Partie 1 : Récolte de données - Web scrapping

In [1]:
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import time #nécessaire pour les délais des requêtes
import random

In [2]:
# Liste de User-Agents couramment utilisés
USER_AGENTS = [
    # Chrome sur Windows
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    # Firefox sur Linux
    'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    # Safari sur macOS
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    # Edge sur Windows
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.54',
]

# Fonction pour choisir un User-Agent aléatoirement
def get_random_headers():
    random_user_agent = random.choice(USER_AGENTS)
    return {'User-Agent': random_user_agent}

# Utilisation dans une requête :
# headers = get_random_headers()
# response = requests.get(url, headers=headers)

Code téléchargement du fichier html :
 = 200 ==> tout s'est bien passé
 = 40x ==> erreur de notre côté 
 = 50x ==> erreur serveur
 

Par contre la requête requests ne permet pas de télécharger le fichier .html complet je ne trouve aucune mots uni-link. On a donc privilégié télécharger le fichier .html complet de la page web et nous avons pu vérifier après téléchargement que les classes 'uni-link' sont bien présentes.

In [3]:
try :
    # IMPORTANT : Il est judicieux de toujours ajouter un User-Agent pour simuler un navigateur
    get_random_headers()
    qs_world_ranking_html = requests.get("https://www.topuniversities.com/world-university-rankings?items_per_page=150", timeout =10)
    qs_world_ranking_html.raise_for_status() # lève une excepetion si le code erreur est 4xx ou 5xx

    soup_qs_world_ranking = BeautifulSoup(qs_world_ranking_html.text, 'html.parser')
    print(f'Status du téléchargement du fichier html 200 (OK) : ',qs_world_ranking_html.status_code)
except requests.exceptions.RequestException as e:
    print(f'Erreur lors de la requête : {e}')
    exit() # quitter si la requête échoue

Status du téléchargement du fichier html 200 (OK) :  200


In [8]:
def get_university_urls_from_api(api_url):
    """
    Fetch university URLs and metadata from the QS Rankings JSON endpoint.
    
    Args:
        api_url (str): The target JSON API URL.
        
    Returns:
        list: A list of dictionaries containing title, url, rank, and region.
    """
    try:
        # Execute the request using random headers to avoid bot detection
        response = requests.get(api_url, headers=get_random_headers())

        if response.status_code == 200:
            # Parse the JSON response into a Python dictionary
            data = response.json()
            
            # Extract the list of university nodes (default to empty list if not found)
            university_data = data.get("score_nodes", [])
            
            extracted_data = []
            base_url = "https://www.topuniversities.com"

            for index, uni in enumerate(university_data):
                relative_path = uni.get('path')
                
                # Reconstruct the absolute URL if the path is relative
                if relative_path and relative_path.startswith('/'):
                    full_url = base_url + relative_path
                else:
                    full_url = relative_path

                # Store metadata (crucial for later Text Mining and categorisation)
                extracted_data.append({
                    'title': uni.get('title'),
                    'url': full_url,
                    'rank': uni.get('rank'),
                    'region': uni.get('region'),
                    'country': uni.get('country')
                })

                # Visual feedback for the first 3 items of the current page
                if index < 3:
                    print(
                        f"University {index + 1}: "
                        f"{extracted_data[-1]['title']} "
                        f"(rank: {extracted_data[-1]['rank']})"
                    )

            print(f"Page processed: {len(extracted_data)} universities retrieved.")
            return extracted_data

        else:
            print(f"Request failed (Status {response.status_code}) for URL: {api_url}")
            return []

    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return []

In [9]:
# Global list to store all universities across all pages
all_universities = []

# Loop through the desired pages (e.g., from page 0 to 2)
# page=0 contains ranks 1-150, page=1 contains 151-300, etc.
for page_number in range(0, 11):
    print(f"\n--- Processing Page {page_number} ---")
    
    # Construct the API URL dynamically
    target_api = f"https://www.topuniversities.com/rankings/endpoint?nid=4061771&page={page_number}&items_per_page=150&tab=indicators&region=&countries=&cities=&search=&star=&sort_by=&order_by=&program_type=&scholarship=&fee=&english_score=&academic_score=&mix_student=&loggedincache="
    
    # Call the function and extend the global list
    page_results = get_university_urls_from_api(target_api)
    all_universities.extend(page_results)

print(f"\n EXTRACTION COMPLETE: {len(all_universities)} universities ready for content scraping.")


--- Processing Page 0 ---
University 1: Massachusetts Institute of Technology (MIT) (rank: 1)
University 2: Imperial College London (rank: 2)
University 3: Stanford University (rank: 3)
Page processed: 150 universities retrieved.

--- Processing Page 1 ---
University 1: Western University (rank: 151)
University 2: University of Vienna (rank: 152)
University 3: Universiti Teknologi Malaysia  (rank: 153)
Page processed: 150 universities retrieved.

--- Processing Page 2 ---
University 1: Universidad Carlos III de Madrid (UC3M) (rank: 301)
University 2: Stellenbosch University (rank: 302)
University 3: Jagiellonian University (rank: 303)
Page processed: 150 universities retrieved.

--- Processing Page 3 ---
University 1: UNESP (rank: 451)
University 2: Johannes Gutenberg Universität Mainz (rank: 452)
University 3: Shenzhen University (rank: 453)
Page processed: 150 universities retrieved.

--- Processing Page 4 ---
University 1: Taipei Medical University (TMU) (rank: 601)
University 2: T

In [12]:
for uni in all_universities[:5]:
    print(uni['title'])
    print(uni['url'])

Massachusetts Institute of Technology (MIT)
https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
Imperial College London
https://www.topuniversities.com/universities/imperial-college-london
Stanford University
https://www.topuniversities.com/universities/stanford-university
University of Oxford
https://www.topuniversities.com/universities/university-oxford
Harvard University
https://www.topuniversities.com/universities/harvard-university


In [None]:
url_first_uni = all_universities[0]['url']
print(url_first_uni)

response = requests.get(url_first_uni, timeout =10)
print(response.text)

if response.status_code ==200:
    soup = BeautifulSoup(response.text,'html.parser')
    #print(soup.prettify())
else : print("Error", response.status_code)

cards = soup.find_all('div', class_='card_content')

if cards:
    all_text =" ".join(
        card.get_text(separator=' ', strip=True)
    )
    print("Extracted text", all_text)
else : print('No div_card_content found')

https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
<!DOCTYPE html>
<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
 <head attr="https://www.topuniversities.com/">
  <!-- AbTasty - DIG-130-->
  <script src="https://try.abtasty.com/9adf3d0b991db423705748a5e31c6121.js" type="text/javascript">
  </script>
  <!-- Botify Activation for TFBBL-90 -->
  <script async="" src="https://tags.pw.adn.cloud/EQRFNS/activation.js">
  </script>
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push(

  {'gtm.start': new Date().getTime(),event:'gtm.js'}
  );var f=d.getElementsByTagName(s)[0],
  j=d.createElement(s),dl=l!='d

In [6]:
api_url = "https://www.topuniversities.com/rankings/endpoint?nid=4061771&page=0&items_per_page=150&tab=indicators&region=&countries=&cities=&search=&star=&sort_by=&order_by=&program_type=&scholarship=&fee=&english_score=&academic_score=&mix_student=&loggedincache="
def
try : 
    response = requests.get(api_url,headers = get_random_headers())

    if response.status_code == 200:
        data = response.json() # transform Json file in a python dictionnary

        university_data = data.get("score_nodes", [])

        university_urls =[]
        # import base de l'url car le lien <a href ="/universities/massachusetts-institute" contient que la suite
        base_url ="https://www.topuniversities.com" 

        print("Extraction des hyperliens :")

        for index, uni in enumerate(university_data):
            relative_path = uni.get('path')

            #constuire l'URL complet si elle est relative
            if relative_path and relative_path.startswith('/'): #.startswith permet de voir si le lien relatif commence par "/" c'est à dire qu'il manque le début du lien
                full_url = base_url +relative_path
            else : full_url = relative_path #cas ou le lien est déhà absolu

            university_urls.append(full_url)

            if index < 5:
                print(f"University {index + 1}:{uni.get('title')} {full_url}")
        print(f"\n✅ Succès ! {len(university_urls)} liens récupérés sans aucun HTML à parser.")

    else:
        print(f"Erreur de requête : {response.status_code}")

except Exception as e:
    print(f"Une erreur est survenue : {e}")


Extraction des hyperliens :
University 1:Massachusetts Institute of Technology (MIT) https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
University 2:Imperial College London https://www.topuniversities.com/universities/imperial-college-london
University 3:Stanford University https://www.topuniversities.com/universities/stanford-university
University 4:University of Oxford https://www.topuniversities.com/universities/university-oxford
University 5:Harvard University https://www.topuniversities.com/universities/harvard-university

✅ Succès ! 150 liens récupérés sans aucun HTML à parser.


In [None]:
def extract_html_from_file(file_name:str) -> BeautifulSoup or None:
    print(f"Tentative d'ouverture du fichier : {file_name}")
    try :
    #ouvrir le fichier en mode lecture ('r')
        with open(file_name,'r', encoding='utf-8') as file:

            html_content = file.read()

            soup_object = BeautifulSoup(html_content,'html.parser')
            #print(soup_object)
            return soup_object

    except FileNotFoundError:
        print(f'Erreur : fichier non trouvé. Assurez-vous que {file_name} est dans le même dossier que le code')
        return None
    except Exception as e:
        print(f'Erreur inattendue lors de la lecture du fichier : {e}')
        return None

In [None]:
file_name = 'QS World University Rankings 2026_Full.html'

soup_qs_ranking_saved_html = extract_html_from_file(file_name)

Tentative d'ouverture du fichier : QS World University Rankings 2026_Full.html
<!DOCTYPE html>

<!-- saved from url=(0076)https://www.topuniversities.com/world-university-rankings?items_per_page=150 -->
<html class="js scrolled" data-once="cvjquery webform-dialog" dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# "><head attr="https://www.topuniversities.com/"><meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<!-- AbTasty - DIG-130-->
<script async="" defer="" src="./QS World University Rankings 2026_Full_files/client"></script><script async="" defer="" src="./QS World University Rankings 2026_Full_files/config.js.téléchargement"></

Sélecteur cible : 'a.uni-link'
pour rechercher toutes les balises <a> qui ont la classe "uni-link"
et extraire les hyperliens des 150 meilleures universités du classement QS_World

In [None]:
if soup_qs_ranking_saved_html :
    # Utiliser la méthode find_all() pour obtenir une liste de tous les éléments correspondants
    university_link_elements = soup_qs_world_ranking.find_all('a', class_="uni-link")

    # test afficher les 10 premiers éléments trouvés
    print("\n--- DÉBOGAGE : 10 PREMIERS ÉLÉMENTS TROUVÉS PAR 'a.uni-link' ---")
    if university_link_elements:
        print(f" {len(university_link_elements)} hyperliens trouvés")
    else:
        # CAUSE TYPIQUE : Le contenu du tableau est chargé par JavaScript APRES le chargement initial de la page.
        print("Attention : aucun hyperlien trouvé avec le sélecteur 'a.uni_link'")
        exit()

    university_urls =[]
    # import base de l'url car le lien <a href ="/universities/massachusetts-institute" contient que la suite
    base_url ="https://www.topuniversities.com" 

    print("Extraction des hyperliens :")

    for index, link_element in enumerate(university_link_elements):
        relative_path = link_element.get('href')

        #constuire l'URL complet si elle est relative
        if relative_path and relative_path.startswith('/'): #.startswith permet de voir si le lien relatif commence par "/" c'est à dire qu'il manque le début du lien
            full_url = base_url +relative_path
        else : full_url = relative_path #cas ou le lien est déhà absolu

        university_urls.append(full_url)

        if index < 10:
            print(f"Rang {index + 1: <3}: {full_url}")




--- DÉBOGAGE : 10 PREMIERS ÉLÉMENTS TROUVÉS PAR 'a.uni-link' ---
 150 hyperliens trouvés
Extraction des hyperliens :
Rang 1  : https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
Rang 2  : https://www.topuniversities.com/universities/imperial-college-london
Rang 3  : https://www.topuniversities.com/universities/stanford-university
Rang 4  : https://www.topuniversities.com/universities/university-oxford
Rang 5  : https://www.topuniversities.com/universities/harvard-university
Rang 6  : https://www.topuniversities.com/universities/university-cambridge
Rang 7  : https://www.topuniversities.com/universities/eth-zurich
Rang 8  : https://www.topuniversities.com/universities/national-university-singapore-nus
Rang 9  : https://www.topuniversities.com/universities/ucl
Rang 10 : https://www.topuniversities.com/universities/california-institute-technology-caltech


In [12]:
print(f"\n--- Résultat Final ---")
print(f"Nombre total d'hyperliens extraits : {len(university_urls)}")
print(f"Le premier lien est : {university_urls[0]}")
print(f"Le dernier lien extrait est : {university_urls[-1]}")


--- Résultat Final ---
Nombre total d'hyperliens extraits : 150
Le premier lien est : https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
Le dernier lien extrait est : https://www.topuniversities.com/universities/university-cape-town


In [6]:
times_higher_education_html = requests.get('https://www.topuniversities.com/world-university-rankings?items_per_page=150')
print(times_higher_education_html.status_code)

200
