Part 1 : Web scrapping

In [55]:
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import time #required for request delays
import random
import json

In [2]:
# Liste of commonly used User-Agents
USER_AGENTS = [
    # Chrome on Windows
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    # Firefox on Linux
    'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    # Safari on macOS
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    # Edge on Windows
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.54',
]

# Function to choose a User-Agent randomly
def get_random_headers():
    random_user_agent = random.choice(USER_AGENTS)
    return {'User-Agent': random_user_agent}


Part 1.1 : Web scrapping QS World University Rankings 2026

HTML file dowload status codes :
    200 ==> Success
    4xx ==> Client-side error based on our request
    5xx ==> Server-side error based on the website's server

In [4]:
try :
    # IMPORTANT : It is always wise to include a User-Agent to simulate a browser
    get_random_headers()
    qs_world_ranking_html = requests.get("https://www.topuniversities.com/world-university-rankings?items_per_page=150", timeout =10)
    qs_world_ranking_html.raise_for_status() # Raises an exception if the error code i 4xx or 5xx 

    soup_qs_world_ranking = BeautifulSoup(qs_world_ranking_html.text, 'html.parser')
    print(f'Status du téléchargement du fichier html 200 (OK) : ',qs_world_ranking_html.status_code)
except requests.exceptions.RequestException as e:
    print(f'Erreur lors de la requête : {e}')
    exit() # Exit if the request fails 

Status du téléchargement du fichier html 200 (OK) :  200


Note : the standard 'requests' library did not retireve the full HTML as 'uni-link' classes were missing from source code. We opted to use the JSON endpoint used to import the university URLs and metadata. 

In [8]:
def get_university_urls_from_api(api_url):
    """
    Fetch university URLs and metadata from the QS Rankings JSON endpoint.
    
    Args:
        api_url (str): The target JSON API URL.
        
    Returns:
        list: A list of dictionaries containing title, url, rank, and region.
    """
    try:
        # Execute the request using random headers to avoid bot detection
        response = requests.get(api_url, headers=get_random_headers())

        if response.status_code == 200:
            # Parse the JSON response into a Python dictionary
            data = response.json()
            
            # Extract the list of university nodes (default to empty list if not found)
            university_data = data.get("score_nodes", [])
            
            extracted_data = []
            base_url = "https://www.topuniversities.com"

            for index, uni in enumerate(university_data):
                relative_path = uni.get('path')
                
                # Reconstruct the absolute URL if the path is relative
                if relative_path and relative_path.startswith('/'):
                    full_url = base_url + relative_path
                else:
                    full_url = relative_path

                # Store metadata (crucial for later Text Mining and categorisation)
                extracted_data.append({
                    'title': uni.get('title'),
                    'url': full_url,
                    'rank': uni.get('rank'),
                    'region': uni.get('region'),
                    'country': uni.get('country')
                })

                # Visual feedback for the first 3 items of the current page
                if index < 3:
                    print(
                        f"University {index + 1}: "
                        f"{extracted_data[-1]['title']} "
                        f"(rank: {extracted_data[-1]['rank']})"
                    )

            print(f"Page processed: {len(extracted_data)} universities retrieved.")
            return extracted_data

        else:
            print(f"Request failed (Status {response.status_code}) for URL: {api_url}")
            return []

    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return []

In [None]:
# Global list to store all universities across all pages
all_universities = []

# Loop through the desired pages (e.g., from page 0 to )
# page=0 contains ranks 1-150, page=1 contains 151-300, etc.
for page_number in range(0, 11):
    print(f"\n--- Processing Page {page_number} ---")
    
    # Construct the API URL dynamically
    target_api = f"https://www.topuniversities.com/rankings/endpoint?nid=4061771&page={page_number}&items_per_page=150&tab=indicators&region=&countries=&cities=&search=&star=&sort_by=&order_by=&program_type=&scholarship=&fee=&english_score=&academic_score=&mix_student=&loggedincache="
    
    # Call the function and extend the global list
    page_results = get_university_urls_from_api(target_api)
    all_universities.extend(page_results)

print(f"\n EXTRACTION COMPLETE: {len(all_universities)} universities ready for content scraping.")


--- Processing Page 0 ---
University 1: Massachusetts Institute of Technology (MIT) (rank: 1)
University 2: Imperial College London (rank: 2)
University 3: Stanford University (rank: 3)
Page processed: 150 universities retrieved.

--- Processing Page 1 ---
University 1: Western University (rank: 151)
University 2: University of Vienna (rank: 152)
University 3: Universiti Teknologi Malaysia  (rank: 153)
Page processed: 150 universities retrieved.

--- Processing Page 2 ---
University 1: Universidad Carlos III de Madrid (UC3M) (rank: 301)
University 2: Stellenbosch University (rank: 302)
University 3: Jagiellonian University (rank: 303)
Page processed: 150 universities retrieved.

--- Processing Page 3 ---
University 1: UNESP (rank: 451)
University 2: Johannes Gutenberg Universität Mainz (rank: 452)
University 3: Shenzhen University (rank: 453)
Page processed: 150 universities retrieved.

--- Processing Page 4 ---
University 1: Taipei Medical University (TMU) (rank: 601)
University 2: T

In [12]:
for uni in all_universities[:5]:
    print(uni['title'])
    print(uni['url'])

Massachusetts Institute of Technology (MIT)
https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
Imperial College London
https://www.topuniversities.com/universities/imperial-college-london
Stanford University
https://www.topuniversities.com/universities/stanford-university
University of Oxford
https://www.topuniversities.com/universities/university-oxford
Harvard University
https://www.topuniversities.com/universities/harvard-university


In [56]:
def scrape_university_description(url):
    """
    Input: University URL
    Output: Cleaned 'About' text or None if not found
    """
    try:
        # 1. Fetch the page with random headers
        response = requests.get(url, headers=get_random_headers(), timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : 
            print("Error : ", response.status_code)
            return None

        about_section =soup.find("div", class_="block about_section d-none d-md-block")

        if not about_section:
            return None

        # 3. Targeted extraction of all paragraphs inside the text container
        text_section = about_section.find("div", class_="textsection abt-overview-read")
        
        if text_section:
            paragraphs = text_section.find_all("p")
            # Join all paragraphs with double newlines
            full_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            return full_text
            
        return None

    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

In [57]:
# Assuming 'all_universities' is your list of dictionaries from the API step
print(f"Starting the extraction of {len(all_universities)} universities...")

for index, uni in enumerate(all_universities):
    # Progress tracking
    print(f"[{index + 1}/{len(all_universities)}] Scraping: {uni['title']}")
    
    # Call the extraction function
    description = scrape_university_description(uni['url'])
    
    # Store the result back in the dictionary
    uni['QS_description'] = description
    
    # --- SAFETY PAUSE ---
    # We wait 2 seconds between each request to avoid being detected as a bot
    time.sleep(2)

print("\n✅ All Possible URLs processed!")

Starting the extraction of 1504 universities...
[1/1504] Scraping: Massachusetts Institute of Technology (MIT)
[2/1504] Scraping: Imperial College London
[3/1504] Scraping: Stanford University
[4/1504] Scraping: University of Oxford
[5/1504] Scraping: Harvard University
[6/1504] Scraping: University of Cambridge
[7/1504] Scraping: ETH Zurich
[8/1504] Scraping: National University of Singapore (NUS)
[9/1504] Scraping: UCL
[10/1504] Scraping: California Institute of Technology (Caltech)
[11/1504] Scraping: The University of Hong Kong
[12/1504] Scraping: Nanyang Technological University, Singapore (NTU Singapore)
[13/1504] Scraping: University of Chicago
[14/1504] Scraping: Peking University
[15/1504] Scraping: University of Pennsylvania
[16/1504] Scraping: Cornell University
[17/1504] Scraping: Tsinghua University
[18/1504] Scraping: University of California, Berkeley (UCB)
[19/1504] Scraping: The University of Melbourne
[20/1504] Scraping: The University of New South Wales (UNSW Sydney)

In [58]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_universities)

# index = False to drop the colomn of pandas index
# utf-8-sig allow to read in easily the data in a Excel file
df.to_csv("qs_university_corpus.csv", index=False, encoding='utf-8-sig')
df.to_parquet("qs_university_corpus.parquet", index=False) #

print("💾 File saved: qs_university_missions_full.csv")

💾 File saved: qs_university_missions_full.csv


We chose to save the corpus in a CSV file to facilitate visual verification of the content. However, the .parquet format will be primarily used for its compactness and processing efficiency.

Part 1.2 : Web scrapping Times Higher Education (THE)World University Ranking 2026

In [3]:

try :
    # IMPORTANT : It is always wise to include a User-Agent to simulate a browser
    get_random_headers()
    the_world_ranking_html = requests.get("https://www.timeshighereducation.com/world-university-rankings/latest/world-ranking",headers=get_random_headers(), timeout =10)
    the_world_ranking_html.raise_for_status() # Raises an exception if the error code i 4xx or 5xx 

    soup_the_world_ranking = BeautifulSoup(the_world_ranking_html.text, 'html.parser')
    print(f'Status du téléchargement du fichier html 200 (OK) : ',the_world_ranking_html.status_code)
except requests.exceptions.RequestException as e:
    print(f'Erreur lors de la requête : {e}')
    exit() # Exit if the request fails 


Status du téléchargement du fichier html 200 (OK) :  200


In [None]:
def get_universities():
    try:
        # 1. Read the local HTML file
        with open("World University Rankings 2026 _ Times Higher Education (THE).html", 'r', encoding='utf-8') as file:
            response_text = file.read()
            soup = BeautifulSoup(response_text, 'html.parser')

        # 2. JSON extraction from script tag
        script_data = soup.find('script', id='__NEXT_DATA__')
        if not script_data:
            print("Erreur : Balise __NEXT_DATA__ introuvable.")
            return None

        json_content = json.loads(script_data.string)

        # 3. Navigate throught JSOn structure to find the universities list
        table_content = json_content.get('props', {}).get('pageProps', {}).get('page', {}).get('rankingsTableConfig', {})
        ranking_data = table_content.get('rankingsData', {})
        universities_list = ranking_data.get('data', [])

        print(f"Number of universities found: {len(universities_list)}")

        # 4. Display the first university with selected keys only
        if universities_list:
            first_uni = universities_list[0]

            print("\n--- Available keys ---")
            print(list(first_uni.keys()))
            
            # Define the keys you want to see
            keys_to_show = ['rank', 'name', 'url', 'location']
            
            # Create a clean dictionary with only these keys
            filtered_view = {k: first_uni.get(k) for k in keys_to_show}
            
            print("\n--- FILTERED DATA OF THE FIRST UNIVERSITY ---")
            print(json.dumps(filtered_view, indent=4, ensure_ascii=False))
        
        return universities_list

    except Exception as e:
        print(f"An error occured: {e}")
        return None

In [99]:
universities = get_universities()

Number of universities found: 3118

--- FILTERED DATA OF THE FIRST UNIVERSITY ---
{
    "rank": "1",
    "name": "University of Oxford",
    "url": "/world-university-rankings/university-oxford",
    "location": "United Kingdom"
}


In [106]:
def extract_university_data(universities):
    try:
        links = []
        base_url = "https://www.timeshighereducation.com"

        for uni in universities:
            rank = uni.get('rank')
            name = uni.get('name')
            path = uni.get('url') # Using 'url' as identified in your JSON
            location = uni.get('location')

            if name and path and rank!="Reporter":
                full_url = base_url + path
                links.append({
                    "rank": rank,
                    "name": name,
                    "location": location,
                    "url": full_url
                })

        print(f"Extracted data for {len(links)} universities.")
        return links  # This now works because it is inside a function

    except Exception as e:
        print(f"Error while accessing JSON data: {e}")
        return None

In [112]:
url_from_universities =extract_university_data(universities=universities)

Extracted data for 2191 universities.


In [None]:
def scrape_university_description(url):
    try :
        response = requests.get(url, headers=get_random_headers(), timeout=10)  
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : return None

        anchor =soup.find("div", class_="css-jxa8wh")

        if not anchor:
            print(f"Description container not found for URL: {url}")
            return None

        paragraphs = anchor.find_all("p")
        # Join all paragraphs with double newlines
        full_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        return full_text

    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

In [127]:
first_university_url = url_from_universities[0]['url']
print(first_university_url)
scrape_university_description(first_university_url)


https://www.timeshighereducation.com/world-university-rankings/university-oxford
Success


'The University of Oxford is the oldest university in the English-speaking world and the world’s second oldest surviving university. While its exact founding date is unknown, there is evidence that teaching took place as far back as 1096.\n\nLocated in and around Oxford’s medieval city centre, the university comprises 44 colleges and halls, and over 100 libraries, making it the largest library system in the UK.\n\nStudents number around 22,000 in total, just over half of whom are undergraduates while over 40 per cent are international, representing 140 countries between them.\n\nCalled the "city of dreaming spires" by Victorian poet, Matthew Arnold, Oxford has the youngest population of any city in England and Wales: nearly a quarter of its residents are university students, which gives Oxford a noticeable buzz.\n\nOxford has an alumni network of over 250,000 individuals, including more than 120 Olympic medallists, 26 Nobel Prize winners, seven poets laureate, and over 30 modern world 

In [128]:
all_universities_data = []

print(f"Starting the extraction of {len(url_from_universities)} universities...")

for index, uni in enumerate(url_from_universities):
    name = uni['name']
    rank = uni['rank']
    location = uni['location']
    url = uni['url']

    print(f"[{index + 1}/{len(url_from_universities)}] Scraping: {name}")

    description =scrape_university_description(url)

    enriched_uni = {
        "name": name,
        "rank": rank,
        "location": location,
        "description": description
    }
    all_universities_data.append(enriched_uni)
    time.sleep(2)  # Safety pause
print("--- Scraping Complete ---")

Starting the extraction of 2191 universities...
[1/2191] Scraping: University of Oxford
Success
[2/2191] Scraping: Massachusetts Institute of Technology
Success
[3/2191] Scraping: Princeton University
Success
[4/2191] Scraping: University of Cambridge


KeyboardInterrupt: 

In [None]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_universities)

# index = False to drop the colomn of pandas index
# utf-8-sig allow to read in easily the data in a Excel file
df.to_csv("the_university_corpus.csv", index=False, encoding='utf-8-sig')
df.to_parquet("the_university_corpus.parquet", index=False) #

print("💾 File saved: the_university_corpus.csv")