Part 1 : Web scraping

In [1]:
import numpy as np
import pandas as pd
import requests 
from bs4 import BeautifulSoup
import time #required for request delays
import random
import json

from tqdm import tqdm

In [7]:
# Liste of commonly used User-Agents
USER_AGENTS = [
    # Chrome on Windows
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    # Firefox on Linux
    'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
    # Safari on macOS
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
    # Edge on Windows
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.54',
]

# Function to choose a User-Agent randomly
def get_random_headers():
    random_user_agent = random.choice(USER_AGENTS)
    return {'User-Agent': random_user_agent}


## Part 1.1 : Web scraping QS World University Rankings 2026

HTML file dowload status codes :
    200 ==> Success
    4xx ==> Client-side error based on our request
    5xx ==> Server-side error based on the website's server

In [10]:
try :
    # IMPORTANT : It is always wise to include a User-Agent to simulate a browser
    get_random_headers()
    qs_world_ranking_html = requests.get("https://www.topuniversities.com/world-university-rankings?items_per_page=150", timeout =10)
    qs_world_ranking_html.raise_for_status() # Raises an exception if the error code i 4xx or 5xx 

    soup_qs_world_ranking = BeautifulSoup(qs_world_ranking_html.text, 'html.parser')
    print(f'Downloading status of the html file 200 (OK) : ',qs_world_ranking_html.status_code)
except requests.exceptions.RequestException as e:
    print(f'Error : {e}')
    exit() # Exit if the request fails 

Downloading status of the html file 200 (OK) :  200


Note : the standard 'requests' library did not retireve the full HTML as 'uni-link' classes were missing from source code. We opted to use the JSON endpoint used to import the university URLs and metadata. 

In [48]:
def get_university_urls_from_api(api_url):
    """
    Fetch university URLs and metadata from the QS Rankings JSON endpoint.
    
    Args:
        api_url (str): The target JSON API URL.
        
    Returns:
        list: A list of dictionaries containing title, url, rank, and region.
    """
    try:
        # Execute the request using random headers to avoid bot detection
        response = requests.get(api_url, headers=get_random_headers())

        if response.status_code == 200:
            # Parse the JSON response into a Python dictionary
            data = response.json()
            
            # Extract the list of university nodes (default to empty list if not found)
            university_data = data.get("score_nodes", [])
            
            extracted_data = []
            base_url = "https://www.topuniversities.com"

            for index, uni in enumerate(university_data):
                relative_path = uni.get('path')
                
                # Reconstruct the absolute URL if the path is relative
                if relative_path and relative_path.startswith('/'):
                    full_url = base_url + relative_path
                else:
                    full_url = relative_path

                # Store metadata (crucial for later Text Mining and categorisation)
                extracted_data.append({
                    'title': uni.get('title'),
                    'url': full_url,
                    'rank': uni.get('rank'),
                    'region': uni.get('region'),
                    'country': uni.get('country')
                })

                # Visual feedback for the first 3 items of the current page
                if index < 3:
                    print(
                        f"University {index + 1}: "
                        f"{extracted_data[-1]['title']} "
                        f"(rank: {extracted_data[-1]['rank']})"
                    )

            print(f"Page processed: {len(extracted_data)} universities retrieved.")
            return extracted_data

        else:
            print(f"Request failed (Status {response.status_code}) for URL: {api_url}")
            return []

    except Exception as e:
        print(f"An error occurred during the API call: {e}")
        return []

In [49]:
# Global list to store all universities across all pages
all_universities = []

# Loop through the desired pages (e.g., from page 0 to )
# page=0 contains ranks 1-150, page=1 contains 151-300, etc.
for page_number in range(0, 11):
    print(f"\n--- Processing Page {page_number} ---")
    
    # Construct the API URL dynamically
    target_api = f"https://www.topuniversities.com/rankings/endpoint?nid=4061771&page={page_number}&items_per_page=150&tab=indicators&region=&countries=&cities=&search=&star=&sort_by=&order_by=&program_type=&scholarship=&fee=&english_score=&academic_score=&mix_student=&loggedincache="
    
    # Call the function and extend the global list
    page_results = get_university_urls_from_api(target_api)
    all_universities.extend(page_results)

print(f"\n EXTRACTION COMPLETE: {len(all_universities)} universities ready for content scraping.")


--- Processing Page 0 ---
University 1: Massachusetts Institute of Technology (MIT) (rank: 1)
University 2: Imperial College London (rank: 2)
University 3: Stanford University (rank: 3)
Page processed: 150 universities retrieved.

--- Processing Page 1 ---
University 1: Western University (rank: 151)
University 2: University of Vienna (rank: 152)
University 3: Universiti Teknologi Malaysia  (rank: 153)
Page processed: 150 universities retrieved.

--- Processing Page 2 ---
University 1: Universidad Carlos III de Madrid (UC3M) (rank: 301)
University 2: Stellenbosch University (rank: 302)
University 3: Jagiellonian University (rank: 303)
Page processed: 150 universities retrieved.

--- Processing Page 3 ---
University 1: UNESP (rank: 451)
University 2: Johannes Gutenberg Universität Mainz (rank: 452)
University 3: Shenzhen University (rank: 453)
Page processed: 150 universities retrieved.

--- Processing Page 4 ---
University 1: Taipei Medical University (TMU) (rank: 601)
University 2: T

In [50]:
for uni in all_universities[:5]:
    print(uni['title'])
    print(uni['url'])

Massachusetts Institute of Technology (MIT)
https://www.topuniversities.com/universities/massachusetts-institute-technology-mit
Imperial College London
https://www.topuniversities.com/universities/imperial-college-london
Stanford University
https://www.topuniversities.com/universities/stanford-university
University of Oxford
https://www.topuniversities.com/universities/university-oxford
Harvard University
https://www.topuniversities.com/universities/harvard-university


In [51]:
def scrape_university_description(url):
    """
    Input: University URL
    Output: Cleaned 'About' text or None if not found
    """
    try:
        # 1. Fetch the page with random headers
        response = requests.get(url, headers=get_random_headers(), timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : 
            print("Error : ", response.status_code)
            return None

        about_section =soup.find("div", class_="block about_section d-none d-md-block")

        if not about_section:
            return None

        # 3. Targeted extraction of all paragraphs inside the text container
        text_section = about_section.find("div", class_="textsection abt-overview-read")
        
        if text_section:
            paragraphs = text_section.find_all("p")
            if paragraphs:
                full_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            else :
                for btn in text_section.find_all("span", class_=["read-more", "read-less"]):
                    btn.decompose()
                full_text = text_section.get_text(separator=" ", strip=True)   
            return full_text
            
    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

We noticed that 367 universities description were empty such as "https://www.topuniversities.com/universities/universiti-malaya-um" because the html file is not coded the same way. We noticed that the descriptions of those universities were split with the read-more and read-less html tag. 

In [52]:
scrape_university_description("https://www.topuniversities.com/universities/universiti-malaya-um")

'University of Malaya, the first University of the country, is situated on a 750-acre (309-hectare) campus in the southwest of Kuala Lumpur, the capital city of Malaysia.The University of Malaya grew out of a tradition of service to the society. Its predecessors, the King Edward VII College of Medicine established in 1905 and Raffles College in 1929, has been established to meet urgent demands, one in medicine and the other in education. When the two came together to form the University of Malaya in October 1949, this was so that they might perform together an even greater service - to help lay the foundations of a new nation by producing a generation of skilled and educated men. Hence the University of Malaya was established on 8 October 1949 as a national institution to serve the higher education needs of the Federation of Malaya and Singapore.The growth of the University was very rapid during the first decade of its establishment and this resulted in the setting up of two autonomous

In [53]:
# Assuming 'all_universities' is your list of dictionaries from the API step
print(f"Starting the extraction of {len(all_universities)} universities...")

for index, uni in enumerate(all_universities):
    # Progress tracking
    print(f"[{index + 1}/{len(all_universities)}] Scraping: {uni['title']}")
    
    # Call the extraction function
    description = scrape_university_description(uni['url'])
    
    # Store the result back in the dictionary
    uni['QS_description'] = description
    
    # --- SAFETY PAUSE ---
    # We wait 2 seconds between each request to avoid being detected as a bot
    time.sleep(2)

print("\n✅ All Possible URLs processed!")

Starting the extraction of 1504 universities...
[1/1504] Scraping: Massachusetts Institute of Technology (MIT)
[2/1504] Scraping: Imperial College London
[3/1504] Scraping: Stanford University
[4/1504] Scraping: University of Oxford
[5/1504] Scraping: Harvard University
[6/1504] Scraping: University of Cambridge
[7/1504] Scraping: ETH Zurich
[8/1504] Scraping: National University of Singapore (NUS)
[9/1504] Scraping: UCL
[10/1504] Scraping: California Institute of Technology (Caltech)
[11/1504] Scraping: The University of Hong Kong
[12/1504] Scraping: Nanyang Technological University, Singapore (NTU Singapore)
[13/1504] Scraping: University of Chicago
[14/1504] Scraping: Peking University
[15/1504] Scraping: University of Pennsylvania
[16/1504] Scraping: Cornell University
[17/1504] Scraping: Tsinghua University
[18/1504] Scraping: University of California, Berkeley (UCB)
[19/1504] Scraping: The University of Melbourne
[20/1504] Scraping: The University of New South Wales (UNSW Sydney)

Even when accounting for web pages where the description is split with read-more and read-less, three universities "https://www.topuniversities.com/universities/universiti-pendidikan-sultan-idris-upsi", "https://www.topuniversities.com/universities/universite-de-franche-comte" and "https://www.topuniversities.com/universities/lovely-professional-university-lpu" still do not contain any description text. We can also remove "https://www.topuniversities.com/universities/university-maryland-baltimore" as the description repeats the university's name.

In [None]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(all_universities)

# index = False to drop the colomn of pandas index
# utf-8-sig allow to read in easily the data in a Excel file
df.to_csv("DATA/RAW/CSV/qs_university_corpus.csv", index=False, encoding='utf-8-sig')
df.to_parquet("DATA/RAW/PARQUET/qs_university_corpus.parquet", index=False) #

print("💾 File saved: qs_university_missions_full.csv")

💾 File saved: qs_university_missions_full.csv


In [3]:
# code used once to rename the column title in name et QS_description in description without starting all the scraping
# remove rank 577, 843, 904 because the description repeats the university's name
# all this one have "."
#1005, 942, 922, 900, 890, 546 and more


df = pd.read_parquet("DATA/RAW/PARQUET/qs_university_corpus.parquet")

# Rename columns
df = df.rename(columns={'title': 'name', 'QS_description': 'description'})

# Ensure name and description are strings
df['name'] = df['name'].astype(str)
df['description'] = df['description'].astype(str)

# Strip spaces
df['description'] = df['description'].str.strip()
df['name'] = df['name'].str.strip()

# Build mask for rows to keep:
#  - description not empty
#  - description not equal to "."
#  - description different from university name
mask_keep = (
    (df['description'].notna()) &
    (df['description'] != "") &
    (df['description'] != ".") &
    (df['description'].str.lower() != df['name'].str.lower())
)

initial_rows = len(df)
df = df[mask_keep].copy()
removed_rows = initial_rows - len(df)
print(f"Removed {removed_rows} row(s) with empty / '.' / name-equal descriptions")

# Save cleaned datasets
df.to_parquet(
    "DATA/CLEAN/PARQUET/qs_university_corpus_no_cleaned_description.parquet",
    index=False
)
df.to_csv(
    "DATA/CLEAN/CSV/qs_university_corpus_no_cleaned_description.csv",
    index=False,
    encoding='utf-8-sig'
)

print("💾 Files saved: qs_university_corpus_no_cleaned_description.[parquet/csv]")


Removed 364 row(s) with empty / '.' / name-equal descriptions
💾 Files saved: qs_university_corpus_no_cleaned_description.[parquet/csv]


We chose to save the corpus in a CSV file to facilitate visual verification of the content. However, the .parquet format will be primarily used for its compactness and processing efficiency.

## Part 1.2 : Web scraping Times Higher Education (THE)World University Ranking 2026

In [None]:

try :
    # IMPORTANT : It is always wise to include a User-Agent to simulate a browser
    get_random_headers()
    the_world_ranking_html = requests.get("https://www.timeshighereducation.com/world-university-rankings/latest/world-ranking",headers=get_random_headers(), timeout =10)
    the_world_ranking_html.raise_for_status() # Raises an exception if the error code i 4xx or 5xx 

    soup_the_world_ranking = BeautifulSoup(the_world_ranking_html.text, 'html.parser')
    print(f'Downloading status of the html file 200 (OK) : ',the_world_ranking_html.status_code)
except requests.exceptions.RequestException as e:
    print(f'Error : {e}')
    exit() # Exit if the request fails 


Status du téléchargement du fichier html 200 (OK) :  200


In [30]:
def get_universities(file):
    try:
        # 1. Read the local HTML file
        with open(file, 'r', encoding='utf-8') as file:
            response_text = file.read()
            soup = BeautifulSoup(response_text, 'html.parser')

        # 2. JSON extraction from script tag
        script_data = soup.find('script', id='__NEXT_DATA__')
        if not script_data:
            print("Erreur : Balise __NEXT_DATA__ introuvable.")
            return None

        json_content = json.loads(script_data.string)

        # 3. Navigate throught JSOn structure to find the universities list
        table_content = json_content.get('props', {}).get('pageProps', {}).get('page', {}).get('rankingsTableConfig', {})
        ranking_data = table_content.get('rankingsData', {})
        universities_list = ranking_data.get('data', [])

        print(f"Number of universities found: {len(universities_list)}")

        # 4. Display the first university with selected keys only
        if universities_list:
            first_uni = universities_list[0]

            print("\n--- Available keys ---")
            print(list(first_uni.keys()))
            
            # Define the keys you want to see
            keys_to_show = ['rank', 'name', 'url', 'location']
            
            # Create a clean dictionary with only these keys
            filtered_view = {k: first_uni.get(k) for k in keys_to_show}
            
            print("\n--- FILTERED DATA OF THE FIRST UNIVERSITY ---")
            print(json.dumps(filtered_view, indent=4, ensure_ascii=False))
        
        return universities_list

    except Exception as e:
        print(f"An error occured: {e}")
        return None

In [28]:
universities = get_universities("DATA/HTML/World University Rankings 2026 _ Times Higher Education (THE).html")

Number of universities found: 3118

--- Available keys ---
['rank_order', 'rank', 'name', 'scores_overall', 'scores_overall_rank', 'scores_teaching', 'scores_teaching_rank', 'scores_research', 'scores_research_rank', 'scores_citations', 'scores_citations_rank', 'scores_industry_income', 'scores_industry_income_rank', 'scores_international_outlook', 'scores_international_outlook_rank', 'iid', 'record_type', 'member_level', 'url', 'nid', 'location', 'stats_number_students', 'stats_student_staff_ratio', 'stats_pc_intl_students', 'stats_female_male_ratio', 'aliases', 'closed', 'unaccredited', 'disabled', 'apply_link', 'cta_button', 'logo_url', 'enhanced']

--- FILTERED DATA OF THE FIRST UNIVERSITY ---
{
    "rank": "1",
    "name": "University of Oxford",
    "url": "/world-university-rankings/university-oxford",
    "location": "United Kingdom"
}


In [None]:
def extract_university_data(universities):
    try:
        links = []
        base_url = "https://www.timeshighereducation.com"

        for uni in universities:
            rank = uni.get('rank')
            name = uni.get('name')
            path = uni.get('url') # Using 'url' as identified in your JSON
            location = uni.get('location')

            if name and path and rank!="Reporter":
                full_url = base_url + path
                links.append({
                    "rank": rank,
                    "name": name,
                    "location": location,
                    "url": full_url
                })

        print(f"Extracted data for {len(links)} universities.")
        return links  # This now works because it is inside a function

    except Exception as e:
        print(f"Error while accessing JSON data: {e}")
        return None

In [None]:
url_from_universities =extract_university_data(universities=universities)

Extracted data for 2191 universities.


In [None]:
def scrape_university_description(url):
    try :
        response = requests.get(url, headers=get_random_headers(), timeout=10)  
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : return None

        anchor =soup.find("div", class_="css-jxa8wh")

        if not anchor:
            print(f"Description container not found for URL: {url}")
            return None

        paragraphs = anchor.find_all("p")
        # Join all paragraphs with double newlines
        full_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
        return full_text

    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

In [None]:
first_university_url = url_from_universities[0]['url']
print(first_university_url)
scrape_university_description(first_university_url)


https://www.timeshighereducation.com/world-university-rankings/university-oxford


'The University of Oxford is the oldest university in the English-speaking world and the world’s second oldest surviving university. While its exact founding date is unknown, there is evidence that teaching took place as far back as 1096.\n\nLocated in and around Oxford’s medieval city centre, the university comprises 44 colleges and halls, and over 100 libraries, making it the largest library system in the UK.\n\nStudents number around 22,000 in total, just over half of whom are undergraduates while over 40 per cent are international, representing 140 countries between them.\n\nCalled the "city of dreaming spires" by Victorian poet, Matthew Arnold, Oxford has the youngest population of any city in England and Wales: nearly a quarter of its residents are university students, which gives Oxford a noticeable buzz.\n\nOxford has an alumni network of over 250,000 individuals, including more than 120 Olympic medallists, 26 Nobel Prize winners, seven poets laureate, and over 30 modern world 

In [None]:
all_universities_data = []

print(f"Starting the extraction of {len(url_from_universities)} universities...")

for index, uni in enumerate(url_from_universities):
    name = uni['name']
    rank = uni['rank']
    location = uni['location']
    url = uni['url']

    print(f"[{index + 1}/{len(url_from_universities)}] Scraping: {name}")

    description =scrape_university_description(url)

    enriched_uni = {
        "name": name,
        "rank": rank,
        "location": location,
        "description": description
    }
    all_universities_data.append(enriched_uni)
    time.sleep(2)  # Safety pause
print("--- Scraping Complete ---")

Starting the extraction of 2191 universities...
[1/2191] Scraping: University of Oxford
[2/2191] Scraping: Massachusetts Institute of Technology
[3/2191] Scraping: Princeton University
[4/2191] Scraping: University of Cambridge
[5/2191] Scraping: Harvard University
[6/2191] Scraping: Stanford University
[7/2191] Scraping: California Institute of Technology
[8/2191] Scraping: Imperial College London
[9/2191] Scraping: University of California, Berkeley
[10/2191] Scraping: Yale University
[11/2191] Scraping: ETH Zurich
[12/2191] Scraping: Tsinghua University
[13/2191] Scraping: Peking University
[14/2191] Scraping: University of Pennsylvania
[15/2191] Scraping: The University of Chicago
[16/2191] Scraping: Johns Hopkins University
[17/2191] Scraping: National University of Singapore
[18/2191] Scraping: Cornell University
[19/2191] Scraping: University of California, Los Angeles
[20/2191] Scraping: Columbia University
[21/2191] Scraping: University of Toronto
[22/2191] Scraping: UCL
[23/2

check the number of descriptions extracted 
the total number of universities with a rank = 2191
but the profile of Russian universities has been temporarily suspended
and 1 university : Delta University for Science and Technology, Shivaji University, Kolhapur and South Valley University  (Egypt) encounterd an issue while scraping

In [None]:


#this filters out None or empty descriptions 
#it keeps only those uni where for each and every uni in the all datagrame 'description' exists and is not just whitespace

print(f"Original count: {len(all_universities_data)}")

cleaned_universities_data = [
    uni for uni in all_universities_data if uni.get('description') and uni.get('description').strip()
]

print(f"Cleaned count: {len(cleaned_universities_data)}")
print(f"Removed {len(all_universities_data) - len(cleaned_universities_data)} entries with no description.")

Original count: 2191
Cleaned count: 2106
Removed 85 entries with no description.


In [None]:
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(cleaned_universities_data)

# index = False to drop the colomn of pandas index
# utf-8-sig allow to read in easily the data in a Excel file
df.to_csv("DATA/RAW/CSV/the_university_corpus.csv", index=False, encoding='utf-8-sig')
df.to_parquet("DATA/RAW/PARQUET/the_university_corpus.parquet", index=False) #

print("File saved: the_university_corpus.csv")

💾 File saved: the_university_corpus.csv


In [None]:
# code used once to rename the column title in name et QS_description in description without starting all the scraping
#we have to delete Tianjin University because its description is not usefull "https://www.timeshighereducation.com/world-university-rankings/tianjin-university"
#and Kyushu Institute of Technology (Kyutech) because its description is “.”

df = pd.read_csv("DATA/RAW/CSV/the_university_corpus.csv")

df= df.rename(columns={'location': 'country'})

uni_to_remove = ['Kyushu Institute of Technology (Kyutech)', 'Tianjin University']

index_to_drop = df[df['name'].astype(str).isin(uni_to_remove)].index

if not index_to_drop.empty:

    df.drop(index_to_drop, inplace= True)

    print(f"Removed {len(index_to_drop)} row(s)")

else : print("No row with rank Kyushu Institute of Technology (Kyutech) or Tianjin University")

df.to_csv("DATA/CLEAN/CSV/the_university_corpus.csv", index=False, encoding='utf-8-sig')
df.to_parquet("DATA/CLEAN/PARQUET/the_university_corpus.parquet", index=False) #

print("💾 File saved: the_university_missions_full.csv")

Removed 2 row(s)
💾 File saved: the_university_missions_full.csv


## Part 1.3: Web scraping Times Higher Education (THE)World University Ranking 2011


This approach will allow the analysis of how key terms and themes have evolved over ttime by comparing university descriptions from before 2015 (rhe pre-SDG era) to more recent ones. To achieve this, we must use the Wayback machine; while current rankings and URLs can be retireved directly from the THE website, the descriptions themselves are updated annually, making historical data inaccessible on the live site. 

In [None]:
universities_2012 = get_universities("DATA/HTML/World University Rankings 2011-2012 _ Times Higher Education (THE).html")

In [None]:
wayback_date ="20130101"
sleep_time = 2         
base_url = "https://web.archive.org/"

In [None]:
try :
    # IMPORTANT : It is always wise to include a User-Agent to simulate a browser
    get_random_headers()
    the_world_ranking_html = requests.get("https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking",headers=get_random_headers(), timeout =10)
    the_world_ranking_html.raise_for_status() # Raises an exception if the error code i 4xx or 5xx 

    soup_the_world_ranking = BeautifulSoup(the_world_ranking_html.text, 'html.parser')
    print(f'Downloading status of the html file 200 (OK) : ',the_world_ranking_html.status_code)
except requests.exceptions.RequestException as e:
    print(f'Error : {e}')
    exit() # Exit if the request fails 


Downloading status of the html file 200 (OK) :  200


In [8]:
def get_university_urls_from_2012(url):
    try :
        response = requests.get(url, headers=get_random_headers(), timeout=10)  
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : return None

        table =soup.find_all("table", class_="ranking main ind-OS overall")

        if not table:
            print(f"Description container not found for URL: {url}")
            return None
        else :print("Found the container")

        #print(table)

        uni_td =soup.find_all("td", class_="uni")

        #print(uni_td)

        base_url = "https://web.archive.org"
        university_links_2012 = []

        for td in uni_td:
            a_tag = td.find("a")
            if a_tag and 'href' in a_tag.attrs:
                relative_path = a_tag['href']
                full_url = base_url + relative_path
                university_links_2012.append(full_url)
        return university_links_2012


        if not uni_td:
            print(f"Description container not found for URL: {url}")
            return None
        else :print("Found the container")

    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

In [9]:
list_of_url_2012 = get_university_urls_from_2012("https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking")
print(f"Number of university URLs extracted for 2012: {len(list_of_url_2012)}")

Found the container
Number of university URLs extracted for 2012: 200


Once the URLs have been extracted, we can begin scraping the individual university pages. However, after several requests, an archive server saturation error occurs. To mitigate this issue, we have implemented a retry loop that attempts to scrape each URL up to three times, using an exponentially increasing delay between attempts.

In [14]:
def scrape_university_description_2011(url):
    try :
        response = requests.get(url, headers=get_random_headers(), timeout=30)  
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : return None

        main_content =soup.find("section", class_="main-content")
        
        if not main_content:
            print(f"Description container not found for URL: {url}")
            return None
        #else :print("Found the container")

        detail = main_content.find("div", class_="details")
        #print(detail)

        
        rank_element = detail.find("span", class_="rank")
        rank = rank_element.get_text(strip=True) if rank_element else "N/A"
        #print(rank)

        header = main_content.find("header", class_="header")
        universities_2012_name = header.find("h1").get_text(strip=True)
        #print(f"University Name: {universities_2012_name}")

        info_section = main_content.find("section", class_="infomation") # Typo fixed from 'information' to 'infomation' developer mistake

        if info_section:
            paragraphs = info_section.find_all("p")
            full_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            #print(full_text)
        else :
            print(f"No information section found for URL: {url}")
            return None

        region_tag = main_content.find("p", class_='region').find("span")
        region = region_tag.get_text(strip=True)
    
        return {
            "rank" :rank,
            "name": universities_2012_name,
            "description": full_text,
            "region" :region
        }
            
    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

In [15]:
scrape_university_description_2011("https://web.archive.org/web/20121007054238/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/california-institute-of-technology")

{'rank': '1',
 'name': 'California Institute of Technology',
 'description': "Caltech alumni include movie director Frank Capra, who graduated in 1918, but its 124-acre campus predates nearby Hollywood. More than 30 Caltech students have won Nobel prizes, and one alumnus - Harrison Schmitt - has walked on the Moon. Home to Nasa's Jet Propulsion Laboratory, it has a faculty of about 300 teaching around 2,000 students.",
 'region': 'North America'}

In [16]:
scrape_university_description_2011("https://web.archive.org/web/20121008005253/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/emory-university")


{'rank': '75',
 'name': 'Emory University',
 'description': 'A "million-dollar letter" from Asa Candler, founder of the Coca-Cola Company, in 1915 helped transform a small Methodist college, founded in 1832, into today\'s research university. It has around 13,000 students and more than 3,700 academics across four undergraduate and seven graduate schools on its 600-acre Druid Hills campus.',
 'region': 'North America'}

In [23]:
universities_2012_data_success =[]
universities_2012_data_failed = []

retries =3

print(f"Starting the extraction of {len(list_of_url_2012)} universities...")

for url in tqdm(list_of_url_2012):
    success = False
    for attempt in range(retries):
        try :
            data = scrape_university_description_2011(url)
            if data:
                result_record = {
                    "rank": data['rank'],
                    "name": data['name'],
                    "description": data['description'],
                    "region": data['region']
                }
                universities_2012_data_success.append(result_record)
                success = True
                break
            else : 
                print(f"Attempt {attempt + 1} failed for URL: {url}")
                time.sleep(2**attempt * 5) # Exponential backoff
        except Exception as e:
            print(f" Exception on attempt {attempt + 1} for URL: {url} - {e}")
            time.sleep(2**attempt * 5) # Exponential backoff
    if not success:
        failed_record = {
            "url": url,
            "error": "Failed after retries"
        }
        universities_2012_data_failed.append(failed_record)
    time.sleep(2)  # Safety pause
print("--- Scraping Complete ---")
print(f"Success : {len(universities_2012_data_success)}")
print(f"Failed : {len(universities_2012_data_failed)}")

Starting the extraction of 200 universities...


  5%|▌         | 10/200 [00:40<13:01,  4.11s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/yale-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/yale-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D779010>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/yale-university


 10%|▉         | 19/200 [01:40<12:23,  4.11s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/cornell-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/cornell-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD55A90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/cornell-university
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/cornell-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceed

 10%|█         | 20/200 [02:28<51:57, 17.32s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/carnegie-mellon-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/carnegie-mellon-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DB093D0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/carnegie-mellon-university
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/inst

 10%|█         | 21/200 [03:14<1:16:50, 25.76s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-british-columbia: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-british-columbia (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D7BFF90>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-british-columbia
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-

 12%|█▏        | 24/200 [04:06<51:01, 17.40s/it]  

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-washington: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-washington (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D7BFDD0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-washington


 14%|█▍        | 29/200 [04:55<24:32,  8.61s/it]  

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-tokyo: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-tokyo (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6D1250>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-tokyo


 16%|█▌        | 31/200 [05:29<33:01, 11.72s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karolinska-institute: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karolinska-institute (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DFB1AD0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karolinska-institute


 18%|█▊        | 37/200 [06:17<16:40,  6.14s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/australian-national-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/australian-national-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D5B73D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/australian-national-university


 20%|██        | 40/200 [06:55<22:26,  8.41s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/washington-university-in-st-louis: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/washington-university-in-st-louis (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DBBCE90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/washington-university-in-st-louis


 20%|██        | 41/200 [07:25<39:13, 14.80s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-minnesota: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-minnesota (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD4A450>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-minnesota
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-minnesota: HTTPSConnectionPool(host='web.archive.org', port=443)

 22%|██▏       | 44/200 [08:33<41:13, 15.86s/it]  

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/ludwig-maximilians-universitat-munchen: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/ludwig-maximilians-universitat-munchen (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DCE8350>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/ludwig-maximilians-universitat-munchen
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/ludwig-maximilians-universitat-mu

 22%|██▎       | 45/200 [09:34<1:15:51, 29.36s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eacutecole-polytechnique-feacutedeacuterale-de-lausanne: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eacutecole-polytechnique-feacutedeacuterale-de-lausanne (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DDAE690>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eacutecole-polytechnique-feacutedeacuterale-de-lausanne


 24%|██▎       | 47/200 [10:08<56:06, 22.00s/it]  

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-manchester: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-manchester (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D9C3690>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-manchester


 25%|██▌       | 50/200 [10:46<34:45, 13.90s/it]  

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/pennsylvania-state-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/pennsylvania-state-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D5B57D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/pennsylvania-state-university


 28%|██▊       | 57/200 [11:39<13:15,  5.56s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-sydney: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-sydney (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DDFFB90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-sydney


 32%|███▏      | 64/200 [12:31<10:36,  4.68s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/mcmaster-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/mcmaster-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D81A910>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/mcmaster-university
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/mcmaster-univ

 32%|███▎      | 65/200 [13:12<35:00, 15.56s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-bristol: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-bristol (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DD70450>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-bristol
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/univers

 33%|███▎      | 66/200 [13:56<53:55, 24.15s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/katholieke-universiteit-leuven: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/katholieke-universiteit-leuven (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DDADC10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/katholieke-universiteit-leuven


 34%|███▎      | 67/200 [14:26<57:19, 25.86s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/utrecht-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/utrecht-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DDFC6D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/utrecht-university


 36%|███▌      | 72/200 [15:11<19:58,  9.37s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/universitat-heidelberg: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/universitat-heidelberg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DDC9C10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/universitat-heidelberg


 42%|████▏     | 84/200 [16:22<07:38,  3.95s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-st-andrews: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-st-andrews (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D8192D0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-st-andrews


 42%|████▎     | 85/200 [16:39<14:57,  7.81s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-california-irvine: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-california-irvine (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DEF0950>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-california-irvine
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/wor

 44%|████▍     | 88/200 [17:55<27:27, 14.71s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-notre-dame: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-notre-dame (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6E14A850>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-notre-dame
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institutio

 46%|████▋     | 93/200 [18:53<15:16,  8.56s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-maryland-college-park: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-maryland-college-park (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D738C90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-maryland-college-park


 48%|████▊     | 95/200 [19:34<22:46, 13.01s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/michigan-state-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/michigan-state-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D68E190>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/michigan-state-university


 48%|████▊     | 97/200 [20:07<23:36, 13.76s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/purdue-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/purdue-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D5B73D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/purdue-university


 50%|████▉     | 99/200 [20:41<23:45, 14.11s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-alberta: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-alberta (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D93CA90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-alberta


 53%|█████▎    | 106/200 [21:33<08:42,  5.56s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/royal-holloway-university-of-london: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/royal-holloway-university-of-london (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DF8EB90>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/royal-holloway-university-of-london


 54%|█████▎    | 107/200 [21:45<11:27,  7.39s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/tokyo-institute-of-technology: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/tokyo-institute-of-technology (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D5B58D0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/tokyo-institute-of-technology
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ran

 56%|█████▌    | 112/200 [22:43<10:35,  7.22s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-utah: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-utah (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DEAD1D0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-utah
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-ut

 57%|█████▋    | 114/200 [23:32<20:22, 14.22s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eindhoven-university-of-technology: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eindhoven-university-of-technology (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DE1F9D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eindhoven-university-of-technology
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/eindhoven-university-of-technology: HTTPSConn

 62%|██████▏   | 123/200 [25:04<06:49,  5.32s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/seoul-national-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/seoul-national-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DCB6D50>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/seoul-national-university


 62%|██████▏   | 124/200 [25:33<16:04, 12.69s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-florida: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-florida (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D78C090>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-florida
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-florida: HTTPSConnectionPool(host='web.archive.org', port=443): Max re

 66%|██████▋   | 133/200 [27:04<05:43,  5.13s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-groningen: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-groningen (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6C5A10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-groningen


 67%|██████▋   | 134/200 [27:34<13:44, 12.50s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/george-washington-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/george-washington-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E06BBD0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/george-washington-university
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/george-washington-university: HTTPSConnectionPool(host='web.arc

 72%|███████▏  | 143/200 [28:35<04:33,  4.81s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/rensselaer-polytechnic-institute: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/rensselaer-polytechnic-institute (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DAC5D10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/rensselaer-polytechnic-institute
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/rensselaer-polytechnic-institute: HTTPSConnectionPo

 74%|███████▍  | 148/200 [30:11<08:11,  9.44s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/birkbeck-university-of-london: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/birkbeck-university-of-london


 76%|███████▋  | 153/200 [31:06<05:40,  7.24s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/yeshiva-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/yeshiva-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DA16FD0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/yeshiva-university


 77%|███████▋  | 154/200 [31:20<07:07,  9.30s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/national-taiwan-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/national-taiwan-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D5F4D10>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/national-taiwan-university


 78%|███████▊  | 156/200 [31:36<05:55,  8.09s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/erasmus-university-rotterdam: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/erasmus-university-rotterdam


 80%|███████▉  | 159/200 [32:23<07:12, 10.54s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/radboud-university-nijmegen: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/radboud-university-nijmegen (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D9E1210>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/radboud-university-nijmegen


 82%|████████▏ | 163/200 [33:10<06:04,  9.84s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/texas-aampm-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/texas-aampm-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D619150>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/texas-aampm-university


 84%|████████▎ | 167/200 [33:37<03:30,  6.37s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/rwth-aachen-university: HTTPSConnectionPool(host='web.archive.org', port=443): Read timed out. (read timeout=30)
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/rwth-aachen-university


 86%|████████▌ | 172/200 [34:34<03:16,  7.03s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/queens-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/queens-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DD0E890>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/queens-university


 86%|████████▋ | 173/200 [34:45<03:45,  8.34s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-auckland: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-auckland (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D8B0850>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-auckland


 88%|████████▊ | 175/200 [35:01<03:16,  7.84s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-dundee: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-dundee (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DA02C90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-dundee
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-dundee: HTTPSConnectionPool(host='web.archive.org', port=443): Max retrie

 92%|█████████▏| 183/200 [36:49<01:37,  5.74s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/iowa-state-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/iowa-state-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DB27C90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/iowa-state-university


 94%|█████████▍| 188/200 [37:34<01:10,  5.90s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-western-australia: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-western-australia (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DC09090>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-western-australia


 96%|█████████▌| 192/200 [38:13<00:53,  6.67s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/city-university-of-hong-kong: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/city-university-of-hong-kong (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E1CBB50>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/city-university-of-hong-kong
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/city-university-of-hong-kong: HTTPSConnectionPool(host='web.arc

 98%|█████████▊| 195/200 [39:20<01:05, 13.04s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karlsruhe-institute-of-technology: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karlsruhe-institute-of-technology (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD94F50>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karlsruhe-institute-of-technology
 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/karlsruhe-institute-of-technology: HTTPSConnecti

100%|█████████▉| 199/200 [40:42<00:12, 12.87s/it]

 Error scraping https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-twente: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-twente (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E049DD0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20121006035856/http://www.timeshighereducation.co.uk/world-university-rankings/2011-12/world-ranking/institution/university-of-twente


100%|██████████| 200/200 [41:12<00:00, 12.36s/it]

--- Scraping Complete ---
Success : 196
Failed : 4





In [None]:
df_success_2012 = pd.DataFrame(universities_2012_data_success)
df_success_2012.to_csv("DATA/RAW/CSV/the_university_corpus_2011-2012.csv", index=False, encoding='utf-8-sig')
df_success_2012.to_parquet("DATA/RAW/PARQUET/the_university_corpus_2011-2012.parquet", index=False) 

print("File saved: the_university_corpus_2011-2012.csv")

File saved: the_university_corpus_2011-2012.csv


In [None]:
filename = "DATA/RAW/CSV/the_university_corpus_2011-2012.csv"

try:
    df = pd.read_csv(filename)
    print(f"File '{filename}' read. Number of universities: {len(df)}")

    index_to_drop = df[df['rank'].astype(str) == "192"].index

    if not index_to_drop.empty:
        df.drop(index_to_drop, inplace= True)
        print(f"Removed {len(index_to_drop)} row(s) with rank 192")
    else : print("No row with rank 192 found")

    df.to_parquet("DATA/CLEAN/PARQUET/the_university_corpus_2011-2012.parquet", index=False)
    df.to_csv("DATA/CLEAN/CSV/the_university_corpus_2011-2012.parquet", index=False)

except FileNotFoundError:
    print(f"File '{filename}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File 'DATA/CSV/the_university_corpus_2011-2012.csv' read. Number of universities: 196
Removed 1 row(s) with rank 192


## Part 1.4: Web scraping Times Higher Education (THE)World University Ranking 2021


In [31]:
universities_2021 = get_universities("DATA/HTML/2025_World University Rankings 2021 _ Times Higher Education (THE).html")

Number of universities found: 1526

--- Available keys ---
['rank_order', 'rank', 'name', 'scores_overall', 'scores_overall_rank', 'scores_teaching', 'scores_teaching_rank', 'scores_research', 'scores_research_rank', 'scores_citations', 'scores_citations_rank', 'scores_industry_income', 'scores_industry_income_rank', 'scores_international_outlook', 'scores_international_outlook_rank', 'iid', 'record_type', 'member_level', 'url', 'nid', 'location', 'stats_number_students', 'stats_student_staff_ratio', 'stats_pc_intl_students', 'stats_female_male_ratio', 'aliases', 'closed', 'unaccredited', 'disabled', 'apply_link', 'cta_button', 'logo_url', 'enhanced']

--- FILTERED DATA OF THE FIRST UNIVERSITY ---
{
    "rank": "1",
    "name": "University of Oxford",
    "url": "/world-university-rankings/university-oxford",
    "location": "United Kingdom"
}


Although 1,526 universities were found, the 2012 sample size was limited to under 200. To maintain comparative consistency, we are narrowing our scope to the Top 200 universities. We will therefore retain only the top 200 positions from the latest ranking.

In [32]:
universities_2021 = universities_2021[:200]

print(f"Number of universities extracted for 2021: {len(universities_2021)}")

Number of universities extracted for 2021: 200


In [33]:
def get_university_urls_from_2021(filename):
    try :
        with open(filename, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        soup = BeautifulSoup(html_content, 'html.parser')

        if not soup:
            print(f"Error reading HTML content from file: {filename}")
            return None
        
        data_table =soup.find("table", class_="table regWallVariants-processed wur-hash-processed wur-cols-processed wur-pagelen-processed dataTable no-footer rank-only stats usr-processed")

        if not data_table:
            print(f"Description data_table not found for file: {filename}")
            return None
        
        namesearch_td =data_table.find_all("td", class_="name namesearch")

        if not namesearch_td:
            print(f"Description container not found for file: {filename}")
            return None

        base_url = "https://web.archive.org"
        university_links_2021 = []

        for td in namesearch_td:
            a_tag = td.find("a", class_="ranking-institution-title")
            if a_tag and 'href' in a_tag.attrs:
                raw_path = a_tag['href']

                if raw_path.startswith("http"):
                    full_url = raw_path
                else:
                    full_url = base_url + raw_path
                
                clean_url = full_url.replace(" ", "").replace("mp_/", "/")  # Clean URL
                
                university_links_2021.append(clean_url)
        return university_links_2021


    except Exception as e:
        print(f" Error scraping {filename}: {e}")
        return None
        

In [34]:
list_of_url_2021 = get_university_urls_from_2021("DATA/HTML/2021_length_200_World University Rankings 2021 _ Times Higher Education (THE).html")
print(f"Number of university URLs extracted for 2021: {len(list_of_url_2021)}")
print(list_of_url_2021[:5])


Number of university URLs extracted for 2021: 200
['https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-oxford', 'https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/stanford-university', 'https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/harvard-university', 'https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/california-institute-technology', 'https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/massachusetts-institute-technology']


In [38]:
def scrape_university_description_2021(url):
    try :
        response = requests.get(url, headers=get_random_headers(), timeout=30)  
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
        else : return None

        name_tag = soup.find("h1", class_="institution-info__title")
        rank_tag = soup.find("span", class_="institution-info__ranking-number")
        
        name = name_tag.get_text(strip=True) if name_tag else "Name not found"
        rank = rank_tag.get_text(strip=True) if rank_tag else "Rank not found"

        #print(name,rank)
        
        panes =soup.find_all("div", class_="pane-content")
        full_text = 'Description not found'

        paragraphs = []

        if not panes:
            print(f"Description container not found for URL: {url}")
            return None
        #else :print("Found the container")

        if panes:
            for idx in [0,1]:
                if len(panes)>idx:
                    potential_country = panes[idx].find("div", class_="institution-info__country clearfix")
                    if potential_country :
                        country_div = potential_country.find("a")
                        if country_div:
                            country = potential_country.get_text(strip = True)
                            break
                    
            # Test index 2 and index 3 if index as no text 
            for idx in [2, 3]:
                if len(panes) > idx:
                    potential_p = panes[idx].find_all("p")
                    # Check paragrpahes <p> and that they are not empty
                    if potential_p and any(p.get_text(strip=True) for p in potential_p):
                        paragraphs = potential_p
                        break # Founded exit the loop

        if paragraphs:
            full_text = "\n\n".join([p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)])
            #print(full_text)


        # 3. Retour UNIQUE du dictionnaire
        return {
            "rank": rank,
            "name": name,
            "description": full_text,
            "url": url,
            "country" : country
        }
            
    except Exception as e:
        print(f" Error scraping {url}: {e}")
        return None

In [39]:

scrape_university_description_2021("https://web.archive.org/web/20210609021249/https://www.timeshighereducation.com/world-university-rankings/university-oxford")

{'rank': '1',
 'name': 'University of Oxford',
 'description': 'The University of Oxford is the oldest university in the English-speaking world and the world’s second oldest surviving university. While its exact founding date is unknown, there is evidence that teaching took place as far back as 1096.\n\nLocated in and around Oxford’s medieval city centre, the university comprises 44 colleges and halls, and over 100 libraries, making it the largest library system in the UK.\n\nStudents number around 22,000 in total, just over half of whom are undergraduates while over 40 per cent are international, representing 140 countries between them.\n\nCalled the "city of dreaming spires" by Victorian poet, Matthew Arnold, Oxford has the youngest population of any city in England and Wales: nearly a quarter of its residents are university students, which gives Oxford a noticeable buzz.\n\nOxford has an alumni network of over 250,000 individuals, including more than 120 Olympic medallists, 26 Nobel

We have an erreur with the link used to find the description of Standford University because the number of pane-content is different.

In [40]:
scrape_university_description_2021("https://web.archive.org/web/20210527123333/https://www.timeshighereducation.com/world-university-rankings/stanford-university")

{'rank': '2',
 'name': 'Stanford University',
 'description': "Located in the heart of Silicon Valley,Stanford Universitywas founded in 1885 by Jane and Leland Stanford, “to promote the public welfare by exercising an influence in behalf of humanity and civilization.” Since opening in 1891, Stanford's faculty and students have worked to\xa0improve the health and wellbeing\xa0of people around the world through\xa0the\xa0discovery and application of knowledge. Breakthroughs at Stanford\xa0include the first successful\xa0heart-lung transplant, the debut of the computer mouse, and the development of\xa0digital music.\n\nSituated on 8,180 acres, Stanford is one of the largest campuses in the United States with 18 interdisciplinary research institutes and seven schools on a single campus: Graduate School of Business; School of Earth, Energy & Environmental Sciences; Graduate School of Education; School of Engineering; School of Humanities and Sciences; Law School; and School of Medicine.\n\n

In [41]:
universities_2021_data_success =[]
universities_2021_data_failed = []

retries =3

print(f"Starting the extraction of {len(list_of_url_2021)} universities...")

for url in tqdm(list_of_url_2021):
    success = False
    for attempt in range(retries):
        try :
            data = scrape_university_description_2021(url)
            if data:
                result_record = {
                    "rank": data['rank'],
                    "name": data['name'],
                    "description": data['description'],
                    "country": data['country'],
                    'url':data['url']
                }
                universities_2021_data_success.append(result_record)
                success = True
                break
            else : 
                print(f"Attempt {attempt + 1} failed for URL: {url}")
                time.sleep(2**attempt * 5) # Exponential backoff
        except Exception as e:
            print(f" Exception on attempt {attempt + 1} for URL: {url} - {e}")
            time.sleep(2**attempt * 5) # Exponential backoff
    if not success:
        failed_record = {
            "url": url,
            "error": "Failed after retries"
        }
        universities_2021_data_failed.append(failed_record)
    time.sleep(2)  # Safety pause
print("--- Scraping Complete ---")
print(f"Success : {len(universities_2021_data_success)}")
print(f"Failed : {len(universities_2021_data_failed)}")

Starting the extraction of 200 universities...


  5%|▌         | 10/200 [00:41<12:52,  4.07s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/imperial-college-london: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/imperial-college-london (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D616CD0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/imperial-college-london


  9%|▉         | 18/200 [01:44<16:36,  5.48s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/cornell-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/cornell-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6F4DB310>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/cornell-university
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/cornell-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducati

 10%|█         | 20/200 [02:32<39:27, 13.15s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/tsinghua-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/tsinghua-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D6B3A90>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/tsinghua-university
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/tsinghua-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshigheredu

 11%|█         | 22/200 [03:40<1:02:15, 20.99s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/peking-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/peking-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DECFC10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/peking-university


 18%|█▊        | 37/200 [05:09<11:02,  4.07s/it]  

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/georgia-institute-technology: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/georgia-institute-technology (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DC09D50>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/georgia-institute-technology


 20%|█▉        | 39/200 [05:44<26:35,  9.91s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/mcgill-university: ('Connection aborted.', ConnectionResetError(10054, 'Une connexion existante a dû être fermée par l’hôte distant', None, 10054, None))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/mcgill-university


 20%|██        | 40/200 [06:15<43:19, 16.24s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-munich: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-munich (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DF9F910>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-munich
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-munich: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/202106021

 20%|██        | 41/200 [07:01<1:06:18, 25.02s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/heidelberg-university-0: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/heidelberg-university-0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E2258D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/heidelberg-university-0


 26%|██▌       | 51/200 [08:08<12:06,  4.88s/it]  

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-sydney: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-sydney (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D8D2790>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-sydney


 28%|██▊       | 57/200 [08:58<12:58,  5.45s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-north-carolina-chapel-hill: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-north-carolina-chapel-hill (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E049510>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-north-carolina-chapel-hill
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-north-carolina-chapel-hill: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducatio

 29%|██▉       | 58/200 [10:00<52:41, 22.26s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/australian-national-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/australian-national-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D934450>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/australian-national-university


 30%|██▉       | 59/200 [10:30<57:59, 24.68s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/seoul-national-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/seoul-national-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D915DD0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/seoul-national-university


 32%|███▏      | 64/200 [11:18<21:49,  9.63s/it]  

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/monash-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/monash-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6B0550>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/monash-university


 37%|███▋      | 74/200 [12:30<12:43,  6.06s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/charite-universitatsmedizin-berlin: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/charite-universitatsmedizin-berlin (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E04C210>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/charite-universitatsmedizin-berlin
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/charite-universitatsmedizin-berlin: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-

 38%|███▊      | 77/200 [13:39<26:55, 13.14s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/delft-university-technology: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/delft-university-technology (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DD72810>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/delft-university-technology
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/delft-university-technology: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/202106021

 39%|███▉      | 78/200 [14:04<33:47, 16.62s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-tubingen: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-tubingen (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6D0690>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-tubingen


 40%|████      | 81/200 [14:43<24:50, 12.52s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ohio-state-university-main-campus: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ohio-state-university-main-campus (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD71E10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ohio-state-university-main-campus


 43%|████▎     | 86/200 [15:30<14:04,  7.40s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ecole-polytechnique: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ecole-polytechnique (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DF8FA90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ecole-polytechnique


 44%|████▎     | 87/200 [16:00<26:40, 14.17s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-science-and-technology-china: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-science-and-technology-china (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6F399210>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-science-and-technology-china


 46%|████▋     | 93/200 [17:16<23:21, 13.10s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/purdue-university-west-lafayette: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/purdue-university-west-lafayette (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E0D0A10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/purdue-university-west-lafayette


 48%|████▊     | 95/200 [17:51<24:37, 14.07s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/korea-advanced-institute-science-and-technology-kaist: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/korea-advanced-institute-science-and-technology-kaist (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E1F0B10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/korea-advanced-institute-science-and-technology-kaist


 51%|█████     | 102/200 [18:45<09:29,  5.81s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ghent-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ghent-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6B2390>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ghent-university


 52%|█████▎    | 105/200 [19:31<16:52, 10.65s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/aarhus-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/aarhus-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E10B9D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/aarhus-university


 55%|█████▍    | 109/200 [20:13<12:34,  8.29s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/queen-mary-university-london: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/queen-mary-university-london (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D85D910>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/queen-mary-university-london


 56%|█████▌    | 111/200 [20:49<17:54, 12.07s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/uppsala-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/uppsala-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DA71F10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/uppsala-university


 57%|█████▋    | 114/200 [21:24<14:34, 10.16s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/penn-state-main-campus: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/penn-state-main-campus (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DBE09D0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/penn-state-main-campus
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/penn-state-main-campus: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.ti

 57%|█████▊    | 115/200 [22:07<28:15, 19.94s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/vrije-universiteit-amsterdam: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/vrije-universiteit-amsterdam (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD857D0>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/vrije-universiteit-amsterdam


 58%|█████▊    | 116/200 [22:37<32:15, 23.04s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-virginia-main-campus: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-virginia-main-campus (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D48A910>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-virginia-main-campus


 64%|██████▍   | 129/200 [24:07<06:35,  5.57s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-gottingen: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-gottingen (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6F4FE210>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-gottingen


 65%|██████▌   | 130/200 [24:19<08:57,  7.67s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-alberta: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-alberta (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6E1FE250>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-alberta
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-alberta: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducati

 67%|██████▋   | 134/200 [24:57<07:52,  7.15s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-hamburg: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-hamburg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DBBE9D0>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-hamburg
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-hamburg: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducati

 68%|██████▊   | 135/200 [25:43<20:36, 19.02s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/lancaster-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/lancaster-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD70250>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/lancaster-university
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/lancaster-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/lancaster-university (Caused by Conn

 70%|███████   | 141/200 [27:45<10:35, 10.77s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-berlin: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-berlin (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DECD590>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-berlin


 71%|███████   | 142/200 [28:15<16:00, 16.55s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ulm-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ulm-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DD96B50>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ulm-university
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ulm-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/ulm-university (Caused by ConnectTimeoutError(<urllib3.conne

 72%|███████▏  | 143/200 [29:16<28:28, 29.98s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-wurzburg: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-wurzburg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DDABB10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-wurzburg
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-wurzburg: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-wurzburg (Caused by ConnectTi

 76%|███████▌  | 151/200 [30:46<05:43,  7.01s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-florida: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-florida (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E238890>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-florida


 76%|███████▌  | 152/200 [31:16<11:10, 13.96s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/pompeu-fabra-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/pompeu-fabra-university (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6E20AA90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/pompeu-fabra-university


 78%|███████▊  | 155/200 [31:55<08:31, 11.37s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/trinity-college-dublin: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/trinity-college-dublin (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6F5B2C10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/trinity-college-dublin


 80%|███████▉  | 159/200 [32:39<05:59,  8.77s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-leeds: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-leeds (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D3F1010>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-leeds


 81%|████████  | 162/200 [33:21<06:38, 10.48s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-liverpool: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-liverpool (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DD97D50>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-liverpool
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-liverpool: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighe

 82%|████████▏ | 164/200 [33:51<07:11, 11.97s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-vienna: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-vienna (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6F2890>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-vienna


 82%|████████▎ | 165/200 [34:22<10:11, 17.47s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/rutgers-state-university-new-jersey-0: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/rutgers-state-university-new-jersey-0 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D6F3990>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/rutgers-state-university-new-jersey-0


 86%|████████▋ | 173/200 [35:21<02:31,  5.62s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-exeter: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-exeter (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DA57B90>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-exeter


 89%|████████▉ | 178/200 [36:07<02:14,  6.13s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/newcastle-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/newcastle-university (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6DE80050>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/newcastle-university
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/newcastle-university: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighe

 90%|█████████ | 181/200 [36:44<02:38,  8.35s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/autonomous-university-barcelona: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/autonomous-university-barcelona (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6DE6EA50>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/autonomous-university-barcelona


 92%|█████████▏| 184/200 [37:23<02:30,  9.43s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-canberra: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-canberra (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002DB6D868910>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-canberra
 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-canberra: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshigheredu

 94%|█████████▍| 188/200 [38:42<02:24, 12.00s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-denmark: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-denmark (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6D8D0450>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/technical-university-denmark


 99%|█████████▉| 198/200 [39:48<00:09,  4.59s/it]

 Error scraping https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-erlangen-nuremberg: HTTPSConnectionPool(host='web.archive.org', port=443): Max retries exceeded with url: /web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-erlangen-nuremberg (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x000002DB6F45DC10>, 'Connection to web.archive.org timed out. (connect timeout=30)'))
Attempt 1 failed for URL: https://web.archive.org/web/20210602120510/https://www.timeshighereducation.com/world-university-rankings/university-erlangen-nuremberg


100%|██████████| 200/200 [40:22<00:00, 12.11s/it]

--- Scraping Complete ---
Success : 195
Failed : 5





In [None]:
df_success_2012 = pd.DataFrame(universities_2021_data_success)
df_success_2012.to_csv("DATA/RAW/CSV/the_university_corpus_2021.csv", index=False, encoding='utf-8-sig')
df_success_2012.to_parquet("DATA/RAW/PARQUET/the_university_corpus_2021.parquet", index=False) 

print("File saved: the_university_corpus_2021.csv")

File saved: the_university_corpus_2021.csv


In [None]:
# rank 59 " Descripition" = 'Description not found'

filename = "DATA/RAW/CSV/the_university_corpus_2021.csv"

try:
    df = pd.read_csv(filename)
    print(f"File '{filename}' read. Number of universities: {len(df)}")

    index_to_drop = df[df['rank'].astype(str) == "59"].index

    if not index_to_drop.empty:
        df.drop(index_to_drop, inplace= True)
        print(f"Removed {len(index_to_drop)} row(s) with rank 59")
    else : print("No row with rank 192 found")

    df.to_parquet("DATA/CLEAN/PARQUET/the_university_corpus_2021.parquet", index=False)
    df.to_csv("DATA/CLEAN/CSV/the_university_corpus_2021.csv", index=False)
    
except FileNotFoundError:
    print(f"File '{filename}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

File 'DATA/CSV/the_university_corpus_2021.csv' read. Number of universities: 195
Removed 1 row(s) with rank 59
