In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import numpy as np
from difflib import SequenceMatcher

# Top Universities

In [2]:
topu = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt')
times = requests.get('https://www.timeshighereducation.com/sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json')

In [3]:
page_topu = topu.json()
url_topu = 'https://www.topuniversities.com'

In [4]:
ranking = []
for i in range(200):
    uni = page_topu['data'][i]
    rank = int(uni['overall_rank'].replace('=',''))
    
    uni_info = BeautifulSoup(uni['uni'], 'html.parser').find('a')
    uni_url = uni_info.get('href')
    
    subrequest = requests.get(url_topu + uni_url)
    soup = BeautifulSoup(subrequest.text, 'html.parser')
    

    div_inter_fac = soup.find('div', class_='inter faculty')
    div_fac = soup.find('div', class_='total faculty')
    div_inter_stud = soup.find('div', class_='total inter')
    div_stud = soup.find('div', class_='total student')

    staff_intl = int(re.sub(r'[ \n,]', '', div_inter_fac.find('div', class_='number').string)) if div_inter_fac else None
    staff = int(re.sub(r'[ \n,]', '', div_fac.find('div', class_='number').string)) if div_fac else None
    students_intl = int(re.sub(r'[ \n,]', '', div_inter_stud.find('div', class_='number').string)) if div_inter_stud else None
    students = int(re.sub(r'[ \n,]', '', div_stud.find('div', class_='number').string)) if div_stud else None

    ranking.append({
            'Name': uni_info.string, 
            'Rank': rank, 
            'Country': uni['location'],
            'Region': uni['region'],
            'Nb faculty members (international)': staff_intl,
            'Nb faculty members (total)': staff,
            'Nb students (international)': students_intl,
            'Nb students (total)': students
        })

In [5]:
ranking_topu = pd.DataFrame.from_dict(ranking)
ranking_topu.head()

Unnamed: 0,Country,Name,Nb faculty members (international),Nb faculty members (total),Nb students (international),Nb students (total),Rank,Region
0,United States,Massachusetts Institute of Technology (MIT),1679.0,2982.0,3717.0,11067.0,1,North America
1,United States,Stanford University,2042.0,4285.0,3611.0,15878.0,2,North America
2,United States,Harvard University,1311.0,4350.0,5266.0,22429.0,3,North America
3,United States,California Institute of Technology (Caltech),350.0,953.0,647.0,2255.0,4,North America
4,United Kingdom,University of Cambridge,2278.0,5490.0,6699.0,18770.0,5,Europe


In [6]:
def get_ratio(dataframe, param):
    staff_by_param = dataframe.groupby(param).apply(pd.DataFrame.sort_values, 'Ratio staff/student', ascending=False)
    student_by_param = dataframe.groupby(param).apply(pd.DataFrame.sort_values, 'Ratio intl/student', ascending=False)
    staff_by_param = staff_by_param[['Name', 'Ratio staff/student']]
    student_by_param = student_by_param[['Name', 'Ratio intl/student']]
    return staff_by_param, student_by_param

def ranking_by_ratios(dataframe):
    ratio_staff_student = dataframe['Nb faculty members (total)'] / dataframe['Nb students (total)'].apply(float)
    ratio_intl_student = dataframe['Nb students (international)'] / dataframe['Nb students (total)'].apply(float)
    
    ratios = pd.concat([dataframe, ratio_staff_student, ratio_intl_student], axis = 1)
    ratios.columns = np.append(dataframe.columns.values, ['Ratio staff/student', 'Ratio intl/student'])
    
    staff_by_country, student_by_country = get_ratio(ratios, 'Country')
    staff_by_region, student_by_region = get_ratio(ratios, 'Region')

    return staff_by_country, staff_by_region, student_by_country, student_by_region
    

In [7]:
topu_staff_country, topu_staff_region, topu_student_country, topu_student_region = ranking_by_ratios(ranking_topu)

In [8]:
topu_staff_country.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio staff/student
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,74,Universidad de Buenos Aires (UBA),0.134267
Australia,19,The Australian National University,0.110788
Australia,47,The University of Queensland,0.08422
Australia,109,The University of Adelaide,0.081403
Australia,41,The University of Melbourne,0.078493


In [9]:
topu_staff_region.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio staff/student
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,190,University of Cape Town,0.08845
Asia,71,Pohang University of Science And Technology (P...,0.213025
Asia,76,Tohoku University,0.191339
Asia,35,Kyoto University,0.176722
Asia,172,King Fahd University of Petroleum & Minerals,0.175828


In [10]:
topu_student_country.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio intl/student
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Argentina,74,Universidad de Buenos Aires (UBA),0.221658
Australia,41,The University of Melbourne,0.427434
Australia,19,The Australian National University,0.384365
Australia,109,The University of Adelaide,0.382015
Australia,49,The University of Sydney,0.36484


In [11]:
topu_student_region.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio intl/student
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,190,University of Cape Town,0.169703
Asia,25,The University of Hong Kong,0.407144
Asia,48,City University of Hong Kong,0.354221
Asia,10,"Nanyang Technological University, Singapore (NTU)",0.281724
Asia,29,The Hong Kong University of Science and Techno...,0.281542


# Times Higher Education

In [12]:
page_times = times.json()

In [13]:
url = 'https://www.timeshighereducation.com'

In [14]:
ranking = []
for i in range(200):
        uni = page_times['data'][i]
        rank = int(uni['rank'].replace('=',''))

        uni_url = uni['url']
        subrequest = requests.get(url + uni_url)
        soup = BeautifulSoup(subrequest.text, 'html.parser')
        meta = soup.find('meta', property='og:locality')
        region = meta['content'] if meta else None
    
        percentage_int_students = float(next(soup
                                             .find('div', class_='keystats pc_intl_students')
                                             .previous_siblings)
                                        .string.replace('%',''))/100.0
        
        ratio_teacher_student = float(next(soup
                                           .find('div', class_='keystats student_staff_ratio')
                                           .previous_siblings).string)
        
        nb_students = int(next(soup.find('div', class_='keystats number_students').previous_siblings)
                          .string.replace(',',''))
        
        nb_students_international = int(nb_students * percentage_int_students)
        nb_staff = int(nb_students / float(ratio_teacher_student))



        ranking.append({
            'Name': uni['name'], 
            'Rank': rank, 
            'Country': uni['location'],
            'Region': region,
            'Nb faculty members (international)': None,
            'Nb faculty members (total)': nb_staff,
            'Nb students (international)': nb_students_international,
            'Nb students (total)': nb_students
        })

In [15]:
ranking_times = pd.DataFrame.from_dict(ranking)
ranking_times.head()

Unnamed: 0,Country,Name,Nb faculty members (international),Nb faculty members (total),Nb students (international),Nb students (total),Rank,Region
0,United Kingdom,University of Oxford,,1822,7755,20409,1,Oxford
1,United Kingdom,University of Cambridge,,1687,6436,18389,2,Cambridge
2,United States,California Institute of Technology,,339,596,2209,3,Pasadena
3,United States,Stanford University,,2112,3485,15845,3,Stanford
4,United States,Massachusetts Institute of Technology,,1284,3800,11177,5,Cambridge


In [16]:
times_staff_country, times_staff_region, times_student_country, times_student_region = ranking_by_ratios(ranking_times)

In [17]:
times_staff_country.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio staff/student
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,47,Australian National University,0.051795
Australia,60,University of Sydney,0.046282
Australia,111,University of Western Australia,0.04583
Australia,31,University of Melbourne,0.037587
Australia,84,University of New South Wales,0.03745


In [18]:
times_staff_region.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio staff/student
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000 GG Amsterdam,58,University of Amsterdam,0.081294
10117 Berlin,126,Charité - Universitätsmedizin Berlin,0.06169
3000 DR Rotterdam,71,Erasmus University Rotterdam,0.049712
3000 Leuven,46,KU Leuven,0.026952
6525 HP Nijmegen,121,Radboud University Nijmegen,0.05233


In [19]:
times_student_country.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio intl/student
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Australia,31,University of Melbourne,0.399991
Australia,47,Australian National University,0.349994
Australia,84,University of New South Wales,0.339995
Australia,60,University of Sydney,0.319978
Australia,80,Monash University,0.279981


In [20]:
times_student_region.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Ratio intl/student
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1000 GG Amsterdam,58,University of Amsterdam,0.119983
10117 Berlin,126,Charité - Universitätsmedizin Berlin,0.179918
3000 DR Rotterdam,71,Erasmus University Rotterdam,0.199982
3000 Leuven,46,KU Leuven,0.149982
6525 HP Nijmegen,121,Radboud University Nijmegen,0.11995


## Merging

In [21]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

def replace_l(string):
    l = [' ', 'university', 'université', 'the', 'of', 'universität', 'universitaet', 'california']
    for i in l:
        string = string.replace(i, '')
    return re.sub(r'\([a-zA-Z_0-9]{1,11}\)', '', string)

def sanitize(name, other):
    IT = 'Institute of Technology'
    ST = 'Science and Technology'
    if name not in other and IT in name and IT in other:
        name = name.replace(IT, '')
        other = other.replace(IT, '')
        
    if name not in other and ST in name and ST in other:
        name = name.replace(ST, '')
        other = other.replace(ST, '')
        
    name = replace_l(name.lower())
    other = replace_l(other.lower())
    return name, other

def update_name(name):
    for elem in ranking_topu['Name']:
        clean_name, clean_elem = sanitize(name, elem)
        if similar(clean_name, clean_elem) > 0.81 :
            #if similar(clean_name, clean_elem) < 0.9:
            print("{} // {}".format(name, elem))
            return elem
    return name


In [22]:
ranking_times['Name'] = ranking_times['Name'].apply(update_name)
#ranking_times['Name']['LMU Munich']# = 'Ludwig-Maximilians-Universität München'

University of Oxford // University of Oxford
University of Cambridge // University of Cambridge
California Institute of Technology // California Institute of Technology (Caltech)
Stanford University // Stanford University
Massachusetts Institute of Technology // Massachusetts Institute of Technology (MIT) 
Harvard University // Harvard University
Princeton University // Princeton University
Imperial College London // Imperial College London
University of Chicago // University of Chicago
ETH Zurich – Swiss Federal Institute of Technology Zurich // ETH Zurich - Swiss Federal Institute of Technology
University of Pennsylvania // University of Pennsylvania
Yale University // Yale University
Johns Hopkins University // Johns Hopkins University
Columbia University // Columbia University
University of California, Los Angeles // University of California, Los Angeles (UCLA)
University College London // UCL (University College London)
Duke University // Duke University
University of California, 

Shanghai Jiao Tong University // Shanghai Jiao Tong University
Aalto University // Aalto University
University of Auckland // The University of Auckland
Lomonosov Moscow State University // Lomonosov Moscow State University
National Taiwan University // National Taiwan University (NTU)


In [23]:
merged = ranking_topu.merge(ranking_times.drop('Region', axis=1), on='Name', how='outer').fillna(-1)
merged = merged.set_index('Name')

In [24]:
merged

Unnamed: 0_level_0,Country_x,Nb faculty members (international)_x,Nb faculty members (total)_x,Nb students (international)_x,Nb students (total)_x,Rank_x,Region,Country_y,Nb faculty members (international)_y,Nb faculty members (total)_y,Nb students (international)_y,Nb students (total)_y,Rank_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Massachusetts Institute of Technology (MIT),United States,1679.0,2982.0,3717.0,11067.0,1.0,North America,United States,-1,1284.0,3800.0,11177.0,5.0
Stanford University,United States,2042.0,4285.0,3611.0,15878.0,2.0,North America,United States,-1,2112.0,3485.0,15845.0,3.0
Harvard University,United States,1311.0,4350.0,5266.0,22429.0,3.0,North America,United States,-1,2283.0,5284.0,20326.0,6.0
California Institute of Technology (Caltech),United States,350.0,953.0,647.0,2255.0,4.0,North America,United States,-1,339.0,596.0,2209.0,3.0
University of Cambridge,United Kingdom,2278.0,5490.0,6699.0,18770.0,5.0,Europe,United Kingdom,-1,1687.0,6436.0,18389.0,2.0
University of Oxford,United Kingdom,2964.0,6750.0,7353.0,19720.0,6.0,Europe,United Kingdom,-1,1822.0,7755.0,20409.0,1.0
UCL (University College London),United Kingdom,2554.0,6345.0,14854.0,31080.0,7.0,Europe,United Kingdom,-1,2886.0,14848.0,30304.0,16.0
Imperial College London,United Kingdom,2071.0,3930.0,8746.0,16090.0,8.0,Europe,United Kingdom,-1,1390.0,8721.0,15857.0,8.0
University of Chicago,United States,635.0,2449.0,3379.0,13557.0,9.0,North America,United States,-1,2181.0,3381.0,13525.0,9.0
ETH Zurich - Swiss Federal Institute of Technology,Switzerland,1886.0,2477.0,7563.0,19815.0,10.0,Europe,Switzerland,-1,1317.0,7308.0,19233.0,10.0


In [25]:
merged[(merged['Rank_x'] == -1) | (merged['Rank_y']==-1)]

Unnamed: 0_level_0,Country_x,Nb faculty members (international)_x,Nb faculty members (total)_x,Nb students (international)_x,Nb students (total)_x,Rank_x,Region,Country_y,Nb faculty members (international)_y,Nb faculty members (total)_y,Nb students (international)_y,Nb students (total)_y,Rank_y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
"Ecole normale supérieure, Paris",France,75.0,178.0,374.0,1907.0,43.0,Europe,-1,-1,-1.0,-1.0,-1.0,-1.0
Tokyo Institute of Technology,Japan,191.0,1563.0,1071.0,9832.0,56.0,Asia,-1,-1,-1.0,-1.0,-1.0,-1.0
Osaka University,Japan,296.0,2814.0,2106.0,22760.0,63.0,Asia,-1,-1,-1.0,-1.0,-1.0,-1.0
Ludwig-Maximilians-Universität München,Germany,674.0,3274.0,5084.0,35055.0,66.0,Europe,-1,-1,-1.0,-1.0,-1.0,-1.0
Ruprecht-Karls-Universität Heidelberg,Germany,756.0,3908.0,5298.0,28852.0,68.0,Europe,-1,-1,-1.0,-1.0,-1.0,-1.0
Universidad de Buenos Aires (UBA),Argentina,3165.0,16421.0,27109.0,122301.0,75.0,Latin America,-1,-1,-1.0,-1.0,-1.0,-1.0
Tohoku University,Japan,264.0,3411.0,1604.0,17827.0,76.0,Asia,-1,-1,-1.0,-1.0,-1.0,-1.0
Korea University,South Korea,339.0,3745.0,3638.0,25892.0,90.0,Asia,-1,-1,-1.0,-1.0,-1.0,-1.0
Pennsylvania State University,United States,211.0,3509.0,7081.0,45612.0,93.0,North America,-1,-1,-1.0,-1.0,-1.0,-1.0
Yonsei University,South Korea,290.0,3621.0,3379.0,25903.0,106.0,Asia,-1,-1,-1.0,-1.0,-1.0,-1.0


In [26]:
merged['delta ranking'] = (1+np.absolute(merged['Rank_x']-merged['Rank_y']))*(np.minimum(merged['Rank_x'],merged['Rank_y']))

In [27]:
best_uni = merged['delta ranking'][merged['delta ranking'] > 0].sort_values().rank()

In [28]:
best_uni[:'Ecole Polytechnique Fédérale de Lausanne (EPFL)']

Name
Stanford University                                        1.0
Massachusetts Institute of Technology (MIT)                2.0
California Institute of Technology (Caltech)               3.5
University of Oxford                                       3.5
University of Cambridge                                    5.5
Imperial College London                                    5.5
University of Chicago                                      7.0
ETH Zurich - Swiss Federal Institute of Technology         8.0
Harvard University                                         9.0
University of Michigan                                    10.0
Princeton University                                      11.0
Yale University                                           12.0
Johns Hopkins University                                  13.0
UCL (University College London)                           14.5
Columbia University                                       14.5
Cornell University                                

In [29]:
merged[(merged['Rank_x'] != -1) & (merged['Rank_y']!=-1)].count()

Country_x                               152
Nb faculty members (international)_x    152
Nb faculty members (total)_x            152
Nb students (international)_x           152
Nb students (total)_x                   152
Rank_x                                  152
Region                                  152
Country_y                               152
Nb faculty members (international)_y    152
Nb faculty members (total)_y            152
Nb students (international)_y           152
Nb students (total)_y                   152
Rank_y                                  152
delta ranking                           152
dtype: int64