# Background

In this homework we will extract interesting information from www.topuniversities.com and www.timeshighereducation.com, two platforms that maintain a global ranking of worldwide universities. This ranking is not offered as a downloadable dataset, so you will have to find a way to scrape the information we need! You are not allowed to download manually the entire ranking -- rather you have to understand how the server loads it in your browser. For this task, Postman with the Interceptor extension can help you greatly. We recommend that you watch this brief tutorial to understand quickly how to use it.

In [598]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

# 1. Obtain the 200 top-ranking universities in www.topuniversities.com

In [599]:
URL = 'https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051.txt?_=1508168782318'
req = requests.get(URL)

In [None]:
import json
from pandas.io.json import json_normalize

data = json.loads(req.text)
data = json_normalize(data['data'])[:200]

df_top = pd.DataFrame(data)

df_top = df_top[['rank_display','title','country','region','score','stars']]
df_top.head()

Unnamed: 0,rank_display,title,country,region,score,stars
0,1,Massachusetts Institute of Technology (MIT),United States,North America,100.0,6
1,2,Stanford University,United States,North America,98.7,5
2,3,Harvard University,United States,North America,98.4,5
3,4,California Institute of Technology (Caltech),United States,North America,97.7,5
4,5,University of Cambridge,United Kingdom,Europe,95.6,5


In [None]:
df_top['total_faculties'] = 'NaN'
df_top['inter_faculties'] = 'NaN'
df_top['total_students'] = 'NaN'
df_top['inter_students'] = 'NaN'

for i in range(200):
    r =  requests.get('https://www.topuniversities.com'+data.loc[i].url)
    soup = BeautifulSoup(r.text,'html.parser')
    numbers = soup.find_all('div','number')
    
    #Handling missing data
    if(len(numbers)!=8):
        print('Missing data for '+df_top['title'].loc[i]+' in row '+str(i))
        
    else:
        df_top['total_faculties'].loc[i] = float(numbers[0].text.replace(',', ''))
        df_top['inter_faculties'].loc[i]=float(numbers[1].text.replace(',', ''))
        df_top['total_students'].loc[i]= float(numbers[2].text.replace(',', ''))
        df_top['inter_students'].loc[i] = float(numbers[3].text.replace(',', ''))





Missing data for New York University (NYU) in row 51


In [None]:
#Handling missing data
df_top['total_faculties'].loc[189] = 423
df_top['inter_faculties'].loc[189]='Nan'
df_top['total_students'].loc[189]= 4071
df_top['inter_students'].loc[189] = 47

df_top.head()

# Which are the best universities in term of: (a) ratio between faculty members and students, (b) ratio of international students?

In [None]:
df_top_stat = df_top.copy()
df_top_stat= df_top_stat.drop(df_top_stat.index[51])

df_top_stat['ratio_faculty_student'] = df_top_stat.total_faculties / df_top_stat.total_students
df_top_stat['ratio_international_student'] = df_top_stat.inter_students / df_top_stat.total_students


Faculty members and students

In [None]:
df_top_stat.sort_values('ratio_faculty_student', ascending=False).head()

International students

In [None]:
df_top_stat.sort_values('ratio_international_student', ascending=False).head()

# Answer the previous question aggregating the data by (c) country and (d) region.

In [None]:
df_top_stat_agg = df_top_stat.copy()

Aggregate by country

In [None]:
df_top_stat_country = df_top_stat_agg[['country','total_faculties','inter_faculties','total_students',\
                                       'inter_students','ratio_faculty_student','ratio_international_student']]

df_top_stat_country = df_top_stat_country.groupby(by=['country']).agg(sum)

In [None]:
df_top_stat_country.sort_values('ratio_faculty_student', ascending=False).head()

In [None]:
df_top_stat_country.sort_values('ratio_international_student', ascending=False).head()

Aggregate by region

In [None]:
df_top_stat_region = df_top_stat_agg[['region','total_faculties','inter_faculties','total_students','inter_students',\
                                      'ratio_faculty_student','ratio_international_student']]

df_top_stat_region = df_top_stat_region.groupby(by=['region']).agg(sum)

In [None]:
df_top_stat_region.sort_values('ratio_faculty_student', ascending=False).head()

In [None]:
df_top_stat_region.sort_values('ratio_international_student', ascending=False).head()

# Obtain the 200 top-ranking universities in www.timeshighereducation.com (ranking 2018)

In [None]:
URL = 'https://www.timeshighereducation.com//sites/default/files/the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

req = requests.get(URL)

In [None]:
import json
from pandas.io.json import json_normalize

data = json.loads(req.text)
data = json_normalize(data['data'])[:201]
df_times = pd.DataFrame(data)

df_times = df_times[['name', 'aliases', 'location', 'rank', 'stats_number_students', 'stats_pc_intl_students','stats_student_staff_ratio']]
df_times.head()

In [None]:
df_times['nb_students'] = df_times['stats_number_students'].str.replace(',','').astype(int)
df_times['ratio_inter_students'] = df_times['stats_pc_intl_students'].str.replace('%','').astype(float)/100
df_times['staff_student_ratio'] = 1/df_times['stats_student_staff_ratio'].astype(float)

In [None]:
df_times

number of faculty members

In [None]:
import numpy as np
df_times['nb_faculty_members'] =np.round((df_times['nb_students'] * df_times['staff_student_ratio'])).astype(int)
df_times['nb_inter_students'] =np.round((df_times['nb_students'] * df_times['ratio_inter_students'])).astype(int)

df_times = df_times[['rank','name','location','nb_faculty_members','nb_students','nb_inter_students','staff_student_ratio','ratio_inter_students']]
df_times.head()

# Which are the best universities in term of: (a) ratio between faculty members and students, (b) ratio of international students?


Faculty members and students

In [None]:
df_times.sort_values('staff_student_ratio', ascending=False).head()

International students

In [None]:
df_times.sort_values('ratio_inter_students', ascending=False).head()

# Answer the previous question aggregating the data by (c) country and (d) region.

In [None]:
df_times_agg= df_times.copy()

In [None]:
df_times_country = df_times_agg[['location','nb_faculty_members','nb_students','nb_inter_students',\
                                 'staff_student_ratio','ratio_inter_students']].groupby(by=['location']).agg(sum)

In [None]:
df_times_country.sort_values('staff_student_ratio', ascending=False)

In [None]:
df_times_country.sort_values('ratio_inter_students', ascending=False)

# Merge

The idea is to modify the name of each university to make them as equal as possible:
 - Lower all letters
 - Translating important words (university, school, technical..)
 - Get rid of special characters
 - Get rid of prepositions
 - Get rid of parenthesis and their content. Example: (UCB)


In [None]:
df_top_tomerge = df_top.copy()
df_times_tomerge = df_times.copy()

In [None]:
def modify_tomerge(string):
    
    #lower case
    string = string.str.lower()
    
    # University
    string = string.str.replace('universite','university')
    string = string.str.replace('universitat','university')
    string = string.str.replace('universitaet','university')
    string = string.str.replace('universidad','university')
    string = string.str.replace('universiteit','university')
    string = string.str.replace('universidade','university')
    string = string.str.replace('universitari','university')


    # school
    string = string.replace('scuola','school')
    
    #technical
    string = string.replace('technische','technical')
    
    #studies
    string = string.replace('studi','studies')
    string = string.replace('estudios','studies')


    #some translations
    string = string.replace('freie','free')
    string = string.replace('tecnológico','technological')



    #special characters
    string = string.str.replace('é','e')
    string = string.str.replace('-','')
    string = string.str.replace('ä','a')
    string = string.str.replace('ã','a')
    string = string.str.replace('ó','o')
    string = string.str.replace('ö','o')
    string = string.str.replace('&','')
    string = string.str.replace('/','')


    #prepositions
    string = string.str.replace('of','')
    string = string.str.replace('the','')
    string = string.str.replace('at','')
    string = string.str.replace('de','')
    string = string.str.replace('y','')
    string = string.str.replace('di','')
    string = string.str.replace(',','')


    #parenthesis and their content
    string = string.str.replace(r"\(.*\)","")
    
    #space
    string = string.str.replace(' ','')
    

    return string
   


In [None]:
df_top_tomerge = df_top_tomerge.rename(columns={'title': 'name', 'rank_display':'rank_top'})
df_times_tomerge = df_times_tomerge.rename(columns={'rank': 'rank_times'})

df_times_tomerge['name'] = modify_tomerge(df_times_tomerge['name'])
df_top_tomerge['name'] = modify_tomerge(df_top_tomerge['name'])

df = df_top_tomerge.merge(df_times_tomerge, how='inner')

df