In [1]:
import requests
import pandas as pd
import numpy as np
from IPython.display import display
from bs4 import BeautifulSoup
import re
from requests_futures.sessions import FuturesSession
import datetime

In [2]:
TOP_UNI_URL = "https://www.topuniversities.com"
start = datetime.datetime.now()

In [3]:
r = requests.get('https://www.topuniversities.com/sites/default/files/qs-rankings-data/357051_indicators.txt')
json = r.json()['data']
#print(json)

In [4]:
def getLink(x):
    soup = BeautifulSoup(x, 'html.parser')
    link = soup.find("a")
    return pd.Series([TOP_UNI_URL + link.get("href"),link.text],index = ['url','name'])

In [5]:
df = pd.DataFrame(json)
df = df[['region','overall_rank','location','uni']]
df_temp = df.apply(lambda x: getLink(x['uni']),axis = 1)
df = df.drop('uni',1)
df = df.join(df_temp)

display(df)

Unnamed: 0,region,overall_rank,location,url,name
0,North America,1,United States,https://www.topuniversities.com/universities/m...,Massachusetts Institute of Technology (MIT)
1,North America,2,United States,https://www.topuniversities.com/universities/s...,Stanford University
2,North America,3,United States,https://www.topuniversities.com/universities/h...,Harvard University
3,North America,4,United States,https://www.topuniversities.com/universities/c...,California Institute of Technology (Caltech)
4,Europe,5,United Kingdom,https://www.topuniversities.com/universities/u...,University of Cambridge
5,Europe,6,United Kingdom,https://www.topuniversities.com/universities/u...,University of Oxford
6,Europe,7,United Kingdom,https://www.topuniversities.com/universities/u...,UCL (University College London)
7,Europe,8,United Kingdom,https://www.topuniversities.com/universities/i...,Imperial College London
8,North America,9,United States,https://www.topuniversities.com/universities/u...,University of Chicago
9,Europe,10,Switzerland,https://www.topuniversities.com/universities/e...,ETH Zurich - Swiss Federal Institute of Techno...


In [6]:
def find_number(soup, list_dict_div):
    current_soup = soup
    for dict_div in list_dict_div:
        current_soup = current_soup.find("div",dict_div)
        if(current_soup is None):
            return 0
    return int(current_soup.text.replace(",", ""))

In [7]:
def getExtraData(x):
    #print(x)
    #r_t = requests.get(x)
    soup = BeautifulSoup(x.text, 'html.parser')

    total_faculty_int = find_number(soup, [{ "class" : "faculty-main wrapper col-md-4" },{ "class" : "total faculty"}, { "class" : "number"}])
    inter_faculty_int = find_number(soup, [{ "class" : "faculty-main wrapper col-md-4" },{ "class" : "inter faculty"}, { "class" : "number"}])
    number_students_int = find_number(soup, [{"class": "students-main wrapper col-md-4"}, { "class" : "number"}])
    international_students_int = find_number(soup, [{"class": "int-students-main wrapper col-md-4"}, { "class" : "number"}])

    return pd.DataFrame(data = [[total_faculty_int,inter_faculty_int,number_students_int,international_students_int]], columns = ['total faculty','international faculty','number of students','international students'])

def async_getExtraData(sess,x):
    return getExtraData(x)

In [8]:
df = df.head(200)

df_temp = df[['url']]

#display(df_temp)
urls = df_temp.values.tolist()
urls = [item for sublist in urls for item in sublist]
#print(urls)

session = FuturesSession(max_workers=10)
futures = []
results =[]

for url in urls:
    future = session.get(url)
    futures.append((url,future))

for url,r in futures:
    results.append((url,r.result()))

In [9]:
list_dataframe = []

for url,result in results:
    list_dataframe.append(pd.concat([getExtraData(result),pd.DataFrame(data = [url],columns=['url'])],axis = 1))
    
print('done')

done


In [10]:
df_temp = pd.concat(list_dataframe)
display(df_temp)

Unnamed: 0,total faculty,international faculty,number of students,international students,url
0,2982,1679,11067,3717,https://www.topuniversities.com/universities/m...
0,4285,2042,15878,3611,https://www.topuniversities.com/universities/s...
0,4350,1311,22429,5266,https://www.topuniversities.com/universities/h...
0,953,350,2255,647,https://www.topuniversities.com/universities/c...
0,5490,2278,18770,6699,https://www.topuniversities.com/universities/u...
0,6750,2964,19720,7353,https://www.topuniversities.com/universities/u...
0,6345,2554,31080,14854,https://www.topuniversities.com/universities/u...
0,3930,2071,16090,8746,https://www.topuniversities.com/universities/i...
0,2449,635,13557,3379,https://www.topuniversities.com/universities/u...
0,2477,1886,19815,7563,https://www.topuniversities.com/universities/e...


In [11]:
df = df.merge(df_temp, left_on='url', right_on='url', how='outer')
df = df.drop('url', 1)

In [12]:
display(df)

Unnamed: 0,region,overall_rank,location,name,total faculty,international faculty,number of students,international students
0,North America,1,United States,Massachusetts Institute of Technology (MIT),2982,1679,11067,3717
1,North America,2,United States,Stanford University,4285,2042,15878,3611
2,North America,3,United States,Harvard University,4350,1311,22429,5266
3,North America,4,United States,California Institute of Technology (Caltech),953,350,2255,647
4,Europe,5,United Kingdom,University of Cambridge,5490,2278,18770,6699
5,Europe,6,United Kingdom,University of Oxford,6750,2964,19720,7353
6,Europe,7,United Kingdom,UCL (University College London),6345,2554,31080,14854
7,Europe,8,United Kingdom,Imperial College London,3930,2071,16090,8746
8,North America,9,United States,University of Chicago,2449,635,13557,3379
9,Europe,10,Switzerland,ETH Zurich - Swiss Federal Institute of Techno...,2477,1886,19815,7563


In [13]:
def display_ratio(df):
    df_ratio = df.copy()
    
    df_ratio['ratio'] = df_ratio['total faculty'] / df_ratio['number of students']
    df_ratio = df_ratio.sort_values('ratio',ascending=False)
    display(df_ratio)
    
    df_ratio['ratio'] = df_ratio['international students'] / df_ratio['number of students']
    df_ratio = df_ratio.sort_values('ratio',ascending=False)
    display(df_ratio)

### a ,b)

In [14]:
display_ratio(df)

Unnamed: 0,region,overall_rank,location,name,total faculty,international faculty,number of students,international students,ratio
3,North America,4,United States,California Institute of Technology (Caltech),953,350,2255,647,0.422616
15,North America,16,United States,Yale University,4940,1708,12402,2469,0.398323
5,Europe,6,United Kingdom,University of Oxford,6750,2964,19720,7353,0.342292
4,Europe,5,United Kingdom,University of Cambridge,5490,2278,18770,6699,0.292488
16,North America,17,United States,Johns Hopkins University,4462,1061,16146,4105,0.276353
1,North America,2,United States,Stanford University,4285,2042,15878,3611,0.269870
0,North America,1,United States,Massachusetts Institute of Technology (MIT),2982,1679,11067,3717,0.269450
185,North America,186,United States,University of Rochester,2569,488,9636,2805,0.266604
18,North America,19,United States,University of Pennsylvania,5499,1383,20639,4250,0.266437
17,North America,18,United States,Columbia University,6189,913,25045,8105,0.247115


Unnamed: 0,region,overall_rank,location,name,total faculty,international faculty,number of students,international students,ratio
34,Europe,35,United Kingdom,London School of Economics and Political Scien...,1088,687,9760,6748,0.691393
11,Europe,12,Switzerland,Ecole Polytechnique Fédérale de Lausanne (EPFL),1695,1300,10343,5896,0.570047
7,Europe,8,United Kingdom,Imperial College London,3930,2071,16090,8746,0.543567
199,Europe,200,Netherlands,Maastricht University,1277,502,16385,8234,0.502533
46,North America,47,United States,Carnegie Mellon University,1342,425,13356,6385,0.478062
6,Europe,7,United Kingdom,UCL (University College London),6345,2554,31080,14854,0.477928
91,Europe,92,United Kingdom,University of St Andrews,1140,485,8800,4030,0.457955
41,Oceania,41,Australia,The University of Melbourne,3311,1477,42182,18030,0.427434
126,Europe,127,United Kingdom,Queen Mary University of London,1885,801,16135,6806,0.421816
25,Asia,26,Hong Kong,The University of Hong Kong,3012,2085,20214,8230,0.407144


### c)

In [15]:
display_ratio(df.groupby('location').sum())

Unnamed: 0_level_0,total faculty,international faculty,number of students,international students,ratio
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Russia,6709,373,30233,5098,0.22191
Denmark,11916,3904,67223,9543,0.177261
Saudi Arabia,1062,665,6040,989,0.175828
Singapore,9444,6079,58466,16168,0.16153
Malaysia,2755,655,17902,3476,0.153893
Japan,28395,2221,186222,16269,0.152479
South Korea,19851,2010,140071,16273,0.141721
Switzerland,15323,9208,109112,32995,0.140434
United Kingdom,79934,30216,583621,199426,0.136962
Israel,2249,454,16531,1034,0.136047


Unnamed: 0_level_0,total faculty,international faculty,number of students,international students,ratio
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Australia,22034,11382,301994,106359,0.352189
United Kingdom,79934,30216,583621,199426,0.341705
Hong Kong,10166,6296,78838,24499,0.310751
Austria,4117,1572,63446,19667,0.30998
Switzerland,15323,9208,109112,32995,0.302396
Singapore,9444,6079,58466,16168,0.276537
Canada,29317,10734,281514,73239,0.260161
New Zealand,3313,1404,48173,12439,0.258215
Ireland,2853,1171,34794,8187,0.235299
Netherlands,20287,5683,197631,46044,0.23298


### d)

In [16]:
display_ratio(df.groupby('region').sum())

Unnamed: 0_level_0,total faculty,international faculty,number of students,international students,ratio
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Asia,106734,25462,807003,110100,0.13226
North America,182123,43836,1546353,292116,0.117776
Europe,223271,68267,1991280,454293,0.112124
Latin America,43126,5521,396902,34737,0.108657
Africa,1733,379,19593,3325,0.08845
Oceania,25347,12786,350167,118798,0.072385


Unnamed: 0_level_0,total faculty,international faculty,number of students,international students,ratio
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Oceania,25347,12786,350167,118798,0.339261
Europe,223271,68267,1991280,454293,0.228141
North America,182123,43836,1546353,292116,0.188906
Africa,1733,379,19593,3325,0.169703
Asia,106734,25462,807003,110100,0.136431
Latin America,43126,5521,396902,34737,0.08752


In [17]:
end = datetime.datetime.now()
print(end - start)      

0:00:46.570278
