## Homework 2

In [2]:
# Import libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Simple Data Retrieval

We first find the appropriate lists giving us the needed data

In [4]:
#Base URL for websites to crawl
TIMES_EDUCATION = 'https://www.timeshighereducation.com'
TOP_UNIVERSITIES = 'https://www.topuniversities.com'
FILES = '/sites/default/files/'

#These are the main URLs we will be working with
TIMES_EDUCATION_JSON = TIMES_EDUCATION + FILES + 'the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
TOP_UNIVERVERSITIES_TEXT = TOP_UNIVERSITIES + FILES + 'qs-rankings-data/357051.txt'

In this part, we request the JSON of the first ranking and preprocess the data (the steps are detailed in the code).

In [5]:
#We send the request and save the JSON
r = requests.get(TIMES_EDUCATION_JSON)
timesjson = r.json()

In [19]:
#Transforming to Data frame
times_frame = pd.DataFrame(timesjson['data']).head(200) #Keep the first 200 schools
times_frame = times_frame[[d for d in times_frame.columns if not ('score' in d)]] #Drop useless frames
del times_frame['member_level'],\
    times_frame['nid'],\
    times_frame['record_type'],\
    times_frame['subjects_offered'],\
    times_frame['rank']
times_frame['rank_order'] = times_frame['rank_order'].map(lambda x: int(int(x)/10)) #Keeping absolute order

#We display the format of the elements we retrieved to get a better visualization
times_frame.head()

Unnamed: 0,aliases,location,name,rank_order,stats_female_male_ratio,stats_number_students,stats_pc_intl_students,stats_student_staff_ratio,url
0,University of Oxford,United Kingdom,University of Oxford,1,46 : 54,20409,38%,11.2,/world-university-rankings/university-oxford
1,University of Cambridge,United Kingdom,University of Cambridge,2,45 : 55,18389,35%,10.9,/world-university-rankings/university-cambridge
2,California Institute of Technology caltech,United States,California Institute of Technology,3,31 : 69,2209,27%,6.5,/world-university-rankings/california-institut...
3,Stanford University,United States,Stanford University,4,42 : 58,15845,22%,7.5,/world-university-rankings/stanford-university
4,Massachusetts Institute of Technology,United States,Massachusetts Institute of Technology,5,37 : 63,11177,34%,8.7,/world-university-rankings/massachusetts-insti...


We simply repeat the same process for the second ranking (thus, we will not comment the code as above).

In [7]:
s = requests.get(TOP_UNIVERVERSITIES_TEXT)
topjson = s.json()

In [36]:
##NEED TO GET ONLY NUMERICAL VALUES FOR THE RANK DISPLAY!!

top_frame = pd.DataFrame(topjson['data']).head(200)
del top_frame['cc'],\
    top_frame['logo'],\
    top_frame['nid'],\
    top_frame['core_id'],\
    top_frame['stars'],\
    top_frame['guide'] 

top_frame.head()

Unnamed: 0,country,rank_display,region,score,title,url
0,United States,1,North America,100.0,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...
1,United States,2,North America,98.7,Stanford University,/universities/stanford-university
2,United States,3,North America,98.4,Harvard University,/universities/harvard-university
3,United States,4,North America,97.7,California Institute of Technology (Caltech),/universities/california-institute-technology-...
4,United Kingdom,5,Europe,95.6,University of Cambridge,/universities/university-cambridge


## Web Scraping

To be able to find additional elements, we need to visit every University URL. Below, we offer an example of usage (1st university of the Top Universities ranking) in order to clarify the code we will write in the next steps.

In [9]:
r = requests.get(TOP_UNIVERSITIES + '/universities/university-cambridge')
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

#In order to get the right numbers, we use [1:-1] in order to delete the leading and trailing spaces
student_number = (soup.find('div', class_='total student')).find('div', class_='number').text[1:-1]
int_student = (soup.find('div', class_='total inter')).find('div', class_='number').text[1:-1]
facult_number = (soup.find('div', class_='total faculty')).find('div', class_='number').text[1:-1]
int_faculty = (soup.find('div', class_='inter faculty')).find('div', class_='number').text[1:-1]

#We create a new DataFrame to visualize the new information
pd.DataFrame.from_dict({'students' : [student_number], 
              'international students': [int_student], 
              'faculty' : [facult_number], 
              'international faculty' : [int_faculty] })

Unnamed: 0,faculty,international faculty,international students,students
0,5490,2278,6699,18770


As asked, we need to ensure that we have the name, rank, country & region, number of faculty members (international and total) and number of students (international and total) and that they appear in the final DataFrame.

In [33]:
def get_additional_info(url):
    r = requests.get(TOP_UNIVERSITIES + url)
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #In the following steps, we make sure the values exist before fetching them
    student_number = (soup.find('div', class_='total student')) 
    if student_number:
        student_number = student_number.find('div', class_='number').text[1:-1].replace(",","")
        
    int_student = (soup.find('div', class_='total inter'))
    if int_student:
        int_student = int_student.find('div', class_='number').text[1:-1].replace(",","")    
    
    faculty_number = (soup.find('div', class_='total faculty'))
    if faculty_number:
        faculty_number = faculty_number.find('div', class_='number').text[1:-1].replace(",","")
        
    int_faculty = (soup.find('div', class_='inter faculty'))
    if int_faculty:
        int_faculty = int_faculty.find('div', class_='number').text[1:-1].replace(",","")
    
    frame = pd.DataFrame.from_dict({'students' : [student_number], 
              'international students': [int_student], 
              'faculty' : [faculty_number], 
              'international faculty' : [int_faculty] })
    return frame

In [34]:
#Takes a relatively long time (more than a minute to run)
missing_rows = pd.DataFrame()
for url in top_frame.url:
    new_info = get_additional_info(url)
    missing_rows = missing_rows.append(new_info, ignore_index = True)

missing_rows.head()

Unnamed: 0,faculty,international faculty,international students,students
0,2982,1679,3717,11067
1,4285,2042,3611,15878
2,4350,1311,5266,22429
3,953,350,647,2255
4,5490,2278,6699,18770


In [66]:
#In this step, we merge the missing information with the existing frame
merged_top = pd.concat([top_frame, missing_rows], axis=1, join_axes=[top_frame.index])
merged_top.head()

Unnamed: 0,country,rank_display,region,score,title,url,faculty,international faculty,international students,students
0,United States,1,North America,100.0,Massachusetts Institute of Technology (MIT),/universities/massachusetts-institute-technolo...,2982,1679,3717,11067
1,United States,2,North America,98.7,Stanford University,/universities/stanford-university,4285,2042,3611,15878
2,United States,3,North America,98.4,Harvard University,/universities/harvard-university,4350,1311,5266,22429
3,United States,4,North America,97.7,California Institute of Technology (Caltech),/universities/california-institute-technology-...,953,350,647,2255
4,United Kingdom,5,Europe,95.6,University of Cambridge,/universities/university-cambridge,5490,2278,6699,18770


We first try to determine which university is best according to the faculty/student ratio. To do that, we create a new DataFrame which only contains the name of the university, the ratio and the rank_display in case of a tie.

In [50]:
top_fac_stud = merged_top
top_fac_stud['faculty/students'] = top_fac_stud.apply(lambda row: int(row['faculty'])/int(row['students']), axis = 1)
top_fac_stud = top_fac_stud[['title', 'faculty/students', 'rank_display']]
top_fac_stud.head()

Unnamed: 0,title,faculty/students,rank_display
0,Massachusetts Institute of Technology (MIT),0.26945,1
1,Stanford University,0.26987,2
2,Harvard University,0.193945,3
3,California Institute of Technology (Caltech),0.422616,4
4,University of Cambridge,0.292488,5


In [51]:
top_fac_stud = top_fac_stud.sort_values(['faculty/students', 'rank_display'], ascending=[False, True])
top_fac_stud.index = range(len(top_fac_stud.index))
top_fac_stud.head()

Unnamed: 0,title,faculty/students,rank_display
0,California Institute of Technology (Caltech),0.422616,4
1,Yale University,0.398323,16
2,University of Oxford,0.342292,6
3,University of Cambridge,0.292488,5
4,Johns Hopkins University,0.276353,17


Now, we sort the universities according to their international students ratio.

In [60]:
top_int_stud = merged_top
top_int_stud['international/students'] = top_int_stud.apply(lambda row: int(row['international students'])/int(row['students']), axis = 1)
top_int_stud = top_int_stud[['title', 'international/students', 'rank_display']]
top_int_stud.head()

Unnamed: 0,title,international/students,rank_display
0,Massachusetts Institute of Technology (MIT),0.335863,1
1,Stanford University,0.227422,2
2,Harvard University,0.234785,3
3,California Institute of Technology (Caltech),0.286918,4
4,University of Cambridge,0.356899,5


In [61]:
top_int_stud = top_int_stud.sort_values(['international/students', 'rank_display'], ascending=[False, True])
top_int_stud.index = range(len(top_fac_stud.index))
top_int_stud.head()

Unnamed: 0,title,international/students,rank_display
0,London School of Economics and Political Scien...,0.691393,35
1,Ecole Polytechnique Fédérale de Lausanne (EPFL),0.570047,12
2,Imperial College London,0.543567,8
3,Maastricht University,0.502533,200
4,Carnegie Mellon University,0.478062,=47


It is time to aggregate the universities by country and region in order to sort them.

In [73]:
##AS RANK DOESN'T WORK (NOT NUMERICAL VALUES ONLY) WE USE THE SCORE

top_country = merged_top
top_country = top_country.groupby("country", as_index = False)["score"].max()
top_country = top_country.sort_values("score", ascending = False)
top_country.index = range(len(top_country.index))
top_country.head()

Unnamed: 0,country,score
0,United States,98.7
1,United Kingdom,95.6
2,Switzerland,93.3
3,Singapore,92.2
4,Australia,87.1


In [74]:
top_region = merged_top
top_region = top_region.groupby("region", as_index = False)["score"].max()
top_region = top_region.sort_values("score", ascending = False)
top_region.index = range(len(top_region.index))
top_region.head()

Unnamed: 0,region,score
0,North America,98.7
1,Europe,95.6
2,Asia,92.2
3,Oceania,87.1
4,Latin America,69.1
