## Homework 2

In [None]:
# Use the data folder to look for the saved rankings
DATA_FOLDER = 'Data/'

In [None]:
# Import libraries
import requests
import pandas as pd
from os.path import exists
import os #we can write this better
from bs4 import BeautifulSoup

In [None]:
# We define nice helper functions to reduce the running time
import pickle
def save_pkl(obj, path):
    with open(path, 'wb') as f:
        pickle.dump(obj, f)

def load_pkl(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

## Simple Data Retrieval

To find the aproptiate links to find the data, we did a preliminary explosation of the websides responses and code using postman and the google chrome dev tools.

- For the times, going to the ranking site and looking at the json response immediatly led to the right data.
- For the topuniversities, a search on postman looking for the occurence of the string data was preformed, leading to the source of the data
- To find the additioal information needed in the topuniversities set, we used the google dev tools to quickly check which classes pointed to the < div> tags containing the required information

We first find the appropriate lists giving us the needed data

In [None]:
#Base URL for websites to crawl
TIMES_EDUCATION = 'https://www.timeshighereducation.com'
TOP_UNIVERSITIES = 'https://www.topuniversities.com'
FILES = '/sites/default/files/'

#These are the main URLs we will be working with
TIMES_EDUCATION_JSON = TIMES_EDUCATION + FILES + 'the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'
TOP_UNIVERVERSITIES_TEXT = TOP_UNIVERSITIES + FILES + 'qs-rankings-data/357051.txt'

In this part, we request the JSON of the first ranking and preprocess the data (the steps are detailed in the code).

In [None]:
#We send the request and save the JSON
r = requests.get(TIMES_EDUCATION_JSON)
timesjson = r.json()

In [None]:
#Transforming to Data frame
times_frame = pd.DataFrame(timesjson['data']).head(200) #Keep the first 200 schools
times_frame = times_frame[[d for d in times_frame.columns if not ('score' in d)]] #Drop useless frames
del times_frame['member_level'],\
    times_frame['nid'],\
    times_frame['record_type'],\
    times_frame['subjects_offered'],\
    times_frame['stats_female_male_ratio']
times_frame['rank_order'] = times_frame['rank_order'].map(lambda x: int(int(x)/10)) #Keeping absolute order

#We display the format of the elements we retrieved to get a better visualization
times_frame.head()

In [None]:
times_frame.shape #getting 200 universities

We simply repeat the same process for the second ranking (thus, we will not comment the code as above).

In [None]:
s = requests.get(TOP_UNIVERVERSITIES_TEXT)
topjson = s.json()

In [None]:
##NEED TO GET ONLY NUMERICAL VALUES FOR THE RANK DISPLAY!!

top_frame = pd.DataFrame(topjson['data']).head(200)
del top_frame['cc'],\
    top_frame['logo'],\
    top_frame['nid'],\
    top_frame['core_id'],\
    top_frame['stars'],\
    top_frame['guide'] 

top_frame.head()

## Web Scraping

To be able to find additional elements, we need to visit every University URL. Below, we offer an example of usage (1st university of the Top Universities ranking) in order to clarify the code we will write in the next steps.

In [None]:
r = requests.get(TOP_UNIVERSITIES + '/universities/university-cambridge')
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

#In order to get the right numbers, we use [1:-1] in order to delete the leading and trailing spaces
student_number = (soup.find('div', class_='total student')).find('div', class_='number').text[1:-1]
int_student = (soup.find('div', class_='total inter')).find('div', class_='number').text[1:-1]
facult_number = (soup.find('div', class_='total faculty')).find('div', class_='number').text[1:-1]
int_faculty = (soup.find('div', class_='inter faculty')).find('div', class_='number').text[1:-1]

#We create a new DataFrame to visualize the new information
pd.DataFrame.from_dict({'students' : [student_number], 
              'international students': [int_student], 
              'faculty' : [facult_number], 
              'international faculty' : [int_faculty] })

As asked, we need to ensure that we have the name, rank, country & region, number of faculty members (international and total) and number of students (international and total) and that they appear in the final DataFrame.

In [None]:
def get_additional_info(url):
    r = requests.get(TOP_UNIVERSITIES + url)
    if r.status_code == 404:
        print('NOOO')
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #In the following steps, we make sure the values exist before fetching them
    student_number = (soup.find('div', class_='total student')) 
    if student_number:
        student_number = student_number.find('div', class_='number')
        
    int_student = (soup.find('div', class_='total inter'))
    if int_student:
        int_student = int_student.find('div', class_='number')
    
    faculty_number = (soup.find('div', class_='total faculty'))
    if faculty_number:
        faculty_number = faculty_number.find('div', class_='number')
        
    int_faculty = (soup.find('div', class_='inter faculty'))
    if int_faculty:
        int_faculty = int_faculty.find('div', class_='number')
    
    frame = pd.DataFrame.from_dict({'students' : [remove_blank_convert_float(student_number)], 
              'international students': [remove_blank_convert_float(int_student)], 
              'faculty' : [remove_blank_convert_float(faculty_number)], 
              'international faculty' : [remove_blank_convert_float(int_faculty)] })
    return frame


def remove_blank_convert_float(x):
    if(x):
        x = x.text[1:-1].replace(",","")
        x_float = float(x)
    else:
        #use nan for unknown values, facilitates computation
        x_float = float('nan')
    return x_float

In [None]:
#Takes a relatively long time to run the first time

#create place to put data
if not os.path.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

top_file = DATA_FOLDER + 'top_ranking.pkl'
if exists(top_file):
    merged_top = load_pkl(top_file)
else :
    missing_rows = pd.DataFrame()
    for url in top_frame.url:
        new_info = get_additional_info(url)
        missing_rows = missing_rows.append(new_info, ignore_index = True)
    merged_top = pd.concat([top_frame, missing_rows], axis=1, join_axes=[top_frame.index])
    save_pkl(merged_top, top_file)

We now check if we have any undefined values in the set and see that we do.

In [None]:
merged_top.shape #geting 200 universities

In [None]:
merged_top[merged_top.isnull().any(axis=1)]

We have undefined values, but they are due to the wbesite not containing these informations.
For now, we propagate the nan

We first try to determine which university is best according to the faculty/student ratio. To do that, we create a new DataFrame which only contains the name of the university, the ratio and the rank_display in case of a tie.

In [None]:
top_fac_stud = merged_top
top_fac_stud['faculty/students'] = top_fac_stud.faculty/top_fac_stud.students
top_fac_stud = top_fac_stud[['title', 'faculty/students', 'rank_display']]

In [None]:
top_fac_stud = top_fac_stud.sort_values(['faculty/students', 'rank_display'], ascending=[False, True])
top_fac_stud.index = range(len(top_fac_stud.index))
top_fac_stud.head()

Now, we sort the universities according to their international students ratio.

In [None]:
top_int_stud = merged_top
top_int_stud['international/students'] =\
    top_int_stud['international students']/top_int_stud['students']
top_int_stud = top_int_stud[['title', 'international/students', 'rank_display']]
top_int_stud.head()

In [None]:
top_int_stud = top_int_stud.sort_values(['international/students', 'rank_display'], ascending=[False, True])
top_int_stud.index = range(len(top_fac_stud.index))
top_int_stud.head()

It is time to aggregate the universities by country and region in order to sort them by ranking.

In [None]:
##AS RANK DOESN'T WORK (NOT NUMERICAL VALUES ONLY ) WE USE THE SCORE
top_country = merged_top.sort_values(['country', 'score'], ascending=[True,False])
top_country.set_index('country')

In [None]:
# Need to work on it (cf Above)
top_region = merged_top
top_region = top_region.groupby("region", as_index = False)["score"].max()
top_region = top_region.sort_values("score", ascending = False)
top_region.index = range(len(top_region.index))
top_region.head()

To be able to define the regions by country, we define a dict

In [None]:
regions_by_country = dict(zip(top_frame.country, top_frame.region))

In [None]:
times_frame['region'] = times_frame.location.map(regions_by_country)
times_frame[times_frame.region.isnull()]

In [None]:
#both are in Europe, amd Luxembourg isn't present anyway
times_frame.loc[times_frame.region.isnull(), 'region'] = 'Europe' 
times_frame[times_frame.location == 'Luxembourg'] #assignment works :D

In [None]:
# We want the rank as an int, so we use replace and a regex to be able to only have int values.
# CARTEFUL: can only run this once or you will have errors ! Because .str only works on strings and we transform to int!
times_frame['rank'] = times_frame['rank'].str.replace(r'\D+', '').astype('int')
top_frame['rank_display'] = top_frame['rank_display'].str.replace(r'\D+', '').astype('int')