# Homework 2

We first import the needed libraries and set the elements we need for the rest of the Homework

In [None]:
# Used to look for the saved rankings
DATA_FOLDER = 'Data/'

In [None]:
# Importing libraries
import requests
import numpy as np
import pickle as pk
import pandas as pd
import seaborn as sns
import os, os.path as osp
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [None]:
sns.set_context("notebook")
sns.set(style="whitegrid")
sns.set_palette("hls", 10)

## Useful functions

The following functions are helper functions allowing us to have a cleaner code

In [None]:
#Functions allowing us to manipulate Pickle files (better running time)
def save_pkl(obj, path):
    """Saves into a pickle file to given path."""
    with open(path, 'wb') as f:
        pk.dump(obj, f)
        
        
def load_pkl(path):
    """Loads a pickle file from a given path."""
    with open(path, 'rb') as f:
        return pk.load(f)

In [None]:
#Functions allowing us to clean our data
def transform_rank(df, cname):
    """Function that transforms the rank from a string to an int. 
    The assert is needed to ensure that we have strings (else the code produces errors)."""
    assert df[cname].dtype == np.object_, "We do not have strings in " + cname
    df[cname] = df[cname].str.replace(r'\D+', '').astype('int')
    
def remove_blank_convert_float(x):
    """Removes blankspace from x and parses it to float."""
    if(x):
        x = x.text[1:-1].replace(",","")
        x_float = float(x)
    else:
        #Using NAN for unknown values to facilitate computation
        x_float = float('NAN')
    return x_float

In [None]:
#As we need to compute several ratios, we decided to modularize this operation
def ratios_calc(df, c1, c2, name):
    """Function that computes ratios for a given DataFrame.
        df is the DataFrame
        c1 is the name of the first column for the ratio,
        c2 is the name of the second column for the ratio,
        name is the name of the new column
    """
    # Computes the ratio and puts it in a new column
    df[name] = df[c1]/df[c2]

## Preparatory steps

The first step was a preliminary exploration of the website's responses and code using Postman and the Google Chrome dev tools.

- For the Times, looking at the JSON response from the ranking website immediately led to the right data.
- For TopUniversities, we searched for the occurence of some string data on Postman, leading to the source of the data.
- To find the additional information needed in the TopUniversities set, we used the Google dev tools to quickly check which classes pointed to the div tags containing the required information.

In [None]:
#Base URL for websites to crawl
FILES = '/sites/default/files/'
TOP_UNIVERSITIES = 'https://www.topuniversities.com'
TIMES_EDUCATION = 'https://www.timeshighereducation.com'

#These are the main URLs we will be working with
TOP_UNIVERVERSITIES_TEXT = TOP_UNIVERSITIES + FILES + 'qs-rankings-data/357051.txt'
TIMES_EDUCATION_JSON = TIMES_EDUCATION + FILES + 'the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

## Exercise 1 : TopUniversities

### Data retrieval
The first step is pretty step forward. We simply request the JSON and create a DataFrame containing all the needed information offered by the request.

In [None]:
s = requests.get(TOP_UNIVERVERSITIES_TEXT)
topjson = s.json()

In [None]:
top_frame = pd.DataFrame(topjson['data']).head(200)
del top_frame['cc'],\
    top_frame['logo'],\
    top_frame['nid'],\
    top_frame['core_id'],\
    top_frame['stars'],\
    top_frame['guide'] 

#Allows us to have a quick look at the data.
top_frame.head()

In [None]:
#This step allows us to clean the (relative) rank for comparison and add an (absolute) rank for display
transform_rank(top_frame,'rank_display')

The second step is a little more complicated, which is why we decided to include an example of how additional data should be retrieved in order to better understand our methodology.

In [None]:
r = requests.get(TOP_UNIVERSITIES + '/universities/university-cambridge')
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

#In order to get the right numbers, we use [1:-1] in order to delete the leading and trailing spaces
student_number = (soup.find('div', class_='total student')).find('div', class_='number').text[1:-1]
int_student = (soup.find('div', class_='total inter')).find('div', class_='number').text[1:-1]
facult_number = (soup.find('div', class_='total faculty')).find('div', class_='number').text[1:-1]
int_faculty = (soup.find('div', class_='inter faculty')).find('div', class_='number').text[1:-1]

#We create a new DataFrame to visualize the new information
pd.DataFrame.from_dict({'students' : [student_number], 
              'international students': [int_student], 
              'faculty' : [facult_number], 
              'international faculty' : [int_faculty] })

The following function allows us to get all the useful information. As asked, we make sure that we have the name, rank, country, region, number of faculty members (international and total) and number of students (international and total) in the final DataFrame.

In [None]:
def get_additional_info(url):
    r = requests.get(TOP_UNIVERSITIES + url)
    assert r.status_code != 404, 'Bad request: could not get data'
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #In the following steps, we make sure the values exist before fetching them
    student_number = (soup.find('div', class_='total student')) 
    if student_number:
        student_number = student_number.find('div', class_='number')
        
    int_student = (soup.find('div', class_='total inter'))
    if int_student:
        int_student = int_student.find('div', class_='number')
    
    faculty_number = (soup.find('div', class_='total faculty'))
    if faculty_number:
        faculty_number = faculty_number.find('div', class_='number')
        
    int_faculty = (soup.find('div', class_='inter faculty'))
    if int_faculty:
        int_faculty = int_faculty.find('div', class_='number')
    
    frame = pd.DataFrame.from_dict({'students' : [remove_blank_convert_float(student_number)], 
              'international students': [remove_blank_convert_float(int_student)], 
              'faculty' : [remove_blank_convert_float(faculty_number)], 
              'international faculty' : [remove_blank_convert_float(int_faculty)] })
    return frame

In [None]:
#Takes a relatively long time to run the first time

#We create a folder to store all the data.
if not osp.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

#We create a file to store the TopUniversities Ranking
top_file = DATA_FOLDER + 'top_ranking.pkl'
if osp.exists(top_file):
    merged_top = load_pkl(top_file)
else:
    missing_rows = pd.DataFrame()
    for url in top_frame.url:
        new_info = get_additional_info(url)
        missing_rows = missing_rows.append(new_info, ignore_index = True)
    merged_top = pd.concat([top_frame, missing_rows], axis=1, join_axes=[top_frame.index])
    save_pkl(merged_top, top_file)

We have undefined values in the "merged_top" set due to the lack of information on the website, so we simply propagate NAN values.

### Best universities by ratio :
We first determine which universities are the best according to the faculty/student ratio (and display the top of the list). To do that, we create a new DataFrame only containing the relevant information (university, ratio and relative rank for display).

In [None]:
top_fac_stud = merged_top
ratios_calc(top_fac_stud,'faculty', 'students', 'faculty/students')
top_fac_stud = top_fac_stud[['title', 'faculty/students', 'rank_display']]

In [None]:
top_fac_stud = top_fac_stud.sort_values(['faculty/students', 'rank_display'], ascending=[False, True])
top_fac_stud.index = range(len(top_fac_stud.index))
top_fac_stud.head()

The other important ratio used to determine the quality of a school is international/total students. We use a similar method as before to obtain these numbers.

In [None]:
top_int_stud = merged_top
ratios_calc(top_int_stud,'international students', 'students', 'international/students')
top_int_stud = top_int_stud[['title', 'international/students', 'rank_display']]
top_int_stud.head()

In [None]:
top_int_stud = top_int_stud.sort_values(['international/students', 'rank_display'], ascending=[False, True])
top_int_stud.index = range(len(top_fac_stud.index))
top_int_stud.head()

To better understand and visualize the top universities according to each ration, we plot the top 10 universities.

In [None]:
best_fac = top_fac_stud.head(10)
best_int = top_int_stud.head(10)

In [None]:
sns.barplot(y="title", x="faculty/students", hue="rank_display", data=best_fac)
plt.title("Best universities by faculty to students ratio")
plt.show()

sns.barplot(y="title", x="international/students", hue="rank_display", data=best_int)
plt.title("Best universities by internatinal students to students ratio")
plt.show()

### Best universities by geographic location :
TODO : what happened to our code ?!

#### Best countries and regions

We understood that the question asked us to create a rating of the regions and countries with the most universities that were in the top 200. So a better region or country is not the one that has the highest university but the one that has the most.

In [None]:
best_countries = pd.DataFrame(merged_top['country'].value_counts())
# Unfortunatley, index can't be used in seaborn, so I have to change our dataframe a bit
best_countries = best_countries.reset_index()
best_countries.columns = ['country', 'universities']

best_regions = pd.DataFrame(merged_top['region'].value_counts())
best_regions = best_regions.reset_index()
best_regions.columns = ['region', 'universities']

There are only 5 regions so we will plot all of them, but for the countries, we will only take the 10 first countries and consider them the "best". Note that we have 34 countries total in the list.

In [None]:
best_countries = best_countries.head(10) #they are already ordered.

sns.barplot(y='country', x='universities', data=best_countries)
plt.title("Best countries according to number of top universities")
plt.show()

sns.barplot(y="region", x="universities", data=best_regions)
plt.title("Regions according to number of top universities")
plt.show()

TODO: Keeping code below but not using it ^^"

It is time to aggregate the universities by country and region in order to sort them by ranking.

In [None]:
top_country = merged_top.sort_values(['country', 'rank_display'], ascending=[True,False])
top_country.set_index('country')

In [None]:
# Need to work on it (cf Above)
top_region = merged_top
top_region = top_region.groupby("region", as_index = False)["score"].max()
top_region = top_region.sort_values("score", ascending = False)
top_region.index = range(len(top_region.index))
top_region.head()

## Exercise 2 : Times Higher Education

In the following exercise, we mostly do the same steps as for the first exercise. Thus, we will not precise each step (except for different handlings of the data).

### Data retrieval

In [None]:
r = requests.get(TIMES_EDUCATION_JSON)
timesjson = r.json()

In [None]:
times_frame = pd.DataFrame(timesjson['data']).head(200) #Keep the first 200 schools
times_frame = times_frame[[d for d in times_frame.columns if not ('score' in d)]] #Drop useless frames
del times_frame['member_level'],\
    times_frame['nid'],\
    times_frame['record_type'],\
    times_frame['subjects_offered'],\
    times_frame['stats_female_male_ratio']
times_frame['rank_order'] = times_frame['rank_order'].map(lambda x: int(int(x)/10)) #Keeping absolute order

#We display the format of the elements we retrieved to get a better visualization
times_frame.head()

In [None]:
#We create a file to store the Times Higher Education Ranking
times_file = DATA_FOLDER + 'times_ranking.pkl'
if osp.exists(times_file):
    merged_top = load_pkl(times_file)
else:
    missing_rows = pd.DataFrame()
    for url in top_frame.url:
        new_info = get_additional_info(url)
        missing_rows = missing_rows.append(new_info, ignore_index = True)
    merged_times = pd.concat([times_frame, missing_rows], axis=1, join_axes=[top_frame.index])
    save_pkl(merged_times, times_file)

In [None]:
times_frame.shape
transform_rank(times_frame,'rank')

### Best universities by ratio :

This is the faculty/students ratio.

In [None]:
times_fac_stud = merged_times
ratios_calc(times_fac_stud,'faculty', 'students', 'faculty/students')
times_fac_stud = times_fac_stud[['title', 'faculty/students', 'rank_display']]

In [None]:
times_fac_stud = times_fac_stud.sort_values(['faculty/students', 'rank_display'], ascending=[False, True])
times_fac_stud.index = range(len(times_fac_stud.index))
times_fac_stud.head()

This is the international/total ratio.

In [None]:
times_int_stud = merged_times
ratios_calc(times_int_stud,'international students', 'students', 'international/students')
times_int_stud = times_int_stud[['title', 'international/students', 'rank_display']]
times_int_stud.head()

In [None]:
times_int_stud = times_int_stud.sort_values(['international/students', 'rank_display'], ascending=[False, True])
times_int_stud.index = range(len(times_int_stud.index))
times_int_stud.head()

This is the visualization of the best elements we have.

In [None]:
best_fac = top_fac_stud.head(10)
best_int = top_int_stud.head(10)

In [None]:
sns.barplot(y="title", x="faculty/students", hue="rank_display", data=best_fac)
plt.title("Best universities by faculty to students ratio")
plt.show()

sns.barplot(y="title", x="international/students", hue="rank_display", data=best_int)
plt.title("Best universities by internatinal students to students ratio")
plt.show()

### Best universities by geographic region :
TODO : same than before (but not defined yet)

Unlike the TopUniversities ranking, the Times Higher Education does not include the region. This way, we decide to match the countries using the regions defined by the first ranking. 

In [None]:
regions_by_country = dict(zip(top_frame.country, top_frame.region))

In [None]:
times_frame['region'] = times_frame.location.map(regions_by_country)
times_frame[times_frame.region.isnull()]

In [None]:
#both are in Europe, amd Luxembourg isn't present anyway
times_frame.loc[times_frame.region.isnull(), 'region'] = 'Europe' 
times_frame[times_frame.location == 'Luxembourg'] #assignment works :D
# Need to work on the details (do everything by hand)

## Exercise 3 :  Merging the two rankings

First, we try to merge the two frames in a naive manner, but we see that there is almost no overlap

In [None]:
#this is a bad idea
merged_frame = pd.merge(times_frame, 
                        merged_top, how='outer', right_on=['title'], left_on=['name'])

So instead, we check if there is a python library to help us out. And there is! But we should probably ask if we are allowed to use this...

In [None]:
import difflib 

def get_match(element, other):
    #difflib gets the best matching elements and returns a list of possible matches in order of accuracy
    match = difflib.get_close_matches(element, merged_top.title)
    return '' if len(match) == 0 else match[0]

In [None]:
matches = times_frame.name.map(lambda x: get_match(x, merged_top.title))

In [None]:
times_frame['mergeindex'] = matches

In [None]:
merged_frame = pd.merge(times_frame, 
                        merged_top, how='outer', right_on=['title'], left_on=['mergeindex'])
nulls = merged_frame[merged_frame.isnull().any(axis=1)]

In [None]:
nulls_1 = nulls[nulls['name'].isnull()]
nulls_2 = nulls[nulls['title'].isnull()]
print(nulls_2.name)
print(nulls_1[nulls_1['country'] == 'Germany'].title,
nulls_2[nulls_2['location'] == 'Germany'].name)


In [None]:
print(nulls_1[nulls_1['country'] == 'France'].title,
nulls_2[nulls_2['location'] == 'France'].name)

In [None]:
print(nulls_1[nulls_1['country'] == 'Sweden']['title'],
nulls_2[nulls_2['location'] == 'Sweden']['name'])

We see that the only improvement possible is combining LMU Munich and Ludwig-Maximilians-Universität München, as this is one row, we just do it manually

In [None]:
#TODO: add rank? I don't think we need anything else, all other values are junk that won't be needed for question 4 and 5

## Exercise 4 : Exploratory Analysis

## Exercise 5 : Best Universities