# Homework 2

We first import the needed libraries and set the elements we need for the rest of the Homework

In [None]:
# Used to look for the saved rankings
DATA_FOLDER = 'Data/'

In [None]:
# Importing libraries
import re #for regexes
import requests
import numpy as np
import unicodedata #useful for ex 3
import pickle as pk
import pandas as pd
import seaborn as sns
import os, os.path as osp
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt

In [None]:
sns.set(style="darkgrid")
sns.set_context("notebook")
sns.set_palette("RdPu_d", 10)

## Useful functions

The following functions are helper functions allowing us to have a cleaner code

In [None]:
#Functions allowing us to manipulate Pickle files (better running time)
def save_pkl(obj, path):
    """Saves into a pickle file to given path."""
    with open(path, 'wb') as f:
        pk.dump(obj, f)  
        
def load_pkl(path):
    """Loads a pickle file from a given path."""
    with open(path, 'rb') as f:
        return pk.load(f)

In [None]:
#Functions allowing us to clean our data
def transform_rank(df, cname):
    """Function that transforms the rank from a string to an int. 
    We also use it to define a relative and an absolute ranking"""
    
    df.rename(columns = {cname: 'relative_rank'}, inplace = True)
    df['absolute_rank'] = df.index + 1
    
    #The assert is needed to ensure that we have strings (else the code produces errors).
    assert df['relative_rank'].dtype == np.object_, "We do not have strings in " + 'relative_rank'
    df['relative_rank'] = df['relative_rank'].str.replace(r'\D+', '').astype('int')
    
def remove_blank_convert_float(x):
    """Removes blankspace from x and parses it to float."""
    if(x):
        x = x.text[1:-1].replace(",","")
        x_float = float(x)
    else:
        #Using NAN for unknown values to facilitate computation
        x_float = float('NAN')
    return x_float

In [None]:
#As we need to compute several ratios, we decided to modularize this operation
def ratios_calc(df, c1, c2, name):
    """Function that computes ratios for a given DataFrame.
        df is the DataFrame
        c1 is the name of the first column for the ratio,
        c2 is the name of the second column for the ratio,
        name is the name of the new column
    """
    # Computes the ratio and puts it in a new column
    df[name] = df[c1]/df[c2]

## Preparatory steps

The first step was a preliminary exploration of the website's responses and code using Postman and the Google Chrome dev tools.

- For the Times, looking at the JSON response from the ranking website immediately led to the right data.
- For TopUniversities, we searched for the occurence of some string data on Postman, leading to the source of the data.
- To find the additional information needed in the TopUniversities set, we used the Google dev tools to quickly check which classes pointed to the div tags containing the required information.

In [None]:
#Base URL for websites to crawl
FILES = '/sites/default/files/'
TOP_UNIVERSITIES = 'https://www.topuniversities.com'
TIMES_EDUCATION = 'https://www.timeshighereducation.com'

#These are the main URLs we will be working with
TOP_UNIVERVERSITIES_TEXT = TOP_UNIVERSITIES + FILES + 'qs-rankings-data/357051.txt'
TIMES_EDUCATION_JSON = TIMES_EDUCATION + FILES + 'the_data_rankings/world_university_rankings_2018_limit0_369a9045a203e176392b9fb8f8c1cb2a.json'

## Exercise 1 : TopUniversities

### Data retrieval
The first step is pretty step forward. We simply request the JSON and create a DataFrame containing all the needed information offered by the request.

In [None]:
s = requests.get(TOP_UNIVERVERSITIES_TEXT)
topjson = s.json()

In [None]:
top_frame = pd.DataFrame(topjson['data']).head(200)
del top_frame['cc'],\
    top_frame['logo'],\
    top_frame['nid'],\
    top_frame['core_id'],\
    top_frame['stars'],\
    top_frame['guide'] 

#This step allows us to clean the (relative) rank for comparison and add an (absolute) rank for display
transform_rank(top_frame,'rank_display')
    
#Allows us to have a quick look at the data.
top_frame.head()

The second step is a little more complicated, which is why we decided to include an example of how additional data should be retrieved in order to better understand our methodology.

In [None]:
r = requests.get(TOP_UNIVERSITIES + '/universities/university-cambridge')
page_body = r.text
soup = BeautifulSoup(page_body, 'html.parser')

#In order to get the right numbers, we use [1:-1] in order to delete the leading and trailing spaces
student_number = (soup.find('div', class_='total student')).find('div', class_='number').text[1:-1]
int_student = (soup.find('div', class_='total inter')).find('div', class_='number').text[1:-1]
facult_number = (soup.find('div', class_='total faculty')).find('div', class_='number').text[1:-1]
int_faculty = (soup.find('div', class_='inter faculty')).find('div', class_='number').text[1:-1]

#We create a new DataFrame to visualize the new information
pd.DataFrame.from_dict({'students' : [student_number], 
              'international students': [int_student], 
              'faculty' : [facult_number], 
              'international faculty' : [int_faculty] })

The following function allows us to get all the useful information. As asked, we make sure that we have the name, rank, country, region, number of faculty members (international and total) and number of students (international and total) in the final DataFrame.

In [None]:
def get_additional_info(url):
    r = requests.get(TOP_UNIVERSITIES + url)
    assert r.status_code != 404, 'Bad request: could not get data'
    page_body = r.text
    soup = BeautifulSoup(page_body, 'html.parser')
    
    #In the following steps, we make sure the values exist before fetching them
    student_number = (soup.find('div', class_='total student')) 
    if student_number:
        student_number = student_number.find('div', class_='number')
        
    int_student = (soup.find('div', class_='total inter'))
    if int_student:
        int_student = int_student.find('div', class_='number')
    
    faculty_number = (soup.find('div', class_='total faculty'))
    if faculty_number:
        faculty_number = faculty_number.find('div', class_='number')
        
    int_faculty = (soup.find('div', class_='inter faculty'))
    if int_faculty:
        int_faculty = int_faculty.find('div', class_='number')
    
    frame = pd.DataFrame.from_dict({'students' : [remove_blank_convert_float(student_number)], 
              'international students': [remove_blank_convert_float(int_student)], 
              'faculty' : [remove_blank_convert_float(faculty_number)], 
              'international faculty' : [remove_blank_convert_float(int_faculty)] })
    return frame

In [None]:
#Takes a relatively long time to run the first time

#We create a folder to store all the data.
if not osp.exists(DATA_FOLDER):
    os.makedirs(DATA_FOLDER)

#We create a file to store the TopUniversities Ranking
top_file = DATA_FOLDER + 'top_ranking.pkl'
if osp.exists(top_file):
    merged_top = load_pkl(top_file)
else:
    missing_rows = pd.DataFrame()
    for url in top_frame.url:
        new_info = get_additional_info(url)
        missing_rows = missing_rows.append(new_info, ignore_index = True)
    #We simply concatenate all the information and sort them in order to have a better layout similar to the one below
    merged_top = pd.concat([top_frame, missing_rows], axis=1, join_axes=[top_frame.index])
    merged_top.rename(columns = {'title': 'name'}, inplace = True)
    merged_top = merged_top.reindex_axis(['country', 'region', 'name', 'score', 'relative_rank',
                                         'absolute_rank', 'international faculty', 'faculty',
                                         'international students', 'students', 'url'], axis=1)
    save_pkl(merged_top, top_file)

We have undefined values in the "merged_top" set due to the lack of information on the website, so we simply propagate NAN values.

### Best universities by ratio :
We first determine which universities are the best according to the faculty/student ratio (and display the top of the list). To do that, we create a new DataFrame only containing the relevant information (university, ratio and relative rank for display).

In [None]:
top_fac_stud = merged_top
ratios_calc(top_fac_stud , 'faculty', 'students', 'faculty/students')
top_fac_stud = top_fac_stud[['name', 'faculty/students', 'relative_rank']]

In [None]:
top_fac_stud = top_fac_stud.sort_values(['faculty/students', 'relative_rank'], ascending=[False, True])
top_fac_stud = top_fac_stud.reset_index()
top_fac_stud.head()

The other important ratio used to determine the quality of a school is international/total students. We use a similar method as before to obtain these numbers.

In [None]:
top_int_stud = merged_top
ratios_calc(top_int_stud,'international students', 'students', 'international/students')
top_int_stud = top_int_stud[['name', 'international/students', 'relative_rank']]

In [None]:
top_int_stud = top_int_stud.sort_values(['international/students', 'relative_rank'], ascending=[False, True])
top_int_stud = top_int_stud.reset_index()
top_int_stud.head()

To better understand and visualize the top universities according to each ration, we plot the top 10 universities.

In [None]:
best_fac = top_fac_stud.head(10)
best_int = top_int_stud.head(10)

In [None]:
cmap = plt.cm.get_cmap('Blues')
pal = sns.color_palette("RdPu", n_colors=14, desat=0.6)

sns.barplot(y="name", x="faculty/students", palette=best_fac['relative_rank'].map(lambda x: cmap(1-np.sqrt(1+(x - 200)/200))) , data=best_fac)
plt.title("Best universities by faculty to students ratio")
plt.show()

sns.barplot(y="name", x="international/students", palette=best_int['relative_rank'].map(lambda x: cmap(1-np.sqrt(1+(x - 200)/200))) , data=best_int)
plt.title("Best universities by internatinal students to students ratio")
plt.show()

### Best universities by geographic location :
We begin by sorting the countries by their number of universities in the 200 top schools.

In [None]:
best_countries = pd.DataFrame(merged_top['country'].value_counts())
best_countries = best_countries.reset_index()
best_countries.columns = ['country', 'universities']
best_countries.head()

We perform the same actions for the regions we have

In [None]:
best_regions = pd.DataFrame(merged_top['region'].value_counts())
best_regions = best_regions.reset_index()
best_regions.columns = ['region', 'universities']
best_regions.head()

Finally we plot the best regions and countries.

In [None]:
best_countries = best_countries.head(10)
best_regions = best_regions.head() #works as there are only 5 regions

In [None]:
sns.barplot(y='country', x='universities', data=best_countries)
plt.title("Best countries according to number of top universities")
plt.show()

sns.barplot(y="region", x="universities", data=best_regions)
plt.title("Regions according to number of top universities")
plt.show()

## Exercise 2 : Times Higher Education

In the following exercise, we mostly do the same steps as for the first exercise. Thus, we will not precise each step (except for different handlings of the data).

### Data retrieval

In [None]:
r = requests.get(TIMES_EDUCATION_JSON)
timesjson = r.json()

In [None]:
times_frame = pd.DataFrame(timesjson['data']).head(200) #Keep the first 200 schools
times_frame = times_frame[[d for d in times_frame.columns if not ('score' in d)]] #Drop useless frames
del times_frame['member_level'],\
    times_frame['nid'],\
    times_frame['record_type'],\
    times_frame['subjects_offered'],\
    times_frame['stats_female_male_ratio'],\
    times_frame['rank_order'],\
    times_frame['aliases']

transform_rank(times_frame,'rank')
times_frame.rename(columns = {'stats_number_students': 'students', 
                    'stats_pc_intl_students': 'international/students',
                    'stats_student_staff_ratio': 'faculty/students',
                    'location': 'country'}, inplace = True)    

times_frame.head()

### Data cleaning :

There is a very big difference between the TopUniversities ranking and the Times Higher Education ranking when it comes to ratios, they are already provided to us. For this reason, we need to format these elements so they can be easily used. We then display the head to ensure the results are sound.

In [None]:
times_frame['faculty/students'] = 1/times_frame['faculty/students'].astype('float')
times_frame['international/students'] = (1/100) * times_frame['international/students'].str.replace('%', '').astype('int')
times_frame.head()

We also have that the Times Higher Education does not include the region. This way, we decide to match the countries using the regions defined by the first ranking. 

In [None]:
regions_by_country = dict(zip(top_frame.country, top_frame.region))

In [None]:
times_frame['region'] = times_frame.country.map(regions_by_country)
times_frame[times_frame.region.isnull()]

In [None]:
#Both universities are in Europe, so we simply "hardcode" them
times_frame.loc[times_frame.region.isnull(), 'region'] = 'Europe' 
times_frame.head()

Once our data is ready, we save all the needed elements in a new file

In [None]:
#We create a file to store the Times Higher Education Ranking
times_file = DATA_FOLDER + 'times_ranking.pkl'
if osp.exists(times_file):
    times_frame = load_pkl(times_file)
else:
    times_frame = times_frame.reindex_axis(['country', 'region', 'name', 'relative_rank',
                                         'absolute_rank', 'faculty/students', 'international/students', 
                                         'students', 'url'], axis=1)
    save_pkl(times_frame, times_file)

### Best universities by ratio :

We visualize the best universities according to their ratios (after sorting them).

In [None]:
best_fac = times_frame[['name', 'faculty/students', 'relative_rank']]
best_fac = times_frame.sort_values(['faculty/students', 'relative_rank'], ascending=[False, True])
best_fac.index = range(len(best_fac.index))
best_fac = best_fac.head(10)

In [None]:
best_int = best_int[['name', 'international/students', 'relative_rank']]
best_int = best_int.sort_values(['international/students', 'relative_rank'], ascending=[False, True])
best_int = best_int.reset_index()
best_int = best_int.head(10)

In [None]:
cmap = plt.cm.get_cmap('Blues')
pal = sns.color_palette("RdPu", n_colors=14, desat=0.6)

sns.barplot(y="name", x="faculty/students", palette=best_fac['relative_rank'].map(lambda x: cmap(1-np.sqrt(1+(x - 200)/200))) , data=best_fac)
plt.title("Best universities by faculty to students ratio")
plt.show()

sns.barplot(y="name", x="international/students", palette=best_int['relative_rank'].map(lambda x: cmap(1-np.sqrt(1+(x - 200)/200))) , data=best_int)
plt.title("Best universities by internatinal students to students ratio")
plt.show()

### Best universities by geographic region :

Unlike the TopUniversities ranking, the Times Higher Education does not include the region. This way, we decide to match the countries using the regions defined by the first ranking. 

This is the ranking by country.

In [None]:
best_countries = pd.DataFrame(times_frame['country'].value_counts())
best_countries = best_countries.reset_index()
best_countries.columns = ['country', 'universities']
best_countries.head()

This is the ranking by region.

In [None]:
best_regions = pd.DataFrame(times_frame['region'].value_counts())
best_regions = best_regions.reset_index()
best_regions.columns = ['region', 'universities']
best_regions.head()

Finally, we plot our results

In [None]:
best_countries = best_countries.head(10)
best_regions = best_regions.head()

In [None]:
sns.barplot(y='country', x='universities', data=best_countries)
plt.title("Best countries according to number of top universities")
plt.show()

sns.barplot(y="region", x="universities", data=best_regions)
plt.title("Regions according to number of top universities")
plt.show()

## Exercise 3 :  Merging the two rankings

### Naive Merge

We tried to merge the two DataFrames in a naive manner with a simple join, and found that barely any columns matched.

We also tried to use difflib, a Python library, but it gives us false positives for certain universities. If it gave us false negatives, it wouldn't be a huge problem, but the opposite is one as we don't want any errors in our mergers. 

Hence, we created our own method. The idea behind this is that non-alphanumeric characters in the strings make it so that the strings are not an exact match. It also happens that some university names in one DataFrame are a substring of those in the other.

For some languages (french and german for example), non-unicode characters (like é and è or ö and ü) impede the matching. Thus, removing all these may increase our matching rate, without giving us any false positives.

In [None]:
def ascii(s):
    s = re.sub("\W+", "", s)
    s = s.replace("at", '')
    # transforms é with e for example
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

top_frame.title = top_frame.title.map(str.lower).map(ascii)
times_frame.name = times_frame.name.map(str.lower).map(ascii)

We create a dictionary with the different matches, making sure to fill it from both sides (check if a is in b and if b is in a). As we sometimes have multiple matches, it is best not to do anything right now at the risk of getting false positives.

In [None]:
i = 0
university_name = dict()

for name in times_frame['name']:
    if (top_frame['title'].str.contains(name).any()) \
    and len(top_frame.title[top_frame['title'].str.contains(name)]) == 1:
        #way to check how many direct matches we have
        university_name[name] = top_frame.title[top_frame['title'].str.contains(name)].item()
        #matches_1.append(top_frame.title[top_frame['title'].str.contains(name)].item())
        i = i+1
        
    #if we have multiple possibilities, take exact match if it exists
    elif (top_frame['title'].str.contains(name).any()) \
    and len(top_frame.title[top_frame['title'].str.contains(name)]) != 1\
    and not top_frame.title[top_frame['title'] == name].empty:
        university_name[name] = top_frame.title[top_frame['title'] == name]
#Number of matches.
len(university_name)

In [None]:
# Mostly the same but the other way around.
i = 0

for name in top_frame['title']:
    if times_frame['name'].str.contains(name).any()\
    and len(times_frame.name[times_frame['name'].str.contains(name)]) == 1:
        university_name[times_frame.name[times_frame['name'].str.contains(name)].item()] = name
        i = i+1
    elif (times_frame['name'].str.contains(name).any()) \
    and len(times_frame.name[times_frame['name'].str.contains(name)]) != 1\
    and not times_frame.name[times_frame['name'] == name].empty:
        university_name[times_frame.name[times_frame['name'] == name].item()] = name
#Number of matches.
len(university_name)

As we can see, we only have 148 matches, which leaves us with (at least) 52 unmatches universities.

In [None]:
times_frame['join_here'] = times_frame.name.map(university_name)
university_name

### Specific Cases

We decided to take a look at all the unmatched universities to see if we can do better.

In [None]:
still_na = times_frame[times_frame['join_here'].isnull()]
inv_unimap = {v: k for k, v in university_name.items()} #inverse dictionary
top_frame['join_here'] = top_frame.title.map(inv_unimap)
top_na = top_frame[top_frame['join_here'].isnull()]

We go through each country to check if there are any unmatched universities or if they exist in only one of the rankings. This will allow us to understand the difficulties in each one of them. We could use the aliases, but in reality, it does not match and treat them separately below.

In [None]:
#This set gives us the set of countries for which both frames have unmatched universities
na_countries = set(top_na.groupby('country').size().index).intersection(set(still_na.groupby('country').size().index))
na_countries

In [None]:
def print_universities(country):
    print(country, "\n")
    print("TopUniversities university names :")
    print(top_na[top_na.country == country].title)
    print("\nTimes Higher Education university names")
    print(still_na[still_na.country == country].name)

Concerning Belgium, we have no possible matches between the 2 rankings.

In [None]:
print_universities("Belgium")

In Canada, the main issue of merging is the language.

In [None]:
print_universities("Canada")

Once again in France, the only possible merges are prevented because of the language.

In [None]:
print_universities("France")

We see that matchings in Germany can be improved a lot because of naming conventions in Germany.

In [None]:
print_universities("Germany")

This university always matches with 2 other universities (needs to be done manually).

In [None]:
print_universities("Hong Kong")

The only string that can be matched is not written in the same in both rankings.

In [None]:
print_universities("South Korea")

Once again, universities in Spain cannot be merged because of translation issues.

In [None]:
print_universities("Spain")

These universities are all distinct.

In [None]:
print_universities("Sweden")

Finally, in the United Kingdom, we have no possible matches between the universities.

In [None]:
print_universities("United Kingdom")

We tried to match the universities using their number of students. However, we were unable to use this method because the difference between the number of students between the two rankings was too big.

After this, we decided to match them manually by adding them to the dictionary.

In [None]:
#times.name = top.title
university_name['universityofmontreal'] = 'universitedemontreal'
university_name['pierreandmariecurieuniversity'] = 'universitepierreetmariecurieupmc'
university_name['lmumunich'] = 'ludwigmaximiliansuniversitatmunchen'
university_name['humboldtuniversityofberlin'] = 'humboldtuniversitatzuberlin'
university_name['freeuniversityofberlin'] = 'freieuniversitaetberlin'
university_name['heidelberguniversity'] = 'ruprechtkarlsuniversitatheidelberg'
university_name['universityoftubingen'] = 'eberhardkarlsuniversitattubingen'
university_name['technicaluniversityofberlin'] = 'technischeuniversitatberlintuberlin'
university_name['universityoffreiburg'] = 'albertludwigsuniversitaetfreiburg'
university_name['universityofhongkong'] = 'theuniversityofhongkong'
university_name['koreaadvancedinstituteofscienceandtechnologykaist'] = 'kaistkoreaadvancedinstituteofsciencetechnology'
university_name['autonomousuniversityofbarcelona'] = 'universitautonomadebarcelona'

### Final merging

We start by adding a column to unable the merging of the two rankings.

In [None]:
times_frame['join_here'] = times_frame.name.map(university_name)

In [None]:
merged_top.name = merged_top.name.map(str.lower).map(ascii)

merged_frame = pd.merge(times_frame, 
                        merged_top, how='outer', left_on=['join_here'], right_on=['name'])

In [None]:
merged_frame

## Exercise 4 : Exploratory Analysis

## Exercise 5 : Best Universities