# Homework 2 - Montero, Peterssen, Brousse

## Part 1 - Getting links to the lists of students enrolled in CS from 2007 to 2017

First, import the libraries required for the homework.

In [1]:
from bs4 import BeautifulSoup
import requests
import re

Also, define some constants and utility methods.

In [2]:
# only link provided by assignment, splitted in two for convenience
isa_base = 'http://isa.epfl.ch/imoniteur_ISAP/'
isa_landing_page = isa_base + '%21gedpublicreports.htm?ww_i_reportmodel=133685247'
isa_report_page = isa_base + '%21gedpublicreports.html'

In [3]:
def fetch(url, args=None):
    '''GETs a webpage, determined by its url, and optional parameters.
       Returns a BeautifulSoup containing the parsed HTML page. '''
    req = requests.get(url, args)
    if req.status_code != requests.codes.ok:
        print('Error: code' + str(req.status_code))
        return None
    
    return BeautifulSoup(req.text, 'html.parser')

Find the form in the web page.

In [4]:
main_soup = fetch(isa_landing_page)

# the form is contained in a frame called 'toc'. We need to find the URL it is pointing to
frame = main_soup.find('frame', attrs={'name' : 'toc'})
frame_url = frame.attrs['src']

# now we can find the form inside the toc frame
frame_soup = fetch(isa_base + frame_url)
form = frame_soup.find('form', attrs={'name' : 'f'})

# We need to get the 'action' of the form, that is, where to go when the user clicks 'send'
# Don't forget to concatenate it with the base isa url
form_action = isa_base + form.attrs['action']

It is now time to prepare the arguments we want to pass to the form. To this effect, we needed to use the `interceptor` add-on and explore a bit.<br>
We will get ALL the possible lists of students for the section 'Informatique'. This is done with the following arguments (taken from our postman exploration) :

In [5]:
args = {
    'ww_b_list' : '1',
    'ww_i_reportmodel': '133685247',
    'ww_c_langue': '',
    'ww_i_reportModelXsl' : '133685270', # for html
    
    'zz_x_UNITE_ACAD' : 'Informatique',
    'ww_x_UNITE_ACAD' : '249847', # this is Computer science section
    
    'zz_x_PERIODE_ACAD' : '',
    'ww_x_PERIODE_ACAD' : 'null', # we want all possible years
    
    'zz_x_PERIODE_PEDAGO' : '',
    'ww_x_PERIODE_PEDAGO' : 'null', # we want all possible semesters
    
    'zz_x_HIVERETE' : '',
    'ww_x_HIVERETE' : 'null', # both summer and winter semesters
    
    'dummy' : 'ok' # (we just accept that fact...)
}

We will now send our request to get all the links to the lists. This is done by filling the form with our custom values, sending it and collecting the links.

In [6]:
# sending our request
form_send_soup = fetch(form_action, args)

# collect links, taking care of *not* including the first link ("tous")
links = form_send_soup.find_all('a', attrs={'class' : 'ww_x_GPS'})

# Drop the "tous" link, and transform into a dict: key -> description
pattern = r".*\w+=(\d+).*"
clean_links = [
    dict({
        'desc' : a.text.strip(),
        'key' : re.match(pattern, a.attrs['onclick']).group(1)
    }) for a in links if a.text != 'Tous'
]

# For demonstration, we include the head of the extracted links.
clean_links[:5]

[{'desc': 'Echange IN, 2005-2006, Semestre automne', 'key': '24065660'},
 {'desc': 'Echange IN, 2005-2006, Semestre printemps', 'key': '24065672'},
 {'desc': 'Echange IN, 2006-2007, Semestre automne', 'key': '39467937'},
 {'desc': 'Echange IN, 2006-2007, Semestre printemps', 'key': '39467957'},
 {'desc': 'Echange IN, 2007-2008, Semestre automne', 'key': '71454914'}]

Now, we want to keep only the links whose description match:<br>
`Informatique, [period], [degree] semestre [semester]`.<br>
To this effect, we match the string above to extract the wanted information, and keep the key from before:

In [7]:
pattern = r"Informatique, (\d+)-(\d+), (\w+) semestre (\d)"
detailed_links = []
for link in clean_links:
    match = re.match(pattern, link['desc'])
    if match:
        detailed_links.append(dict({
            'start' : int(match.group(1)),
            'end' : int(match.group(2)),
            'degree' : match.group(3),
            'semester' : match.group(4),
            'key' : link['key']
        }))

# The first element is:
detailed_links[0]

{'degree': 'Bachelor',
 'end': 2005,
 'key': '2225262',
 'semester': '1',
 'start': 2004}

We can now filter the links to keep only the valid ones.<br>
The conditions for validity are: start_year >= 2007 and end_year <= 2017 (i.e. now).

In [8]:
valid_links = [e for e in detailed_links if e['start'] >= 2007 and e['end'] <= 2017]

# The first item is no longer before 2007, as wanted
valid_links[0]

{'degree': 'Bachelor',
 'end': 2008,
 'key': '71297531',
 'semester': '1',
 'start': 2007}

## Part 2 - Fetching the lists of students from the links

We will now extract the list of students for each of the semesters given by the links.<br>
The methods are chained for more clarity (top-level to low-level).

In [9]:
def fetch_student_list_from_link(link):
    # init or overwrite the report key for the current link
    args['ww_x_GPS'] = link['key']
    return parse_student_list(fetch(isa_report_page, args), link)

def parse_student_list(soup, link):
    table = soup.find('table')
    
    # in html, rows of a table are called 'tr'
    all_rows = soup.find_all('tr')
    
    # the headers of each column are in the second row of the table
    # the students are stored in the subsequent rows
    data = all_rows[2:]
    headers = [h.text for h in all_rows[1].find_all('th')]
    
    parsed_students = [parse_student(headers, row, link) for row in data]
    return parsed_students

def parse_student(headers, row, link):
    # in html, individual cells of a row of a table are called 'td'
    cells = row.find_all('td')
    
    # initilaize a parsed student "object". It is a dict containing the 4 keys that we want,
    student = {
        'start' : link['start'],
        'end' : link['end'],
        'degree' : link['degree'],
        'semester' : link['semester']
    }
    
    for (h, c) in zip(headers, cells):
        student[h] = c.text

    return student  

As an example, let's parse the first student of the first link (2007-2008 BA1):

In [10]:
fetch_student_list_from_link(valid_links[0])[0]

{'Civilité': 'Monsieur',
 'Ecole Echange': '',
 'Filière opt.': '',
 'Mineur': '',
 'No Sciper': '169569',
 'Nom Prénom': 'Arévalo\xa0Christian',
 'Orientation Bachelor': '',
 'Orientation Master': '',
 'Spécialisation': '',
 'Statut': 'Présent',
 'Type Echange': '',
 'degree': 'Bachelor',
 'end': 2008,
 'semester': '1',
 'start': 2007}

Now, we would like to concatenate <b>all</b> students from all lists. This is done by iterating over all the valid links:

In [11]:
students_in_classes = [fetch_student_list_from_link(link) for link in valid_links]

The problem now is that `all_students` is a list of lists of students. We want to have only a single list containing all the students, without sublevels. The solution for flattening is taken from [stackoverflow](http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python).

In [12]:
students = [s for class_list in students_in_classes for s in class_list]

## Part 3 - Statistical analysis

In [13]:
import pandas as pd

In [14]:
# incorporate the students into a data frame, keep only useful columns, and give them an English name
data = pd.DataFrame(students)
data = data[['Civilité', 'No Sciper', 'Nom Prénom', 'start', 'end', 'degree', 'semester', 'Statut', 'Spécialisation', 'Mineur']]
data = data.rename(columns={'Civilité' : 'title',
                            'No Sciper' : 'sciper',
                            'Nom Prénom' : 'name',
                            'Statut' : 'status',
                            'Spécialisation' : 'spec',
                            'Mineur' : 'minor'})

# 'semester' value is an integer
data.semester = data.semester.astype(int)

# index by sciper number
data = data.set_index('sciper')

data.sample(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
195290,Monsieur,Prakash Aman,2011,2012,Bachelor,4,Présent,,
228408,Monsieur,Milliet Alain Georges Paul,2012,2013,Bachelor,1,Présent,,
223344,Monsieur,Forel Duncan Montana,2013,2014,Bachelor,4,Présent,,
242489,Monsieur,Gomez Vivolo Antoine,2014,2015,Bachelor,2,Présent,,
223287,Monsieur,Reich Nicolas Alexandre,2012,2013,Bachelor,2,Présent,,


### Part 3a - Bachelor data

We're keeping only students who were both in BA1 and BA6:

In [16]:
ba_data = data[data['degree'] == 'Bachelor']
ba_1 = ba_data[ba_data.semester == 1]
ba_2 = ba_data[ba_data.semester == 2]
ba_3 = ba_data[ba_data.semester == 3]
ba_4 = ba_data[ba_data.semester == 4]
ba_5 = ba_data[ba_data.semester == 5]
ba_6 = ba_data[ba_data.semester == 6]

print('There were', len(ba_1), 'students in IN-BA1 over the years')
print('There were', len(ba_2), 'students in IN-BA2 over the years')
print('There were', len(ba_3), 'students in IN-BA3 over the years')
print('There were', len(ba_4), 'students in IN-BA4 over the years')
print('There were', len(ba_5), 'students in IN-BA5 over the years')
print('There were', len(ba_6), 'students in IN-BA6 over the years')

# compute the intersection by using sets
intersection_index = set(ba_1.index).intersection(set(ba_6.index))
intersection = ba_data[ba_data.index.isin(intersection_index)].sort_index()

print('There were', len(intersection_index), 'students in BA1 and thereafter in BA6 over the years')

There were 1720 students in IN-BA1 over the years
There were 1160 students in IN-BA2 over the years
There were 730 students in IN-BA3 over the years
There were 630 students in IN-BA4 over the years
There were 890 students in IN-BA5 over the years
There were 664 students in IN-BA6 over the years
There were 397 students in BA1 and thereafter in BA6 over the years


Now each semester that every qualifying student has done is in the dataframe.<br>
We need to group the students by sciper and count the number of semesters.

In [498]:
semester_count = intersection.reset_index()[['sciper', 'semester']].groupby(['sciper']).count()
semester_count = semester_count.rename(columns={'semester' : 'semester_count'})
intersection_counted = intersection.merge(semester_count, left_index=True, right_index=True)

We want to keep one row per person and drop the columns that aren't useful for the analysis.

In [499]:
intersection_counted = intersection_counted.groupby(intersection_counted.index).agg('first')
intersection_counted = intersection_counted[['title', 'name', 'semester_count']]

We can now group by sex and compute the required stats:

In [504]:
intersection_counted.groupby('title').mean()

Unnamed: 0_level_0,semester_count
title,Unnamed: 1_level_1
Madame,6.793103
Monsieur,7.105978


### Discussion:
TODO !!!!!

## Part 3b - Master data