# Homework 2 - Montero, Peterssen, Brousse

## Part 1 - Getting links to the lists of students enrolled in CS from 2007 to 2017

First, import the libraries required for the homework.

In [619]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np

Also, define some constants and utility methods.

In [212]:
# only link provided by assignment, splitted in two for convenience
isa_base = 'http://isa.epfl.ch/imoniteur_ISAP/'
isa_landing_page = isa_base + '%21gedpublicreports.htm?ww_i_reportmodel=133685247'
isa_report_page = isa_base + '%21gedpublicreports.html'

In [196]:
def fetch(url, args=None):
    '''GETs a webpage, determined by its url, and optional parameters.
       Returns a BeautifulSoup containing the parsed HTML page. '''
    req = requests.get(url, args)
    if req.status_code != requests.codes.ok:
        print('Error: code' + str(req.status_code))
        return None
    
    return BeautifulSoup(req.text, 'html.parser')

Find the form in the web page.

In [197]:
main_soup = fetch(isa_landing_page)

# the form is contained in a frame called 'toc'. We need to find the URL it is pointing to
frame = main_soup.find('frame', attrs={'name' : 'toc'})
frame_url = frame.attrs['src']

# now we can find the form inside the toc frame
frame_soup = fetch(isa_base + frame_url)
form = frame_soup.find('form', attrs={'name' : 'f'})

# We need to get the 'action' of the form, that is, where to go when the user clicks 'send'
# Don't forget to concatenate it with the base isa url
form_action = isa_base + form.attrs['action']

It is now time to prepare the arguments we want to pass to the form. To this effect, we needed to use the `interceptor` add-on and explore a bit.<br>
We will get ALL the possible lists of students for the section 'Informatique'. This is done with the following arguments (taken from our postman exploration) :

In [174]:
args = {
    'ww_b_list' : '1',
    'ww_i_reportmodel': '133685247',
    'ww_c_langue': '',
    'ww_i_reportModelXsl' : '133685270', # for html
    
    'zz_x_UNITE_ACAD' : 'Informatique',
    'ww_x_UNITE_ACAD' : '249847', # this is Computer science section
    
    'zz_x_PERIODE_ACAD' : '',
    'ww_x_PERIODE_ACAD' : 'null', # we want all possible years
    
    'zz_x_PERIODE_PEDAGO' : '',
    'ww_x_PERIODE_PEDAGO' : 'null', # we want all possible semesters
    
    'zz_x_HIVERETE' : '',
    'ww_x_HIVERETE' : 'null', # both summer and winter semesters
    
    'dummy' : 'ok' # (we just accept that fact...)
}

We will now send our request to get all the links to the lists. This is done by filling the form with our custom values, sending it and collecting the links.

In [172]:
# sending our request
form_send_soup = fetch(form_action, args)

# collect links, taking care of *not* including the first link ("tous")
links = form_send_soup.find_all('a', attrs={'class' : 'ww_x_GPS'})

# Drop the "tous" link, and transform into a dict: key -> description
pattern = r".*\w+=(\d+).*"
clean_links = [
    dict({
        'desc' : a.text.strip(),
        'key' : re.match(pattern, a.attrs['onclick']).group(1)
    }) for a in links if a.text != 'Tous'
]

# For demonstration, we include the head of the extracted links.
clean_links[:5]

[{'desc': 'Echange IN, 2005-2006, Semestre automne', 'key': '24065660'},
 {'desc': 'Echange IN, 2005-2006, Semestre printemps', 'key': '24065672'},
 {'desc': 'Echange IN, 2006-2007, Semestre automne', 'key': '39467937'},
 {'desc': 'Echange IN, 2006-2007, Semestre printemps', 'key': '39467957'},
 {'desc': 'Echange IN, 2007-2008, Semestre automne', 'key': '71454914'}]

Now, we want to keep only the links whose description match:<br>
`Informatique, [period], [degree] semestre [semester]`.<br>
To this effect, we match the string above to extract the wanted information, and keep the key from before:

In [575]:
pattern = r"Informatique, (\d+)-(\d+), (\w+) semestre (\d)"
detailed_links = []
for link in clean_links:
    match = re.match(pattern, link['desc'])
    if match:
        detailed_links.append(dict({
            'start' : int(match.group(1)),
            'end' : int(match.group(2)),
            'degree' : match.group(3),
            'semester' : match.group(4),
            'key' : link['key']
        }))

# The first element is:
detailed_links[0]

{'degree': 'Bachelor',
 'end': 2005,
 'key': '2225262',
 'semester': '1',
 'start': 2004}

We can now filter the links to keep only the valid ones.<br>
The conditions for validity are: start_year >= 2007 and end_year <= 2017 (i.e. now).

In [156]:
valid_links = [e for e in detailed_links if e['start'] >= 2007 and e['end'] <= 2017]

# The first item is no longer before 2007, as wanted
valid_links[0]

{'degree': 'Bachelor',
 'end': 2008,
 'key': '71297531',
 'semester': '1',
 'start': 2007}

## Part 2 - Fetching the lists of students from the links

We will now extract the list of students for each of the semesters given by the links.<br>
The methods are chained for more clarity (top-level to low-level).

In [260]:
def fetch_student_list_from_link(link):
    # init or overwrite the report key for the current link
    args['ww_x_GPS'] = link['key']
    return parse_student_list(fetch(isa_report_page, args), link)

def parse_student_list(soup, link):
    table = soup.find('table')
    
    # in html, rows of a table are called 'tr'
    all_rows = soup.find_all('tr')
    
    # the headers of each column are in the second row of the table
    # the students are stored in the subsequent rows
    data = all_rows[2:]
    headers = [h.text for h in all_rows[1].find_all('th')]
    
    parsed_students = [parse_student(headers, row, link) for row in data]
    return parsed_students

def parse_student(headers, row, link):
    # in html, individual cells of a row of a table are called 'td'
    cells = row.find_all('td')
    
    # initilaize a parsed student "object". It is a dict containing the 4 keys that we want,
    student = {
        'start' : link['start'],
        'end' : link['end'],
        'degree' : link['degree'],
        'semester' : link['semester']
    }
    
    for (h, c) in zip(headers, cells):
        student[h] = c.text

    return student  

As an example, let's parse the first student of the first link (2007-2008 BA1):

In [261]:
fetch_student_list_from_link(valid_links[0])[0]

{'Civilité': 'Monsieur',
 'Ecole Echange': '',
 'Filière opt.': '',
 'Mineur': '',
 'No Sciper': '169569',
 'Nom Prénom': 'Arévalo\xa0Christian',
 'Orientation Bachelor': '',
 'Orientation Master': '',
 'Spécialisation': '',
 'Statut': 'Présent',
 'Type Echange': '',
 'degree': 'Bachelor',
 'end': 2008,
 'semester': '1',
 'start': 2007}

Now, we would like to concatenate <b>all</b> students from all lists. This is done by iterating over all the valid links:

In [271]:
students_in_classes = [fetch_student_list_from_link(link) for link in valid_links]

The problem now is that `all_students` is a list of lists of students. We want to have only a single list containing all the students, without sublevels. The solution for flattening is taken from [stackoverflow](http://stackoverflow.com/questions/952914/making-a-flat-list-out-of-list-of-lists-in-python).

In [753]:
students = [s for class_list in students_in_classes for s in class_list]
len(students)

8509

## Part 3 - Statistical analysis

In [273]:
import pandas as pd

In [829]:
# incorporate the students into a data frame, keep only useful columns, and give them an English name
data = pd.DataFrame(students)
data = data[['Civilité', 'No Sciper', 'Nom Prénom', 'start', 'end', 'degree', 'semester', 'Statut', 'Spécialisation', 'Mineur']]
data = data.rename(columns={'Civilité' : 'title',
                            'No Sciper' : 'sciper',
                            'Nom Prénom' : 'name',
                            'Statut' : 'status',
                            'Spécialisation' : 'spec',
                            'Mineur' : 'minor'})

# 'semester' value is an integer
data.semester = data.semester.astype(int)

# index by sciper number
data = data.set_index('sciper')

data.sample(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
180008,Monsieur,Wertenbroek Rick,2009,2010,Bachelor,3,Présent,,
213287,Monsieur,Jacobs Frédéric Jean H,2011,2012,Bachelor,1,Présent,,
234458,Monsieur,Hugi Romain Guillaume,2014,2015,Bachelor,1,Présent,,
204157,Monsieur,Cremese Guillaume Georges,2011,2012,Bachelor,1,Présent,,
170414,Monsieur,Eberle Julien,2008,2009,Master,2,Congé,Internet computing,


### Part 3a - Bachelor data

We're keeping only students who were both in BA1 and BA6:

In [775]:
# filter to keep only bachelor data and appropriate semesters
ba_data = data[data.degree == 'Bachelor']
ba_1 = ba_data[ba_data.semester == 1]
ba_6 = ba_data[ba_data.semester == 6]

print('There were', len(ba_1), 'students in IN-BA1 over the years')
print('There were', len(ba_6), 'students in IN-BA6 over the years')

There were 1719 students in IN-BA1 over the years
There were 665 students in IN-BA6 over the years


In [776]:
# compute the intersection by using sets
def compute_intersection_first_last_semester(df, first, last):
    intersection_index = set(first.index).intersection(set(last.index))
    intersection = df[df.index.isin(intersection_index)].sort_index()
    return intersection

Now each semester that every qualifying student has done is in the dataframe.<br>
We need to group the students by sciper and count the number of semesters.

In [777]:
def compute_semester_count(intersection):
    semester_count = intersection.reset_index()[['sciper', 'semester']].groupby(['sciper']).count()
    semester_count = semester_count.rename(columns={'semester' : 'semester_count'})
    intersection_counted = intersection.merge(semester_count, left_index=True, right_index=True)
    return intersection_counted

With use of the two defined functions, we can now get our semester count:

In [778]:
ba_counted = compute_semester_count(compute_intersection_first_last_semester(ba_data, ba_1, ba_6))

We want to keep one row per person and drop the columns that aren't useful for the analysis.

In [780]:
ba_counted = ba_counted.groupby(ba_counted.index).agg('first')
ba_counted = ba_counted[['title', 'name', 'semester_count']]

We can now group by sex and compute the required stats:

In [819]:
ba_counted.groupby('title').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,semester_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Madame,count,29.0
Madame,mean,6.793103
Madame,std,1.346406
Madame,min,6.0
Madame,25%,6.0
Madame,50%,6.0
Madame,75%,8.0
Madame,max,11.0
Monsieur,count,368.0
Monsieur,mean,7.105978


### Discussion:
TODO !!!!!

## Part 3b - Master data

<p>We will perform the same kind of computation as before to compute the number of semesters per student. We need to mind the fact that some students don't start in the first semester, but in the second one. Also, we need to distinguish between students with a specialization or a minor, and those without.</p>
<p><b>Assumptions:</b><br>
- Some students have a MA3 registered even though they don't do either a specialization or a minor. We will consider them anyways.<br>
</p>

In [782]:
# Proceed as before to filter into semesters
ma_data = data[data.degree == 'Master']
ma_1 = ma_data[ma_data.semester == 1]
ma_2 = ma_data[ma_data.semester == 2]
ma_3 = ma_data[ma_data.semester == 3]

print('There were', len(ma_1), 'students in IN-MA1 over the years')
print('There were', len(ma_2), 'students in IN-MA2 over the years')
print('There were', len(ma_3), 'students in IN-MA3 over the years')

There were 948 students in IN-MA1 over the years
There were 1062 students in IN-MA2 over the years
There were 706 students in IN-MA3 over the years


In the bachelor analysis, we needed to compute the semester count between the first BA1 and the last BA6.<br>
Here, it is not that simple, since the last semester of MA is not the same for everyone: some students finish in MA2, some finish in MA3, and they can very well succeed in both cases.<br>
=&gt; The problem here is then to figure out, for which student, if their last semester is MA2 or MA3. This is done with the following index:

In [783]:
last_index = ma_data.groupby(ma_data.index).agg({'semester' : np.max})

# 'semester' column here indicates the last semester number of the given student.
last_index.head(5)

Unnamed: 0_level_0,semester
sciper,Unnamed: 1_level_1
128911,3
129093,3
129326,3
138088,1
145546,3


Now, we merge this table back with all the master data. This is a typical `inner join`:

In [784]:
ma_last = ma_data.reset_index().merge(last_index.reset_index(), how='inner', on=['sciper','semester']).set_index('sciper')
ma_last.sort_index().head(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
128911,Monsieur,Gulati Asheesh,2007,2008,Master,3,Présent,Internet computing,
129093,Monsieur,Zhou Maoan,2007,2008,Master,3,Stage,,
129326,Monsieur,Ni Zhong Zhong,2007,2008,Master,3,Présent,,
138088,Monsieur,Droz-dit-Busset Arnault,2007,2008,Master,1,Présent,,
145546,Monsieur,Clivaz Jean-Philippe,2007,2008,Master,3,Présent,,


We need to <b>not consider</b> students whose maximal semester number is 1: this would be an indication that the student did not graduate at EPFL:

In [785]:
ma_last = ma_last[ma_last.semester != 1]
ma_last.sort_index().head(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
128911,Monsieur,Gulati Asheesh,2007,2008,Master,3,Présent,Internet computing,
129093,Monsieur,Zhou Maoan,2007,2008,Master,3,Stage,,
129326,Monsieur,Ni Zhong Zhong,2007,2008,Master,3,Présent,,
145546,Monsieur,Clivaz Jean-Philippe,2007,2008,Master,3,Présent,,
145957,Monsieur,Hügli Michael,2007,2008,Master,3,Présent,,


Now, we can use the functions that we defined in the Bachelor part, to compute the now usual intersection between students who were in the MA1 and then subsequently in their respective last semester of Master (MA2 or MA3).

In [839]:
ma_counted = compute_semester_count(compute_intersection_first_last_semester(ma_data, ma_1, ma_last))
ma_counted = ma_counted.groupby(ma_counted.index).agg('first')
ma_counted = ma_counted[['title', 'name', 'semester_count', 'minor', 'spec']]
ma_counted.sample(5)

Unnamed: 0_level_0,title,name,semester_count,minor,spec
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
214477,Madame,Jenatton Kateline,3,,
166263,Monsieur,Roux Alexandre,3,,Foundations of Software
169942,Monsieur,Habegger Mathieu,4,,
234277,Monsieur,Sharma Puneet,3,,
183920,Monsieur,Crisan Daniel,2,,


### Considering Master project ("PDM") data

It is now time to add the data for the students that did their PDM.<br>
All the students present in the `ma_counted` table above are already considered to have graduated successfully from EPFL, but the duration needs to be adjusted to take into account their PDM.<br>
We just need to fetch PDM data:

In [869]:
pdm_data = data[data.degree == 'PDM']

Some PDMs are longer than one semester. We can now compute the number of semesters of each student in `pdm_data`.

In [870]:
pdm_counted = pdm_data.reset_index()[['sciper', 'semester']].groupby(['sciper']).count()
pdm_counted = pdm_counted.rename(columns={'semester' : 'semester_count_pdm'})
pdm_counted.head()

Unnamed: 0_level_0,semester_count_pdm
sciper,Unnamed: 1_level_1
227209,1


Now we concatenate this table with `ma_counted`.<br>
It is assumed that every student that did a first and a last semester of courses also did a PDM, even if this is not registered in ISA (some other databases seem to exist). Thus we add one semester to the count of the students who are in that case:

In [871]:
ma_with_pdm = pd.concat([pdm_counted, ma_counted], axis=1)
ma_with_pdm['semester_count_pdm'].fillna(1, inplace=True)
ma_with_pdm.head()

Unnamed: 0,semester_count_pdm,title,name,semester_count,minor,spec
146330,1.0,Monsieur,Cardinaux Damien,3,,
146742,1.0,Monsieur,Marx Clément,5,,"Signals, Images and Interfaces"
146929,1.0,Monsieur,Junod Antoine,2,,
147008,1.0,Monsieur,Good Xavier,4,,"Signals, Images and Interfaces"
152232,1.0,Monsieur,Anagnostaras David,3,"Mineur en Management, technologie et entrepren...",


The final `semester_count` is given by the sum of itself with the number of semesters for the PDM:

In [872]:
ma_with_pdm.semester_count = ma_with_pdm.semester_count + ma_with_pdm.semester_count_pdm
ma_with_pdm = ma_with_pdm.drop('semester_count_pdm', axis=1)
ma_with_pdm.head()

Unnamed: 0,title,name,semester_count,minor,spec
146330,Monsieur,Cardinaux Damien,4.0,,
146742,Monsieur,Marx Clément,6.0,,"Signals, Images and Interfaces"
146929,Monsieur,Junod Antoine,3.0,,
147008,Monsieur,Good Xavier,5.0,,"Signals, Images and Interfaces"
152232,Monsieur,Anagnostaras David,4.0,"Mineur en Management, technologie et entrepren...",


### Students with a specialization

In [796]:
students_with_spec = intersection_counted.groupby('spec').mean()
students_with_spec

Index(['title', 'name', 'start', 'end', 'degree', 'semester', 'status', 'spec',
       'minor'],
      dtype='object')