# Homework 2 - Montero, Peterssen, Brousse

In [30]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np
import pandas as pd

## Fetching all students enrolled in CS in Bachelor from 2007 to 2017

In [32]:
# Base url of isa
base_url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS"

## Part 1 - Fetch from ISA all codes used as parameters for fetching data

In [33]:
# Get all parameters from the ISA form
select_url = base_url +  ".filter?"
select_params = {'ww_i_reportModel' : '133685247'}

r  = requests.get(select_url, params=select_params)
data = r.text
soup = BeautifulSoup(data, "lxml")
allCodes = {}

# For each dropdown on the form get the corresponding codes
for dropdown in soup.find_all('select'):
   key = dropdown.get('name')
   codes = dict([(option.text, option.get('value')) for option in dropdown.find_all('option') if option.text])
   allCodes[key] = codes

# Dictionary with all the codes
allCodes

{'ww_x_HIVERETE': {"Semestre d'automne": '2936286',
  'Semestre de printemps': '2936295'},
 'ww_x_PERIODE_ACAD': {'2007-2008': '978181',
  '2008-2009': '978187',
  '2009-2010': '978195',
  '2010-2011': '39486325',
  '2011-2012': '123455150',
  '2012-2013': '123456101',
  '2013-2014': '213637754',
  '2014-2015': '213637922',
  '2015-2016': '213638028',
  '2016-2017': '355925344'},
 'ww_x_PERIODE_PEDAGO': {'Bachelor semestre 1': '249108',
  'Bachelor semestre 2': '249114',
  'Bachelor semestre 3': '942155',
  'Bachelor semestre 4': '942163',
  'Bachelor semestre 5': '942120',
  'Bachelor semestre 5b': '2226768',
  'Bachelor semestre 6': '942175',
  'Bachelor semestre 6b': '2226785',
  'Master semestre 1': '2230106',
  'Master semestre 2': '942192',
  'Master semestre 3': '2230128',
  'Master semestre 4': '2230140',
  'Mineur semestre 1': '2335667',
  'Mineur semestre 2': '2335676',
  'Mise à niveau': '2063602308',
  'Projet Master automne': '249127',
  'Projet Master printemps': '3781783

In [34]:
def fetch_data (url, args) :
    '''GETs a webpage, determined by its url, and optional parameters.
       Returns a BeautifulSoup containing the parsed HTML page. '''
    req = requests.get(url, params=args)
    if req.status_code != requests.codes.ok:
        print('Error: code ' + str(req.status_code))
        return None
    
    return BeautifulSoup(req.text, 'html.parser')

Parse an ISA table:

In [35]:
def parse_table(soup) :
    table = soup.find("table")
    rows = table.find_all("tr")
    return rows

Define default url and constant parameter

In [36]:
data_url = base_url + ".html?"

args = {'ww_x_GPS' : '-1', 
              'ww_i_reportModel' : '133685247',
              'ww_i_reportModelXsl':'133685270',
              'ww_x_UNITE_ACAD' : 'null',
              'ww_x_PERIODE_ACAD' : 'null', 
              'ww_x_PERIODE_PEDAGO' : 'null', 
              'ww_x_HIVERETE':'null'}

args['ww_x_UNITE_ACAD'] = allCodes['ww_x_UNITE_ACAD']['Informatique']


Iterate over the years and the semesters codes, fetch and parse data, store the result in a dataFrame only considering useful columns

In [37]:
def assign_semester (period):
    if('Projet' in period) :
        semester = ['PDM', 1] if ('automne' in period) else ['PDM' , 2]
    else :
        semester = [period.split(" ")[0], int(re.findall(r'\d+', period)[0])]
   
    return semester

In [38]:
def fetch_table() :
    rn = []
    for year, y_code in allCodes['ww_x_PERIODE_ACAD'].items():
        for periode, p_code in allCodes['ww_x_PERIODE_PEDAGO'].items():
            if any(degree in periode for degree in ['Bachelor', 'Master']):
                args['ww_x_PERIODE_ACAD'] = p_code
                args['ww_x_PERIODE_PEDAGO'] = y_code

                soup = fetch_data(data_url, args)
                rows = parse_table(soup)
                
                for row in rows:
                    cell = row.find_all('td')
                    if cell:                      
                        rn.append({'title' : cell[0].get_text(),
                                   'name'     : cell[1].get_text(),
                                   'spec' : cell[4].get_text(),
                                   'minor' : cell[6].get_text(),
                                   'status' : cell[7].get_text(),
                                   'sciper' : cell[10].get_text(),
                                   'degree':  assign_semester(periode)[0],
                                   'end': year.split("-")[1],
                                   'semester':  assign_semester(periode)[1], 
                                   'start': year.split("-")[0]}) 
             
    return pd.DataFrame(data = rn)

In [39]:
data = fetch_table()

## Part 3 - Statistical analysis

In [40]:
# sort columns
data = data[['sciper', 'title', 'name', 'start', 'end', 'degree', 'semester', 'status', 'spec', 'minor']]

# 'semester' value is an integer
data.semester = data.semester.astype(int)

# index by sciper number
data = data.set_index('sciper')

data.sample(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
256846,Monsieur,Tallec Corentin Jean Daniel,2015,2016,Master,1,Présent,,
235644,Monsieur,Antelo Blanco Lucas,2013,2014,Bachelor,2,Présent,,
216134,Monsieur,Grivaz Thomas Gérard François,2011,2012,Bachelor,1,Présent,,
206923,Monsieur,Tran Le Hung,2011,2012,Master,2,Présent,,
257825,Monsieur,Alnasser Riyadh Abdulaziz M,2015,2016,Master,2,Présent,,


### Part 3a - Bachelor data

We're keeping only students who were both in BA1 and BA6:

In [43]:
# filter to keep only bachelor data and appropriate semesters
ba_data = data[data.degree == 'Bachelor']
ba_1 = ba_data[ba_data.semester == 1]
ba_6 = ba_data[ba_data.semester == 6]

print('There were', len(ba_1), 'students in IN-BA1 over the years')
print('There were', len(ba_6), 'students in IN-BA6 over the years')

There were 1720 students in IN-BA1 over the years
There were 664 students in IN-BA6 over the years


In [44]:
# compute the intersection by using sets
def compute_intersection_first_last_semester(df, first, last):
    intersection_index = set(first.index).intersection(set(last.index))
    intersection = df[df.index.isin(intersection_index)].sort_index()
    return intersection

Now each semester that every qualifying student has done is in the dataframe.<br>
We need to group the students by sciper and count the number of semesters.

In [45]:
def compute_semester_count(intersection):
    semester_count = intersection.reset_index()[['sciper', 'semester']].groupby(['sciper']).count()
    semester_count = semester_count.rename(columns={'semester' : 'semester_count'})
    intersection_counted = intersection.merge(semester_count, left_index=True, right_index=True)
    return intersection_counted

With use of the two defined functions, we can now get our semester count:

In [54]:
ba_counted = compute_semester_count(compute_intersection_first_last_semester(ba_data, ba_1, ba_6))
ba_counted.head(10)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
147008,Monsieur,Good Xavier,2008,2009,Bachelor,1,Présent,,,6
147008,Monsieur,Good Xavier,2008,2009,Bachelor,2,Présent,,,6
147008,Monsieur,Good Xavier,2010,2011,Bachelor,5,Congé,,,6
147008,Monsieur,Good Xavier,2009,2010,Bachelor,4,Présent,,,6
147008,Monsieur,Good Xavier,2010,2011,Bachelor,6,Congé,,,6
147008,Monsieur,Good Xavier,2009,2010,Bachelor,3,Présent,,,6
169569,Monsieur,Arévalo Christian,2007,2008,Bachelor,1,Présent,,,6
169569,Monsieur,Arévalo Christian,2009,2010,Bachelor,5,Présent,,,6
169569,Monsieur,Arévalo Christian,2008,2009,Bachelor,4,Présent,,,6
169569,Monsieur,Arévalo Christian,2009,2010,Bachelor,6,Présent,,,6


We want to keep one row per person and drop the columns that aren't useful for the analysis.

In [55]:
ba_counted = ba_counted.groupby(ba_counted.index).agg('first')
ba_counted = ba_counted[['title', 'name', 'semester_count']]
ba_counted.head()

Unnamed: 0_level_0,title,name,semester_count
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
147008,Monsieur,Good Xavier,6
169569,Monsieur,Arévalo Christian,6
169731,Monsieur,Knecht Mathieu,8
169795,Monsieur,Scheiben Pascal,9
171195,Monsieur,Richter Arnaud,6


We can now group by sex and compute the required stats:

In [56]:
ba_counted.groupby('title').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,semester_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Madame,count,29.0
Madame,mean,6.793103
Madame,std,1.346406
Madame,min,6.0
Madame,25%,6.0
Madame,50%,6.0
Madame,75%,8.0
Madame,max,11.0
Monsieur,count,368.0
Monsieur,mean,7.105978


### Discussion:
TODO !!!!!

## Part 3b - Master data

<p>We will perform the same kind of computation as before to compute the number of semesters per student. We need to mind the fact that some students don't start in the first semester, but in the second one. Also, we need to distinguish between students with a specialization or a minor, and those without.</p>
<p><b>Assumptions:</b><br>
- Some students have a MA3 registered even though they don't do either a specialization or a minor. We will consider them anyways.<br>
</p>

In [57]:
# Proceed as before to filter into semesters
ma_data = data[data.degree == 'Master']
ma_1 = ma_data[ma_data.semester == 1]
ma_2 = ma_data[ma_data.semester == 2]
ma_3 = ma_data[ma_data.semester == 3]

print('There were', len(ma_1), 'students in IN-MA1 over the years')
print('There were', len(ma_2), 'students in IN-MA2 over the years')
print('There were', len(ma_3), 'students in IN-MA3 over the years')

There were 948 students in IN-MA1 over the years
There were 1062 students in IN-MA2 over the years
There were 706 students in IN-MA3 over the years


In the bachelor analysis, we needed to compute the semester count between the first BA1 and the last BA6.<br>
Here, it is not that simple, since the last semester of MA is not the same for everyone: some students finish in MA2, some finish in MA3, and they can very well succeed in both cases.<br>
=&gt; The problem here is then to figure out, for which student, if their last semester is MA2 or MA3. This is done with the following index:

In [58]:
last_index = ma_data.groupby(ma_data.index).agg({'semester' : np.max})

# 'semester' column here indicates the last semester number of the given student.
last_index.head(5)

Unnamed: 0_level_0,semester
sciper,Unnamed: 1_level_1
128911,3
129093,3
129326,3
138088,1
145546,3


Now, we merge this table back with all the master data. This is a typical `inner join`:

In [59]:
ma_last = ma_data.reset_index().merge(last_index.reset_index(), how='inner', on=['sciper','semester']).set_index('sciper')
ma_last.sort_index().head(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
128911,Monsieur,Gulati Asheesh,2007,2008,Master,3,Présent,Internet computing,
129093,Monsieur,Zhou Maoan,2007,2008,Master,3,Stage,,
129326,Monsieur,Ni Zhong Zhong,2007,2008,Master,3,Présent,,
138088,Monsieur,Droz-dit-Busset Arnault,2007,2008,Master,1,Présent,,
145546,Monsieur,Clivaz Jean-Philippe,2007,2008,Master,3,Présent,,


We need to <b>not consider</b> students whose maximal semester number is 1: this would be an indication that the student did not graduate at EPFL:

In [60]:
ma_last = ma_last[ma_last.semester != 1]
ma_last.sort_index().head(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
128911,Monsieur,Gulati Asheesh,2007,2008,Master,3,Présent,Internet computing,
129093,Monsieur,Zhou Maoan,2007,2008,Master,3,Stage,,
129326,Monsieur,Ni Zhong Zhong,2007,2008,Master,3,Présent,,
145546,Monsieur,Clivaz Jean-Philippe,2007,2008,Master,3,Présent,,
145957,Monsieur,Hügli Michael,2007,2008,Master,3,Présent,,


Now, we can use the functions that we defined in the Bachelor part, to compute the now usual intersection between students who were in the MA1 and then subsequently in their respective last semester of Master (MA2 or MA3).

In [63]:
ma_counted = compute_semester_count(compute_intersection_first_last_semester(ma_data, ma_1, ma_last))
ma_counted = ma_counted.groupby(ma_counted.index).agg('first')
ma_counted = ma_counted[['title', 'name', 'semester_count', 'minor', 'spec']]
ma_counted.sample(5)

Unnamed: 0_level_0,title,name,semester_count,minor,spec
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
179426,Monsieur,Bloch Remi,3,,Software Systems
228358,Monsieur,Raemy Baptiste,3,,Internet computing
183629,Monsieur,Gratsias Georgios,2,,
243164,Monsieur,Brechbühler Gil,4,,
191181,Monsieur,Upadhyay Utkarsh,4,,Foundations of Software


### Considering Master project ("PDM") data

It is now time to add the data for the students that did their PDM.<br>
All the students present in the `ma_counted` table above are already considered to have graduated successfully from EPFL, but the duration needs to be adjusted to take into account their PDM.<br>
We just need to fetch PDM data:

In [64]:
pdm_data = data[data.degree == 'PDM']
pdm_data.head()

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
166075,Monsieur,Amiguet Jérôme,2016,2017,PDM,1,Présent,,
243163,Monsieur,Antognini Diego Matteo,2016,2017,PDM,1,Présent,,
194182,Monsieur,Antognini Marco,2016,2017,PDM,1,Présent,Foundations of Software,
213664,Monsieur,Bouquet Stéphane,2016,2017,PDM,1,Présent,,"Mineur en Management, technologie et entrepren..."
214637,Monsieur,Bovet Sidney,2016,2017,PDM,1,Présent,,


Some PDMs are longer than one semester. We can now compute the number of semesters of each student in `pdm_data`.

In [65]:
pdm_counted = pdm_data.reset_index()[['sciper', 'semester']].groupby(['sciper']).count()
pdm_counted = pdm_counted.rename(columns={'semester' : 'semester_count_pdm'})
pdm_counted.head()

Unnamed: 0_level_0,semester_count_pdm
sciper,Unnamed: 1_level_1
145957,1
146742,2
153819,1
154573,1
159516,1


Now we concatenate this table with `ma_counted`.<br>
It is assumed that every student that did a first and a last semester of courses also did a PDM, even if this is not registered in ISA (some other databases seem to exist). Thus we add one semester to the count of the students who are in that case:

In [71]:
ma_with_pdm = pd.concat([pdm_counted, ma_counted], axis=1)
ma_with_pdm['semester_count_pdm'].fillna(1, inplace=True)
ma_with_pdm.head()

Unnamed: 0,semester_count_pdm,title,name,semester_count,minor,spec
145957,1.0,,,,,
146330,1.0,Monsieur,Cardinaux Damien,3.0,,
146742,2.0,Monsieur,Marx Clément,5.0,,"Signals, Images and Interfaces"
146929,1.0,Monsieur,Junod Antoine,2.0,,
147008,1.0,Monsieur,Good Xavier,4.0,,


In [78]:
pdm_counted.reset_index().merge(ma_counted.reset_index()).set_index('sciper')

Unnamed: 0_level_0,semester_count_pdm,title,name,semester_count,minor,spec
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
146742,2,Monsieur,Marx Clément,5,,"Signals, Images and Interfaces"
154573,1,Madame,Benabdallah Zeineb,5,,Biocomputing
159516,1,Monsieur,Raja Yanick,3,,
160213,2,Monsieur,Bécholey Alexandre,3,,
160225,1,Monsieur,Cassata Alexandre,3,,
161279,1,Monsieur,Biollay Jean Isaac Jamal Pachacutec,3,,
165819,1,Monsieur,Hofer Thomas,3,,Foundations of Software
166075,1,Monsieur,Amiguet Jérôme,4,,
166344,1,Monsieur,Blatter Jérémy,3,,
166491,1,Monsieur,Imperatori Alexandre,7,,


The final `semester_count` is given by the sum of itself with the number of semesters for the PDM:

In [None]:
ma_with_pdm.semester_count = ma_with_pdm.semester_count + ma_with_pdm.semester_count_pdm
ma_with_pdm = ma_with_pdm.drop('semester_count_pdm', axis=1)
ma_with_pdm.head()

### Students with a specialization

In [None]:
students_with_spec = intersection_counted.groupby('spec').mean()
students_with_spec