In [307]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup
sns.set_context('notebook')

## Fetching all students enrolled in CS in Bachelor from 2007 to 2017

In [308]:
#Base url of isa
base_url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS"

## part-1 Fetch from ISA all codes used as parameters for fetching data

In [309]:
# Get all parameters from the ISA form
select_url = base_url +  ".filter?"
select_params = {'ww_i_reportModel' : '133685247'}

r  = requests.get(select_url, params=select_params)
data = r.text
soup = BeautifulSoup(data, "lxml")
allCodes = {}

# For each dropdown on the form get the corresponding codes
for dropdown in soup.find_all('select'):
   key = dropdown.get('name')
   codes = dict([(option.text, option.get('value')) for option in dropdown.find_all('option') if option.text])
   allCodes[key] = codes

# Dictionary with all the codes
allCodes

{'ww_x_HIVERETE': {"Semestre d'automne": '2936286',
  'Semestre de printemps': '2936295'},
 'ww_x_PERIODE_ACAD': {'2007-2008': '978181',
  '2008-2009': '978187',
  '2009-2010': '978195',
  '2010-2011': '39486325',
  '2011-2012': '123455150',
  '2012-2013': '123456101',
  '2013-2014': '213637754',
  '2014-2015': '213637922',
  '2015-2016': '213638028',
  '2016-2017': '355925344'},
 'ww_x_PERIODE_PEDAGO': {'Bachelor semestre 1': '249108',
  'Bachelor semestre 2': '249114',
  'Bachelor semestre 3': '942155',
  'Bachelor semestre 4': '942163',
  'Bachelor semestre 5': '942120',
  'Bachelor semestre 5b': '2226768',
  'Bachelor semestre 6': '942175',
  'Bachelor semestre 6b': '2226785',
  'Master semestre 1': '2230106',
  'Master semestre 2': '942192',
  'Master semestre 3': '2230128',
  'Master semestre 4': '2230140',
  'Mineur semestre 1': '2335667',
  'Mineur semestre 2': '2335676',
  'Mise à niveau': '2063602308',
  'Projet Master automne': '249127',
  'Projet Master printemps': '3781783

In [310]:
def fetch_data (url, args) :
    '''GETs a webpage, determined by its url, and optional parameters.
       Returns a BeautifulSoup containing the parsed HTML page. '''
    req = requests.get(url, params=args)
    if req.status_code != requests.codes.ok:
        print('Error: code ' + str(req.status_code))
        return None
    
    return BeautifulSoup(req.text, 'html.parser')

In [311]:
def parse_table(soup) :
    table = soup.find("table")
    rows = table.find_all("tr")
    return rows

In [320]:
data_url = base_url + ".html?"

args = {'ww_x_GPS' : '-1', 
              'ww_i_reportModel' : '133685247',
              'ww_i_reportModelXsl':'133685270',
              'ww_x_UNITE_ACAD' : 'null',
              'ww_x_PERIODE_ACAD' : 'null', 
              'ww_x_PERIODE_PEDAGO' : 'null', 
              'ww_x_HIVERETE':'null'}

args['ww_x_UNITE_ACAD'] = allCodes['ww_x_UNITE_ACAD']['Informatique']


In [404]:
def fetch_table(degree, action) :
    rn = []
    for year, y_code in allCodes['ww_x_PERIODE_ACAD'].items():
        for periode, p_code in allCodes['ww_x_PERIODE_PEDAGO'].items():
            if degree in periode:
                args['ww_x_PERIODE_ACAD'] = p_code
                args['ww_x_PERIODE_PEDAGO'] = y_code

                soup = fetch_data(data_url, args)
                rows = parse_table(soup)

                rn.extend(action(rows, year, periode))
                
    return pd.DataFrame(data = rn)


In [405]:
def bachelor_action(rows, year, periode) :
    tmp_table = []
    # Take significant infromation from rows, add the corresponding year and semester
    for row in rows:
        cell = row.find_all('td')
        if cell:
            tmp_table.append({'Year'     : year,
                       'Semester' : periode,
                       'Civility' : cell[0].get_text(),
                       'Name'     : cell[1].get_text(), 
                       'Sciper'   : cell[10].get_text()})
    return tmp_table

In [406]:
data = fetch_table('Bachelor', bachelor_action)
data

Unnamed: 0,Civility,Name,Sciper,Semester,Year
0,Madame,Andriambololona Riana Miarantsoa,154157,Bachelor semestre 5,2007-2008
1,Monsieur,Barras Florian,170220,Bachelor semestre 5,2007-2008
2,Monsieur,Bécholey Alexandre,160213,Bachelor semestre 5,2007-2008
3,Monsieur,Beuret Thibaut,166701,Bachelor semestre 5,2007-2008
4,Monsieur,Biollay Jean Isaac Jamal Pachacutec,161279,Bachelor semestre 5,2007-2008
5,Monsieur,Blatter Jérémy,166344,Bachelor semestre 5,2007-2008
6,Monsieur,Boksanyi Michael,161634,Bachelor semestre 5,2007-2008
7,Monsieur,Bolkensteyn Dinesh,170451,Bachelor semestre 5,2007-2008
8,Monsieur,Bornet Cyril Antoine Michel,170219,Bachelor semestre 5,2007-2008
9,Monsieur,Candaux Yanick,169655,Bachelor semestre 5,2007-2008


# Part 2 : Fetch list of students in registered bachelor 1 and 6

For each student take the year when (s)he started (min of years if multiples entries for bachelor 1) and the last year (s)he was incribed in the bachelor 6 (max of years if multiple entries). Assuming Sciper, Name and Civility are constant for each student.

In [407]:
s1 = data[data['Semester'].str.contains("1")]
s6 = data[data['Semester'].str.contains("6")]

bachelor = pd.merge(s1, s6, on=['Sciper', 'Name', 'Civility'], suffixes=('_1', '_6'))
bachelor = bachelor.groupby(['Sciper', 'Name', 'Civility'])
bachelor = bachelor.aggregate({'Year_6' : np.min, 'Year_1' : np.max})
bachelor

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Year_6,Year_1
Sciper,Name,Civility,Unnamed: 3_level_1,Unnamed: 4_level_1
147008,Good Xavier,Monsieur,2010-2011,2008-2009
169569,Arévalo Christian,Monsieur,2009-2010,2007-2008
169731,Knecht Mathieu,Monsieur,2010-2011,2007-2008
169795,Scheiben Pascal,Monsieur,2009-2010,2007-2008
171195,Richter Arnaud,Monsieur,2009-2010,2007-2008
171619,Buchschacher Nicolas,Monsieur,2009-2010,2007-2008
174905,Aubelle Flavien,Monsieur,2010-2011,2007-2008
175190,Hanser Valérian,Monsieur,2009-2010,2007-2008
175280,Uberti Quentin,Monsieur,2009-2010,2007-2008
175379,Gremion Sébastien,Monsieur,2010-2011,2008-2009


In [381]:
def months_in_epfl(row) :
    start = row['Year_1'].split("-")[0]
    end = row['Year_6'].split("-")[1]
    return (int(end) - int(start)) * 12

In [382]:
bachelor['Months'] = bachelor.apply(months_in_epfl, axis=1)
bachelor

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Year_6,Year_1,Months
Sciper,Name,Civility,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
147008,Good Xavier,Monsieur,2010-2011,2008-2009,36
169569,Arévalo Christian,Monsieur,2009-2010,2007-2008,36
169731,Knecht Mathieu,Monsieur,2010-2011,2007-2008,48
169795,Scheiben Pascal,Monsieur,2009-2010,2007-2008,36
171195,Richter Arnaud,Monsieur,2009-2010,2007-2008,36
171619,Buchschacher Nicolas,Monsieur,2009-2010,2007-2008,36
174905,Aubelle Flavien,Monsieur,2010-2011,2007-2008,48
175190,Hanser Valérian,Monsieur,2009-2010,2007-2008,36
175280,Uberti Quentin,Monsieur,2009-2010,2007-2008,36
175379,Gremion Sébastien,Monsieur,2010-2011,2008-2009,36


In [319]:
bachelor = bachelor.reset_index()

gender = bachelor['Months'].groupby(value['Civility'])
g = pd.DataFrame(gender.aggregate(np.mean))
g

Unnamed: 0_level_0,Months
Civility,Unnamed: 1_level_1
Madame,36.0
Monsieur,36.195652


# Master

In [408]:
def master_action(rows, year, periode) :
    tmp_master_table = []
    # Take significant infromation from rows, add the corresponding year and semester
    for row in rows:
        cell = row.find_all('td')
        if cell:
            tmp_master_table.append({'Year'          : year,
                                   'Semester'        : periode,
                                   'Civility'        : cell[0].get_text(),
                                   'Name'            : cell[1].get_text(), 
                                   'Sciper'          : cell[10].get_text(),
                                   'Mineur'          : cell[6].get_text(), 
                                   'Specialisation'  : cell[4].get_text()})
    return tmp_master_table

In [409]:
master_data = fetch_table('Master', master_action)
master_data


Unnamed: 0,Civility,Mineur,Name,Sciper,Semester,Specialisation,Year
0,Monsieur,,Stewart Conail,173527,Projet Master automne,,2007-2008
1,Monsieur,,Aeberhard François-Xavier,153066,Master semestre 1,,2007-2008
2,Madame,,Agarwal Megha,180027,Master semestre 1,,2007-2008
3,Monsieur,,Anagnostaras David,152232,Master semestre 1,,2007-2008
4,Monsieur,,Auroux Damien,177395,Master semestre 1,,2007-2008
5,Monsieur,,Awalebo Joseph,161970,Master semestre 1,,2007-2008
6,Monsieur,,Balet Ken,166258,Master semestre 1,,2007-2008
7,Monsieur,,Barazzutti Raphaël Pierre,173600,Master semestre 1,,2007-2008
8,Monsieur,,Bayramoglu Ersoy,178879,Master semestre 1,,2007-2008
9,Madame,,Benabdallah Zeineb,154573,Master semestre 1,,2007-2008


In [591]:
m1 = master_data[master_data['Semester'].str.contains("1")]
m2 = master_data[master_data['Semester'].str.contains("2")]
m3 = master_data[master_data['Semester'].str.contains("3")]

pm = master_data[master_data['Semester'].str.contains("Projet")]

master = m1.merge(m2, how='outer', on=['Sciper', 'Name', 'Civility'], suffixes=('_1', '_2'))\
        .merge(m3, how='outer',on=['Sciper', 'Name', 'Civility'], suffixes=('_2', '_3')) \
        .merge(pm, how='outer', on=['Sciper', 'Name', 'Civility'], suffixes=('_3', '_pm'))

master = master.groupby(['Sciper', 'Name', 'Civility'])
master = master.aggregate({'Mineur_1': np.max, 
                           'Mineur_2': np.max, 
                           'Mineur_3': np.max, 
                           'Specialisation_1' : np.max,
                           'Specialisation_2' : np.max,
                           'Specialisation_3' : np.max,
                           'Year_1' : np.min, \
                           'Year_2' : np.max, \
                           'Year_3' : np.max, 'Year_pm': np.max})
master.reset_index()
master



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Specialisation_3,Specialisation_2,Year_1,Mineur_1,Year_2,Year_3,Year_pm,Mineur_3,Mineur_2,Specialisation_1
Sciper,Name,Civility,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
128911,Gulati Asheesh,Monsieur,Internet computing,Internet computing,,,2007-2008,2007-2008,,,,
129093,Zhou Maoan,Monsieur,,,,,2007-2008,2007-2008,,,,
129326,Ni Zhong Zhong,Monsieur,,,,,2007-2008,2007-2008,,,,
138088,Droz-dit-Busset Arnault,Monsieur,,,2007-2008,,,,,,,
145546,Clivaz Jean-Philippe,Monsieur,,,,,,2007-2008,,,,
145957,Hügli Michael,Monsieur,,,,,,2007-2008,2007-2008,,,
146330,Cardinaux Damien,Monsieur,,,2007-2008,,2007-2008,2008-2009,,,,
146441,Henriot Laetitia,Madame,,,,,,2007-2008,,,,
146742,Marx Clément,Monsieur,"Signals, Images and Interfaces","Signals, Images and Interfaces",2008-2009,,2009-2010,2012-2013,2010-2011,,,"Signals, Images and Interfaces"
146752,De Lamarter Erik,Monsieur,,,,,,2007-2008,,,,
