In [157]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import requests
from bs4 import BeautifulSoup

In [158]:
## Fetching all students enrolled in CS in Bachelor from 2007 to 2017

In [159]:
#Base url of isa
base_url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS"

In [160]:
## part-1 Fetch from ISA all codes used as parameters for fetching data

In [161]:
# Get all parameters from the ISA form
select_url = base_url +  ".filter?"
select_params = {'ww_i_reportModel' : '133685247'}

r  = requests.get(select_url, params=select_params)
data = r.text
soup = BeautifulSoup(data, "lxml")
allCodes = {}

# For each dropdown on the form get the corresponding codes
for dropdown in soup.find_all('select'):
   key = dropdown.get('name')
   codes = dict([(option.text, option.get('value')) for option in dropdown.find_all('option') if option.text])
   allCodes[key] = codes

# Dictionary with all the codes
allCodes

{'ww_x_HIVERETE': {"Semestre d'automne": '2936286',
  'Semestre de printemps': '2936295'},
 'ww_x_PERIODE_ACAD': {'2007-2008': '978181',
  '2008-2009': '978187',
  '2009-2010': '978195',
  '2010-2011': '39486325',
  '2011-2012': '123455150',
  '2012-2013': '123456101',
  '2013-2014': '213637754',
  '2014-2015': '213637922',
  '2015-2016': '213638028',
  '2016-2017': '355925344'},
 'ww_x_PERIODE_PEDAGO': {'Bachelor semestre 1': '249108',
  'Bachelor semestre 2': '249114',
  'Bachelor semestre 3': '942155',
  'Bachelor semestre 4': '942163',
  'Bachelor semestre 5': '942120',
  'Bachelor semestre 5b': '2226768',
  'Bachelor semestre 6': '942175',
  'Bachelor semestre 6b': '2226785',
  'Master semestre 1': '2230106',
  'Master semestre 2': '942192',
  'Master semestre 3': '2230128',
  'Master semestre 4': '2230140',
  'Mineur semestre 1': '2335667',
  'Mineur semestre 2': '2335676',
  'Mise à niveau': '2063602308',
  'Projet Master automne': '249127',
  'Projet Master printemps': '3781783

In [162]:
def fetch_data (url, args) :
    '''GETs a webpage, determined by its url, and optional parameters.
       Returns a BeautifulSoup containing the parsed HTML page. '''
    req = requests.get(url, params=args)
    if req.status_code != requests.codes.ok:
        print('Error: code ' + str(req.status_code))
        return None
    
    return BeautifulSoup(req.text, 'html.parser')

In [163]:
def parse_table(soup) :
    table = soup.find("table")
    rows = table.find_all("tr")
    return rows

In [164]:
data_url = base_url + ".html?"

args = {'ww_x_GPS' : '-1', 
              'ww_i_reportModel' : '133685247',
              'ww_i_reportModelXsl':'133685270',
              'ww_x_UNITE_ACAD' : 'null',
              'ww_x_PERIODE_ACAD' : 'null', 
              'ww_x_PERIODE_PEDAGO' : 'null', 
              'ww_x_HIVERETE':'null'}

args['ww_x_UNITE_ACAD'] = allCodes['ww_x_UNITE_ACAD']['Informatique']


In [165]:
def assign_semester (period):
    if('Projet' in period) :
        semester = ['PM', 1] if ('autome' in period) else ['PM' , 2]
    else :
        semester = [period.split(" ")[0], int(re.findall(r'\d+', period)[0])]
   
    return semester

In [166]:
def fetch_table() :
    rn = []
    for year, y_code in allCodes['ww_x_PERIODE_ACAD'].items():
        for periode, p_code in allCodes['ww_x_PERIODE_PEDAGO'].items():
            if any(degree in periode for degree in ['Bachelor', 'Master']):
                args['ww_x_PERIODE_ACAD'] = p_code
                args['ww_x_PERIODE_PEDAGO'] = y_code

                soup = fetch_data(data_url, args)
                rows = parse_table(soup)
                
                for row in rows:
                    cell = row.find_all('td')
                    if cell:                      
                        rn.append({'Civilité' : cell[0].get_text(),
                                   'Nom Prénom'     : cell[1].get_text(), 
                                   'Orientation Bachelor'   : cell[2].get_text(),
                                   'Orientation Master' : cell[3].get_text(),
                                   'Spécialisation' : cell[4].get_text(),
                                   'Filière opt' : cell[5].get_text(),
                                   'Mineur' : cell[6].get_text(),
                                   'Statut' : cell[7].get_text(),
                                   'Type Echange' : cell[8].get_text(),
                                   'Ecole Echange' : cell[9].get_text(),
                                   'No Sciper' : cell[10].get_text(),
                                   'degree':  assign_semester(periode)[0],
                                   'end': year.split("-")[1],
                                   'semester':  assign_semester(periode)[1], 
                                   'start': year.split("-")[0]}) 
             
    return pd.DataFrame(data = rn)


In [167]:
data = fetch_table()
data

Unnamed: 0,Civilité,Ecole Echange,Filière opt,Mineur,No Sciper,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Statut,Type Echange,degree,end,semester,start
0,Monsieur,,,,202293,Aiulfi Loris Sandro,,,,Présent,,Bachelor,2013,4,2012
1,Monsieur,,,,215576,Alonso Seisdedos Florian,,,,Présent,,Bachelor,2013,4,2012
2,Monsieur,,,,213618,Amorim Afonso Caldeira Da Silva Pedro Maria,,,,Présent,,Bachelor,2013,4,2012
3,Monsieur,,,,212591,Beaud Guillaume François Paul,,,,Présent,,Bachelor,2013,4,2012
4,Monsieur,,,,206416,Berney Léonard,,,,Présent,,Bachelor,2013,4,2012
5,Monsieur,,,,217612,Bottinelli Rémi,,,,Présent,,Bachelor,2013,4,2012
6,Monsieur,,,,213664,Bouquet Stéphane,,,,Présent,,Bachelor,2013,4,2012
7,Monsieur,,,,214637,Bovet Sidney,,,,Présent,,Bachelor,2013,4,2012
8,Monsieur,,,,214603,Bron Sacha Allan,,,,Présent,,Bachelor,2013,4,2012
9,Monsieur,,,,203945,Cao Huu-Ân,,,,Présent,,Bachelor,2013,4,2012


In [168]:
#data.set_index('No Sciper').sort_index()

## Part 3 - Statistical analysis

In [169]:
# incorporate the students into a data frame, keep only useful columns, and give them an English name
#data = pd.DataFrame(students)
data = data[['Civilité', 'No Sciper', 'Nom Prénom', 'start', 'end', 'degree', 'semester', 'Statut', 'Spécialisation', 'Mineur']]
data = data.rename(columns={'Civilité' : 'title',
                            'No Sciper' : 'sciper',
                            'Nom Prénom' : 'name',
                            'Statut' : 'status',
                            'Spécialisation' : 'spec',
                            'Mineur' : 'minor'})

# 'semester' value is an integer
data.semester = data.semester.astype(int)

# index by sciper number
data = data.set_index('sciper')

data.sample(5)

Unnamed: 0_level_0,title,name,start,end,degree,semester,status,spec,minor
sciper,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
223572,Monsieur,Pirelli Solal Vincenzo,2014,2015,Bachelor,6,Présent,,
201042,Monsieur,Mettraux Vincent Michel,2016,2017,Master,3,Présent,,
170946,Monsieur,Zimmermann Florian,2014,2015,Master,1,Présent,,
239170,Monsieur,Chevalley Alain,2013,2014,Bachelor,1,Présent,,
227356,Monsieur,Leiva Loris Angel,2013,2014,Bachelor,4,Présent,,


### Part 3a - Bachelor data

In [170]:
# filter to keep only bachelor data and appropriate semesters
ba_data = data[data['degree'] == 'Bachelor']
ba_1 = ba_data[ba_data.semester == 1]
ba_2 = ba_data[ba_data.semester == 2]
ba_3 = ba_data[ba_data.semester == 3]
ba_4 = ba_data[ba_data.semester == 4]
ba_5 = ba_data[ba_data.semester == 5]
ba_6 = ba_data[ba_data.semester == 6]

print('There were', len(ba_1), 'students in IN-BA1 over the years')
print('There were', len(ba_2), 'students in IN-BA2 over the years')
print('There were', len(ba_3), 'students in IN-BA3 over the years')
print('There were', len(ba_4), 'students in IN-BA4 over the years')
print('There were', len(ba_5), 'students in IN-BA5 over the years')
print('There were', len(ba_6), 'students in IN-BA6 over the years')

# compute the intersection by using sets
intersection_index = set(ba_1.index).intersection(set(ba_6.index))
intersection = ba_data[ba_data.index.isin(intersection_index)].sort_index()

print('There were', len(intersection_index), 'students in BA1 and thereafter in BA6 over the years')

There were 1720 students in IN-BA1 over the years
There were 1160 students in IN-BA2 over the years
There were 738 students in IN-BA3 over the years
There were 636 students in IN-BA4 over the years
There were 890 students in IN-BA5 over the years
There were 664 students in IN-BA6 over the years
There were 397 students in BA1 and thereafter in BA6 over the years
