In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib.parse
import re

We can see two base url used to retrive data, the ```URL_FORM_BASE``` for the form and ```URL_DATA_BASE``` for the data (the table which contains the actual information).

In [2]:
URL_FORM_BASE = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter"
URL_DATA_BASE = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

By first inspecting the source code of the page we can see that the form has some hidden field, they indicate to the backend what kind of data to retrieve. We can thus form a dictionary of "default parameters".

In [3]:
FORM_PARAM_DEFAULT = {
    "ww_v_list" : 1,
    "ww_i_reportmodel": "133685247",
    #"ww_c_langue": 'en',
}

We then take interest in the form itself. We can see that we have 5 different fields each with a specific name attribute. (When the form is "posted" the name attribute is used to generate the url). We can thus retrieve all the options for each field relying on his name attribute.

We will only work with the HTML data so we directly take the parameters from the source code and don't bother to parse the HMTL page to retrieve it (for HMTL use a value of 133685270 for the ww_i_reportModelXsl attribute).

In [4]:
def request2soup(url, params):
    r = requests.get(url, params=params)
    assert r.status_code == 200
    return BeautifulSoup(r.text, "lxml")

def get_select_options(soup, name_attr):
    data = soup.select('select[name={}]'.format(name_attr))[0] #We assume that we have only one field with a specific name
    options = {option['value']: option.text for option in data.find_all('option', value=True, selected=False) if  option.text != ''}
    return options

def get_fields_options(soup, fields):
    form_options = {}
    for key, value in fields.items():
        form_options[value] = get_select_options(soup, value)
    
    return form_options

FORM_FIELDS = {
    # Label : Name Attribute
    'Unité académique': 'ww_x_UNITE_ACAD',
    'Période académique': 'ww_x_PERIODE_ACAD',
    'Période pédagogique': 'ww_x_PERIODE_PEDAGO',
    'Type de semestre': 'ww_x_HIVERETE',
}

soup_index = request2soup(URL_FORM_BASE, FORM_PARAM_DEFAULT)

FORM_FIELDS_OPTIONS = get_fields_options(soup_index, FORM_FIELDS)

In [5]:
FORM_FIELDS_OPTIONS

{'ww_x_HIVERETE': {'2936286': "Semestre d'automne",
  '2936295': 'Semestre de printemps'},
 'ww_x_PERIODE_ACAD': {'123455150': '2011-2012',
  '123456101': '2012-2013',
  '213637754': '2013-2014',
  '213637922': '2014-2015',
  '213638028': '2015-2016',
  '355925344': '2016-2017',
  '39486325': '2010-2011',
  '978181': '2007-2008',
  '978187': '2008-2009',
  '978195': '2009-2010'},
 'ww_x_PERIODE_PEDAGO': {'2063602308': 'Mise à niveau',
  '2226616': 'Stage automne 4ème année',
  '2226626': 'Stage printemps 4ème année',
  '2226768': 'Bachelor semestre 5b',
  '2226785': 'Bachelor semestre 6b',
  '2227132': 'Stage printemps master',
  '2230106': 'Master semestre 1',
  '2230128': 'Master semestre 3',
  '2230140': 'Master semestre 4',
  '2335667': 'Mineur semestre 1',
  '2335676': 'Mineur semestre 2',
  '249108': 'Bachelor semestre 1',
  '249114': 'Bachelor semestre 2',
  '249127': 'Projet Master automne',
  '2754553': 'Semestre printemps',
  '3781783': 'Projet Master printemps',
  '942120': 

By submiting the form, we get a table filled with links at the bottom of the form back. Of course ISA does not want to play it simple and rely instead of the href attribute to javascript for the link behaviour. A link when clicked call the js function "loadReport" which generate the "data url", the url from which we will be able to get data we are interested in. This url is normally used to fill a frame inside the page. But we can use it directly. 

We first create a "metadata" dataframe so ease our further investigation

In [6]:
def get_form_url(params):
    return URL_FORM_BASE + "?" + urllib.parse.urlencode(dict(params, **FORM_PARAM_DEFAULT))

def get_data_url(params, GPS=-1, type=133685270):
    p = {'ww_x_GPS': GPS, 'ww_i_reportModelXsl': type}
    return URL_DATA_BASE + "?" + urllib.parse.urlencode(dict(dict(params, **FORM_PARAM_DEFAULT), **p))

--------------

In [7]:
params = {
    'ww_x_UNITE_ACAD': 249847,
    'ww_x_PERIODE_ACAD': 978181,
    'ww_x_PERIODE_PEDAGO': 249108,
    'ww_x_HIVERETE': '',
}
print(get_data_url(params))
ba1 = pd.read_html(get_data_url(params), header=1)[0]

params = {
    'ww_x_UNITE_ACAD': 249847,
    'ww_x_PERIODE_ACAD': 978181,
    'ww_x_PERIODE_PEDAGO': 249114,
    'ww_x_HIVERETE': '',
}
print(get_data_url(params))
ba2 = pd.read_html(get_data_url(params), header=1)[0]


http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_i_reportModelXsl=133685270&ww_x_PERIODE_ACAD=978181&ww_x_UNITE_ACAD=249847&ww_v_list=1&ww_x_GPS=-1&ww_i_reportmodel=133685247&ww_x_HIVERETE=&ww_x_PERIODE_PEDAGO=249108
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_i_reportModelXsl=133685270&ww_x_PERIODE_ACAD=978181&ww_x_UNITE_ACAD=249847&ww_v_list=1&ww_x_GPS=-1&ww_i_reportmodel=133685247&ww_x_HIVERETE=&ww_x_PERIODE_PEDAGO=249114


In [268]:
test2 = ba2.copy()[['No Sciper', 'Statut']]
test2['semester'] = 'BA1'
test2['period'] = '2007-2008'
test2 = test2.set_index(['No Sciper'])

In [269]:
test = ba1.copy()[['No Sciper', 'Statut']]
test['semester'] = 'BA2'
test['period'] = '2007-2008'
test = test.set_index(['No Sciper'])

In [270]:
t = pd.concat([test2, test])

In [271]:
t.set_index('semester', append=True, inplace=True)

In [272]:
t = t.sort_index()

In [8]:
BACHELOR = {  
    '249108': 'Bachelor semestre 1',
    '249114': 'Bachelor semestre 2',
    '942155': 'Bachelor semestre 3',
    '942163': 'Bachelor semestre 4',
    '942120': 'Bachelor semestre 5',
    '942175': 'Bachelor semestre 6',
    '2226768': 'Bachelor semestre 5b',
    '2226785': 'Bachelor semestre 6b',
}

def get_bachelor_data():
    data_list = []
    
    for peda in BACHELOR:
        for period in FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD']:
            params = {
                'ww_x_UNITE_ACAD': 249847,
                'ww_x_PERIODE_ACAD': period,
                'ww_x_PERIODE_PEDAGO': peda,
                'ww_x_HIVERETE': '',
            } 
            try: 
                data = pd.read_html(get_data_url(params), header=1)[0]
                df = data.copy()[['No Sciper', 'Statut']]
                df['semester'] = BACHELOR[peda]
                df['period'] = FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD'][period]
                df.set_index(['No Sciper'], inplace=True)

                data_list.append(df)
            except:
                print("No Data : " +  BACHELOR[peda] + " " + FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD'][period])
        
    d = pd.concat(data_list)
    d.set_index('semester', append=True, inplace=True)
    d.sort_index(inplace=True)
    
    return d

In [54]:
d = get_bachelor_data()

No Data : Bachelor semestre 5b 2014-2015
No Data : Bachelor semestre 5b 2012-2013
No Data : Bachelor semestre 5b 2011-2012
No Data : Bachelor semestre 5b 2013-2014
No Data : Bachelor semestre 5b 2015-2016
No Data : Bachelor semestre 5b 2016-2017
No Data : Bachelor semestre 5b 2009-2010
No Data : Bachelor semestre 5b 2008-2009
No Data : Bachelor semestre 5b 2010-2011
No Data : Bachelor semestre 5b 2007-2008
No Data : Bachelor semestre 6b 2014-2015
No Data : Bachelor semestre 6b 2012-2013
No Data : Bachelor semestre 6b 2011-2012
No Data : Bachelor semestre 6b 2013-2014
No Data : Bachelor semestre 6b 2015-2016
No Data : Bachelor semestre 6b 2016-2017
No Data : Bachelor semestre 6b 2009-2010
No Data : Bachelor semestre 6b 2008-2009
No Data : Bachelor semestre 6b 2010-2011
No Data : Bachelor semestre 6b 2007-2008


In [56]:
f = d.copy()

In [57]:
d.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Statut,period
No Sciper,semester,Unnamed: 2_level_1,Unnamed: 3_level_1
,Bachelor semestre 3,,2008-2009
,Bachelor semestre 3,,2007-2008
,Bachelor semestre 4,,2008-2009
,Bachelor semestre 4,,2007-2008
147008.0,Bachelor semestre 1,Présent,2008-2009
147008.0,Bachelor semestre 2,Présent,2008-2009
147008.0,Bachelor semestre 3,Présent,2009-2010
147008.0,Bachelor semestre 4,Présent,2009-2010
147008.0,Bachelor semestre 5,Congé,2010-2011
147008.0,Bachelor semestre 6,Congé,2010-2011


In [58]:
d = d.reset_index()

In [59]:
d.head(20)

Unnamed: 0,No Sciper,semester,Statut,period
0,,Bachelor semestre 3,,2008-2009
1,,Bachelor semestre 3,,2007-2008
2,,Bachelor semestre 4,,2008-2009
3,,Bachelor semestre 4,,2007-2008
4,147008.0,Bachelor semestre 1,Présent,2008-2009
5,147008.0,Bachelor semestre 2,Présent,2008-2009
6,147008.0,Bachelor semestre 3,Présent,2009-2010
7,147008.0,Bachelor semestre 4,Présent,2009-2010
8,147008.0,Bachelor semestre 5,Congé,2010-2011
9,147008.0,Bachelor semestre 6,Congé,2010-2011


In [60]:
d['No Sciper'] = d['No Sciper'].apply(str)
grouped = d.groupby(['No Sciper'])

In [61]:
filtered = grouped.filter(lambda x: len(x.values) >= 2 and "Bachelor semestre 1" in x.values and "Bachelor semestre 6" in x.values)

In [64]:
d[d['No Sciper'] == '169569']

Unnamed: 0,No Sciper,semester,Statut,period
88,169569,Bachelor semestre 1,Présent,2007-2008
89,169569,Bachelor semestre 2,Présent,2007-2008
90,169569,Bachelor semestre 5,Présent,2009-2010
91,169569,Bachelor semestre 6,Présent,2009-2010
5600,169569,Bachelor semestre 3,Présent,2008-2009
5601,169569,Bachelor semestre 4,Présent,2008-2009


In [65]:
filtered.set_index(['No Sciper', "semester"]).sort_index().head(100)

Unnamed: 0_level_0,Unnamed: 1_level_0,Statut,period
No Sciper,semester,Unnamed: 2_level_1,Unnamed: 3_level_1
147008,Bachelor semestre 1,Présent,2008-2009
147008,Bachelor semestre 2,Présent,2008-2009
147008,Bachelor semestre 3,Présent,2009-2010
147008,Bachelor semestre 4,Présent,2009-2010
147008,Bachelor semestre 5,Congé,2010-2011
147008,Bachelor semestre 6,Congé,2010-2011
169569,Bachelor semestre 1,Présent,2007-2008
169569,Bachelor semestre 2,Présent,2007-2008
169569,Bachelor semestre 3,Présent,2008-2009
169569,Bachelor semestre 4,Présent,2008-2009
