In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib.parse
import re

We can see two base url used to retrive data, the ```URL_FORM_BASE``` for the form and ```URL_DATA_BASE``` for the data (the table which contains the actual information).

In [None]:
URL_FORM_BASE = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter"
URL_DATA_BASE = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

By first inspecting the source code of the page we can see that the form has some hidden field, they indicate to the backend what kind of data to retrieve. We can thus form a dictionary of "default parameters".

In [None]:
FORM_PARAM_DEFAULT = {
    "ww_v_list" : 1,
    "ww_i_reportmodel": "133685247",
    #"ww_c_langue": 'en',
}

We then take interest in the form itself. We can see that we have 5 different fields each with a specific name attribute. (When the form is "posted" the name attribute is used to generate the url). We can thus retrieve all the options for each field relying on his name attribute.

We will only work with the HTML data so we directly take the parameters from the source code and don't bother to parse the HMTL page to retrieve it (for HMTL use a value of 133685270 for the ww_i_reportModelXsl attribute).

In [None]:
def request2soup(url, params):
    r = requests.get(url, params=params)
    assert r.status_code == 200
    return BeautifulSoup(r.text, "lxml")

def get_select_options(soup, name_attr):
    data = soup.select('select[name={}]'.format(name_attr))[0] #We assume that we have only one field with a specific name
    options = {option['value']: option.text for option in data.find_all('option', value=True, selected=False) if  option.text != ''}
    return options

def get_fields_options(soup, fields):
    form_options = {}
    for key, value in fields.items():
        form_options[value] = get_select_options(soup, value)
    
    return form_options

FORM_FIELDS = {
    # Label : Name Attribute
    'Unité académique': 'ww_x_UNITE_ACAD',
    'Période académique': 'ww_x_PERIODE_ACAD',
    'Période pédagogique': 'ww_x_PERIODE_PEDAGO',
    'Type de semestre': 'ww_x_HIVERETE',
}

soup_index = request2soup(URL_FORM_BASE, FORM_PARAM_DEFAULT)

FORM_FIELDS_OPTIONS = get_fields_options(soup_index, FORM_FIELDS)

In [None]:
FORM_FIELDS_OPTIONS

By submiting the form, we get a table filled with links at the bottom of the form back. Of course ISA does not want to play it simple and rely instead of the href attribute to javascript for the link behaviour. A link when clicked call the js function "loadReport" which generate the "data url", the url from which we will be able to get data we are interested in. This url is normally used to fill a frame inside the page. But we can use it directly. 

We first create a "metadata" dataframe so ease our further investigation

In [None]:
def get_form_url(params):
    return URL_FORM_BASE + "?" + urllib.parse.urlencode(dict(params, **FORM_PARAM_DEFAULT))

def get_data_url(params, GPS=-1, type=133685270):
    p = {'ww_x_GPS': GPS, 'ww_i_reportModelXsl': type}
    return URL_DATA_BASE + "?" + urllib.parse.urlencode(dict(dict(params, **FORM_PARAM_DEFAULT), **p))

--------------

In [None]:
BACHELOR = {  
    '249108': 'Bachelor semestre 1',
    '249114': 'Bachelor semestre 2',
    '942155': 'Bachelor semestre 3',
    '942163': 'Bachelor semestre 4',
    '942120': 'Bachelor semestre 5',
    '942175': 'Bachelor semestre 6',
    #'2226768': 'Bachelor semestre 5b',
    #'2226785': 'Bachelor semestre 6b',
}

def get_bachelor_data():
    data_list = []
    
    for peda in BACHELOR:
        for period in FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD']:
            params = {
                'ww_x_UNITE_ACAD': 249847,
                'ww_x_PERIODE_ACAD': period,
                'ww_x_PERIODE_PEDAGO': peda,
                'ww_x_HIVERETE': '',
            } 
            try: 
                data = pd.read_html(get_data_url(params), header=1)[0]
                df = data.copy()[['No Sciper', 'Statut', 'Civilité']]
                df['semester'] = BACHELOR[peda]
                df['period'] = FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD'][period]
                data_list.append(df)
            except:
                print("No Data : " +  BACHELOR[peda] + " " + FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD'][period])
        
    d = pd.concat(data_list) 
    return d

In [None]:
d = get_bachelor_data()
d['No Sciper'] = d['No Sciper'].apply(str)
f = d.copy()

In [None]:
grouped = d.groupby(['No Sciper'])
filtered = grouped.filter(lambda x: len(x.values) >= 2 and "Bachelor semestre 1" in x.values and "Bachelor semestre 6" in x.values)
filtered = filtered.set_index(['No Sciper','semester']).sort_index().head(100)

In [None]:
filtered.head(100)