In [1]:
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import re
from itertools import * 
import concurrent.futures



Extract all request parameters

In [2]:
html_doc = urllib.request.urlopen("http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=&ww_x_UNITE_ACAD=942293&zz_x_PERIODE_ACAD=&ww_x_PERIODE_ACAD=355925344&zz_x_PERIODE_PEDAGO=&ww_x_PERIODE_PEDAGO=942155&zz_x_HIVERETE=&ww_x_HIVERETE=2936286&dummy=ok").read()
soup = BeautifulSoup(html_doc, 'html.parser')

In [29]:
OFFSET_STATUS = 7

# transform the iterable in none if it's empty
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return chain([first], iterable)

# return the request url for the given parameters
def buildRequest(params):
    requestParams = ''
    for key in params:
        requestParams += key + '=' + str(params[key]) + '&'
    return "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?" + requestParams


def parsePeriod(period):
    if "Bachelor semestre" in period:
        return ('Bachelor', period[len('Bachelor semestre '):])
    if "Master semestre" in period:
        return ('Master', period[len('Master semestre '):])

    return 'ToDo'
    # TODO: add support for master thesis and others
    
def getSemester(semester):
    if semester == "Semestre d'automne":
        return 0
    else:
        return 1
    
def parseTitle(e):
    return [(x.contents[0], i) for i,x in enumerate(e.findChildren())] 

def parseResult(html):
    def genPersonDf(person, title):
        personDict = parsePerson(person, title)
        if personDict is None:
            return pd.DataFrame()
        return pd.DataFrame(personDict, index=[personDict['No Sciper']])
        
    def parsePerson(person, title):
        def getContent(e):
            content = e.contents
            if len(content) > 0:
                return content[0]
            else:
                return ''
            
            
        elems = person.findChildren()
        if (len(elems) < 12
        or "<th>Civilité</th>" in str(elems) # for when we have several tables because of Tous
        or "<td" in str(elems[OFFSET_STATUS].next)): # for Randriamparany Tahina Mikaela....
            return None
        return {x[0]: getContent(elems[x[1]])  for x in title}
#         return {
#             'gender': elems[OFFSET_GENDER].contents,
#             'name': elems[OFFSET_NAME].contents,
#             'status': elems[OFFSET_STATUS].contents,
#             'sciper': elems[OFFSET_SCIPER].contents,
#         }
    soup = BeautifulSoup(html, 'html.parser')
    
    persons = soup.findAll('tr')[1:]
    if len(persons) > 0:
        title = parseTitle(persons[0])
        it = (genPersonDf(person, title) for person in persons[1:])
        it = peek(it)
        if it is None:
            return pd.DataFrame()
        else:
            return pd.concat(it)
    else:
        return pd.DataFrame()



def genDF(params):
    request = buildRequest({
                                'ww_x_GPS': ww_x_GPS,
                                'ww_i_reportModel': ww_i_reportModel,
                                'ww_i_reportModelXsl': ww_i_reportModelXsl,
                                'ww_x_PERIODE_ACAD': params[1]['value'],
                                'ww_x_UNITE_ACAD': params[2]['value'],
                                'ww_x_PERIODE_PEDAGO': params[0]['value'],
                                'ww_x_HIVERETE': params[3]['value']
                             })
    with urllib.request.urlopen(request) as c:
        html = c.read()
        df = parseResult(html)
        df['year'] = params[1].next
        df['section'] = params[2].next
        period = parsePeriod(params[0].next)
        df['period'] = getSemester(params[3].next)
        df['level'] = period[0]
        df['semester'] = period[1]
        return df

# select all?
ww_x_GPS=-1

ww_i_reportModel =  soup.findAll("input", {"name": "ww_i_reportmodel"})[0]['value']

ww_i_reportModelXsl =  soup.findAll("input", {"name": "ww_i_reportModelXsl"})[0]['value'] # select the first value as it is the html report

# remove the null by doing removing the first element
ww_x_UNITE_ACADs =  soup.findAll("select", {"name": "ww_x_UNITE_ACAD"})[0].findChildren()[1:] 
ww_x_PERIODE_ACADs =  soup.findAll("select", {"name": "ww_x_PERIODE_ACAD"})[0].findChildren()[1:]
ww_x_PERIODE_PEDAGOs = soup.findAll("select", {"name": "ww_x_PERIODE_PEDAGO"})[0].findChildren()[1:]
ww_x_HIVERETEs = soup.findAll("select", {"name": "ww_x_HIVERETE"})[0].findChildren()[1:]


# ww_x_PERIODE_ACAD = ww_x_PERIODE_ACADs[0]
# ww_x_UNITE_ACAD = ww_x_UNITE_ACADs[0]
# ww_x_PERIODE_PEDAGO = ww_x_PERIODE_PEDAGOs[0]
# ww_x_HIVERETE = ww_x_HIVERETEs[0]

itBachelor = product(
            filter(lambda x: "Bachelor" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
             filter(lambda x: "Informatique" in x.next, ww_x_UNITE_ACADs),
#              ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )

itMaster = product(
            filter(lambda x: "Master" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
             filter(lambda x: "Informatique" in x.next, ww_x_UNITE_ACADs),
#              ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )
# for (x, y, z, a) in it:
#     print(x, y, z, a)
             
# df = pd.concat(list(map(genDF, it)))

# #We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in itBachelor}
    dfB = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))

with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in itMaster}
    dfM = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))

#     for future in concurrent.futures.as_completed(future_to_url):
#         url = future_to_url[future]
#         print(future.result())
#         try:
#             data = future.result()
#         except Exception as exc:
#             print('%r generated an exception: %s' % (url, exc))
#         else:
#             print('%r page is %d bytes' % (url, len(data)))

Unnamed: 0,Civilité,Ecole Echange,Filière opt.,Mineur,No Sciper,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Statut,Type Echange,level,period,section,semester,year
246671,Monsieur,,,,246671,Abate Bryan Jeremy,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
251759,Monsieur,,,,251759,Alami-Idrissi Ali,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
248575,Monsieur,,,,248575,Albergoni Tobia,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
249498,Monsieur,,,,249498,Aoun Leonardo,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
234551,Monsieur,,,,234551,Bachmann Roman Christian,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
249694,Monsieur,,,,249694,Badoux Christophe Dylan,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
236818,Monsieur,,,,236818,Ballerini Marco Roberto Julian,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
208714,Monsieur,,,,208714,Bardet Mike Douglas,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
246184,Monsieur,,,,246184,Barman Raphaël,,,,Présent,,Bachelor,0,Informatique,3,2015-2016
249733,Monsieur,,,,249733,Bazzucchi Vincenzo,,,,Présent,,Bachelor,0,Informatique,3,2015-2016


In [16]:
print(dfB['Statut'].value_counts())
print(dfM['Statut'].value_counts())

Présent    5502
Congé       296
Attente       9
Name: Statut, dtype: int64
Présent    2485
Stage       282
Congé        77
Name: Statut, dtype: int64


In [20]:
ww_x_HIVERETEs[0].contents

["Semestre d'automne"]

In [26]:
dfB['semester'].value_counts()

1    216
2    171
5    128
6    104
3     92
4     89
Name: semester, dtype: int64