In [1]:
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
import re
from itertools import * 
import concurrent.futures



Extract all request parameters

In [2]:
html_doc = urllib.request.urlopen("http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=&ww_x_UNITE_ACAD=942293&zz_x_PERIODE_ACAD=&ww_x_PERIODE_ACAD=355925344&zz_x_PERIODE_PEDAGO=&ww_x_PERIODE_PEDAGO=942155&zz_x_HIVERETE=&ww_x_HIVERETE=2936286&dummy=ok").read()
soup = BeautifulSoup(html_doc, 'html.parser')

In [6]:
OFFSET_GENDER = 0
OFFSET_NAME = 1
OFFSET_STATUS = 7
OFFSET_SCIPER = 10

# transform the iterable in none if it's empty
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return chain([first], iterable)

# return the request url for the given parameters
def buildRequest(params):
    requestParams = ''
    for key in params:
        requestParams += key + '=' + str(params[key]) + '&'
    return "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?" + requestParams


def parsePeriod(period):
    if "Bachelor semestre" in period:
        return ('Bachelor', period[len('Bachelor semestre '):])
    if "Master semestre" in period:
        return ('Master', period[len('Master semestre '):])

    # TODO: add support for master thesis and others
    
def parseResult(html):
    def genPersonDf(person):
        personDict = parsePerson(person)
        if personDict is None:
            return pd.DataFrame()
        return pd.DataFrame(personDict, index=[personDict['sciper']])
        
    def parsePerson(person):
        elems = person.findChildren()
        if (len(elems) < 12
        or "<th>Civilité</th>" in str(elems) # for when we have several tables because of Tous
        or "<td" in str(elems[OFFSET_STATUS].next)): # for Randriamparany Tahina Mikaela....
            return None
        return {
            'gender': elems[OFFSET_GENDER].next,
            'name': elems[OFFSET_NAME].next,
            'status': elems[OFFSET_STATUS].next,
            'sciper': elems[OFFSET_SCIPER].next,
        }
    soup = BeautifulSoup(html, 'html.parser')
    
    it = (genPersonDf(person) for person in soup.findAll('tr')[2:])
    it = peek(it)
    if it is None:
        return pd.DataFrame()
    else:
        return pd.concat(it)



def genDF(params):
    request = buildRequest({
                                'ww_x_GPS': ww_x_GPS,
                                'ww_i_reportModel': ww_i_reportModel,
                                'ww_i_reportModelXsl': ww_i_reportModelXsl,
                                'ww_x_PERIODE_ACAD': params[1]['value'],
                                'ww_x_UNITE_ACAD': params[2]['value'],
                                'ww_x_PERIODE_PEDAGO': params[0]['value'],
                                'ww_x_HIVERETE': params[3]['value']
                             })
    with urllib.request.urlopen(request) as c:
        html = c.read()
        df = parseResult(html)
        df['year'] = params[1].next
        df['section'] = params[2].next
        period = parsePeriod(params[0].next)
        df['level'] = period[0]
        df['semester'] = period[1]
        return df

# select all?
ww_x_GPS=-1

ww_i_reportModel =  soup.findAll("input", {"name": "ww_i_reportmodel"})[0]['value']

ww_i_reportModelXsl =  soup.findAll("input", {"name": "ww_i_reportModelXsl"})[0]['value'] # select the first value as it is the html report

# remove the null by doing removing the first element
ww_x_UNITE_ACADs =  soup.findAll("select", {"name": "ww_x_UNITE_ACAD"})[0].findChildren()[1:] 
ww_x_PERIODE_ACADs =  soup.findAll("select", {"name": "ww_x_PERIODE_ACAD"})[0].findChildren()[1:]
ww_x_PERIODE_PEDAGOs = soup.findAll("select", {"name": "ww_x_PERIODE_PEDAGO"})[0].findChildren()[1:]
ww_x_HIVERETEs = soup.findAll("select", {"name": "ww_x_HIVERETE"})[0].findChildren()[1:]


# ww_x_PERIODE_ACAD = ww_x_PERIODE_ACADs[0]
# ww_x_UNITE_ACAD = ww_x_UNITE_ACADs[0]
# ww_x_PERIODE_PEDAGO = ww_x_PERIODE_PEDAGOs[0]
# ww_x_HIVERETE = ww_x_HIVERETEs[0]

it = product(\
            filter(lambda x: "Bachelor" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
#              filter(lambda x: "Architecture" in x.next, ww_x_UNITE_ACADs),
             ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )
# for (x, y, z, a) in it:
#     print(x, y, z, a)
             
# pd.concat(list(map(genDF, it)))

# #We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in it}
    df = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))
df
#     for future in concurrent.futures.as_completed(future_to_url):
#         url = future_to_url[future]
#         print(future.result())
#         try:
#             data = future.result()
#         except Exception as exc:
#             print('%r generated an exception: %s' % (url, exc))
#         else:
#             print('%r page is %d bytes' % (url, len(data)))

Unnamed: 0,gender,level,name,sciper,section,semester,status,year
257357,Madame,Bachelor,Abuelseoud Passente Hani Ibrahim,257357,Chimie et génie chimique,1,Présent,2016-2017
259924,Madame,Bachelor,Ahmed Sugulle Sadia,259924,Chimie et génie chimique,1,Présent,2016-2017
260635,Madame,Bachelor,Aurand Laetitia Ludivine,260635,Chimie et génie chimique,1,Présent,2016-2017
258893,Madame,Bachelor,Azan-Soulié Camille Jaqueline Monique Marie,258893,Chimie et génie chimique,1,Présent,2016-2017
262172,Monsieur,Bachelor,Baumgartner Gaëtan Hans-Ulrich Beat,262172,Chimie et génie chimique,1,Présent,2016-2017
269910,Monsieur,Bachelor,Blauel Valentin Charles,269910,Chimie et génie chimique,1,Présent,2016-2017
270069,Monsieur,Bachelor,Bonhôte Damien Éric,270069,Chimie et génie chimique,1,Présent,2016-2017
259301,Madame,Bachelor,Bonneaux Laurie,259301,Chimie et génie chimique,1,Présent,2016-2017
270908,Monsieur,Bachelor,Bossonnet André,270908,Chimie et génie chimique,1,Présent,2016-2017
270856,Madame,Bachelor,Bouffard Eva Olga Marie Sunida,270856,Chimie et génie chimique,1,Présent,2016-2017


In [8]:

# for ww_x_PERIODE_PEDAGO in ww_x_PERIODE_PEDAGOs:
#     print(parsePeriod(ww_x_PERIODE_PEDAGO.next))

In [None]:
ww_x_PERIODE_ACADs

In [None]:
s = """<tr><td style="white-space:nowrap">Madame</td><td style="white-space:nowrap">Wicki Livia</td><td style="white-space:nowrap"></td><td style="white-space:nowrap"></td><td style="white-space:nowrap"></td><td style="white-space:nowrap"></td><td style="white-space:nowrap"></td><td style="white-space:nowrap">Congé</td><td style="white-space:nowrap">Erasmus</td><td style="white-space:nowrap">Royal Danish Academy of Fine Arts, School of Architecture, Copenhague</td><td>169981</td><td style="white-space:nowrap"></td></tr>"""

def genPersonDf(person):
    personDict = parsePerson(person)
    return pd.DataFrame(personDict, index=[personDict['sciper']])

def parsePerson(person):
    elems = person.findChildren()
    print(len(elems))
    print(person)
    return {
        'gender': elems[OFFSET_GENDER].next,
        'name': elems[OFFSET_NAME].next,
        'status': elems[OFFSET_STATUS].next,
        'sciper': elems[OFFSET_SCIPER].next,
    }

soup = BeautifulSoup(s, 'html.parser')

it = (genPersonDf(person) for person in soup.findAll('tr'))

list(it)
