In [256]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import re
from itertools import * 
import concurrent.futures
from functools import reduce

import scipy.stats as stats
import statsmodels.api as sm



Extract all request parameters

In [2]:
html_doc = urllib.request.urlopen("http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=&ww_x_UNITE_ACAD=942293&zz_x_PERIODE_ACAD=&ww_x_PERIODE_ACAD=355925344&zz_x_PERIODE_PEDAGO=&ww_x_PERIODE_PEDAGO=942155&zz_x_HIVERETE=&ww_x_HIVERETE=2936286&dummy=ok").read()
soup = BeautifulSoup(html_doc, 'html.parser')

In [30]:
OFFSET_STATUS = 7

# transform the iterable in none if it's empty
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return chain([first], iterable)

# return the request url for the given parameters
def buildRequest(params):
    requestParams = ''
    for key in params:
        requestParams += key + '=' + str(params[key]) + '&'
    return "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?" + requestParams


def parsePeriod(period):
    if "Bachelor semestre" in period:
        return ('Bachelor', period[len('Bachelor semestre '):])
    if "Master semestre" in period:
        return ('Master', period[len('Master semestre '):])

    return 'ToDo'
    # TODO: add support for master thesis and others
    
def getSemester(semester):
    if semester == "Semestre d'automne":
        return 0
    else:
        return 1
    
def parseTitle(e):
    return [(x.contents[0], i) for i,x in enumerate(e.findChildren())] 

def parseResult(html):
    def genPersonDf(person, title):
        personDict = parsePerson(person, title)
        if personDict is None:
            return pd.DataFrame()
        return pd.DataFrame(personDict, index=[personDict['No Sciper']])
        
    def parsePerson(person, title):
        def getContent(e):
            content = e.contents
            if len(content) > 0:
                return content[0]
            else:
                return ''
            
            
        elems = person.findChildren()
        if (len(elems) < 12
        or "<th>Civilité</th>" in str(elems) # for when we have several tables because of Tous
        or "<td" in str(elems[OFFSET_STATUS].next)): # for Randriamparany Tahina Mikaela....
            return None
        return {x[0]: getContent(elems[x[1]])  for x in title}
#         return {
#             'gender': elems[OFFSET_GENDER].contents,
#             'name': elems[OFFSET_NAME].contents,
#             'status': elems[OFFSET_STATUS].contents,
#             'sciper': elems[OFFSET_SCIPER].contents,
#         }
    soup = BeautifulSoup(html, 'html.parser')
    
    persons = soup.findAll('tr')[1:]
    if len(persons) > 0:
        title = parseTitle(persons[0])
        it = (genPersonDf(person, title) for person in persons[1:])
        it = peek(it)
        if it is None:
            return pd.DataFrame()
        else:
            return pd.concat(it)
    else:
        return pd.DataFrame()



def genDF(params):
    request = buildRequest({
                                'ww_x_GPS': ww_x_GPS,
                                'ww_i_reportModel': ww_i_reportModel,
                                'ww_i_reportModelXsl': ww_i_reportModelXsl,
                                'ww_x_PERIODE_ACAD': params[1]['value'],
                                'ww_x_UNITE_ACAD': params[2]['value'],
                                'ww_x_PERIODE_PEDAGO': params[0]['value'],
                                'ww_x_HIVERETE': params[3]['value']
                             })
    with urllib.request.urlopen(request) as c:
        html = c.read()
        df = parseResult(html)
        df['year'] = params[1].next
        df['section'] = params[2].next
        period = parsePeriod(params[0].next)
        df['period'] = getSemester(params[3].next)
        df['level'] = period[0]
        df['semester'] = period[1]
        return df

# select all?
ww_x_GPS=-1

ww_i_reportModel =  soup.findAll("input", {"name": "ww_i_reportmodel"})[0]['value']

ww_i_reportModelXsl =  soup.findAll("input", {"name": "ww_i_reportModelXsl"})[0]['value'] # select the first value as it is the html report

# remove the null by doing removing the first element
ww_x_UNITE_ACADs =  soup.findAll("select", {"name": "ww_x_UNITE_ACAD"})[0].findChildren()[1:] 
ww_x_PERIODE_ACADs =  soup.findAll("select", {"name": "ww_x_PERIODE_ACAD"})[0].findChildren()[1:]
ww_x_PERIODE_PEDAGOs = soup.findAll("select", {"name": "ww_x_PERIODE_PEDAGO"})[0].findChildren()[1:]
ww_x_HIVERETEs = soup.findAll("select", {"name": "ww_x_HIVERETE"})[0].findChildren()[1:]


# ww_x_PERIODE_ACAD = ww_x_PERIODE_ACADs[0]
# ww_x_UNITE_ACAD = ww_x_UNITE_ACADs[0]
# ww_x_PERIODE_PEDAGO = ww_x_PERIODE_PEDAGOs[0]
# ww_x_HIVERETE = ww_x_HIVERETEs[0]

itBachelor = product(
            filter(lambda x: "Bachelor" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
             filter(lambda x: "Informatique" in x.next, ww_x_UNITE_ACADs),
#              ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )

itMaster = product(
            filter(lambda x: "Master" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
             filter(lambda x: "Informatique" in x.next, ww_x_UNITE_ACADs),
#              ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )
# for (x, y, z, a) in it:
#     print(x, y, z, a)
             
# df = pd.concat(list(map(genDF, it)))

# #We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in itBachelor}
    dfB = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))

with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in itMaster}
    dfM = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))

#     for future in concurrent.futures.as_completed(future_to_url):
#         url = future_to_url[future]
#         print(future.result())
#         try:
#             data = future.result()
#         except Exception as exc:
#             print('%r generated an exception: %s' % (url, exc))
#         else:
#             print('%r page is %d bytes' % (url, len(data)))

In [31]:
print(dfB['Statut'].value_counts())
print(dfM['Statut'].value_counts())

Présent    5502
Congé       296
Attente       9
Name: Statut, dtype: int64
Présent    2485
Stage       282
Congé        77
Name: Statut, dtype: int64


In [168]:
# def computeDuration():


def removeIncomplete(person):
    did1 = reduce(lambda x,y: x or y, person.semester == '1', False)
    did6 = reduce(lambda x,y: x or y, person.semester == '6', False)
    return did1 and did6

def bachelorDuration(person):
    sample = person.iloc(0)[0]
    sorted = person.sort_values(by=['year', 'period'])
    startYear = sorted.iloc[0]['year']
    startPeriod = sorted.iloc[0]['period']

    endYear = sorted.iloc[-1]['year']
    endPeriod = sorted.iloc[-1]['period']
    
    duration = 0.5 + float(endYear[:4]) - float(startYear[:4]) + (float(endPeriod) - float(startPeriod))/2
#     return pd.DataFrame({'sex': sample['Civilité'], 'duration': duration}, index=[sample['No Sciper']])
    return pd.Series({'sex': sample['Civilité'], 'duration': duration})

    

completed = dfB.groupby(dfB.index).filter(removeIncomplete)

s = completed.groupby(completed.index).apply(bachelorDuration)




In [192]:
mean = s.groupby('sex').mean()
mean.iloc(0)[1] - mean.iloc(0)[0]


duration    0.176818
dtype: float64

In [228]:

se2 = s.groupby('sex').var()/s.groupby('sex').count()

t = (mean['duration'][1] - mean['duration'][0]) / np.sqrt(se2['duration'][0] + se2['duration'][1])
df = s.groupby('sex').count().min()[0] - 1
print('t stat: ' + str(t))
print('degrees of freedom: ' + str(df))

p = (1-stats.t.cdf(t, df))*2
print('the p-value is: ' + str(p))

t stat: 1.34370056781
degrees of freedom: 28
the p-value is: 0.189833792999


We can't reject H0 (the 2 means are equal), => we don't observe any significant difference

In [229]:
M = s[s.sex == 'Monsieur']
F = s[s.sex != 'Monsieur']

stats.ttest_ind(M['duration'], F['duration'], equal_var=False)

Ttest_indResult(statistic=1.3437005678090845, pvalue=0.18785555340784144)

Here we get a small difference in p-value because the test is using the Welch-Satterthwaite degrees of freedom.

In [56]:
dfB[dfB.index =='160213'].sort_values(by=['year', 'period'])

Unnamed: 0,Civilité,Ecole Echange,Filière opt.,Mineur,No Sciper,Nom Prénom,Orientation Bachelor,Orientation Master,Spécialisation,Statut,Type Echange,level,period,section,semester,year
160213,Monsieur,,,,160213,Bécholey Alexandre,,,,Présent,,Bachelor,0,Informatique,5,2007-2008


In [129]:



s = dfB[(dfB.index == '217422') | (dfB.index == '213618')]

a = s.groupby(s.index).apply(bachelorDuration)
a

213618    {'sex': Monsieur, 'duration': 4.0}
217422    {'sex': Monsieur, 'duration': 5.0}
dtype: object

In [241]:
pop1 = M['duration']
pop2 = F['duration']
num1 = pop1.count()
num2 = pop2.count()
df = ((np.var(pop1)/num1 + np.var(pop2)/num2)**(2.0))/(   (np.var(pop1)/num1)**(2.0)/(num1-1) +  (np.var(pop2)/num2)**(2.0)/(num2-1) ) 
(1-stats.t.cdf(t, df))*2

0.18780024218122726

In [265]:
s2 = s
a = pd.Categorical.from_array(s2['sex'])
s2['sex'] = a.codes

sm.OLS(s2['duration'], sm.add_constant(s2['sex'])).fit(cov_type='HC3').summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.749
Date:,"Sat, 15 Oct 2016",Prob (F-statistic):,0.187
Time:,10:44:04,Log-Likelihood:,-464.0
No. Observations:,397,AIC:,932.0
Df Residuals:,395,BIC:,940.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,3.3966,0.127,26.698,0.000,3.147 3.646
sex,0.1768,0.134,1.322,0.186,-0.085 0.439

0,1,2,3
Omnibus:,92.931,Durbin-Watson:,1.823
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159.029
Skew:,1.385,Prob(JB):,2.9300000000000004e-35
Kurtosis:,4.392,Cond. No.,7.27


The coefficient of the sex dummy is non-significant <br>
We see that the average is 3.4 years for females (the sex dummy is encoded to 0) and 3.57 for males

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1,