In [1]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup
import re
from itertools import * 
import concurrent.futures
from functools import reduce

import scipy.stats as stats
import statsmodels.api as sm

## Data Crawler:


In [2]:
html_doc = urllib.request.urlopen("http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_b_list=1&ww_i_reportmodel=133685247&ww_c_langue=&ww_i_reportModelXsl=133685270&zz_x_UNITE_ACAD=&ww_x_UNITE_ACAD=942293&zz_x_PERIODE_ACAD=&ww_x_PERIODE_ACAD=355925344&zz_x_PERIODE_PEDAGO=&ww_x_PERIODE_PEDAGO=942155&zz_x_HIVERETE=&ww_x_HIVERETE=2936286&dummy=ok").read()
soup = BeautifulSoup(html_doc, 'html.parser')

In [219]:
# query all the pages, put the content of the array in a dataframe using the column names
# as titles and concatenate them together in a final df. Once for Bachelor data and once for master
OFFSET_STATUS = 7

# transform the iterable in none if it's empty
def peek(iterable):
    try:
        first = next(iterable)
    except StopIteration:
        return None
    return chain([first], iterable)

# return the request url for the given parameters
def buildRequest(params):
    requestParams = ''
    for key in params:
        requestParams += key + '=' + str(params[key]) + '&'
    return "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?" + requestParams

# split the period value in a tuple
def parsePeriod(period):
    if "Bachelor semestre" in period:
        return ('Bachelor', period[len('Bachelor semestre '):])
    if "Master semestre" in period:
        return ('Master', period[len('Master semestre '):])
    if "Projet Master" in period:
        return ('Master Project', 'PM')

    return 'Other'

# automne -> 0, printemps -> 1
def getSemester(semester):
    if semester == "Semestre d'automne":
        return 0
    else:
        return 1
    
def parseTitle(e):
    return [(x.contents[0], i) for i,x in enumerate(e.findChildren())] 

def parseResult(html):
    def genPersonDf(person, title):
        personDict = parsePerson(person, title)
        if personDict is None:
            return pd.DataFrame()
        return pd.DataFrame(personDict, index=[personDict['No Sciper']])
        
    def parsePerson(person, title):
        def getContent(e):
            content = e.contents
            if len(content) > 0:
                return content[0]
            else:
                return ''
            
            
        elems = person.findChildren()
        if (len(elems) < 12
        or "<th>Civilité</th>" in str(elems) # for when we have several tables because of Tous
        or "<td" in str(elems[OFFSET_STATUS].next)): # for Randriamparany Tahina Mikaela....
            return None
        return {x[0]: getContent(elems[x[1]])  for x in title}
    soup = BeautifulSoup(html, 'html.parser')
    
    persons = soup.findAll('tr')[1:]
    if len(persons) > 0:
        title = parseTitle(persons[0])
        it = (genPersonDf(person, title) for person in persons[1:])
        it = peek(it)
        if it is None:
            return pd.DataFrame()
        else:
            return pd.concat(it)
    else:
        return pd.DataFrame()



def genDF(params):
    request = buildRequest({
                                'ww_x_GPS': ww_x_GPS,
                                'ww_i_reportModel': ww_i_reportModel,
                                'ww_i_reportModelXsl': ww_i_reportModelXsl,
                                'ww_x_PERIODE_ACAD': params[1]['value'],
                                'ww_x_UNITE_ACAD': params[2]['value'],
                                'ww_x_PERIODE_PEDAGO': params[0]['value'],
                                'ww_x_HIVERETE': params[3]['value']
                             })
    with urllib.request.urlopen(request) as c:
        html = c.read()
        df = parseResult(html)
        df['year'] = params[1].next
        df['section'] = params[2].next
        period = parsePeriod(params[0].next)
        df['period'] = getSemester(params[3].next)
        df['level'] = period[0]
        df['semester'] = period[1]
        return df

# select all?
ww_x_GPS=-1

ww_i_reportModel =  soup.findAll("input", {"name": "ww_i_reportmodel"})[0]['value']

ww_i_reportModelXsl =  soup.findAll("input", {"name": "ww_i_reportModelXsl"})[0]['value'] # select the first value as it is the html report

# remove the null by doing removing the first element
ww_x_UNITE_ACADs =  soup.findAll("select", {"name": "ww_x_UNITE_ACAD"})[0].findChildren()[1:] 
ww_x_PERIODE_ACADs =  soup.findAll("select", {"name": "ww_x_PERIODE_ACAD"})[0].findChildren()[1:]
ww_x_PERIODE_PEDAGOs = soup.findAll("select", {"name": "ww_x_PERIODE_PEDAGO"})[0].findChildren()[1:]
ww_x_HIVERETEs = soup.findAll("select", {"name": "ww_x_HIVERETE"})[0].findChildren()[1:]

itBachelor = product(
            filter(lambda x: "Bachelor" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
             filter(lambda x: "Informatique" in x.next, ww_x_UNITE_ACADs),
#              ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )

itMaster = product(
            filter(lambda x: "Master" in x.next or "Project" in x.next, ww_x_PERIODE_PEDAGOs),
#             filter(lambda x: "2015-2016" in x.next, ww_x_PERIODE_ACADs),
            ww_x_PERIODE_ACADs,
             filter(lambda x: "Informatique" in x.next, ww_x_UNITE_ACADs),
#              ww_x_UNITE_ACADs,
             ww_x_HIVERETEs
           )
     
# #We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in itBachelor}
    dfB = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))

with concurrent.futures.ThreadPoolExecutor(max_workers=50) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(genDF, params): params for params in itMaster}
    dfM = pd.concat(future.result() for future in concurrent.futures.as_completed(future_to_url))

In [4]:
print(dfB['Statut'].value_counts())
print(dfM['Statut'].value_counts())

Présent    5502
Congé       296
Attente       9
Name: Statut, dtype: int64
Présent    2485
Stage       282
Congé        77
Name: Statut, dtype: int64


# Bachelor students analysis:
## Goals:
Here we show how long master students took to complete their bachelor per sex and if there are significant statistical  difference
## Assumptions:
We consider a student that has reached the 6th semester to have finished his bachelor but we compute the correct time. Indeed we compute the duration by ordering the rows by year and period. Then we find the duration using the first and last row. 



In [221]:
# remove people that don't have semester 1 and 6 in the data
# we consider that people that have an entry for semester 6 finished their bachelor 
# as they get removed from that list if they fail
def removeIncomplete(person):
    did1 = reduce(lambda x,y: x or y, person.semester == '1', False) # check if the student has a semester 1 entry
    did6 = reduce(lambda x,y: x or y, person.semester == '6', False) # check if the student has a semester 6 entry
    return did1 and did6

# sort the data chronologically and compute the difference in time between the first and last semester
# we compute it in years and multiply by 12
def bachelorDuration(person):
    sample = person.iloc(0)[0] # used to get general infos
    sorted = person.sort_values(by=['year', 'period'])
    startYear = sorted.iloc[0]['year'] # get the first bachelor period
    startPeriod = sorted.iloc[0]['period']

    endYear = sorted.iloc[-1]['year'] # get the last bachelor period
    endPeriod = sorted.iloc[-1]['period']
    
    duration = 12*(0.5 + float(endYear[:4]) - float(startYear[:4]) + (float(endPeriod) - float(startPeriod))/2)
    return pd.Series({'sex': sample['Civilité'], 'duration': duration})

#Remove the students that have not finished their bachelors
dfBCompleted = dfB.groupby(dfB.index).filter(removeIncomplete)

#Get duration of bachelor per student
s = dfBCompleted.groupby(dfBCompleted.index).apply(bachelorDuration)

mean = s.groupby('sex').mean()
print("Average duration per sex {}".format(mean))
print("Average difference in {}".format(mean.iloc(0)[1] - mean.iloc(0)[0]))


Average duration per sex            duration
sex                
Madame    40.758621
Monsieur  42.880435
Average difference in duration    2.121814
dtype: float64


## Statistical Significance Test  

In [223]:
# manual t-test
se2 = s.groupby('sex').var()/s.groupby('sex').count()

t = (mean['duration'][1] - mean['duration'][0]) / np.sqrt(se2['duration'][0] + se2['duration'][1])
df = s.groupby('sex').count().min()[0] - 1
print('t stat: ' + str(t))
print('degrees of freedom: ' + str(df))

p = (1-stats.t.cdf(t, df))*2
print('the p-value is: ' + str(p))

t stat: 1.34370056781
degrees of freedom: 28
the p-value is: 0.189833792999


We can't reject H0 (the 2 means are equal), => we don't observe any significant difference

Testing with a other methods

In [224]:
M = s[s.sex == 'Monsieur']
F = s[s.sex != 'Monsieur']

stats.ttest_ind(M['duration'], F['duration'], equal_var=False)

Ttest_indResult(statistic=1.3437005678090845, pvalue=0.18785555340784144)

Here we get a small difference in p-value compared to the previous one because the test is using the Welch-Satterthwaite degrees of freedom instead of the lowest df of the 2 series.

In [226]:
pop1 = M['duration']
pop2 = F['duration']
num1 = pop1.count()
num2 = pop2.count()
df = ((np.var(pop1)/num1 + np.var(pop2)/num2)**(2.0))/(   (np.var(pop1)/num1)**(2.0)/(num1-1) +  (np.var(pop2)/num2)**(2.0)/(num2-1) ) 
(1-stats.t.cdf(t, df))*2

0.18780024218122748

We can also use a linear regression to check if there a significant difference

In [227]:
s2 = s
a = pd.Categorical.from_array(s2['sex'])
s2['sex'] = a.codes

sm.OLS(s2['duration'], sm.add_constant(s2['sex'])).fit(cov_type='HC3').summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,1.749
Date:,"Mon, 17 Oct 2016",Prob (F-statistic):,0.187
Time:,13:09:13,Log-Likelihood:,-1450.5
No. Observations:,397,AIC:,2905.0
Df Residuals:,395,BIC:,2913.0
Df Model:,1,,
Covariance Type:,HC3,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,40.7586,1.527,26.698,0.000,37.766 43.751
sex,2.1218,1.605,1.322,0.186,-1.023 5.267

0,1,2,3
Omnibus:,92.931,Durbin-Watson:,1.823
Prob(Omnibus):,0.0,Jarque-Bera (JB):,159.029
Skew:,1.385,Prob(JB):,2.9300000000000004e-35
Kurtosis:,4.392,Cond. No.,7.27


The coefficient of the sex dummy is non-significant <br>
We see that the average is 3.4 years for females (the sex dummy is encoded to 0) and 3.57 for males

# Master students analysis:
## Goals:
Here we show how long master students took to complete their master and if there are significant statistical  difference between sex and specialization
## Assumptions:
Since the data on the master project is incomplete, we didn't take it into account. In fact we had examples of people who should be on the list which were not. There for our results don't take into account the PDM. <br >
The stage is typically not done at EPFL so it's not useful here.
We also removed all rows where students were not present at EPFL. <br>
To figure out if someone finished their master, we check if they did two semesters and no spe/minor or if they did three semesters and a spe/minor <br >
We then count the time in years by using the last and first rows ordered by year and period (which work because we only kept "Présent" rows)



In [228]:
# Get all students who did a whole master
# We kept all student that:
# had a Master 1 entry
# had a Master 2 entry and no spe/mineur or had a spe/mineur and a master 3 entry
def removeIncompleteMaster(person):
    did1 = reduce(lambda x,y: x or y, person.semester == '1', False)
    did3 = reduce(lambda x,y: x or y, person.semester == '3', False)
    didspe = reduce(lambda x,y: x or y, person.spe != '', False)
    didminor = reduce(lambda x,y: x or y, person.Mineur != '', False)
    did2 = reduce(lambda x,y: x or y, person.semester == '3', False)

    return did1 and (did3 and (didspe or didminor)) or (did2 and ~didspe and ~didminor)

# sort the data chronologically and compute the difference in time between the first and last semester
# we compute it in years and multiply by 12
def masterDuration(person):
    sorted = person.sort_values(by=['year', 'period'])
    sample = sorted.iloc[-1] # we keep the info for the last period only so we have an update spe
    startYear = sorted.iloc[0]['year']
    startPeriod = sorted.iloc[0]['period']

    endYear = sorted.iloc[-1]['year']
    endPeriod = sorted.iloc[-1]['period']
    
    duration = 12*(0.5 + float(endYear[:4]) - float(startYear[:4]) + (float(endPeriod) - float(startPeriod))/2.0)
    
    return pd.Series({'sex': sample['Civilité'], 'spe': sample['spe'], 'duration': duration})


#Rename column for handiness
dfMP = dfM.rename(columns = {'Spécialisation':'spe'})

#Only keep present rows
dfMP = dfMP[dfMP.Statut != "Stage"] # we remove stage information

#Get finished master
dfMFinished = pd.DataFrame(dfMP.groupby(dfMP.index).filter(removeIncompleteMaster))

#Get duration for each student
dfMFinishedDuration =  dfMFinished.groupby(dfMFinished.index).apply(masterDuration)

#Sex
print ("Average finish time in months : {}".format(dfMFinishedDuration.mean()[0] ))
print ("Average finish time in months per sex : ")
print(dfMFinishedDuration.groupby('sex').mean())

#Spe
print ("Average finish time in months per spe :")
print(dfMFinishedDuration.groupby('spe').mean())
print ("Number of registered student per spe : ")
print(dfMFinishedDuration.groupby('spe').count())

Average finish time in months : 20.78838174273859
Average finish time in months per sex : 
           duration
sex                
Madame    20.307692
Monsieur  20.863309
Average finish time in months per spe :
                                 duration
spe                                      
                                19.677419
Biocomputing                    28.000000
Computer Engineering - SP       22.000000
Computer Science Theory         18.000000
Data Analytics                  18.000000
Foundations of Software         22.500000
Information Security - SP       21.428571
Internet Information Systems    18.000000
Internet computing              22.129870
Service science                 22.000000
Signals, Images and Interfaces  24.782609
Software Systems                20.142857
Number of registered student per spe : 
                                duration  sex
spe                                          
                                     279  279
Biocomputing           

## Statistical Significance Test  

In [213]:
M = dfMFinishedDuration[dfMFinishedDuration.sex == 'Monsieur']
F = dfMFinishedDuration[dfMFinishedDuration.sex != 'Monsieur']

stats.ttest_ind(M['duration'], F['duration'], equal_var=False)

Ttest_indResult(statistic=0.63657336700382761, pvalue=0.52616330907229658)

No significant difference in length by sex

In [214]:
Y = dfMFinishedDuration['duration']
# we need to remove the default category dummy otherwise we get multicolinearity problems
X = pd.concat([dfMFinishedDuration, pd.get_dummies(dfMFinishedDuration['spe'])], axis=1).drop(['duration', 'sex', 'spe', ''], axis = 1)
sm.OLS(Y, sm.add_constant(X)).fit(cov_type='HC3').summary()

0,1,2,3
Dep. Variable:,duration,R-squared:,0.066
Model:,OLS,Adj. R-squared:,0.044
Method:,Least Squares,F-statistic:,11.01
Date:,"Mon, 17 Oct 2016",Prob (F-statistic):,3.19e-18
Time:,12:59:51,Log-Likelihood:,-1550.2
No. Observations:,482,AIC:,3124.0
Df Residuals:,470,BIC:,3175.0
Df Model:,11,,
Covariance Type:,HC3,,

0,1,2,3,4,5
,coef,std err,z,P>|z|,[95.0% Conf. Int.]
const,19.6774,0.365,53.893,0.000,18.962 20.393
Biocomputing,8.3226,6.491,1.282,0.200,-4.400 21.045
Computer Engineering - SP,2.3226,1.370,1.695,0.090,-0.363 5.008
Computer Science Theory,-1.6774,3.710,-0.452,0.651,-8.950 5.595
Data Analytics,-1.6774,0.365,-4.594,0.000,-2.393 -0.962
Foundations of Software,2.8226,0.907,3.113,0.002,1.046 4.599
Information Security - SP,1.7512,2.418,0.724,0.469,-2.988 6.491
Internet Information Systems,-1.6774,4.017,-0.418,0.676,-9.550 6.195
Internet computing,2.4525,0.740,3.313,0.001,1.002 3.903

0,1,2,3
Omnibus:,71.002,Durbin-Watson:,1.889
Prob(Omnibus):,0.0,Jarque-Bera (JB):,155.433
Skew:,0.793,Prob(JB):,1.77e-34
Kurtosis:,5.286,Cond. No.,22.5


This is a comparison of the effect of having a spe with respect to not having one (not the population mean). We see that Data Analytics, FoS, Internet Computing and Signals, Images and Interfaces have a significant effect

In [215]:
# comparison with respect to the whole population (including students without a spe)
withSpe = dfMFinishedDuration[dfMFinishedDuration['spe'] != '']

for spe in withSpe['spe'].unique():
    p = stats.ttest_ind(dfMFinishedDuration['duration'], withSpe[withSpe['spe'] == spe]['duration'], equal_var=False).pvalue
    print(spe + ', p-value: ' + str(p))

Signals, Images and Interfaces, p-value: 0.0486902784227
Internet computing, p-value: 0.0580088733457
Biocomputing, p-value: 0.305952739373
Foundations of Software, p-value: 0.0534057515575
Computer Engineering - SP, p-value: 0.368425283238
Service science, p-value: 0.79081550307
Software Systems, p-value: 0.607060202005
Information Security - SP, p-value: 0.783532939799
Data Analytics, p-value: 8.90242324119e-21
Internet Information Systems, p-value: nan
Computer Science Theory, p-value: nan




However, when comparing to the population mean, the only 2 spes that have a significant difference in duration are Data Analytics (However, only 4 persons finished it) and Signals, Images and Interfaces (23 persons).
Internet computing and FoS are at very close to the threshold but not significant at 5%.

Data analytics is shorter: 18 months vs 20.79<br>
Signals, Images and Interfaces is longer: 4.78 vs 20.79