In [None]:
%matplotlib inline
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import scipy as sp
import urllib.parse
import re

## Data Retrieval

We can see two base url used to retrive data, the ```URL_FORM_BASE``` for the form and ```URL_DATA_BASE``` for the data (the table which contains the actual information).

In [None]:
URL_FORM_BASE = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter"
URL_DATA_BASE = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html"

By first inspecting the source code of the page we can see that the form has some hidden field, they indicate to the backend what kind of data to retrieve. We can thus form a dictionary of "default parameters".

In [None]:
FORM_PARAM_DEFAULT = {
    "ww_v_list" : 1,
    "ww_i_reportmodel": "133685247",
    #"ww_c_langue": 'en', # We work with the frenc version to match the questions
}

We then take interest in the form itself. We can see that we have 5 different fields each with a specific name attribute. (When the form is "posted" the name attribute is used to generate the url). We can thus retrieve all the options for each field relying on his name attribute.

We will only work with the HTML data so we directly take the parameters from the source code and don't bother to parse the HMTL page to retrieve it (for HMTL use a value of 133685270 for the ww_i_reportModelXsl attribute).

In [None]:
def request2soup(url, params):
    r = requests.get(url, params=params)
    assert r.status_code == 200
    return BeautifulSoup(r.text, "lxml")

def get_select_options(soup, name_attr):
    data = soup.select('select[name={}]'.format(name_attr))[0] #We assume that we have only one field with a specific name
    options = {option['value']: option.text for option in data.find_all('option', value=True, selected=False) if  option.text != ''}
    return options

def get_fields_options(soup, fields):
    form_options = {}
    for key, value in fields.items():
        form_options[value] = get_select_options(soup, value)
    
    return form_options

FORM_FIELDS = {
    # Label : Name Attribute
    'Unité académique': 'ww_x_UNITE_ACAD',
    'Période académique': 'ww_x_PERIODE_ACAD',
    'Période pédagogique': 'ww_x_PERIODE_PEDAGO',
    'Type de semestre': 'ww_x_HIVERETE',
}

soup_index = request2soup(URL_FORM_BASE, FORM_PARAM_DEFAULT)

FORM_FIELDS_OPTIONS = get_fields_options(soup_index, FORM_FIELDS)

In [None]:
FORM_FIELDS_OPTIONS

By submiting the form, we get a table filled with links at the bottom of the form back. Of course ISA does not want to play it simple and rely instead of the href attribute to javascript for the link behaviour. A clicked link call the js function "loadReport" which generate the "data url" based on the ww_x_GPS attribute. 

First we define two utilities function to generate for us the complete url based on the given parameters.

In [None]:
# Get Url for the form page (the one with the links)
def get_form_url(params):
    return URL_FORM_BASE + "?" + urllib.parse.urlencode(dict(params, **FORM_PARAM_DEFAULT))

# Get Url for the data page (the one with the table of students information)
def get_data_url(params, GPS=-1, type=133685270):
    p = {'ww_x_GPS': GPS, 'ww_i_reportModelXsl': type}
    return URL_DATA_BASE + "?" + urllib.parse.urlencode(dict(dict(params, **FORM_PARAM_DEFAULT), **p))

Instead of parsing the form page to retrieve the ww_x_GPS attribute, we generate directly the data url based on the form options discovered above.

In [None]:
# Returns a dataframe containing all the student information from 2007 to 2016 for the given pedagogic period
def get_data(period_peda, columns):
    data_list = []
    for peda in period_peda:
        for period in FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD']:
            params = {
                'ww_x_UNITE_ACAD': 249847, # hard coded: only interested in the computer science faculty
                'ww_x_PERIODE_ACAD': period,
                'ww_x_PERIODE_PEDAGO': peda,
                'ww_x_HIVERETE': '',
            }  
            
            try: 
                # By specifying ww_x_UNITE_ACAD, ww_x_PERIODE_ACAD, ww_x_PERIODE_PEDAGO we are sur 
                # that only one table will be present on the page. Indeed there was only one BA1 class in year
                # for the computer science faculty
                data = pd.read_html(get_data_url(params), header=1)[0]
                df = data.copy()[columns] # Only keep interesting columns
                df['semester'] = period_peda[peda] # Add semester information
                df['period'] = FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD'][period] # Add period information
                data_list.append(df)
            except:
                # read_html might throw an error if there is no table in the page
                print("No Data : " +  period_peda[peda] + " " + FORM_FIELDS_OPTIONS['ww_x_PERIODE_ACAD'][period])
        
    d = pd.concat(data_list) 
    return d

--------------

# Q1 : Bachelor Data Analysis

In [None]:
bachelor = {  
    '249108': 'Bachelor semestre 1',
    '249114': 'Bachelor semestre 2',
    '942155': 'Bachelor semestre 3',
    '942163': 'Bachelor semestre 4',
    '942120': 'Bachelor semestre 5',
    '942175': 'Bachelor semestre 6',
    #'2226768': 'Bachelor semestre 5b', # No data, no need to retrieve page
    #'2226785': 'Bachelor semestre 6b', # No data, no need to retrieve page
}
columns = ['No Sciper', 'Statut', 'Civilité']

# Retrieve the data
d = get_data(bachelor, columns)
d['No Sciper'] = d['No Sciper'].apply(str)
bachelor = d.copy()

Now that we have the data in a pandas dataframe we can start working on it.

### Assumptions and remarks

 * There are 23 students that already have values for their spring semester in 2017. They are the student that are abroad for their 3rd year. We assume that they will pass directly their third year.
 * There is one guy that can be considered as an outliers. He has spend 7 years in bachelor.
 * In our setup, we need to add 0.5 year (1 semester) to find the full duration.
     
        Example: start in fall semester 2008 -> "starts" in 2011.5, finish it's bachelor in the spring semester of 2014 (i.e "finished" in 2014). So if we just do 2014 - 2011.5 = 2.5, we miss the duration of the last semester.

In [None]:
# Return if Dataframe has both ba1 ba6 entries
first_and_last = lambda semesters: "Bachelor semestre 1" in semesters and "Bachelor semestre 6" in semesters
# Return if the semester is an autumn semester or not
is_autumn = lambda sem: int(sem[-1:]) % 2 == 1
# Return if we should keed the row (based on the is_autumn result)
idx_to_keep = lambda sem: 0 if sem else 1

In [None]:
grouped = d.groupby(['No Sciper'])
# Only keep the student for which we have at least information about BA1 and BA6
filtered = grouped.filter(lambda x: len(x.values) >= 2 and first_and_last(x.values)).copy()
filtered.head()

In [None]:
# Add extracted information to ease our analysis
filtered['is_autumn'] = filtered['semester'].apply(lambda x: is_autumn(x))
filtered['date'] = filtered.apply(lambda x: int(x['period'].split("-")[idx_to_keep(x['is_autumn'])]), axis=1)
filtered['date_help'] = filtered.apply(lambda x: float(x.date) + (.5 if x.is_autumn else 0), axis=1)
filtered.head()

In [None]:
grpd = filtered.groupby(['No Sciper'])
duration = grpd.apply(lambda x: max(x.date_help) - min(x.date_help) + .5)
# Finally add total time spend in bachelor
filtered['time_elapsed'] = filtered['No Sciper'].apply(lambda sciper: duration[sciper])
filtered.head()

In [None]:
# Strip down the dataframe to a more usefull representation
df = filtered.groupby(['No Sciper'])['Civilité', 'time_elapsed'].agg(lambda x:x.value_counts().index[0])
df.head()

### Duration distribution by sex

As we can see, most of the one who finish their bachelor finish it in three years.

In [None]:
tmp = df.reset_index().groupby(['Civilité','time_elapsed']).count()
tmp = tmp.groupby(level=0).apply(lambda x: 100*x/float(x.sum()))
tmp = tmp.unstack(level=0)
graph = tmp.plot(kind='bar')
graph.set_xlabel('Length (in year)')
graph.set_ylabel('Percentage')
graph.set_title('Bachelor length by sex')

### Duration mean by sex

First we just look at the data using a simple describe. It seems that there is indeed a difference in the mean duration when looking at the sex of the student.

In [None]:
df.groupby(['Civilité']).describe()

#### Hypothesis testing
We will now test if the difference between the two means is statistically signigicant.

In [None]:
M = df.groupby(['Civilité']).get_group('Monsieur')['time_elapsed']
F = df.groupby(['Civilité']).get_group('Madame')['time_elapsed']
print("Men Mean (in semester): {}".format(M.mean()*2))
print("Female Mean (in semester): {}".format(F.mean()*2))

We will use as our H0 hypothesis:
    
    
    The mean duration for male and female students at epfl in the computer science faculty is identical
    
By looking at the distribution plot above, we can see that the duration don't seem to follow a normal distribution. We will then use the Mann-Withney U-test which is a nonparametric test of the null hypothesis that two samples come from the same population against an alternative hypothesis, especially that a particular population tends to have larger values than the other.

In [None]:
from scipy import stats

statistic, p_value = stats.mannwhitneyu(M, F, alternative='two-sided')
print('P-value: {}'.format(p_value))

We find a p-value greater than 0.05 meaning that at significance level of 5%, their is no difference is the distribution of the time spend in bachelor for men and women. 

It is important to note that this test might be skewd. The Mann-Whitney U test assumes independence of observations but in reality each duration is not completely independant of the other. Indeed an exam session might be toughter than the previous one increasing the change to add a semester to each one of the student passing the exam during the tougher session.

# Q2 : Master Data Analysis

In [None]:
master = {  
    #'953137':  'Stage automne 3ème année', # No Data
    #'983606':  'Stage printemps 3ème année', # No Data
    #'2226616': 'Stage automne 4ème année', # No Data
    #'2226626': 'Stage printemps 4ème année', # No Data
    #'2227132': 'Stage printemps master', # No Data
    '2230106': 'Master semestre 1',
    '942192':  'Master semestre 2',
    '2230128': 'Master semestre 3',
    #'2230140': 'Master semestre 4', # No Data
    #'2335667': 'Mineur semestre 1', # No Data
    #'2335676': 'Mineur semestre 2', # No Data
    #'2754553': 'Semestre printemps', # No Data
    #'953159':  'Semestre automne', # No Data
    '249127':  'Projet Master automne',
    '3781783': 'Projet Master printemps',
}


columns = ['No Sciper', 'Statut', 'Civilité', 'Mineur', 'Spécialisation',]

# Retrieve the data
d = get_data(master, columns)
d['No Sciper'] = d['No Sciper'].apply(str)
master = d.copy()

In [None]:
master.head()

### Assumption
 * We assume that the student has to do his master in at least 3 semester if has a specialization or minor in Master semester 3.
 * We keep only the people that have at least a Master semester 1 and Master semester 3 if the student has a specialization or minor. Otherwise, we only keep the people that have at least a Master semester 1 and Master semester 2.
 * If a student has neither an entry for a PDM or a mention of a stage semester we add 6 months to the total duration.
 * If a student has only a stage semester, we add 6 months for the PDM.
 * If student has only a mention of PDM, don't add anything and assume that the student does it stage and PDM at the same time.

In [None]:
# Return if student has finished his master
def first_and_last(rows):
    minor = lambda row: sum([np.nan in [x[3]] for x in rows]) != len(rows)
    spe = lambda row: sum([np.nan in [x[4]] for x in rows]) != len(rows)
    
    if minor(rows) or spe(rows):
        return "Master semestre 1" in rows and "Master semestre 3" in rows

    return "Master semestre 1" in rows and "Master semestre 2" in rows
    
# Return if the semester is an autumn semester or not
def is_autumn(sem):
    if 'automne' in sem:
        return True
    elif 'printemps' in sem:
        return False
    else:
        return int(sem[-1:]) % 2 == 1  
    
# Return if we should keed the row (based on the is_autumn result)
idx_to_keep = lambda sem: 0 if sem else 1

# Return if we should add a semester based on presence of a stage and/or a PDM
def add_semester(rows):
    projet = rows['semester'].str.contains('Projet').max()
    stage = rows['Statut'].str.contains('Stage').max()
    return (not projet and stage) or (not projet and not stage)

# Return the specialization of the student
def get_spe(rows):
    spe = rows[rows['semester'] == 'Master semestre 3']['Spécialisation'].value_counts()
    if len(spe):
        assert len(spe) == 1
        return spe.index[0]
    return None

# Return the minor of the student
def get_minor(rows):
    spe = rows[rows['semester'] == 'Master semestre 3']['Mineur'].value_counts()
    if len(spe):
        assert len(spe) == 1
        return spe.index[0]
    return None

In [None]:
grouped = d.groupby(['No Sciper'])
# Only keep the student for which we have information about at least MA1 and MA2
filtered = grouped.filter(lambda x: len(x.values) >= 2 and first_and_last(x.values)).copy()
filtered.sort_values('No Sciper').head()

In [None]:
# Add extracted information to ease our analysis
filtered['is_autumn'] = filtered['semester'].apply(lambda x: is_autumn(x))
filtered['date'] = filtered.apply(lambda x: int(x['period'].split("-")[idx_to_keep(x['is_autumn'])]), axis=1)
filtered['date_help'] = filtered.apply(lambda x: float(x.date) + (.5 if x.is_autumn else 0), axis=1)
filtered.sort_values('No Sciper').head()

In [None]:
grpd = filtered.groupby(['No Sciper', 'Civilité'])
duration = grpd.apply(lambda x: max(x.date_help) - min(x.date_help) + .5)
start_date = grpd.apply(lambda x: min(x.date_help))
add = grpd.apply(lambda x: add_semester(x)).astype('category')
minor = grpd.apply(lambda x: get_minor(x)).astype('category')
spe = grpd.apply(lambda x: get_spe(x)).astype('category')
df = pd.concat({'duration': duration, 'add_semester':add, 'spe':spe, 'minor':minor, 'start_date':start_date }, axis=1)
df.head()

In [None]:
df['time_elapsed'] = df.apply(lambda x: x['duration'] + 0.5 if x['add_semester'] else x['duration'], axis=1)
df.head()

### Average duration in master in computer science at EPFL

In [None]:
tmp = df.reset_index().groupby(['Civilité','time_elapsed'])['No Sciper'].count()
tmp = tmp.groupby(level=0).apply(lambda x: 100*x/float(x.sum()))
tmp = tmp.unstack(level=0)
graph = tmp.plot(kind='bar')
graph.set_xlabel('Length (in year)')
graph.set_ylabel('Percentage')
graph.set_title('Master length by sex')

In [None]:
avg_master = df['time_elapsed'].mean()
print('Average duration for masters students at EPFL: {} semesters'.format(avg_master*2))

It is important to take into account that master students doing a minor or specialization are skewing the average to a higher value, we can thus look at the mean for student doing minor/specialization and students who don't.

In [None]:
students_l = df[~df['minor'].isnull() | ~df['spe'].isnull()]
students_s = df[df['minor'].isnull() & df['spe'].isnull()]
avg_master_s = students_s['time_elapsed'].mean()
avg_master_l = students_l['time_elapsed'].mean()
print("Number of students with a minor/spe: {} ({}%)".format(len(students_l), len(students_l)*100.0 / len(df) ))
print("  Avergage stay: {} semesters".format(avg_master_l*2))
print("Number of students without a minor/spe: {} ({}%)".format(len(students_s), len(students_s)*100.0 / len(df)))
print("  Avergage stay: {} semesters".format(avg_master_s*2))

### Average duration per specialization



When asked to find if our result are statistically significant with respect to the "General average", we found the term vague and it didn't make much sense to compare it to the average of all students that obtained a master (with or without spe/minor) as such we took the General average as being the avergage of master obtained with a specialisation only.

In [None]:
with_spe_only = df[~df['spe'].isnull()][['spe', 'time_elapsed']]
general_average = with_spe_only['time_elapsed'].mean()
print("General average (with spe only): {}".format(general_average))

In [None]:
def compute_p_value(group):
    trim = lambda grp: grp.apply(lambda row: row.time_elapsed, axis=1)
    return stats.mannwhitneyu(trim(with_spe_only), trim(group), alternative='two-sided')[1]

In [None]:
by_spe = df[['spe', 'time_elapsed']].groupby('spe')
res = by_spe.mean()
res['count'] = df['spe'].value_counts()
res

In [None]:
res['p_value'] = by_spe.apply(lambda grp: compute_p_value(grp))
res

As we can see, only for the Signals, Images and Interfaces specialization the difference in average is statistically significant (p-value of 0.03 < 0.05) at 5% confidence. We can also see that the difference in average for the specialization with only 1 student registred are not considered as statistically significant. They only have student because they are only available since the begining of this semester.

# Q3 : Bonus

In [None]:
def scatter_ma(df, color='blue', ax=None):
    g = df.groupby(["start_date", "time_elapsed"]).size().reset_index(name='count')
    return g.plot(kind='scatter',x ='start_date', y='time_elapsed',s=g['count']*50, color=color, figsize=(20, 10), ax=ax)

d = df.reset_index()
ax = scatter_ma(d[d['Civilité'] == 'Monsieur'])
f_graph = scatter_ma(d[d['Civilité'] == 'Madame'], color='red', ax=ax)
ax.set_xlabel('Start date (in year)')
ax.set_ylabel('Duration in year')
ax.set_title('Master length by sex and starting date')
print("")

We can see that very little people start their master during the spring semester.


The number of semesters to do the master increases over time, the trend went from 3-4 between 2007 and 2010 to 4-5 semesters later on. It is important to note that 2014 and onward are missing values. Indeed nobody who started in 2014 can already have finished his master in 3.5 years.