In [9]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import html5lib

In [10]:
basic_url = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.filter?ww_i_reportModel=133685247'
r = requests.get(basic_url)

In [11]:
soup = BeautifulSoup(r.text,'html.parser')

In [12]:
#The attributes we need are the section, the academic period and the period
#We print the name of the attributes of research in ISA and see which are the right ones
# soup.find_all('select) allows us to take all html code needed
#Then we select the name of the list we got from the precedent function

[print(i['name']) for i in soup.find_all('select')]

#We don't need only ww_x_HIVERETE
#We can see that ww_x_UNITE_ACAD is for the section

ww_x_UNITE_ACAD
ww_x_PERIODE_ACAD
ww_x_PERIODE_PEDAGO
ww_x_HIVERETE


[None, None, None, None]

In [315]:
#Generate all the attributes we need to set the urls to create the database
#To do it, we created a list where each sample is another list whith the value and the name
#For the sections, we only need the value of Informatique
#For the pedagogic period, we only need the Bachelor and Master values. We also have to be carefull since there
#is a Projet de master line, Bachelor semestre 5b/6c and Master semestre 4 without any students 

attributes = []
index_informatique = [i for i in range(len(soup.find_all('option'))) if 'Informatique' in soup.find_all('option')[i]][0]

for i in soup.find_all('select'):
    content = []
    if i['name'] != 'ww_x_HIVERETE':
        if i['name'] == 'ww_x_PERIODE_ACAD':
            for j in i:
                if j['value'] != 'null':
                    content.append([j['value'],j.string])
        elif i['name'] == 'ww_x_UNITE_ACAD':
            line =i.find_all('option')[index_informatique]
            content.append([line['value'],line.string])
        else:
            for j in i:
                if j['value'] != 'null' and ('Bachelor' in j.string or 'Master' in j.string) and \
                j.string != 'Bachelor semestre 5b' and j.string != 'Bachelor semestre 6b' and \
                j.string != 'Master semestre 4':
                    content.append([j['value'],j.string])
        attributes.append([i['name'],content])
        
attributes[1][1] = np.array(list(reversed(attributes[1][1])))

In [316]:
attributes

[['ww_x_UNITE_ACAD', [['249847', 'Informatique']]],
 ['ww_x_PERIODE_ACAD', array([['978181', '2007-2008'],
         ['978187', '2008-2009'],
         ['978195', '2009-2010'],
         ['39486325', '2010-2011'],
         ['123455150', '2011-2012'],
         ['123456101', '2012-2013'],
         ['213637754', '2013-2014'],
         ['213637922', '2014-2015'],
         ['213638028', '2015-2016'],
         ['355925344', '2016-2017']], 
        dtype='<U9')],
 ['ww_x_PERIODE_PEDAGO',
  [['249108', 'Bachelor semestre 1'],
   ['249114', 'Bachelor semestre 2'],
   ['942155', 'Bachelor semestre 3'],
   ['942163', 'Bachelor semestre 4'],
   ['942120', 'Bachelor semestre 5'],
   ['942175', 'Bachelor semestre 6'],
   ['2230106', 'Master semestre 1'],
   ['942192', 'Master semestre 2'],
   ['2230128', 'Master semestre 3'],
   ['249127', 'Projet Master automne'],
   ['3781783', 'Projet Master printemps']]]]

## Question 1

In [262]:
#Prepare_dataframe allows us to have a dataframe from a url. Then we select the right columns which are the Gender
#, the sciper number and date of the bachelor with the right semester
#Select_right_order returns first for the bachelor semestre 1 since we want the start date and we will concatenate
#all the DataFrames with a chronological order
#Concatenate_dataframes_right allows us to do the last point

def prepare_dataframe_bachelor(url,i,j):
    df = pd.read_html(requests.get(url).text)[0]
    df = df.drop([1,2,3,4,5,6,7,8,9,11],axis=1)
    df.columns = df.loc[1]
    df.columns.name = 0
    df = df.drop([0,1],axis=0) 
    df[i] = float(j[0:4])
    return df
    
def select_right_order(bachelor_name):    
    if bachelor_name == 'Bachelor semestre 5' or bachelor_name == 'Bachelor semestre 6':
        return 'last'
    else:
        return 'first'
    
def concatenate_dataframes_right(df1,df2,order):
    df1 = pd.concat([df1,df2])
    df1 = df1.drop_duplicates(['No Sciper'],keep=order)
    return df1

In [263]:
#We get all the urls for the bachelor semester 1,5 and 6 since they are the only important ones by concatenating all 
#the values of the variables in the array attributes
#Since we only need Informatique, there is only one value to concatenate each time so we did it
#at the beginning. We use the precedent functions to avoid redunduncies
# At the end, we take as index the sciper numbers and drop it from the columns

u = 'http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&'\
+ 'ww_i_reportModelXsl=133685270&'

unite_acad = attributes[0][0] + '=' + attributes[0][1][0][0]
u = u + unite_acad + '&'

dfs = []
for i in np.array(attributes[2][1])[[0,4,5],:]:
    url = u + attributes[1][0] + '=' + str(attributes[1][1][0][0]) + '&' + attributes[2][0] + '=' + str(i[0])
    dfs.append(prepare_dataframe_bachelor(url,i[1],attributes[1][1][0][1]))
    
k = 0    

for i in np.array(attributes[2][1])[[0,4,5],:]:
    for j in attributes[1][1]:
        url = u + attributes[1][0] + '=' + str(j[0]) + '&' + attributes[2][0] + '=' + str(i[0])
        df2 = prepare_dataframe_bachelor(url,i[1],j[1])
        order = select_right_order(i[1])
        dfs[k] = concatenate_dataframes_right(dfs[k],df2,order)
    dfs[k].index = dfs[k]['No Sciper']
    dfs[k] = dfs[k].drop(['No Sciper'],axis=1)
    k = k + 1

In [270]:
#The merge function with inner gives all the student who did their first and last semester at epfl

final_bachelor = pd.merge(dfs[0], dfs[2], left_index=True, right_index=True, how='inner',suffixes=['', '_y'])
final_bachelor = final_bachelor.drop(['Civilité_y'],axis=1)
final_bachelor = pd.merge(final_bachelor, dfs[1], left_index=True, right_index=True, how='inner',suffixes=['', '_y'])
final_bachelor = final_bachelor.drop(['Civilité_y'],axis=1)

In [272]:
#We compute the time for each student to get their bachelor. Since we take the last semestre 6 for each student, 
#we just have to see if the fifth semestre's year is the same than the sixth semestre's year and if not we
#add half of a year in the time

time = final_bachelor['Bachelor semestre 6'] - final_bachelor['Bachelor semestre 1'] + 1
for i in range(final_bachelor['Bachelor semestre 5'].shape[0]):
    if final_bachelor['Bachelor semestre 6'][i] != final_bachelor['Bachelor semestre 5'][i]:
        time[i] = time[i] + 0.5
final_bachelor['Count'] = time
final_bachelor = final_bachelor[['Civilité','Bachelor semestre 1','Bachelor semestre 5','Bachelor semestre 6','Count']]

In [293]:
#Make the groupby with respect to the Civilité and calculate the mean for each gender

gender_group = final_bachelor[['Civilité','Count']]
gender_group = gender_group.groupby(gender_group['Civilité'])
gender_group.mean()

Unnamed: 0_level_0,Count
Civilité,Unnamed: 1_level_1
Madame,3.396552
Monsieur,3.574728


In [314]:
diffence_in_average_gender = (gender_group.mean().loc['Monsieur'] - gender_group.mean().loc['Madame']).values[0]
final_bachelor.loc[final_bachelor['Civilité'] == 'Monsieur'][['Count']]
final_bachelor.loc[final_bachelor['Civilité'] == 'Madame'][['Count']]


Unnamed: 0_level_0,Count
No Sciper,Unnamed: 1_level_1
169569,3.0
179449,3.0
178553,3.0
178271,3.0
180731,3.0
171619,3.0
178656,3.0
181121,3.0
180570,3.0
181248,3.0


## Question 2

http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=2230106
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=942192
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=2230128
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=249127
http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?ww_x_GPS=-1&ww_i_reportModel=133685247&ww_i_reportModelXsl=133685270&ww_x_UNITE_ACAD=249847&ww_x_PERIODE_ACAD=978181&ww_x_PERIODE_PEDAGO=

In [378]:
def prepare_dataframe_master(url,i,j):
    df = pd.read_html(requests.get(url).text)[0]
    if '(0 ét.)' in df.loc[0]:
        return df
    else:
        df = df.drop([1,2,3,5,7,8,9,11],axis=1)
        df.columns = df.loc[1]
        df.columns.name = 0
        df = df.drop([0,1],axis=0) 
        df[i] = float(j[0:4])
        return df

In [382]:


dfs = []
#for i in np.array(attributes[2][1])[range(6,11),:]:
#    url = u + attributes[1][0] + '=' + str(attributes[1][1][0][0]) + '&' + attributes[2][0] + '=' + str(i[0])
#    dfs.append(prepare_dataframe(url,i[1],attributes[1][1][0][1]))
    
k = 0    
for i in np.array(attributes[2][1])[range(6,11),:]:
    for j in attributes[1][1]:
        url = u + attributes[1][0] + '=' + str(j[0]) + '&' + attributes[2][0] + '=' + str(i[0])
        #dfs.append(pd.read_html(requests.get(url).text)[0])
        df2 = prepare_dataframe_master(url,i[1],j[1])
        if '(0 ét.)' in df2.loc[0]:
            print(df2)
        print(k)
        k = k+1
        #order = select_right_order(i[1])
        #dfs[k] = concatenate_dataframes_right(dfs[k],df2,order)
    #dfs[k].index = dfs[k]['No Sciper']
    #dfs[k] = dfs[k].drop(['No Sciper'],axis=1)
    #k = k + 1

KeyError: 'the label [0] is not in the [index]'

In [380]:
dfs[0]

IndexError: list index out of range

In [None]:
payload_tails_a = {}
payload_tails_p = {}

sem_a = soup.find('option', text = "Semestre d'automne")
payload_tails_a[sem_a.parent['name']] = sem_a['value']

sem_p = soup.find('option', text = "Semestre de printemps")
payload_tails_p[sem_p.parent['name']] = sem_p['value']