# Parsing Data from [datasport.com](https://www.datasport.com/en/)

We use postman to understand the parameters used by the url request, asked for the exercise.

(However, notice that there are equivalent tools for other browser - for instance, for firefox:
http://stackoverflow.com/questions/28997326/postman-addons-like-in-firefox)

In [1]:
# important modules for this HW
import bs4 # doc: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
import requests as rq 


# previous useful modules
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

In [2]:
form_source = rq.get("https://www.datasport.com/en/")
form_soup = bs4.BeautifulSoup(form_source.text, "html.parser")
# print(form_soup.prettify())

Let's get all the `select` menus of the page, using the `find_all` method of *BeautifulSoup* which allows to search for all tags of a certain type.

In [None]:
selectors = form_soup.find_all('select')
print(len(selectors))

Most importantly, we can find out what each tag is about by printing the its `name` attribute :

In [None]:
for num, s in enumerate(selectors):
    print("Select n°{} : {}".format(num, s.attrs['name']))

In [None]:
for s in selectors:
    options = s.find_all('option')
    options_desc_values = [(o.text, o.attrs['value']) for o in options]
    print(s.attrs['name'] + ':')
    for (d,v) in options_desc_values:
        print("- {} [{}]".format(d,v))

## Get all tables

In [14]:
list_url = []
list_dates = []
list_names = []
list_places = []

etyp = 'Running'
eventlocation = 'CCH'
eventservice = 'all'

eventmonth = []
for month in range(12):
    eventmonth.append(str(month+1).zfill(2))
eventyear = []
for year in range(1999,2016):
    eventyear.append(str(year).zfill(4))

yes_date = 0
yes_rank = 0
yes_name = 0
yes_place = 0

for year in eventyear:
    print(year, 'parsed.')
        
    for month in eventmonth:
        d = {'etyp': etyp, 'eventlocation': eventlocation, 
             'eventmonth': month, 'eventservice': eventservice,
             'eventyear': year}
        post_source = rq.post('https://www.datasport.com/fr/calendrier/',data=d)
        form = bs4.BeautifulSoup(post_source.text, "html.parser")
        #print(form.prettify())
        
        find_tr = form.find_all('tr')
        for tr in find_tr:
            if (tr.has_attr('class') and (tr['class'][0]=='even' 
                                          or tr['class'][0]=='odd')):
                all_td = tr.find_all('td')
                
                find_a = all_td[4].find_all('a')
                for a in find_a:
                    if (a['href'].startswith('http://services.datasport.com/')
                        and not a['href'].endswith('.pdf') 
                        and not a['href'].endswith('pavees')
                        and not a['href'].endswith('mmc')):
                        list_url.append(a['href'])
                        yes_rank += 1
                find_a2 = all_td[14].find_all('a')
                
                if(yes_rank > 0):
                    for a2 in find_a2:
                        if (a2['href'].startswith('http://maps.google.ch/')):
                            list_places.append(a2['href'].split('=')[-1].split(',')[0])
                            yes_place += 1

                    find_a = all_td[1].find_all('a')
                    list_names.append(find_a[0].contents[0])
                    yes_name += 1
                
                    find_date = all_td[0].find_all('span')
                    for date in find_date:
                        if (date.has_attr('class') and date['class'][0]==''):
                            the_date = date.contents[0]
                            if the_date[-1]=='+':
                                the_date = the_date[:-1]
                            if the_date[-4:]==' bis':
                                the_date = the_date[:-4]
                            list_dates.append(date.contents[0])
                            yes_date += 1
                
                #debugging step
                if yes_date != yes_name or yes_date!=yes_rank or yes_date!= yes_place: 
                    print(yes_rank, yes_place, month, year)
                    print(list_url[-1])
                    print(list_dates[-1])
                yes_rank = 0
                yes_name = 0
                yes_date = 0
                yes_place = 0

1999 parsed.
2000 parsed.
2001 parsed.
2002 parsed.
2003 parsed.
2004 parsed.
2005 parsed.
2006 parsed.
2007 parsed.
2008 parsed.
2009 parsed.
2010 parsed.
2011 parsed.
2012 parsed.
2013 parsed.
2014 parsed.
2015 parsed.


In [15]:
print(len(list_url)==len(list_names))
print(len(list_names)==len(list_dates))
print(len(list_dates)==len(list_places))

True
True
True


In [16]:
all_runs_df = pd.DataFrame({ 'Name' : list_names,
                    'Date' : list_dates,
                    'Place' : list_places,
                    'URL' : list_url })

In [17]:
all_runs_df.head()

Unnamed: 0,Date,Name,Place,URL
0,sam. 27.03.1999,Männedörfler Waldlauf,Männedorf,http://services.datasport.com/1999/zkb/maennedorf
1,sam. 20.03.1999,Kerzerslauf,Kerzers,http://services.datasport.com/1999/lauf/kerzers
2,sam. 24.04.1999,Luzerner Stadtlauf,Luzern,http://services.datasport.com/1999/lauf/luzern
3,sam. 24.04.1999,20km de Lausanne,Lausanne,http://services.datasport.com/1999/lauf/km20
4,sam. 24.04.1999,"Chäsitzerlouf, Kehrsatz",Kehrsatz,http://services.datasport.com/1999/lauf/kehrsatz


In [18]:
all_runs_df.to_csv('links2runs.csv')

## Let's get some data

In order to get started, we can now start collecting the results from the Lausanne marathone, one of the main early event in Switzerland.  

Understand the html of the main page, and __extract the relevant parameters__ to query:

In [None]:
laus_mar_url = 'https://services.datasport.com/2016/lauf/lamara/'
result_html = rq.get(laus_mar_url)

# use BS to get the classes in which the data is devided:

result_soup = bs4.BeautifulSoup(result_html.text, "lxml")
result_font = result_soup.find_all('font')

print('number of categories in the main page:', len(result_font))

In [None]:
# we look for the ones containing 
# '*** Overall ***', as they are the most general categories 

# this is indeed probably a GENERAL KEYWORD, as it's indeed found also in
# events in other laungauges, 
# like https://services.datasport.com/2016/lauf/ascona-locarno-marathon/

good_fonts_num = []

for n_font, font in enumerate(result_font):
    
    if 'Overall' in font.findChild().get_text():
            
        good_fonts_num.append(n_font)
        print(font.findChild().get_text())
        
        
good_fonts_num = np.asarray(good_fonts_num)        
        
#  S***** -.- THERE IS A PROBLEM with the marathon hommes : 
# they are not in the same 'html shape' .. -.-

In [None]:
good_fonts_num

In [None]:
# we have to get all: href=RANG*** b

rang_to_query = []

for i in range(len(good_fonts_num)-1):
        
    my_font = result_font[good_fonts_num[i] + 1]
    a_tag = my_font.find_all('a')
    
    for t in a_tag:
    
        if 'RANG' in t['href']:
            
            rang_to_query.append(t['href'])
        
#             print(t['href'])

Query the datasport.com with the right parameters and finally get the __tables__:

In [None]:
base_url = "https://services.datasport.com/2016/lauf/lamara"
base_url + '/' + rang_to_query[0]

In [None]:
# Get raw HTML response
result_html = rq.get(base_url, params=rang_to_query[0])

# Use BeautifulSoup and extract the first (and only) HTML table
result_soup = bs4.BeautifulSoup(result_html.text, "lxml")
# result_table = result_soup.find_all('table')[0]

# print(result_table.prettify())

df_trail = pd.read_html(result_soup.decode())

In [None]:
df_trail.

# ******* ******* ******* ******* *******  
# OLD CODE 
# ******* ******* ******* ******* ******* ******* 

In [None]:
df = pd.read_html(result_table.decode())[0]
df.head()

In [None]:
df.columns = df.loc[1]                # use row 2 as column names
df = df.drop([0, 1])                  # drop useless first rows
df = df.drop([np.nan], axis=1)        # drop useless nan column
df.index = df['No Sciper']            # use sciper column as index

# Drop some columns
df = df.drop(['Orientation Bachelor', 'Orientation Master', 'Filière opt.', 'Type Echange', 'Ecole Echange'], axis=1)

# Do some renaming
df.index.name = 'sciper'
df.columns = ['gender', 'full_name', 'specialization', 'minor', 'status', 'sciper']

# Map gender to more standard names
dict_gender = {'Monsieur': 'male','Madame': 'female'}
df.gender.replace(dict_gender, inplace=True)
df.head()

## Some tools

We can define a helper function which, given a base URL and a dictionary of parameters, will fetch the data and fill a DataFrame with it.

In [None]:
def get_data(base_url, params_dict):
    """Get data from IS-Academia in a pandas DataFrame"""
    
    # Same sequence of operations of above, with a check if the result_table is empty
    
    result_html = rq.get(base_url,params=params_dict)
    result_soup = bs4.BeautifulSoup(result_html.text, "lxml")
    result_table = result_soup.find_all('table')[0]
    
    if (result_table.text == ''):
        # Return empty dataframe
        df = pd.DataFrame()
    else:
        # Build a DataFrame containing the data, with SCIPER as index
        df = pd.read_html(result_table.decode())[0]
        try:
            df.columns = df.loc[1]                # use 2nd row as column names
            df = df.drop([0, 1])                  # drop useless first rows
            df = df.drop([np.nan], axis=1)        # drop useless nan column
            df.index = df['No Sciper']            # use sciper column as index
        
            # Drop some columns
            df = df.drop(['Orientation Bachelor', 'Orientation Master', 'Filière opt.', 'Type Echange', 'Ecole Echange'], axis=1)
            # Do some renaming
            df.index.name = 'sciper'
            df.columns = ['gender', 'full_name', 'specialization', 'minor', 'status', 'sciper']
            # Map gender to more standard names
            dict_gender = {'Monsieur': 'male','Madame': 'female'}
            df.gender.replace(dict_gender, inplace=True)
        except:
            df = pd.DataFrame()
    
    return df

The following lines test this function with hardcoded values :

In [None]:
base_url = "http://isa.epfl.ch/imoniteur_ISAP/!GEDPUBLICREPORTS.html?"
params_dict = {
    'ww_x_GPS': 2021043255,
    'ww_i_reportModel': 133685247,
    'ww_i_reportModelXsl': 133685270,
    'ww_x_UNITE_ACAD': 249847,
    'ww_x_PERIODE_ACAD': 355925344,
    'ww_x_PERIODE_PEDAGO': 249108,
    'ww_x_HIVERETE':2936286
}

get_data(base_url, params_dict).head()

Finally let's get all the possible values in a cleaner way and keep them in variables that we will use throughout this notebook.

In [None]:
acad_period = {}
level = {}
semester = {}
acad_unit = {}

for s in selectors:
    options = s.find_all('option')
    options_desc_values = [(o.text, o.attrs['value']) for o in options]
    s_name = s.attrs['name']
    choices = {d: int(v) for (d,v) in options_desc_values if d!=''}
    
    if s_name == 'ww_x_PERIODE_ACAD':
        acad_period = choices
    elif s_name == 'ww_x_PERIODE_PEDAGO':
        level = choices
    elif s_name == 'ww_x_HIVERETE':
        for (d,v) in options_desc_values:
            if 'automne' in d:
                semester['automne'] = int(v)
            elif 'printemps' in d:
                semester['printemps'] =int(v)
    elif s_name == 'ww_x_UNITE_ACAD':
        acad_unit = choices

# Example of result
acad_period

### Store data locally

In [None]:
# Get bachelor data for every year and store it if it's not empty
import os
local_dir = '.local-data'
try:
    os.mkdir(local_dir)
except FileExistsError:
    # directory exists
    print("Using existing '" + local_dir + "' directory")

In [None]:
# Fixed values
params_dict = {
    'ww_x_GPS': -1,
    'ww_i_reportModel': 133685247,
    'ww_i_reportModelXsl': 133685270,
    'ww_x_UNITE_ACAD': acad_unit['Informatique']
}

# Iterate over all the varying params and keep only data for bachelors
for year_key, year_value in acad_period.items():
    for level_key, level_value in level.items():
        for semester_key, semester_value in semester.items():
            if 'bachelor' in level_key.lower():
                params_dict['ww_x_PERIODE_ACAD'] = year_value
                params_dict['ww_x_PERIODE_PEDAGO'] = level_value
                params_dict['ww_x_HIVERETE'] = semester_value
                
                df = get_data(base_url, params_dict)
                if not df.empty:
                    # Persist dataframe locally with pickle
                    filename = year_key + '-' + level_key.replace(' ', '-').lower() + '-' + semester_key
                    df.to_pickle(local_dir + '/' + filename)

In [None]:
# the previous cell should download 60 files!, as you can check with this command:
print(len([name for name in os.listdir(local_dir)]))

We hereby show an example of dataframe laoded from the files previously download:

In [None]:
df_example = pd.read_pickle(local_dir + '/2007-2008-bachelor-semestre-6-printemps')
df_example.head()