# Data aggregation for visualization

* [Load data](#Load-data)
* [Detailed aggregation by race](#Detailed-aggregation-by-race)

## Load data

In [1]:
import numpy as np
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('notebook')

In [2]:
# It's a bit long but you can load a remote CSV file from its URL !!! :fire:
#raw_df = pd.read_csv('https://www.dropbox.com/s/tt9z5bik6uqndbz/full_database.csv?dl=1')
raw_df = pd.read_csv('/Users/maximepeschard/Downloads/full_database.csv')

In [3]:
raw_df.shape

(1281195, 12)

In [4]:
raw_df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace,len_name
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3","0:6.26,0",2
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6","0:8.25,0",2
2,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abegglen Eddy,M,1954,Mürren,424,"5:45.21,9","2:20.33,8","0:8.11,0",2
3,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.328675,Abosa Emebet,F,1974,Zuoz,1,"3:21.46,1",False,"0:4.46,0",2
4,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abplanalp Michel,M,1960,Auvernier,143,"4:30.26,9","1:7.35,3","0:6.24,0",2


In [5]:
# Small hotfix for 'weird' dates
raw_df.Date = raw_df.Date.apply(lambda d: d.replace('+', '').replace('bis', ''))

## Extra info

In [6]:
races_info = pd.read_csv('../datasets/races-information.csv',index_col=0).drop('url', axis=1)
races_info.head()

Unnamed: 0,date,name,location,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,sam. 27.03.1999,Männedörfler Waldlauf,Männedorf,,,,,47.2574625,8.6946733,saturday,27,3,1999
1,sam. 20.03.1999,Kerzerslauf,Kerzers,,,,,46.97488999999999,7.1954365,saturday,20,3,1999
2,sam. 24.04.1999,Luzerner Stadtlauf,Luzern,,,,,47.05016819999999,8.3093072,saturday,24,4,1999
3,sam. 24.04.1999,20km de Lausanne,Lausanne,,,,,46.5196535,6.6322734,saturday,24,4,1999
4,sam. 24.04.1999,"Chäsitzerlouf, Kehrsatz",Kehrsatz,,,,,,,saturday,24,4,1999


In [7]:
df = pd.merge(raw_df, races_info, how='left', left_on=['Race','Date'], right_on=['name','date']).drop(['date','name','location'],axis=1)
print(df.shape)
df.head()

(1281195, 22)


Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,...,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3",...,,,,,,,saturday,6.0,9.0,2003.0
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6",...,,,,,,,saturday,6.0,9.0,2003.0
2,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abegglen Eddy,M,1954,Mürren,424,"5:45.21,9","2:20.33,8",...,,,,,,,saturday,6.0,9.0,2003.0
3,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.328675,Abosa Emebet,F,1974,Zuoz,1,"3:21.46,1",False,...,,,,,,,saturday,6.0,9.0,2003.0
4,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abplanalp Michel,M,1960,Auvernier,143,"4:30.26,9","1:7.35,3",...,,,,,,,saturday,6.0,9.0,2003.0


## Convert some stuff

In [8]:
df.latitude = df.latitude.apply(float)
df.longitude = df.longitude.apply(float)

## Round distances

In [9]:
df.Distance = df.Distance.apply(round)

## Clean race names

In [10]:
def clean_name(x):
    return x.replace("/"," ").replace("\\"," ")

df.Race = df.Race.apply(clean_name)

## Convert time string to seconds (ignore further precision)

In [11]:
def time_to_seconds(x):
    # Get hours
    split = x.split(':')
    hours = int(split[0])
    # Get minutes
    split = split[1].split('.')
    minutes = int(split[0])
    # Get seconds
    split = split[1].split(',')
    seconds = int(split[0])
    return hours*3600 + minutes*60 + seconds

def seconds_to_time(x):
    minutes, seconds = divmod(x, 60)
    hours, minutes = divmod(minutes, 60)
    return '{}:{}:{}'.format(hours, minutes, seconds)

In [12]:
df['time'] = df.Time.apply(time_to_seconds)

## Add a race+date index

In [13]:
df['race_index'] = df.Race + ' ; ' + df.Date

## Final global dataframe

In [14]:
df.tail()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,...,uv_index,weather_desc,latitude,longitude,weekday,day,month,year,time,race_index
1281190,"Course de l'Escalade, Genève",sam. 05.12.2015,7,Zwahlen Guy,M,1959,Genève,920,"0:42.25,8","0:17.0,2",...,0.0,Sunny,46.204391,6.143158,saturday,5.0,12.0,2015.0,2545,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1281191,"Course de l'Escalade, Genève",sam. 05.12.2015,7,Zwahlen Laurent,M,1967,Concise,1417,"0:37.43,4","0:13.27,1",...,0.0,Sunny,46.204391,6.143158,saturday,5.0,12.0,2015.0,2263,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1281192,"Course de l'Escalade, Genève",sam. 05.12.2015,7,Zweigart Benjamin,M,1991,Confignon,357,"0:31.8,2","0:7.20,9",...,0.0,Sunny,46.204391,6.143158,saturday,5.0,12.0,2015.0,1868,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1281193,"Course de l'Escalade, Genève",sam. 05.12.2015,7,Zwicky Pierre,M,1963,Genève,1089,"0:45.2,1","0:18.21,8",...,0.0,Sunny,46.204391,6.143158,saturday,5.0,12.0,2015.0,2702,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1281194,"Course de l'Escalade, Genève",sam. 05.12.2015,2,Zwimpfer Maxime,M,2005,Vandoeuvres,223,"0:11.7,7","0:2.52,1",...,0.0,Sunny,46.204391,6.143158,saturday,5.0,12.0,2015.0,667,"Course de l'Escalade, Genève ; sam. 05.12.2015"


In [30]:
#df.Date[df.Race == 'Lausanne Marathon'].unique()
df.Date[df.weekday.isnull()].unique()

array(['jeu. 11.06.2015 ', 'jeu. 13.08.2015 '], dtype=object)

# Detailed aggregation by race

Steps :

* Build an `out_dict` which contains (hierarchically) the data for all races, for all dates, for all distances. At the same time, build another dictionary that maps full names to their 'encodings' (used for JSON file names).

```
out_dict = {
    encoded_name_1 : {
        'name': name
        'longitude': longitude,
        'latitude': latitude,
        'dates': {
            'date_1': {
                'weekday': weekday,
                ...
                'distances': {
                    'distance_1': {
                        'count_men': count_men,
                        'count_women': count_women,
                        'times': [x1, ..., xN]
                    },
                    ...
                }
            },
            ...
        }
    },
    ...
}

names_dict = {
    encoded_name_1 : name_1,
    encoded_name_2 : name_2,
    ...
}
```

* Export `names_dict` to a JSON file.
* For each race name `encoded_name`, export `out_dict[encoded_name]` to a JSON file.

In [31]:
###### HELPERS ######
week_dict = {
    'lun': 'monday',
    'mar': 'tuesday',
    'mer': 'wednesday',
    'jeu': 'thursday',
    'ven': 'friday',
    'sam': 'saturday',
    'dim': 'sunday'
}


def fill_date(dataframe, dictionary):
    weekday = dataframe.weekday.unique()[0]
    day = dataframe.day.unique()[0]
    month = dataframe.day.unique()[0]
    year = dataframe.year.unique()[0]
    if pd.isnull(weekday) or pd.isnull(day) or pd.isnull(month) or pd.isnull(year):
        # compute
        dictionary['weekday'] = dataframe.Date.apply(lambda x: week_dict[x.split('.')[0].strip()]).unique()[0]
        dictionary['day'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[1].strip())).unique()[0])
        dictionary['month'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[2].strip())).unique()[0])
        dictionary['year'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[3].strip())).unique()[0])
    else:
        dictionary['weekday'] = weekday
        dictionary['day'] = int(day)
        dictionary['month'] = int(month)
        dictionary['year'] = int(year)


def get_count(dataframe, column, value):
    try:
        return int(dataframe[column].value_counts()[value])
    except KeyError:
        return 0


###### ITERATION OVER RACES ######
names_dict = {}
out_dict = {}
for race_name in df.Race.unique():
    sub_df = df[df.Race == race_name].copy()
    
    race_dict = {}
    race_dict['name'] = race_name
    race_dict['latitude'] = sub_df.latitude.unique()[0]
    race_dict['longitude'] = sub_df.longitude.unique()[0]
    
    
    date_wrapper = {}
    for race_date in sub_df.Date.unique():
        subsub_df = sub_df[sub_df.Date == race_date]
        
        date_dict = {}
        # Note that for some dates, we don't already have this info and have to compute it
        fill_date(subsub_df, date_dict)
        #date_dict['weekday'] = subsub_df.weekday.unique()[0]
        #date_dict['day'] = subsub_df.day.unique()[0]
        #date_dict['month'] = subsub_df.month.unique()[0]
        #date_dict['year'] = subsub_df.year.unique()[0]
        # TODO: weather !
        date_dict['count_men'] = get_count(subsub_df, 'Sex', 'M')
        date_dict['count_women'] = get_count(subsub_df, 'Sex', 'F')
        
        
        dist_wrapper = {}
        for race_dist in subsub_df.Distance.unique():
            subsubsub_df = subsub_df[subsub_df.Distance == race_dist].copy()
            
            dist_dict = {}
            dist_dict['count_men'] = get_count(subsubsub_df, 'Sex', 'M')
            dist_dict['count_women'] = get_count(subsubsub_df, 'Sex', 'F')
            dist_dict['times'] = subsubsub_df.time.tolist()
            
            dist_wrapper[str(race_dist)] = dist_dict
            
        date_dict['distances'] = dist_wrapper
        
        date_wrapper[race_date] = date_dict
        race_dict['dates'] = date_wrapper
        #race_dict[race_date] = date_dict
    
    encoded_name = re.sub('[^0-9a-zA-Z]+', '', race_name.lower())
    names_dict[encoded_name] = race_name
    out_dict[encoded_name] = race_dict

In [32]:
with open('racesnames.json', 'w') as out_file:
    json.dump(names_dict, out_file)

In [33]:
for encoded_name in out_dict:
    with open('races/' + encoded_name + '.json', 'w') as out_file:
        json.dump(out_dict[encoded_name], out_file)

In [None]:
#out_dict['20km de Lausanne']#['sam. 22.04.2006']['distances']

In [None]:
# --- SOME TESTS ----
#another_df = df[(df.Race=='10km de Payerne') & (df.Date=='sam. 27.03.2010')].copy()
another_df = df[(df.Race=='Lausanne Marathon') & (df.Date=='dim. 31.10.2010') &(round(df.Distance)==21)].copy()
bins = np.linspace(another_df.time.min(), another_df.time.max(), 10, dtype=int)
another_df['time_bins'] = pd.cut(another_df.time, bins)
another_df.time_bins.value_counts(sort=False).plot(kind='bar');
print(another_df.time_bins.value_counts(sort=False).values.tolist())
print(bins)
another_df.time_bins.value_counts(sort=False).index.tolist()