# Data aggregation for runners data visualization
**Warning:** We make the assumption that two persons that have the same name (first name and family name) and the same birth year is a unique person. 

## Load data
Requirements:
`pip install unidecode`

In [44]:
import numpy as np
import pandas as pd
import json
import re
from unidecode import unidecode
from datetime import timedelta
import math
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('notebook')

In [2]:
# It's a bit long but you can load a remote CSV file from its URL !!! :fire:
# raw_df = pd.read_pickle('https://drive.google.com/file/d/0BypxDaHZHjhfYTBsMGM2WVlFdkU/view?usp=sharing')
raw_df = pd.read_pickle('/home/ondine/Desktop/ADA/df_duplicates_fixed.pickle')

In [3]:
raw_df.head()

Unnamed: 0,Race,Date,RaceYear,RaceMonth,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,Place,MinTemp,MaxTemp,Weather,RaceID
0,Kerzerslauf,sam. 18.03.2000,2000,3,15.0,Abgottspon Peter,M,1974.0,Zermatt,233,01:02:25,00:04:09,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers
1,Kerzerslauf,sam. 18.03.2000,2000,3,15.0,Abplanalp Michael,M,1964.0,Bern,32,00:55:11.700000,00:03:40,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers
2,Kerzerslauf,sam. 18.03.2000,2000,3,15.0,Abt Werner,M,1947.0,Spiez,155,01:12:42.900000,00:04:50,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers
3,Kerzerslauf,sam. 18.03.2000,2000,3,15.0,Ackermann Antoinette,F,1953.0,Alterswil,48,01:22:36.700000,00:05:30,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers
4,Kerzerslauf,sam. 18.03.2000,2000,3,15.0,Ackermann Hedy,F,1946.0,Alterswil FR,42,01:23:29.300000,00:05:33,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers


### Extra info

In [5]:
races_info = pd.read_csv('../datasets/races-information.csv',index_col=0).drop('url', axis=1)
races_info.head()

Unnamed: 0,date,name,location,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,sam. 27.03.1999,Männedörfler Waldlauf,Männedorf,,,,,47.2574625,8.6946733,saturday,27,3,1999
1,sam. 20.03.1999,Kerzerslauf,Kerzers,,,,,46.97488999999999,7.1954365,saturday,20,3,1999
2,sam. 24.04.1999,Luzerner Stadtlauf,Luzern,,,,,47.05016819999999,8.3093072,saturday,24,4,1999
3,sam. 24.04.1999,20km de Lausanne,Lausanne,,,,,46.5196535,6.6322734,saturday,24,4,1999
4,sam. 24.04.1999,"Chäsitzerlouf, Kehrsatz",Kehrsatz,,,,,,,saturday,24,4,1999


In [6]:
df = pd.merge(raw_df, races_info, how='left', left_on=['Race','Date'], right_on=['name','date'])\
    .drop(['date','name','MinTemp','MaxTemp','Weather','RaceYear','RaceMonth','RaceID'],axis=1)
print(df.shape)
df.columns

(1650887, 22)


Index(['Race', 'Date', 'Distance', 'Name', 'Sex', 'Year', 'LivingPlace',
       'Rank', 'Time', 'Pace', 'Place', 'location', 'min_temp', 'max_temp',
       'uv_index', 'weather_desc', 'latitude', 'longitude', 'weekday', 'day',
       'month', 'year'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,...,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,Kerzerslauf,sam. 18.03.2000,15.0,Abgottspon Peter,M,1974.0,Zermatt,233,01:02:25,00:04:09,...,,,,,46.97488999999999,7.1954365,saturday,18,3,2000
1,Kerzerslauf,sam. 18.03.2000,15.0,Abplanalp Michael,M,1964.0,Bern,32,00:55:11.700000,00:03:40,...,,,,,46.97488999999999,7.1954365,saturday,18,3,2000
2,Kerzerslauf,sam. 18.03.2000,15.0,Abt Werner,M,1947.0,Spiez,155,01:12:42.900000,00:04:50,...,,,,,46.97488999999999,7.1954365,saturday,18,3,2000
3,Kerzerslauf,sam. 18.03.2000,15.0,Ackermann Antoinette,F,1953.0,Alterswil,48,01:22:36.700000,00:05:30,...,,,,,46.97488999999999,7.1954365,saturday,18,3,2000
4,Kerzerslauf,sam. 18.03.2000,15.0,Ackermann Hedy,F,1946.0,Alterswil FR,42,01:23:29.300000,00:05:33,...,,,,,46.97488999999999,7.1954365,saturday,18,3,2000


### Processing - cleaning

In [8]:
df.loc[df.latitude == 'n', 'latitude'] = np.nan
df.loc[df.longitude == 'a', 'longitude'] = np.nan

In [82]:
df.latitude = df.latitude.apply(float)
df.longitude = df.longitude.apply(float)
df.Distance = df.Distance.apply(round)
df.Year = df.Year.fillna(0).apply(int)

<class 'numpy.float64'>


In [10]:
def clean_name(x):
    return x.replace("/"," ").replace("\\"," ")

df.Race = df.Race.apply(clean_name)

In [12]:
df['time'] = df.Time.apply(timedelta.total_seconds)

In [13]:
doubleindex_df = df.set_index(['Name','Year'])
doubleindex_df.index.is_unique

False

In [14]:
doubleindex_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Race,Date,Distance,Sex,LivingPlace,Rank,Time,Pace,Place,location,...,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year,time
Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Abgottspon Peter,1974,Kerzerslauf,sam. 18.03.2000,15,M,Zermatt,233,01:02:25,00:04:09,Kerzers,Kerzers,...,,,,46.97489,7.195436,saturday,18,3,2000,3745.0
Abplanalp Michael,1964,Kerzerslauf,sam. 18.03.2000,15,M,Bern,32,00:55:11.700000,00:03:40,Kerzers,Kerzers,...,,,,46.97489,7.195436,saturday,18,3,2000,3311.7
Abt Werner,1947,Kerzerslauf,sam. 18.03.2000,15,M,Spiez,155,01:12:42.900000,00:04:50,Kerzers,Kerzers,...,,,,46.97489,7.195436,saturday,18,3,2000,4362.9
Ackermann Antoinette,1953,Kerzerslauf,sam. 18.03.2000,15,F,Alterswil,48,01:22:36.700000,00:05:30,Kerzers,Kerzers,...,,,,46.97489,7.195436,saturday,18,3,2000,4956.7
Ackermann Hedy,1946,Kerzerslauf,sam. 18.03.2000,15,F,Alterswil FR,42,01:23:29.300000,00:05:33,Kerzers,Kerzers,...,,,,46.97489,7.195436,saturday,18,3,2000,5009.3


In [15]:
doubleindex_df.columns

Index(['Race', 'Date', 'Distance', 'Sex', 'LivingPlace', 'Rank', 'Time',
       'Pace', 'Place', 'location', 'min_temp', 'max_temp', 'uv_index',
       'weather_desc', 'latitude', 'longitude', 'weekday', 'day', 'month',
       'year', 'time'],
      dtype='object')

In [16]:
df.columns

Index(['Race', 'Date', 'Distance', 'Name', 'Sex', 'Year', 'LivingPlace',
       'Rank', 'Time', 'Pace', 'Place', 'location', 'min_temp', 'max_temp',
       'uv_index', 'weather_desc', 'latitude', 'longitude', 'weekday', 'day',
       'month', 'year', 'time'],
      dtype='object')

## Detailed aggregation by race

Steps :
* Build an out_dict which contains (hierarchically) the data for all runner, for all race. At the same time, build another dictionary that maps full names to their 'encodings' (used for JSON file names).

```
out_dict = {
    encoded_name_1 : {
        'name': Name,
        'birth': Year,
        'sex': Sex,
        'races': {
            'race_1': {
                'race': Race,
                'location': location,
                'latitude': latitude,
                'longitude': longitude,
                'date': {'date_1': {
                            'weekday': weekday,
                            'day', day,
                            'month': month,
                            'year': year,
                            'livingplace': LivingPlace,
                            'categories': {
                                'category_1': {
                                    'distance': Distance, 
                                    'rank': Rank, 
                                    'time': time, 
                                    'pace': Pace
                                    },
                                ...             # 'category_2', etc.
                                }
                          },
                          ...                   # 'date_2', etc.
                },
            }, 
            ...                                 # 'race_2', etc.
        }, 
    ...                                         # 'encoded_name_2', et.
}

names_dict = {
    encoded_name_1 : name_1,
    encoded_name_2 : name_2,
    ...
}
```

* Export names_dict to a JSON file.
* For each race name encoded_name, export out_dict[encoded_name] to a JSON file.

In [17]:
###### HELPERS ######
week_dict = {
    'lun': 'monday',
    'mar': 'tuesday',
    'mer': 'wednesday',
    'jeu': 'thursday',
    'ven': 'friday',
    'sam': 'saturday',
    'dim': 'sunday'
}

def fill_date(dataframe, dictionary):
    weekday = dataframe.weekday.unique()[0]
    day = dataframe.day.unique()[0]
    month = dataframe.day.unique()[0]
    year = dataframe.year.unique()[0]
    if pd.isnull(weekday) or pd.isnull(day) or pd.isnull(month) or pd.isnull(year):
        # compute
        dictionary['weekday'] = dataframe.Date.apply(lambda x: week_dict[x.split('.')[0].strip()]).unique()[0]
        dictionary['day'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[1].strip())).unique()[0])
        dictionary['month'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[2].strip())).unique()[0])
        dictionary['year'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[3].strip())).unique()[0])
    else:
        dictionary['weekday'] = weekday
        dictionary['day'] = int(day)
        dictionary['month'] = int(month)
        dictionary['year'] = int(year)

In [93]:
### ITERATION OVER RUNNERS
names_dict = {}
out_dict = {}

i = 0
i_max = doubleindex_df.index.unique().shape[0]

for (runner,birth) in doubleindex_df.index.unique()[:100]:
    i = i+1
    sub_df_temp = df[df.Name == runner].copy()
    sub_df = sub_df_temp[sub_df_temp.Year == birth].copy()
    runner_dict = {}
    runner_dict['name'] = runner
    runner_dict['birth'] = birth
    runner_dict['sex'] = sub_df.Sex.unique()[0]
    race_wrapper = {}
    
    for race in sub_df.Race.unique():
        race_dict = {}
        race_dict['race'] = race
        race_dict['location'] = sub_df.location.unique()[0]
        race_dict['latitude'] = float(sub_df.latitude.unique()[0])
        race_dict['longitude'] = float(sub_df.longitude.unique()[0])
        subsub_df = sub_df[sub_df.Race == race].copy()
        date_wrapper = {}
        
        for date in subsub_df.Date.unique():
            subsubsub_df = subsub_df[subsub_df.Date == date].copy()
            date_dict = {}
            # Note that for some dates, we don't already have this info and have to compute it
            fill_date(subsub_df, date_dict)
            # TODO: weather !
            # TODO: total number of runners !
            date_dict['livingplace'] = subsub_df.LivingPlace.unique()[0]
            cat_wrapper = {}
            
            for category in subsubsub_df.Distance.unique():
                lastsub_df = subsubsub_df[subsubsub_df.Distance == category].copy()
                cat_dict = {}
                cat_dict['distance'] = int(subsubsub_df.Distance.unique()[-1])
                cat_dict['rank'] = int(subsubsub_df.Rank.unique()[-1])
                cat_dict['time'] = float(subsubsub_df.time.unique()[-1])
                cat_dict['pace'] = int(subsubsub_df.Pace.unique()[-1])
                cat_wrapper[int(category)] = cat_dict

                if lastsub_df.shape[0] != 1:
                    #print()
                    #print()
                    #print('Two runners have the same name, same birth year and run in the same race,',\
                    #      'the same distance.')
                    #print(lastsub_df)
                    #print()
                    #print()
                    break
                        
            date_dict['categories'] = cat_wrapper
            date_wrapper[date] = date_dict
            
        race_dict['date'] = date_wrapper
        race_wrapper[race] = race_dict
        
    runner_dict['races'] = race_wrapper
    runner_id = runner+' '+str(birth)
    encoded_name = re.sub('[^0-9a-zA-Z]+', '', unidecode(runner_id).lower())
    names_dict[encoded_name] = runner_id
    out_dict[encoded_name] = runner_dict
    
    with open('runners/' + encoded_name + '.json', 'w') as out_file:
        json.dump(out_dict[encoded_name], out_file)
    
    if i%1000 == 0:
        print(i,'runners out of',i_max,'have been analysed.')
print('All done.')

with open('runnersnames.json', 'w') as out_file:
    json.dump(names_dict, out_file)
print('Files saved.')

All done.
Files saved.
