# Data aggregation for runners data visualization
**Warning:** We make the assumption that two persons that have the same name (first name and family name) and the same birth year is a unique person. 

## Load data

In [1]:
import numpy as np
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_context('notebook')

In [2]:
# It's a bit long but you can load a remote CSV file from its URL !!! :fire:
#raw_df = pd.read_csv('https://www.dropbox.com/s/tt9z5bik6uqndbz/full_database.csv?dl=1')
raw_df = pd.read_csv('/home/ondine/Desktop/ADA/full_database.csv')

In [3]:
raw_df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace,len_name
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3","0:6.26,0",2
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6","0:8.25,0",2
2,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abegglen Eddy,M,1954,Mürren,424,"5:45.21,9","2:20.33,8","0:8.11,0",2
3,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.328675,Abosa Emebet,F,1974,Zuoz,1,"3:21.46,1",False,"0:4.46,0",2
4,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abplanalp Michel,M,1960,Auvernier,143,"4:30.26,9","1:7.35,3","0:6.24,0",2


In [4]:
# Small hotfix for 'weird' dates
raw_df.Date = raw_df.Date.apply(lambda d: d.replace('+', '').replace('bis', ''))

### Extra info

In [5]:
races_info = pd.read_csv('../datasets/races-information.csv',index_col=0).drop('url', axis=1)
races_info.head()

Unnamed: 0,date,name,location,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,sam. 27.03.1999,Männedörfler Waldlauf,Männedorf,,,,,47.2574625,8.6946733,saturday,27,3,1999
1,sam. 20.03.1999,Kerzerslauf,Kerzers,,,,,46.97488999999999,7.1954365,saturday,20,3,1999
2,sam. 24.04.1999,Luzerner Stadtlauf,Luzern,,,,,47.05016819999999,8.3093072,saturday,24,4,1999
3,sam. 24.04.1999,20km de Lausanne,Lausanne,,,,,46.5196535,6.6322734,saturday,24,4,1999
4,sam. 24.04.1999,"Chäsitzerlouf, Kehrsatz",Kehrsatz,,,,,,,saturday,24,4,1999


In [6]:
df = pd.merge(raw_df, races_info, how='left', left_on=['Race','Date'], right_on=['name','date']).drop(['date','name'],axis=1)
print(df.shape)
df.columns

(1281195, 23)


Index(['Race', 'Date', 'Distance', 'Name', 'Sex', 'Year', 'LivingPlace',
       'Rank', 'Time', 'Delay', 'Pace', 'len_name', 'location', 'min_temp',
       'max_temp', 'uv_index', 'weather_desc', 'latitude', 'longitude',
       'weekday', 'day', 'month', 'year'],
      dtype='object')

In [7]:
df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,...,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3",...,,,,,,,saturday,6.0,9.0,2003.0
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6",...,,,,,,,saturday,6.0,9.0,2003.0
2,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abegglen Eddy,M,1954,Mürren,424,"5:45.21,9","2:20.33,8",...,,,,,,,saturday,6.0,9.0,2003.0
3,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.328675,Abosa Emebet,F,1974,Zuoz,1,"3:21.46,1",False,...,,,,,,,saturday,6.0,9.0,2003.0
4,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abplanalp Michel,M,1960,Auvernier,143,"4:30.26,9","1:7.35,3",...,,,,,,,saturday,6.0,9.0,2003.0


### Processing - cleaning

In [8]:
df.latitude = df.latitude.apply(float)
df.longitude = df.longitude.apply(float)
df.Distance = df.Distance.apply(round)

In [9]:
def clean_name(x):
    return x.replace("/"," ").replace("\\"," ")

df.Race = df.Race.apply(clean_name)

In [10]:
def time_to_seconds(x):
    # Get hours
    split = x.split(':')
    hours = int(split[0])
    # Get minutes
    split = split[1].split('.')
    minutes = int(split[0])
    # Get seconds
    split = split[1].split(',')
    seconds = int(split[0])
    return hours*3600 + minutes*60 + seconds

def seconds_to_time(x):
    minutes, seconds = divmod(x, 60)
    hours, minutes = divmod(minutes, 60)
    return '{}:{}:{}'.format(hours, minutes, seconds)

In [11]:
df['time'] = df.Time.apply(time_to_seconds)

In [12]:
doubleindex_df = df.set_index(['Name','Year'])
doubleindex_df.index.is_unique

False

In [13]:
doubleindex_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Race,Date,Distance,Sex,LivingPlace,Rank,Time,Delay,Pace,len_name,...,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year,time
Name,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Abächerli Walter,1952,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42,M,Hinwil,47,"4:31.56,1","0:53.5,3","0:6.26,0",2,...,,,,,,saturday,6.0,9.0,2003.0,16316
Abbringh Ellen,1962,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42,F,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6","0:8.25,0",2,...,,,,,,saturday,6.0,9.0,2003.0,21309
Abegglen Eddy,1954,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42,M,Mürren,424,"5:45.21,9","2:20.33,8","0:8.11,0",2,...,,,,,,saturday,6.0,9.0,2003.0,20721
Abosa Emebet,1974,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42,F,Zuoz,1,"3:21.46,1",False,"0:4.46,0",2,...,,,,,,saturday,6.0,9.0,2003.0,12106
Abplanalp Michel,1960,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42,M,Auvernier,143,"4:30.26,9","1:7.35,3","0:6.24,0",2,...,,,,,,saturday,6.0,9.0,2003.0,16226


In [14]:
doubleindex_df.columns

Index(['Race', 'Date', 'Distance', 'Sex', 'LivingPlace', 'Rank', 'Time',
       'Delay', 'Pace', 'len_name', 'location', 'min_temp', 'max_temp',
       'uv_index', 'weather_desc', 'latitude', 'longitude', 'weekday', 'day',
       'month', 'year', 'time'],
      dtype='object')

In [15]:
df.columns

Index(['Race', 'Date', 'Distance', 'Name', 'Sex', 'Year', 'LivingPlace',
       'Rank', 'Time', 'Delay', 'Pace', 'len_name', 'location', 'min_temp',
       'max_temp', 'uv_index', 'weather_desc', 'latitude', 'longitude',
       'weekday', 'day', 'month', 'year', 'time'],
      dtype='object')

## Detailed aggregation by race

Steps :
* Build an out_dict which contains (hierarchically) the data for all runner, for all race. At the same time, build another dictionary that maps full names to their 'encodings' (used for JSON file names).

```
out_dict = {
    encoded_name_1 : {
        'name': Name,
        'birth': Year,
        'sex': Sex,
        'races': {
            'race_1': {
                'race': Race,
                'location': location,
                'latitude': latitude,
                'longitude': longitude,
                'date': {'date_1': {
                            'weekday': weekday,
                            'startday', day,
                            'month': month,
                            'year': year,
                            'livingplace': LivingPlace,
                            'categories': {
                                'category_1': {
                                    'distance': Distance, 
                                    'day' : {
                                        'day_1': {
                                            'rank': Rank, 
                                            'time': Time, 
                                            'delay': Delay, 
                                            'pace': Pace
                                         },
                                         ...    # 'day_2', etc.
                                       }
                                    },
                                ...             # 'category_2', etc.
                                }
                          },
                          ...                   # 'date_2', etc.
                },
            }, 
            ...                                 # 'race_2', etc.
        }, 
    ...                                         # 'encoded_name_2', et.
}

names_dict = {
    encoded_name_1 : name_1,
    encoded_name_2 : name_2,
    ...
}
```

* Export names_dict to a JSON file.
* For each race name encoded_name, export out_dict[encoded_name] to a JSON file.

In [16]:
###### HELPERS ######
week_dict = {
    'lun': 'monday',
    'mar': 'tuesday',
    'mer': 'wednesday',
    'jeu': 'thursday',
    'ven': 'friday',
    'sam': 'saturday',
    'dim': 'sunday'
}

def fill_date(dataframe, dictionary):
    weekday = dataframe.weekday.unique()[0]
    day = dataframe.day.unique()[0]
    month = dataframe.day.unique()[0]
    year = dataframe.year.unique()[0]
    if pd.isnull(weekday) or pd.isnull(day) or pd.isnull(month) or pd.isnull(year):
        # compute
        dictionary['weekday'] = dataframe.Date.apply(lambda x: week_dict[x.split('.')[0].strip()]).unique()[0]
        dictionary['startday'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[1].strip())).unique()[0])
        dictionary['month'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[2].strip())).unique()[0])
        dictionary['year'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[3].strip())).unique()[0])
    else:
        dictionary['weekday'] = weekday
        dictionary['startday'] = int(day)
        dictionary['month'] = int(month)
        dictionary['year'] = int(year)

In [None]:
### ITERATION OVER RUNNERS
names_dict = {}
out_dict = {}

i = 0
i_max = doubleindex_df.index.unique().shape[0]

for (runner,birth) in doubleindex_df.index.unique():
    i = i+1
    sub_df_temp = df[df.Name == runner].copy()
    sub_df = sub_df_temp[sub_df_temp.Year == birth].copy()
    runner_dict = {}
    runner_dict['name'] = runner
    runner_dict['birth'] = birth
    runner_dict['sex'] = sub_df.Sex.unique()[0]
    race_wrapper = {}
    
    for race in sub_df.Race.unique():
        race_dict = {}
        race_dict['race'] = race
        race_dict['location'] = sub_df.location.unique()[0]
        race_dict['latitude'] = sub_df.latitude.unique()[0]
        race_dict['longitude'] = sub_df.longitude.unique()[0]
        subsub_df = sub_df[sub_df.Race == race].copy()
        date_wrapper = {}
        
        for date in subsub_df.Date.unique():
            subsubsub_df = subsub_df[subsub_df.Date == date].copy()
            date_dict = {}
            # Note that for some dates, we don't already have this info and have to compute it
            fill_date(subsub_df, date_dict)
            # TODO: weather !
            # TODO: total number of runners !
            date_dict['livingplace'] = subsub_df.LivingPlace.unique()[0]
            cat_wrapper = {}
            
            for category in subsubsub_df.Distance.unique():
                lastsub_df = subsubsub_df[subsubsub_df.Distance == category].copy()
                cat_dict = {}
                cat_dict['distance'] = subsubsub_df.Distance.unique()[0]
                day_wrapper = {}
                day_nb = 0
                
                for time in lastsub_df.Time.unique():
                    day_nb = day_nb + 1
                    lastlastsub_df = lastsub_df[lastsub_df.Time == time].copy()
                    day_dict = {}
                    day_dict['rank'] = lastsub_df.Rank.unique()[-1]
                    day_dict['time'] = lastsub_df.Time.unique()[-1]
                    day_dict['delay'] = lastsub_df.Delay.unique()[-1]
                    day_dict['pace'] = lastsub_df.Pace.unique()[-1]
                    day_wrapper[str(day_nb)] = day_dict                    
                    
                    if lastlastsub_df.shape[0] != 1:
                        print('Two runners have the same name, same birth year and run in the same race,',\
                              'the same distance, the same day.')
                        print(lastlastsub_df)
                        print()
                        print()
                        break
                    
                cat_dict['day'] = day_wrapper
                cat_wrapper[category] = cat_dict
                        
            date_dict['category'] = cat_wrapper
            date_wrapper[date] = date_dict
            
        race_dict['date'] = date_wrapper
        race_wrapper[race] = race_dict
        
    runner_dict['races'] = race_wrapper
    runner_id = runner+' '+str(birth)
    encoded_name = re.sub('[^0-9a-zA-Z]+', '', runner_id.lower())
    names_dict[encoded_name] = runner_id
    out_dict[encoded_name] = runner_dict
    
    if i%100 == 0:
        print(i,'runners out of',i_max,'have been analysed.')
        print()
        print()
print('All done.')

100 runners out of 463331 have been analysed.


Two runners have the same name, same birth year and run in the same race, the same distance, the same day.
                                 Race             Date  Distance  \
735757  Jungfrau-Marathon, Interlaken  sam. 08.09.2012        42   
735758  Jungfrau-Marathon, Interlaken  sam. 08.09.2012        42   

                     Name Sex  Year       LivingPlace  Rank       Time  \
735757  Baldini Giovanni    M  1964   IT-Viterbo (VT)   146  5:31.29,5   
735758  Baldini Giovanni    M  1964   IT-Viterbo (VT)   589  5:31.29,5   

            Delay  ...   max_temp  uv_index weather_desc  latitude  longitude  \
735757  1:52.41,3  ...       21.0       0.0        Clear       NaN        NaN   
735758   2:5.34,8  ...       21.0       0.0        Clear       NaN        NaN   

         weekday  day  month    year   time  
735757  saturday  8.0    9.0  2012.0  19889  
735758  saturday  8.0    9.0  2012.0  19889  

[2 rows x 24 columns]


Two runner