# Data aggregation for races visualization
In this notebook, we create JSON files that will be useful to create the website to visualize the data. Each JSON file corresponds to a specific **race** and contains all the necessary infomation about it. For more details about how this JSON file is created, see [here](#Detailed-aggregation-by-race). 

* [Load data](#Load-data)
* [Clean data](#Clean-data)
* [Detailed aggregation by race](#Detailed-aggregation-by-race)

# Load data

In [1]:
import numpy as np
import pandas as pd
import json
import re
import matplotlib.pyplot as plt
from datetime import timedelta
import seaborn as sns
%matplotlib inline
sns.set_context('notebook')

## Load information about each race

In [2]:
# It's a bit long but you can load a remote CSV file from its URL !!! :fire:

# Old version :
# URL : https://www.dropbox.com/s/tt9z5bik6uqndbz/full_database.csv?dl=0
#raw_df = pd.read_csv('/Users/maximepeschard/Downloads/full_database.csv')

# New version :
# URL : https://drive.google.com/file/d/0BypxDaHZHjhfNG9qbHA0NGJpbU0/view?usp=sharing
#raw_df = pd.read_pickle('../datasets/df_userID.pickle')
raw_df = pd.read_pickle('../../df_userID.pickle')

In [3]:
raw_df.shape

(1648676, 19)

In [4]:
raw_df.head()

Unnamed: 0,Race,Date,RaceYear,RaceMonth,Category,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,Place,MinTemp,MaxTemp,Weather,RaceID,UserID
0,Kerzerslauf,sam. 18.03.2000,2000,3,M20,15.0,Abgottspon Peter,M,1974.0,Zermatt,233,01:02:25,00:04:09,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers,Abgottspon Peter 1974.0
1,Kerzerslauf,sam. 18.03.2000,2000,3,M35,15.0,Abplanalp Michael,M,1964.0,Bern,32,00:55:11.700000,00:03:40,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers,Abplanalp Michael 1964.0
2,Kerzerslauf,sam. 18.03.2000,2000,3,M50,15.0,Abt Werner,M,1947.0,Spiez,155,01:12:42.900000,00:04:50,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers,Abt Werner 1947.0
3,Kerzerslauf,sam. 18.03.2000,2000,3,F45,15.0,Ackermann Antoinette,F,1953.0,Alterswil,48,01:22:36.700000,00:05:30,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers,Ackermann Antoinette 1953.0
4,Kerzerslauf,sam. 18.03.2000,2000,3,F50,15.0,Ackermann Hedy,F,1946.0,Alterswil,42,01:23:29.300000,00:05:33,Kerzers,,,,http://services.datasport.com/2000/lauf/kerzers,Ackermann Hedy 1946.0


In [5]:
# Small hotfix for 'weird' dates
raw_df.Date = raw_df.Date.apply(lambda d: d.replace('+', '').replace('bis', ''))

## Load extra information about the races

In [6]:
races_info = pd.read_csv('../datasets/races-information.csv',index_col=0).drop('url', axis=1)
races_info.head()

Unnamed: 0,date,name,location,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,sam. 27.03.1999,Männedörfler Waldlauf,Männedorf,,,,,47.257463,8.694673,saturday,27,3,1999
1,sam. 20.03.1999,Kerzerslauf,Kerzers,,,,,46.97489,7.195437,saturday,20,3,1999
2,sam. 24.04.1999,Luzerner Stadtlauf,Luzern,,,,,47.050168,8.309307,saturday,24,4,1999
3,sam. 24.04.1999,20km de Lausanne,Lausanne,,,,,46.519653,6.632273,saturday,24,4,1999
4,sam. 24.04.1999,"Chäsitzerlouf, Kehrsatz",Kehrsatz,,,,,,,saturday,24,4,1999


## Merge both tables

In [7]:
df = pd.merge(raw_df, races_info, how='left', left_on=['Race','Date'], right_on=['name','date']).drop(['date','name','location'],axis=1)
print(df.shape)
df.head()

(1648676, 29)


Unnamed: 0,Race,Date,RaceYear,RaceMonth,Category,Distance,Name,Sex,Year,LivingPlace,...,min_temp,max_temp,uv_index,weather_desc,latitude,longitude,weekday,day,month,year
0,Kerzerslauf,sam. 18.03.2000,2000,3,M20,15.0,Abgottspon Peter,M,1974.0,Zermatt,...,,,,,46.97489,7.195437,saturday,18,3,2000
1,Kerzerslauf,sam. 18.03.2000,2000,3,M35,15.0,Abplanalp Michael,M,1964.0,Bern,...,,,,,46.97489,7.195437,saturday,18,3,2000
2,Kerzerslauf,sam. 18.03.2000,2000,3,M50,15.0,Abt Werner,M,1947.0,Spiez,...,,,,,46.97489,7.195437,saturday,18,3,2000
3,Kerzerslauf,sam. 18.03.2000,2000,3,F45,15.0,Ackermann Antoinette,F,1953.0,Alterswil,...,,,,,46.97489,7.195437,saturday,18,3,2000
4,Kerzerslauf,sam. 18.03.2000,2000,3,F50,15.0,Ackermann Hedy,F,1946.0,Alterswil,...,,,,,46.97489,7.195437,saturday,18,3,2000


# Clean data

## Convert types

In [8]:
df.latitude = df.latitude.apply(float)
df.longitude = df.longitude.apply(float)

## Round distances

In [9]:
df.Distance = df.Distance.apply(round)

## Clean race names

In [10]:
def clean_name(x):
    return x.replace("/"," ").replace("\\"," ")

df.Race = df.Race.apply(clean_name)

## Convert time string to seconds (ignore further precision)

In [11]:
def time_to_seconds(x):
    # Get hours
    split = x.split(':')
    hours = int(split[0])
    # Get minutes
    split = split[1].split('.')
    minutes = int(split[0])
    # Get seconds
    split = split[1].split(',')
    seconds = int(split[0])
    return hours*3600 + minutes*60 + seconds

def seconds_to_time(x):
    minutes, seconds = divmod(x, 60)
    hours, minutes = divmod(minutes, 60)
    return '{}:{}:{}'.format(hours, minutes, seconds)

In [24]:
df['time'] = df.Time.apply(timedelta.total_seconds)
df['pace'] = df.Pace.apply(timedelta.total_seconds)

In [25]:
df.columns

Index(['Race', 'Date', 'RaceYear', 'RaceMonth', 'Category', 'Distance', 'Name',
       'Sex', 'Year', 'LivingPlace', 'Rank', 'Time', 'Pace', 'Place',
       'MinTemp', 'MaxTemp', 'Weather', 'RaceID', 'UserID', 'min_temp',
       'max_temp', 'uv_index', 'weather_desc', 'latitude', 'longitude',
       'weekday', 'day', 'month', 'year', 'time', 'pace'],
      dtype='object')

## Add a race+date index

In [26]:
df['race_index'] = df.Race + ' ; ' + df.Date

## Final global dataframe

In [27]:
df.tail()

Unnamed: 0,Race,Date,RaceYear,RaceMonth,Category,Distance,Name,Sex,Year,LivingPlace,...,weather_desc,latitude,longitude,weekday,day,month,year,time,pace,race_index
1648671,"Course de l'Escalade, Genève",sam. 05.12.2015,2015,12,Esc-Hom4,7,Zwahlen Guy,M,1959.0,Geneve,...,Sunny,46.204391,6.143158,saturday,5,12,2015,2545.8,347.0,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1648672,"Course de l'Escalade, Genève",sam. 05.12.2015,2015,12,Esc-Hom3,7,Zwahlen Laurent,M,1967.0,Concise,...,Sunny,46.204391,6.143158,saturday,5,12,2015,2263.4,309.0,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1648673,"Course de l'Escalade, Genève",sam. 05.12.2015,2015,12,Esc-Hom1,7,Zweigart Benjamin,M,1991.0,Confignon,...,Sunny,46.204391,6.143158,saturday,5,12,2015,1868.2,255.0,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1648674,"Course de l'Escalade, Genève",sam. 05.12.2015,2015,12,Mix3-H,7,Zwicky Pierre,M,1963.0,Geneve,...,Sunny,46.204391,6.143158,saturday,5,12,2015,2702.1,368.0,"Course de l'Escalade, Genève ; sam. 05.12.2015"
1648675,"Course de l'Escalade, Genève",sam. 05.12.2015,2015,12,EcoM-B10,2,Zwimpfer Maxime,M,2005.0,Vandoeuvres,...,Sunny,46.204391,6.143158,saturday,5,12,2015,667.7,285.0,"Course de l'Escalade, Genève ; sam. 05.12.2015"


# Detailed aggregation by race

Steps :

* Build an `out_dict` which contains (hierarchically) the data for all races, for all dates, for all distances. At the same time, build another dictionary that maps full names to their 'encodings' (used for JSON file names).

```
out_dict = {
    encoded_name_1 : {
        'name': name
        'longitude': longitude,
        'latitude': latitude,
        'dates': {
            'date_1': {
                'weekday': weekday,
                ...
                'distances': {
                    'distance_1': {
                        'count_men': count_men,
                        'count_women': count_women,
                        'times': [x1, ..., xN]
                    },
                    ...
                },
                'categories': {
                    'category_1': {
                        'count': count,
                        'times': [x1, ..., xM],
                        'distance': Distance
                    },
                    ...
                }
            },
            ...
        }
    },
    ...
}

names_dict = {
    encoded_name_1 : name_1,
    encoded_name_2 : name_2,
    ...
}
```

* Export `names_dict` to a JSON file.
* For each race name `encoded_name`, export `out_dict[encoded_name]` to a JSON file.

The files will also be loaded directly onto the website repository, [hopsuisse.github.io](https://github.com/hopsuisse/hopsuisse.github.io).

## Helpers and main loop to build the JSON files.

In [31]:
###### HELPERS ######
week_dict = {
    'lun': 'monday',
    'mar': 'tuesday',
    'mer': 'wednesday',
    'jeu': 'thursday',
    'ven': 'friday',
    'sam': 'saturday',
    'dim': 'sunday'
}

# Get date attributes from the dataframe and fill dictionary
def fill_date(dataframe, dictionary):
    weekday = dataframe.weekday.unique()[0]
    day = dataframe.day.unique()[0]
    month = dataframe.month.unique()[0]
    year = dataframe.year.unique()[0]
    if pd.isnull(weekday) or pd.isnull(day) or pd.isnull(month) or pd.isnull(year):
        # compute
        dictionary['weekday'] = dataframe.Date.apply(lambda x: week_dict[x.split('.')[0].strip()]).unique()[0]
        dictionary['day'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[1].strip())).unique()[0])
        dictionary['month'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[2].strip())).unique()[0])
        dictionary['year'] = int(dataframe.Date.apply(lambda x: int(x.split('.')[3].strip())).unique()[0])
    else:
        dictionary['weekday'] = weekday
        dictionary['day'] = int(day)
        dictionary['month'] = int(month)
        dictionary['year'] = int(year)

# Get weather attributes (if any) from the dataframe and fill dictionary
def fill_weather(dataframe, dictionary):
    weather = dataframe.weather_desc.unique()[0]
    min_temp = dataframe.min_temp.unique()[0]
    max_temp = dataframe.max_temp.unique()[0]
    uv_index = dataframe.uv_index.unique()[0]
    if not (pd.isnull(weather) or pd.isnull(min_temp) or pd.isnull(max_temp) or pd.isnull(uv_index)):
        dictionary['weather'] = weather
        dictionary['min_temp'] = float(min_temp)
        dictionary['max_temp'] = float(max_temp)
        dictionary['uv_index'] = float(uv_index)

def get_count(dataframe, column, value):
    try:
        return int(dataframe[column].value_counts()[value])
    except KeyError:
        return 0


###### ITERATION OVER RACES ######
names_dict = {}
out_dict = {}
i = 0
for race_name in df.Race.unique():
    i = i+1
    sub_df = df[df.Race == race_name].copy()
    
    race_dict = {}
    race_dict['name'] = race_name
    latitude = sub_df.latitude.unique()[0]
    longitude = sub_df.longitude.unique()[0]
    if not (pd.isnull(latitude) or pd.isnull(longitude)): 
        race_dict['latitude'] = float(sub_df.latitude.unique()[0])
        race_dict['longitude'] = float(sub_df.longitude.unique()[0])
    
    
    date_wrapper = {}
    for race_date in sub_df.Date.unique():
        subsub_df = sub_df[sub_df.Date == race_date]
        
        date_dict = {}
        fill_date(subsub_df, date_dict)
        fill_weather(subsub_df, date_dict)
        date_dict['count_men'] = int(get_count(subsub_df, 'Sex', 'M'))
        date_dict['count_women'] = int(get_count(subsub_df, 'Sex', 'F'))
        
        
        dist_wrapper = {}
        for race_dist in subsub_df.Distance.unique():
            subsubsub_df = subsub_df[subsub_df.Distance == race_dist].copy()
            
            dist_dict = {}
            dist_dict['count_men'] = int(get_count(subsubsub_df, 'Sex', 'M'))
            dist_dict['count_women'] = int(get_count(subsubsub_df, 'Sex', 'F'))
            dist_dict['times'] = [float(i) for i in subsubsub_df.time.tolist()]
            
            dist_wrapper[str(race_dist)] = dist_dict
            
        date_dict['distances'] = dist_wrapper
        
        cat_wrapper = {}
        for race_cat in subsub_df.Category.unique():
            subsubsub_df = subsub_df[subsub_df.Category == race_cat].copy()
            
            cat_dict = {}
            cat_dict['count'] = int(max(subsubsub_df.Rank))
            cat_dict['times'] = [float(i) for i in subsubsub_df.time.tolist()]
            cat_dict['distance'] = int(subsubsub_df.Distance.unique()[0]) # Distance is unique, given a category
           
            cat_wrapper[str(race_cat)] = cat_dict
            
        date_dict['categories'] = cat_wrapper
                
        date_wrapper[race_date] = date_dict
    race_dict['dates'] = date_wrapper
    
    encoded_name = re.sub('[^0-9a-zA-Z]+', '', race_name.lower())
    names_dict[encoded_name] = race_name
    out_dict[encoded_name] = race_dict
    
    if i%50==0:
        print(i,'races out of',len(df.Race.unique()),'have been analysed.')
        
print('All done.')

50 races out of 223 have been analysed.
100 races out of 223 have been analysed.
150 races out of 223 have been analysed.
200 races out of 223 have been analysed.
All done.


### Load JSON files

In [32]:
#with open('../../website/_data/racesnames.json', 'w') as out_file:
with open('../../hopsuisse.github.io/_data/racesnames.json', 'w') as out_file:
    json.dump(names_dict, out_file)

In [33]:
for encoded_name in out_dict:
    #with open('../../website/racedata/' + encoded_name + '.json', 'w') as out_file:
    with open('../../hopsuisse.github.io/racedata/' + encoded_name + '.json', 'w') as out_file:
        json.dump(out_dict[encoded_name], out_file)