# Data aggregation for future visualization

* [Load data](#Load-data)
* [Aggregation by race](#Aggregation-by-race)
* [Aggregation by name](#Aggregation-by-name)

## Load data

In [1]:
import pandas as pd

In [2]:
# Change to full dataset local path
df = pd.read_csv('/Users/maximepeschard/Downloads/full_database.csv')

In [3]:
df.shape

(1281195, 12)

In [4]:
df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace,len_name
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3","0:6.26,0",2
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6","0:8.25,0",2
2,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abegglen Eddy,M,1954,Mürren,424,"5:45.21,9","2:20.33,8","0:8.11,0",2
3,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.328675,Abosa Emebet,F,1974,Zuoz,1,"3:21.46,1",False,"0:4.46,0",2
4,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abplanalp Michel,M,1960,Auvernier,143,"4:30.26,9","1:7.35,3","0:6.24,0",2


## Convert time string to seconds (ignore further precision)

In [5]:
def time_to_seconds(x):
    # Get hours
    split = x.split(':')
    hours = int(split[0])
    # Get minutes
    split = split[1].split('.')
    minutes = int(split[0])
    # Get seconds
    split = split[1].split(',')
    seconds = int(split[0])
    return hours*3600 + minutes*60 + seconds

def seconds_to_time(x):
    minutes, seconds = divmod(x, 60)
    hours, minutes = divmod(minutes, 60)
    return '{}:{}:{}'.format(hours, minutes, seconds)

In [6]:
df['time'] = df.Time.apply(time_to_seconds)

## Add a race+date index

In [7]:
df['race_index'] = df.Race + ' ; ' + df.Date

## Final global dataframe

In [8]:
df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace,len_name,time,race_index
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3","0:6.26,0",2,16316,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6","0:8.25,0",2,21309,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
2,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abegglen Eddy,M,1954,Mürren,424,"5:45.21,9","2:20.33,8","0:8.11,0",2,20721,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
3,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.328675,Abosa Emebet,F,1974,Zuoz,1,"3:21.46,1",False,"0:4.46,0",2,12106,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
4,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abplanalp Michel,M,1960,Auvernier,143,"4:30.26,9","1:7.35,3","0:6.24,0",2,16226,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"


# Aggregation by race

In [9]:
def keep_first(x):
    return x.unique()[0]

def str_count(x, s=''):
    try:
        count = x.value_counts()[s]
    except KeyError:
        count = 0
    return count

aggregations = {
    'Distance': 'max',
    'Name': 'count',
    'Sex': {
        'M': lambda x: str_count(x, s='M'),
        'F': lambda x: str_count(x, s='F')
    },
    'Year': {
        'min_year': 'min', 
        'max_year': 'max',
        'mean_year': 'mean',
        'median_year': 'median'
    },
    'time': {
        'min_time': lambda x: seconds_to_time(x.min()),
        'max_time': lambda x: seconds_to_time(x.max()),
        'mean_time': lambda x: seconds_to_time(x.mean()),
        'median_time': lambda x: seconds_to_time(x.median()),
    }
}

# ---- INTERLUDE ----
#
# Not sure what's best here :
# - groupby 'race_index' and export all data in one huge JSON file (choice 1)
# - groupby ['Race', 'Date'] and export one JSON file by race name (choice 2)
#
# Maybe choice 2 is more practical when working with D3 / viz tools...
# -------------------

races_stats = df.groupby('race_index').agg(aggregations)     # choice 1
# races_stats = df.groupby(['Race', 'Date']).agg(aggregations) # choice 2

races_stats.columns = races_stats.columns.droplevel(0)

In [10]:
races_stats.head()

Unnamed: 0_level_0,max,median_year,mean_year,max_year,min_year,count,median_time,mean_time,min_time,max_time,M,F
race_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10km Schweizer Meisterschaften Lyss ; sam. 02.04.2011,10.0,1965.0,1966.01232,1997,1923,487,0.0:42.0:29.0,0.0:43.0:42.61190965092419,0:22:15,1:15:56,357,130
10km Schweizer Meisterschaften Lyss ; sam. 31.03.2012,10.0,1966.0,1966.77732,1998,1934,485,0.0:41.0:24.0,0.0:42.0:26.39381443298953,0:17:23,1:15:57,370,115
10km de Payerne ; sam. 27.03.2010,10.0,1971.0,1974.335025,2007,1923,591,0.0:38.0:58.0,0.0:35.0:24.844331641285862,0:1:23,1:28:14,403,188
10km de Payerne ; sam. 28.03.2009,10.0,1966.0,1964.978261,1995,1923,368,0.0:41.0:18.0,0.0:43.0:15.14673913043498,0:29:55,1:27:13,288,80
"10km, Payerne ; sam. 10.03.2012",10.0,1973.0,1976.005474,2010,1943,548,0.0:42.0:50.0,0.0:38.0:23.193430656934197,0:1:26,1:14:23,408,140


In [11]:
# Example of JSON export
races_stats[:2].to_json(orient='index')

'{"10km Schweizer Meisterschaften Lyss ; sam. 02.04.2011":{"max":10.0,"median_year":1965.0,"mean_year":1966.0123203285,"max_year":1997,"min_year":1923,"count":487,"median_time":"0.0:42.0:29.0","mean_time":"0.0:43.0:42.61190965092419","min_time":"0:22:15","max_time":"1:15:56","M":357,"F":130},"10km Schweizer Meisterschaften Lyss ; sam. 31.03.2012":{"max":10.0,"median_year":1966.0,"mean_year":1966.7773195876,"max_year":1998,"min_year":1934,"count":485,"median_time":"0.0:41.0:24.0","mean_time":"0.0:42.0:26.39381443298953","min_time":"0:17:23","max_time":"1:15:57","M":370,"F":115}}'

# Aggregation by name

**TO DO ! Beware of the name 'real' duplicates (two different persons with same name (and sometimes same birthyear...)), and the 'false' duplicates (different formatting for the same name, eg. trailing space).**

In [12]:
df.Name.value_counts().head()

Schmid Christian     180
Meier Thomas         170
Müller Martin        160
Müller Thomas        155
Meier Andreas        136
Name: Name, dtype: int64

In [13]:
name_agg_df = df[df.Name.str.strip() == 'Schmid Christian']

In [14]:
name_agg_df.head()

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace,len_name,time,race_index
2304,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Schmid Christian,M,1981,DE-Waldbronn,332,"4:38.55,9","1:49.54,3","0:6.36,0",2,16735,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
2305,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Schmid Christian,M,1967,Berikon,160,"4:12.29,0","1:23.27,4","0:5.59,0",2,15149,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
2306,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Schmid Christian,M,1963,Morrens VD,397,"5:15.51,2","1:52.59,6","0:7.29,0",2,18951,"Jungfrau-Marathon, Interlaken ; sam. 06.09.2003"
5569,"Jungfrau-Marathon, Interlaken",sam. 11.09.2004,42.195,Schmid Christian,M,1967,Berikon,183,"4:16.34,4","1:17.3,5","0:6.4,0",2,15394,"Jungfrau-Marathon, Interlaken ; sam. 11.09.2004"
9147,"Jungfrau-Marathon, Interlaken",sam. 10.09.2005,42.195,Schmid Christian,M,1968,Zürich,270,"4:29.35,3","1:30.13,5","0:6.23,0",2,16175,"Jungfrau-Marathon, Interlaken ; sam. 10.09.2005"
