# 20 km of Brussels 2017

In [1]:
import pandas as pd
import requests
import json
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import numpy as np

## Data Download & Preparation

In [2]:
url = "http://prod.chronorace.be/api/Results/20km/Search?eventId=1187794680566157&year=2017&search=&race=1&gender=&yearOfBirth=&category=&nationality=&province=&postalCode=&team=&isInitial=1&maxEntries=36538&fromEntry=0&_=1496138014817"
data = requests.get(url).json()
data_bis = data['Matches']
data_fin = pd.DataFrame.from_records(data_bis)

### Finisher Statistics

In [152]:
data_fin.loc[:, 'finished'] = data_fin['x_time'] != '-' 
finish_cat = data_fin.groupby(['finished']).size().reset_index()
finish_sexe = data_fin.groupby(['finished', 'sexe']).size().reset_index()

In [157]:
print(finish_sexe.loc[(finish_sexe['finished'] == True) & (finish_sexe['sexe'] == 'F'), 0] / finish_sexe.loc[(finish_sexe['sexe'] == 'F'), 0].sum())
print(finish_sexe.loc[(finish_sexe['finished'] == True) & (finish_sexe['sexe'] == 'M'), 0] / finish_sexe.loc[(finish_sexe['sexe'] == 'M'), 0].sum())

3    0.770908
Name: 0, dtype: float64
4    0.79147
Name: 0, dtype: float64


In [95]:
print("Number of people that finished the run: {}".format(finisher.shape[0]))
print("Number of people that did not finish the run: {}".format(data_fin.shape[0] - finisher.shape[0]))

Number of people that finished the run: 28690
Number of people that did not finish the run: 7848


In [93]:
finisher = data_fin.loc[data_fin['x_time'] != '-', :]
finisher['radius'] = 1
finisher.loc[:, 'time'] = finisher['x_time'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))
finisher.loc[:, 'time'] = finisher['x_time'].apply(lambda x: datetime.strptime(x, '%H:%M:%S'))
finisher.loc[:, 'minutes'] = finisher['time'].apply(lambda x: x.hour*60 + x.minute + x.second/60)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [119]:
finisher = finisher[finisher['sexe'] != u'Ù']

### Run Time Bucket

In [120]:
# Should bucket by total minutes
bins = [0, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 1000]
labels = ['<1h00', '1h00 - 1h10', '1h10 - 1h20', '1h20 - 1h30', '1h30 - 1h40', '1h40 - 1h50', '1h50 - 2h00', '2h00 - 2h10', '2h10 - 2h20', '2h20 - 2h30', '2h30 - 2h40', '2h40 - 2h50', '2h50 - 3h00', '>3h00']
finisher['categories'] = pd.cut(finisher['minutes'], bins, labels=labels)
time_cat = finisher.groupby(['categories']).size().reset_index()
time_cat_sexe = finisher.groupby(['categories', 'sexe']).size().reset_index()
average_men = finisher.loc[finisher['sexe'] == 'M', 'minutes'].mean()
average_women = finisher.loc[finisher['sexe'] == 'F', 'minutes'].mean()
average_total = finisher.loc[:, 'minutes'].mean()
average_men = finisher.loc[finisher['sexe'] == 'M', 'minutes'].mean()
average_women = finisher.loc[finisher['sexe'] == 'F', 'minutes'].mean()

In [160]:
print(str(timedelta(minutes=average_total))[:-3])
print(str(timedelta(minutes=average_men))[:-3])
print(str(timedelta(minutes=average_women))[:-3])

2:00:44.095
1:55:10.912
2:13:48.608


### Box Bucket

In [121]:
bins = [0, 6000, 12000, 18000, 24000, 33000, 42000]
labels = ['Box 1', 'Box 2', 'Box 3', 'Box 4', 'Box 5', 'Box 6']
finisher.loc[:, 'box'] = pd.cut(finisher['bib'], bins, labels=labels)

In [168]:
box_cat = finisher.groupby(['box']).agg(['mean', 'std']).reset_index()
box_cat_sexe = finisher.groupby(['box', 'sexe']).agg(['mean', 'std']).reset_index()

### Age Bucket

In [98]:
def cast_age(row):
    try:
        return float(row['x_age'])
    except:
        return row['x_age']

In [107]:
finisher.loc[:,'age'] = finisher.apply(cast_age, axis=1)

In [137]:
bins = [0, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 100]
labels = ['0 - 20', '21 - 25', '26 - 30', '31 - 35', '36 - 40', '41 - 45', '46 - 50', '51 - 55', '56 - 60', '61 - 65', '65+']
finisher.loc[:, 'age_box'] = pd.cut(finisher['age'], bins, labels=labels)

In [139]:
age_cat = finisher.groupby(['age_box']).agg('mean').reset_index()
age_cat_sexe = finisher.groupby(['age_box', 'sexe']).agg('mean').reset_index()

In [144]:
age_cat_sexe.ix[age_cat_sexe['sexe'] == "M", 5].values

array([ 112.71632896,  111.5027137 ,  111.87660592,  112.92153589,
        113.27851645,  114.53630164,  115.87263763,  119.67545305,
        122.20633484,  128.99785867,  145.28897338])

In [174]:
age_cat['minutes'].pct_change().mean()

0.02004035084662047