In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import datetime
from scipy import stats

import seaborn as sns
sns.set_context('notebook')
%config InlineBackend.figure_format = 'retina'

In [2]:
a=pd.read_csv('../parsing/full_database.csv')

In [3]:
del a['len_name']

In [4]:
a.head(2)

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace
0,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abächerli Walter,M,1952,Hinwil,47,"4:31.56,1","0:53.5,3","0:6.26,0"
1,"Jungfrau-Marathon, Interlaken",sam. 06.09.2003,42.195,Abbringh Ellen,F,1962,NL-Doorn 3941 EB,91,"5:55.9,4","2:12.11,6","0:8.25,0"


In [5]:
weather_db=pd.read_csv('../datasets/races-information-weather.csv')
del weather_db['Unnamed: 0']
del weather_db['Unnamed: 0.1']

In [6]:
weather_db.head(2)

Unnamed: 0,Date,Name,Place,URL,min_temp,max_temp,uv_index,weather_desc
0,sam. 27.03.1999,Männedörfler Waldlauf,Männedorf,http://services.datasport.com/1999/zkb/maennedorf,,,,
1,sam. 20.03.1999,Kerzerslauf,Kerzers,http://services.datasport.com/1999/lauf/kerzers,,,,


In [7]:
gr=a.groupby([a.Date,a.Race])

In [8]:
def merge_weather_info(x):
    race=x.iloc[0].Race
    date=x.iloc[0].Date
    date2=date[:15]
    weather=weather_db[(weather_db.Name==race) & (weather_db.Date==date2)].iloc[0]
    x['Date']=date2
    x['Place']=weather.Place
    x['MinTemp']=weather.min_temp
    x['MaxTemp']=weather.max_temp
    x['Weather']=weather.weather_desc
    x['RaceID']=weather.URL
    
    return x

In [9]:
full_df=gr.apply(merge_weather_info)

In [10]:
full_df=full_df.dropna()

In [11]:
full_df.shape

(1103009, 16)

In [12]:
full_df.head(5)

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Delay,Pace,Place,MinTemp,MaxTemp,Weather,RaceID
165242,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Helmut,M,1962,Staldenried,165,"6:10.37,2","2:32.19,2","0:8.47,0",Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165243,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Julia,F,1950,Gwatt (Thun),5,"5:34.8,2","0:38.30,9","0:7.55,0",Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165244,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Medard,M,1942,Visp,2,"5:13.54,7","0:27.3,3","0:7.26,0",Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165245,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Achterberg Nico,M,1957,NL-Veenendaal,80,"5:38.35,0","2:3.9,3","0:8.1,0",Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165246,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Ackeret Peter,M,1957,Deitingen,11,"4:26.3,5","0:50.37,8","0:6.18,0",Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...


# Parse times and select only meaningful features

In [13]:
base=datetime.datetime.strptime('0:0.0,0','%H:%M.%S,%f')

In [14]:
full_df['Pace'] = (pd.to_datetime(full_df.Pace, format='%H:%M.%S,%f')-base)

In [15]:
full_df.shape

(1103009, 16)

In [16]:
full_df=full_df[full_df.Distance<45]

In [17]:
max(full_df.Distance)

42.416259459459461

In [18]:
full_df.shape

(1096394, 16)

In [19]:
full_df['Time'] = (pd.to_datetime(full_df.Time, format='%H:%M.%S,%f')-base)

In [20]:
del full_df['Delay']

In [21]:
full_df.head(3)

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,Place,MinTemp,MaxTemp,Weather,RaceID
165242,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Helmut,M,1962,Staldenried,165,06:10:37.200000,00:08:47,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165243,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Julia,F,1950,Gwatt (Thun),5,05:34:08.200000,00:07:55,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165244,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Medard,M,1942,Visp,2,05:13:54.700000,00:07:26,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...


# Final preprocessing

In [28]:
df.Race.value_counts()

Course de l'Escalade, Genève                        102157
20km de Lausanne                                     81646
Zürcher Silvesterlauf, Zürich                        68787
Morat-Fribourg                                       66547
Schweizer Frauenlauf Bern                            56882
Kerzerslauf                                          45636
Lausanne Marathon                                    39960
Basler Stadtlauf                                     37862
Hallwilerseelauf, Beinwil am See                     35876
Zürich Marathon, Teamrun und Cityrun                 30448
Lucerne Marathon, Luzern                             29933
SwissCityMarathon - Lucerne, Luzern                  24458
Jungfrau-Marathon, Interlaken                        21883
Stralugano, Lugano                                   21512
Int Greifenseelauf, Uster                            21505
Corrida Bulloise, Bulle                              21397
ASICS Bremgarter Reusslauf                           170

In [29]:
b=df[df.Race=='20km de Lausanne']

In [30]:
b.Distance.value_counts().index

Float64Index([10.0, 20.0, 2.0, 4.0], dtype='float64')

In [22]:
df=full_df.copy()

In [23]:
df.Distance=df.Distance.apply(lambda x: x if x<4 or (abs(1-x/round(x)))>0.0055 else round(x))

In [24]:
df.Distance=df.Distance.apply(lambda x: 21.0975 if x>4 and (abs(1-x/21.0975))<0.0055 else x)

In [25]:
df.Distance=df.Distance.apply(lambda x: 42.195 if x>4 and (abs(1-x/42.195))<0.0055 else x)

In [26]:
df

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,Place,MinTemp,MaxTemp,Weather,RaceID
165242,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Abgottspon Helmut,M,1962,Staldenried,165,06:10:37.200000,00:08:47,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165243,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Abgottspon Julia,F,1950,Gwatt (Thun),5,05:34:08.200000,00:07:55,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165244,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Abgottspon Medard,M,1942,Visp,2,05:13:54.700000,00:07:26,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165245,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Achterberg Nico,M,1957,NL-Veenendaal,80,05:38:35,00:08:01,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165246,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Ackeret Peter,M,1957,Deitingen,11,04:26:03.500000,00:06:18,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165247,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Ackermann Alex,M,1963,Wolfwil,48,04:48:14.700000,00:06:49,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165248,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Ackermann Franz,M,1963,Balsthal,113,05:29:37.200000,00:07:48,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165249,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Adler Peter,M,1955,D-Emsdetten,42,05:07:15.500000,00:07:16,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165250,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Aebersold Thomas,M,1960,Bern,39,04:43:22.200000,00:06:42,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165251,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195000,Aegler Susanne,F,1968,Bolligen,8,04:57:04.700000,00:07:02,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...


In [31]:
df.to_pickle('../datasets/dataframe_weather.pickle')

In [37]:
full_df=df.copy()

In [60]:
a=list(full_df.Date.value_counts().index)
for i,item in enumerate(a):
    a[i]=item.split('.')[3]
set(a)

{'2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'}

# Categorize weather

In [None]:
full_df.Weather.value_counts().index

2 -> Sunny
1 -> Cloudy
0 -> bad weather

In [None]:
dict_favorable_weather={'Partly cloudy':1, 'Clear':2, 'Sunny':2, 'Mist':0, 'Patchy rain possible':1,
       'Cloudy':1, 'Light rain':0, 'Light rain shower':0, 'Moderate rain':0,
       'Overcast':1, 'Fog':0, 'Moderate or heavy rain shower':0,
       'Moderate rain at times':1, 'Light drizzle':0, 'Light snow':0,
       'Patchy light drizzle':0, 'Moderate snow':0, 'Patchy light rain':0,
       'Patchy light rain with thunder':0, 'Freezing fog':0, 'Heavy snow':0,
       'Heavy rain':0, 'Moderate or heavy rain with thunder':0,
       'Moderate or heavy sleet':0, 'Patchy moderate snow':0,
       'Heavy rain at times':0, 'Light sleet':0,
       'Moderate or heavy snow with thunder':0}

In [None]:
def categorize_weather(x):
    return dict_favorable_weather[x]

In [None]:
full_df.Weather=full_df.Weather.apply(categorize_weather)

# Analysis of men

In [38]:
men_full_df=full_df[full_df.Sex=='M']

In [41]:
men_marathon=men_full_df[men_full_df.Distance==42.195]

In [42]:
men_marathon.shape

(63969, 15)

In [43]:
men_marathon.head(3)

Unnamed: 0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,Place,MinTemp,MaxTemp,Weather,RaceID
165242,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Helmut,M,1962,Staldenried,165,06:10:37.200000,00:08:47,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165244,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Abgottspon Medard,M,1942,Visp,2,05:13:54.700000,00:07:26,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...
165245,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Achterberg Nico,M,1957,NL-Veenendaal,80,05:38:35,00:08:01,Zermatt,8.0,20.0,Partly cloudy,http://services.datasport.com/2008/lauf/zermat...


In [44]:
def add_mean_std_race(x):
    x['MeanRace']=np.mean(x.Time)
    x['StdRace']=np.std(x.Time)/np.sqrt(x.shape[0])
    x['Count']=x.shape[0]
    return x

In [45]:
men_marathon=men_marathon.groupby(men_marathon.RaceID).apply(add_mean_std_race)

In [46]:
men_marathon_means=men_marathon.groupby(men_marathon.RaceID).first()

In [47]:
men_marathon_means.head(1)

Unnamed: 0_level_0,Race,Date,Distance,Name,Sex,Year,LivingPlace,Rank,Time,Pace,Place,MinTemp,MaxTemp,Weather,MeanRace,StdRace,Count
RaceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
http://services.datasport.com/2008/lauf/jungfrau,"Jungfrau-Marathon, Interlaken",sam. 06.09.2008,42.195,Abächerli Walter,M,1952,Hinwil,51,04:57:05.200000,00:07:02,Interlaken,12.0,16.0,Patchy rain possible,05:12:08.378315,00:00:43.309769,3076


In [48]:
men_marathon_means=men_marathon_means[['Race','Date','Distance','MinTemp','MaxTemp','Weather','MeanRace','StdRace','Count']]

In [49]:
men_marathon_means['AvgTemp']=(men_marathon_means.MinTemp+men_marathon_means.MaxTemp)/2

In [50]:
men_marathon_means=men_marathon_means[['Race','Date','Distance','Weather','AvgTemp','MeanRace','StdRace','Count']]

In [51]:
men_marathon_means

Unnamed: 0_level_0,Race,Date,Distance,Weather,AvgTemp,MeanRace,StdRace,Count
RaceID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
http://services.datasport.com/2008/lauf/jungfrau,"Jungfrau-Marathon, Interlaken",sam. 06.09.2008,42.195,Patchy rain possible,14.0,05:12:08.378315,00:00:43.309769,3076
http://services.datasport.com/2008/lauf/tenero,"Maratona Ticino, Tenero",dim. 09.11.2008,42.195,Sunny,4.5,03:36:17.806542,00:03:06.803563,107
http://services.datasport.com/2008/lauf/zermattmarathon,"Zermatt Marathon, Zermatt",sam. 05.07.2008,42.195,Partly cloudy,14.0,05:17:43.698255,00:01:35.249823,860
http://services.datasport.com/2008/waffenlauf/frauenfelder,Frauenfelder,dim. 16.11.2008,42.195,Sunny,4.0,04:06:07.302506,00:02:12.962113,359
http://services.datasport.com/2009/lauf/biel,"Bieler Lauftage, Biel/Bienne",ven. 12.06.2009,42.195,Sunny,16.0,04:17:36.674789,00:04:28.234824,119
http://services.datasport.com/2009/lauf/jungfrau,"Jungfrau-Marathon, Interlaken",sam. 05.09.2009,42.195,Sunny,6.5,05:08:45.909620,00:00:44.579179,3191
http://services.datasport.com/2009/lauf/lucernemarathon,"Lucerne Marathon, Luzern",dim. 25.10.2009,42.195,Partly cloudy,8.5,03:45:51.895514,00:00:45.238636,1672
http://services.datasport.com/2009/lauf/neujahrsmarathon/,"Neujahrsmarathon Zürich, Schlieren",jeu. 01.01.2009,42.195,Partly cloudy,0.0,04:10:22.474468,00:03:15.850583,94
http://services.datasport.com/2009/lauf/tenero,"Maratona Ticino, Tenero",dim. 08.11.2009,42.195,Patchy light rain,1.0,03:32:28.143181,00:02:17.169413,176
http://services.datasport.com/2009/lauf/winterthur,Winterthur Marathon,dim. 24.05.2009,42.195,Clear,23.0,03:56:27.112972,00:02:30.903566,185


In [None]:
men_marathon_means.Weather.value_counts()

# Classify different marathons

In [None]:
gr_means=men_marathon_means.groupby(men_marathon_means.Race)

In [None]:
def add_place_mean(x):
    x['MeanPlace']=np.mean(x.MeanRace)
    x['CountPlace']=np.mean(x.Count)
    x['NumberOfRaces']=x.shape[0]
    return x

In [None]:
place_mean=gr_means.apply(add_place_mean)

In [None]:
place_mean=place_mean.groupby(men_marathon_means.Race).first()
place_mean=place_mean[['Distance','MeanPlace','CountPlace','NumberOfRaces']]

In [None]:
place_mean

As you can see, a lot of marathon have too few partecipants to be statistically significative.
Moreover, the Interlaken and Zermatt marathon are slow-pace marathon, for the track chosen (with a lot of uphill).
We exclude these competitions from our analysis

The marathons to analyse are:
- Lausanne Marathon
- Lucerne Marathon, Luzern
- SwissCityMarathon - Lucerne, Luzern
- Zürich Marathon, Teamrun und Cityrun

# Analyse selected runs

In [None]:
final_marathons=men_marathon_means[(men_marathon_means.Race=='Lausanne Marathon') | 
                                   (men_marathon_means.Race=='Lucerne Marathon, Luzern') |
                                   (men_marathon_means.Race=='SwissCityMarathon - Lucerne, Luzern') |
                                   (men_marathon_means.Race=='Zürich Marathon, Teamrun und Cityrun')]

In [None]:
final_marathons[final_marathons.Race=='Zürich Marathon, Teamrun und Cityrun']