In [1]:
%matplotlib inline

import pickle
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from collections import Counter
import pandas as pd
import seaborn as sns
from datetime import time

In [2]:
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA

In [3]:
with open("bostonmarathon-master/results/2014/results.pkl", 'rb') as f:
    data = pickle.load(f)
marathon = pd.DataFrame.from_dict(data).transpose()

In [4]:
marathon.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,bib,city,...,ctz,division,gender,genderdiv,half,name,official,overall,pace,state
35540,0:58:22,2:01:01,2:42:47,3:21:59,4:07:49,4:45:17,0:29:52,30,35540,New York,...,CHN,5258,M,15604,2:08:38,"Zheng, Jiewu",5:06:24,27767,0:11:42,NY
35541,0:57:18,1:53:23,2:21:10,2:49:51,3:18:27,3:46:28,0:29:16,37,35541,Brunswick,...,,3767,F,6676,1:59:27,"Harrington, Kim L.",3:58:20,17647,0:09:06,OH
35542,1:02:28,2:15:31,2:57:57,3:36:45,4:12:27,4:56:56,0:31:03,37,35542,Boston,...,,5385,M,15885,2:24:34,"Gupta, Alok",5:12:24,28283,0:11:55,MA
35543,0:51:05,1:44:33,2:17:28,2:55:33,3:37:50,4:15:28,0:26:08,45,35543,Dorchester,...,,2445,M,13736,1:50:49,"Higgins, David J.",4:32:38,23843,0:10:24,MA
35544,1:19:19,2:47:08,-,4:30:17,5:21:07,6:08:52,0:40:21,22,35544,Waquoit,...,,6882,F,14150,2:57:27,"Bohnenberger, Helen",6:28:10,31567,0:14:49,MA


In [5]:
times = ['5k', '10k', '20k', '25k', '30k', '35k', '40k', 'half', 'official', 'pace']
for time in times:
    marathon[time] = pd.to_timedelta(marathon[time])

In [6]:
marathon['gender'] = np.where(marathon['gender'] == 'F', 0, 1)

In [7]:
marathon[['age', 'division', 'genderdiv', 'overall']] = marathon[['age', 'division', 'genderdiv', 'overall']].apply(pd.to_numeric, errors='coerce')

In [8]:
marathon.dtypes

10k          timedelta64[ns]
20k          timedelta64[ns]
25k          timedelta64[ns]
30k          timedelta64[ns]
35k          timedelta64[ns]
40k          timedelta64[ns]
5k           timedelta64[ns]
age                    int64
bib                   object
city                  object
country               object
ctz                   object
division               int64
gender                 int32
genderdiv              int64
half         timedelta64[ns]
name                  object
official     timedelta64[ns]
overall                int64
pace         timedelta64[ns]
state                 object
dtype: object

In [9]:
marathon.select_dtypes(include=['object']).nunique()

bib        31984
city        5935
country       78
ctz           85
name       31915
state         69
dtype: int64

In [10]:
marathon.drop(['bib', 'city', 'country', 'ctz', 'name', 'state'], 1, inplace=True)

In [11]:
# target variable
# time = marathon.official

In [12]:
# marathon.drop(['official'], axis=1, inplace=True)

In [13]:
marathon.describe()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,division,gender,genderdiv,half,official,overall,pace
count,31984,31984,31984,31984,31984,31984,31984,31984.0,31984.0,31984.0,31984.0,31984,31984,31984.0,31984
mean,0 days 00:51:43.097673,0 days 01:45:32.586480,0 days 02:12:42.583791,0 days 02:44:21.922617,0 days 03:16:38.899543,0 days 03:48:43.335136,0 days 00:25:42.822786,42.407079,1932.563032,0.550807,8051.044741,0 days 01:51:24.792052,0 days 04:02:59.838544,15939.587825,0 days 00:09:16.537206
std,0 days 00:09:11.358657,0 days 00:20:12.587623,0 days 00:27:45.506781,0 days 00:34:25.638167,0 days 00:42:26.661657,0 days 00:49:46.540264,0 days 00:04:26.826230,11.316496,1715.228694,0.49742,4754.005626,0 days 00:21:41.574109,0 days 00:52:18.025241,9232.978224,0 days 00:01:59.549132
min,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,18.0,1.0,0.0,1.0,0 days 00:00:00,0 days 01:20:36,1.0,0 days 00:03:05
25%,0 days 00:45:22,0 days 01:31:42,0 days 01:55:06,0 days 02:20:35,0 days 02:47:03,0 days 03:13:42,0 days 00:22:38,33.0,610.0,0.0,3972.0,0 days 01:36:45,0 days 03:25:31.750000,7943.75,0 days 00:07:51
50%,0 days 00:50:19,0 days 01:42:06,0 days 02:08:40,0 days 02:37:42,0 days 03:08:07,0 days 03:38:47,0 days 00:25:05,42.0,1425.0,1.0,7970.0,0 days 01:47:46.500000,0 days 03:52:22,15939.5,0 days 00:08:52
75%,0 days 00:57:09,0 days 01:56:29,0 days 02:27:33.250000,0 days 03:02:47.250000,0 days 03:40:07,0 days 04:17:24.250000,0 days 00:28:27.250000,50.0,2611.0,1.0,11968.0,0 days 02:03:02,0 days 04:33:14.250000,23935.25,0 days 00:10:26
max,0 days 01:52:23,0 days 03:44:21,0 days 04:49:01,0 days 06:16:23,0 days 07:29:20,0 days 08:28:21,0 days 01:26:57,81.0,6979.0,1.0,17575.0,0 days 03:56:40,0 days 08:58:53,31931.0,0 days 00:20:34


In [14]:
marathon.head()

Unnamed: 0,10k,20k,25k,30k,35k,40k,5k,age,division,gender,genderdiv,half,official,overall,pace
35540,00:58:22,02:01:01,02:42:47,03:21:59,04:07:49,04:45:17,00:29:52,30,5258,1,15604,02:08:38,05:06:24,27767,00:11:42
35541,00:57:18,01:53:23,02:21:10,02:49:51,03:18:27,03:46:28,00:29:16,37,3767,0,6676,01:59:27,03:58:20,17647,00:09:06
35542,01:02:28,02:15:31,02:57:57,03:36:45,04:12:27,04:56:56,00:31:03,37,5385,1,15885,02:24:34,05:12:24,28283,00:11:55
35543,00:51:05,01:44:33,02:17:28,02:55:33,03:37:50,04:15:28,00:26:08,45,2445,1,13736,01:50:49,04:32:38,23843,00:10:24
35544,01:19:19,02:47:08,00:00:00,04:30:17,05:21:07,06:08:52,00:40:21,22,6882,0,14150,02:57:27,06:28:10,31567,00:14:49


In [15]:
# Normalize the data.
X_norm = normalize(marathon)

# Reduce it to two components.
X_pca = PCA(2).fit_transform(X_norm)

# Calculate predicted values.
y_pred = KMeans(n_clusters=2, random_state=42).fit_predict(X_pca)

# Plot the solution.
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred)
plt.show()

# Check the solution against the data.
print('Comparing k-means clusters against the data:')
print(pd.crosstab(y_pred, y))

TypeError: float() argument must be a string or a number, not 'Timedelta'