In [1]:
'''General Libraries'''
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)

'''Statistic'''
import scipy
from scipy import stats
from scipy.stats import norm
import datetime

'''Scikit Learn'''
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

'''Ploting Libraries'''
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "iframe" 


import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_palette('RdBu')

'''Miscellaneous'''
from yellowbrick.regressor import ResidualsPlot
import missingno as msno

'''Seeds'''
import random
random.seed(10)
np.random.seed(11)


import matplotlib.pyplot as plt  
from matplotlib import style 
from sklearn.cluster import KMeans 


style.use("fivethirtyeight") 

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np



In [2]:
df = pd.read_csv('results_kprototypes_k_3.csv')

In [3]:
df

Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,pace,5k_avgSpeed,10k_avgSpeed,20k_avgSpeed,25k_avgSpeed,30k_avgSpeed,35k_avgSpeed,40k_avgSpeed,rank_overall,rank_in_gender_only,rank_in_division,age,age_division,gender,country,state,cluster
0,0.003409,0.007832,0.008234,0.008508,0.007689,0.008533,0.010421,0.010017,0.010147,0.010863,0.003409,0.007832,0.008234,0.007689,0.008533,0.010421,0.010017,0.000219,0.000398,0.001003,0.460317,0.3,M,JPN,others,2
1,0.106944,0.166667,0.157928,0.158276,0.150604,0.134029,0.128172,0.129555,0.127324,0.126930,0.106944,0.166667,0.157928,0.150604,0.134029,0.128172,0.129555,0.000626,0.000000,0.000000,0.238095,0.0,F,KEN,others,2
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.365079,0.2,M,RSA,others,2
3,0.106692,0.166562,0.157928,0.158276,0.150604,0.134029,0.128172,0.131521,0.130881,0.130932,0.106692,0.166562,0.157928,0.150604,0.134029,0.128172,0.131521,0.000814,0.000114,0.000287,0.095238,0.0,F,ETH,others,2
4,0.003409,0.005221,0.002550,0.002870,0.002344,0.001813,0.001074,0.001434,0.001375,0.001144,0.003409,0.005221,0.002550,0.002344,0.001813,0.001074,0.001434,0.000031,0.000057,0.000143,0.349206,0.2,M,JPN,others,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31643,0.232071,0.356099,0.337052,0.336035,0.321602,0.288188,0.281428,0.287149,0.286179,0.285878,0.232071,0.356099,0.337052,0.321602,0.288188,0.281428,0.287149,0.308425,0.426710,0.489109,0.222222,0.0,M,USA,CA,2
31644,0.294444,0.466374,0.490146,0.492449,0.490174,0.459413,0.464193,0.484372,0.484245,0.484277,0.294444,0.466374,0.490146,0.490174,0.459413,0.464193,0.484372,0.859850,0.879595,0.746776,0.253968,0.0,M,USA,MA,0
31645,0.257955,0.442565,0.463373,0.465868,0.457281,0.425718,0.424707,0.440023,0.439469,0.439680,0.257955,0.442565,0.463373,0.457281,0.425718,0.424707,0.440023,0.785813,0.613804,0.749498,0.047619,0.0,F,USA,CT,0
31646,0.293308,0.492168,0.498274,0.498389,0.501686,0.472197,0.470687,0.485298,0.484704,0.484277,0.293308,0.492168,0.498274,0.501686,0.472197,0.470687,0.485298,0.860476,0.683282,0.831040,0.317460,0.1,F,USA,MA,0


In [4]:
df.cluster.value_counts()

2    12190
1    11785
0     7673
Name: cluster, dtype: int64

In [5]:
cluster_0 = df[df.cluster==0]
cluster_1 = df[df.cluster==1]
cluster_2 = df[df.cluster==2]

# 1. Gender 

In [6]:
cluster_gender = {'cluster 0' : cluster_0.gender.value_counts(),
                  'cluster 1' : cluster_1.gender.value_counts(),
                  'cluster 2' : cluster_2.gender.value_counts()}


cluster_gender = pd.DataFrame(data=cluster_gender)

cluster_gender.fillna(0, inplace=True)

In [7]:
cluster_gender

Unnamed: 0,cluster 0,cluster 1,cluster 2
F,3605,6793,3766
M,4068,4992,8424


In [8]:
#fig = make_subplots(rows=1, cols=3)

fig = go.Figure()

x = ['F', 'M']

trace0 = go.Bar(x=x, y=cluster_gender['cluster 0'])
trace1 = go.Bar(x=x, y=cluster_gender['cluster 1'])
trace2 = go.Bar(x=x, y=cluster_gender['cluster 2'])

fig.add_trace(trace0)
fig.add_trace(trace1)
fig.add_trace(trace2)

fig.update_layout(barmode='group',
                  title='Clusters - Gender')

fig.show()

# Age Division

In [9]:
cluster_age = {'cluster 0' : cluster_0.age_division.value_counts(),
                  'cluster 1' : cluster_1.age_division.value_counts(),
                  'cluster 2' : cluster_2.age_division.value_counts()}


cluster_age = pd.DataFrame(data=cluster_age)

cluster_age.fillna(0, inplace=True)

In [10]:
cluster_age

Unnamed: 0,cluster 0,cluster 1,cluster 2
0.0,2899,1252,4475.0
0.1,1081,1069,2028.0
0.2,957,1779,2186.0
0.3,817,2646,1712.0
0.4,709,1998,1214.0
0.5,496,1559,464.0
0.6,364,976,94.0
0.7,223,408,15.0
0.8,94,79,2.0
0.9,26,18,0.0


In [11]:
#fig = make_subplots(rows=1, cols=3)

fig = go.Figure()

x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ,10]

trace0 = go.Bar(x=x, y=cluster_age['cluster 0'])
trace1 = go.Bar(x=x, y=cluster_age['cluster 1'])
trace2 = go.Bar(x=x, y=cluster_age['cluster 2'])

fig.add_trace(trace0)
fig.add_trace(trace1)
fig.add_trace(trace2)

fig.update_layout(barmode='group',
                  title='Clusters - Age Division')

fig.show()

# Pace

In [12]:
df1 = pd.read_csv('2014_clean.csv')
df2 = pd.read_csv('results_kprototypes_k_3.csv')

df_pace = pd.concat([df1, df2], axis=1)

df_pace

Unnamed: 0,name,5k,10k,20k,half,25k,30k,35k,40k,official,pace,rank_overall,rank_in_gender_only,rank_in_division,age,gender,country,state,city,bib,5k.1,10k.1,20k.1,half.1,25k.1,30k.1,35k.1,40k.1,official.1,pace.1,5k_avgSpeed,10k_avgSpeed,20k_avgSpeed,25k_avgSpeed,30k_avgSpeed,35k_avgSpeed,40k_avgSpeed,rank_overall.1,rank_in_gender_only.1,rank_in_division.1,age.1,age_division,gender.1,country.1,state.1,cluster
0,"Yamamoto, Hiroyuki",8.02,17.37,37.65,39.72,47.67,59.18,71.40,80.43,85.25,3.27,8,8,8,47,M,JPN,others,Fukuoka,W1,0.003409,0.007832,0.008234,0.008508,0.007689,0.008533,0.010421,0.010017,0.010147,0.010863,0.003409,0.007832,0.008234,0.007689,0.008533,0.010421,0.010017,0.000219,0.000398,0.001003,0.460317,0.3,M,JPN,others,2
1,"Jeptoo, Rita",16.22,32.58,65.83,69.47,82.43,99.33,116.37,132.10,138.95,5.30,21,1,1,33,F,KEN,others,Eldoret,F1,0.106944,0.166667,0.157928,0.158276,0.150604,0.134029,0.128172,0.129555,0.127324,0.126930,0.106944,0.166667,0.157928,0.150604,0.134029,0.128172,0.129555,0.000626,0.000000,0.000000,0.238095,0.0,F,KEN,others,2
2,"Van Dyk, Ernst F.",7.75,16.62,36.10,38.03,45.80,56.45,67.42,76.10,80.60,3.08,1,1,1,41,M,RSA,others,Paarl,W2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.365079,0.2,M,RSA,others,2
3,"Dibaba, Mare",16.20,32.57,65.83,69.47,82.43,99.33,116.37,132.95,140.58,5.37,27,3,3,24,F,ETH,others,Shoa,F2,0.106692,0.166562,0.157928,0.158276,0.150604,0.134029,0.128172,0.131521,0.130881,0.130932,0.106692,0.166562,0.157928,0.150604,0.134029,0.128172,0.131521,0.000814,0.000114,0.000287,0.095238,0.0,F,ETH,others,2
4,"Hokinoue, Kota",8.02,17.12,36.58,38.60,46.37,57.03,67.83,76.72,81.23,3.10,2,2,2,40,M,JPN,others,Nogata Fukuoka,W3,0.003409,0.005221,0.002550,0.002870,0.002344,0.001813,0.001074,0.001434,0.001375,0.001144,0.003409,0.005221,0.002550,0.002344,0.001813,0.001074,0.001434,0.000031,0.000057,0.000143,0.349206,0.2,M,JPN,others,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31643,"Anderson, Steve K.",26.13,50.72,99.55,104.78,124.02,148.65,174.90,200.22,211.75,8.08,9849,7500,3414,32,M,USA,CA,Larkspur,35908,0.232071,0.356099,0.337052,0.336035,0.321602,0.288188,0.281428,0.287149,0.286179,0.285878,0.232071,0.356099,0.337052,0.321602,0.288188,0.281428,0.287149,0.308425,0.426710,0.489109,0.222222,0.0,M,USA,CA,2
31644,"McCarthy, Michael P.",31.07,61.28,128.37,135.85,165.02,203.43,244.70,285.47,302.52,11.55,27456,15459,5212,34,M,USA,MA,Norwell,35909,0.294444,0.466374,0.490146,0.492449,0.490174,0.459413,0.464193,0.484372,0.484245,0.484277,0.294444,0.466374,0.490146,0.490174,0.459413,0.464193,0.484372,0.859850,0.879595,0.746776,0.253968,0.0,M,USA,MA,0
31645,"Brimmer, Delia C.",28.18,59.00,123.33,130.57,157.02,192.65,229.62,266.30,282.00,10.77,25092,10788,5231,21,F,USA,CT,West Simsbury,35910,0.257955,0.442565,0.463373,0.465868,0.457281,0.425718,0.424707,0.440023,0.439469,0.439680,0.257955,0.442565,0.463373,0.457281,0.425718,0.424707,0.440023,0.785813,0.613804,0.749498,0.047619,0.0,F,USA,CT,0
31646,"Morganthal, Amy",30.98,63.75,129.90,137.03,167.82,207.52,247.18,285.87,302.73,11.55,27476,12009,5800,38,F,USA,MA,North Andover,35911,0.293308,0.492168,0.498274,0.498389,0.501686,0.472197,0.470687,0.485298,0.484704,0.484277,0.293308,0.492168,0.498274,0.501686,0.472197,0.470687,0.485298,0.860476,0.683282,0.831040,0.317460,0.1,F,USA,MA,0


In [13]:
df_pace.pace.quantile([0.2])

Unnamed: 0,pace,pace.1
0.2,7.62,0.259577


In [14]:
pace_top_20 = df_pace['pace'].apply(lambda x: 0.0 if x<7.62 else 1.0)

df_pace.insert(11, 'pace_top_20', value=pace_top_20)

df_pace

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().