In [1]:
'''General Libraries'''
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)

'''Statistic'''
import scipy
from scipy import stats
from scipy.stats import norm
import datetime

'''Scikit Learn'''
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

'''Ploting Libraries'''
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "iframe" 


import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_palette('RdBu')

'''Miscellaneous'''
from yellowbrick.regressor import ResidualsPlot
import missingno as msno

'''Seeds'''
import random
random.seed(10)
np.random.seed(11)


import matplotlib.pyplot as plt  
from matplotlib import style 
from sklearn.cluster import KMeans 


style.use("fivethirtyeight") 

from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np



In [2]:
df1 = pd.read_csv('1_to_kprototypes.csv')
df2 = pd.read_csv('data_clusterLabel_2.csv')

df = pd.concat([df1, df2], axis=1)

In [3]:
df

Unnamed: 0.1,Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,pace,5k_avgSpeed,10k_avgSpeed,20k_avgSpeed,25k_avgSpeed,30k_avgSpeed,35k_avgSpeed,40k_avgSpeed,rank_overall,rank_in_gender_only,rank_in_division,age,age_division,gender,country,state,city,cluster
0,0,0.003409,0.007832,0.008234,0.008508,0.007689,0.008533,0.010421,0.010017,0.010147,0.010863,0.003409,0.007832,0.008234,0.007689,0.008533,0.010421,0.010017,0.000219,0.000398,0.001003,0.460317,0.3,M,JPN,others,Fukuoka,1
1,1,0.106944,0.166667,0.157928,0.158276,0.150604,0.134029,0.128172,0.129555,0.127324,0.126930,0.106944,0.166667,0.157928,0.150604,0.134029,0.128172,0.129555,0.000626,0.000000,0.000000,0.238095,0.0,F,KEN,others,Eldoret,0
2,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.365079,0.2,M,RSA,others,Paarl,1
3,3,0.106692,0.166562,0.157928,0.158276,0.150604,0.134029,0.128172,0.131521,0.130881,0.130932,0.106692,0.166562,0.157928,0.150604,0.134029,0.128172,0.131521,0.000814,0.000114,0.000287,0.095238,0.0,F,ETH,others,Shoa,0
4,4,0.003409,0.005221,0.002550,0.002870,0.002344,0.001813,0.001074,0.001434,0.001375,0.001144,0.003409,0.005221,0.002550,0.002344,0.001813,0.001074,0.001434,0.000031,0.000057,0.000143,0.349206,0.2,M,JPN,others,Nogata Fukuoka,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31643,31643,0.232071,0.356099,0.337052,0.336035,0.321602,0.288188,0.281428,0.287149,0.286179,0.285878,0.232071,0.356099,0.337052,0.321602,0.288188,0.281428,0.287149,0.308425,0.426710,0.489109,0.222222,0.0,M,USA,CA,Larkspur,1
31644,31644,0.294444,0.466374,0.490146,0.492449,0.490174,0.459413,0.464193,0.484372,0.484245,0.484277,0.294444,0.466374,0.490146,0.490174,0.459413,0.464193,0.484372,0.859850,0.879595,0.746776,0.253968,0.0,M,USA,MA,Norwell,1
31645,31645,0.257955,0.442565,0.463373,0.465868,0.457281,0.425718,0.424707,0.440023,0.439469,0.439680,0.257955,0.442565,0.463373,0.457281,0.425718,0.424707,0.440023,0.785813,0.613804,0.749498,0.047619,0.0,F,USA,CT,West Simsbury,0
31646,31646,0.293308,0.492168,0.498274,0.498389,0.501686,0.472197,0.470687,0.485298,0.484704,0.484277,0.293308,0.492168,0.498274,0.501686,0.472197,0.470687,0.485298,0.860476,0.683282,0.831040,0.317460,0.1,F,USA,MA,North Andover,0


In [4]:
df.cluster.value_counts()

1    17484
0    14164
Name: cluster, dtype: int64

# 1. Gender 

In [5]:
cluster_0 = df[df.cluster==0]
cluster_1 = df[df.cluster==1]

In [6]:
cluster_0.gender.value_counts()

F    14164
Name: gender, dtype: int64

In [7]:
cluster_1.gender.value_counts()

M    17484
Name: gender, dtype: int64

In [8]:
cluster_gender = {'cluster 0' : cluster_0.gender.value_counts(),
                  'cluster 1' : cluster_1.gender.value_counts()}


px.bar(cluster_gender)

In [9]:
cluster_1.groupby('age_division').count()

Unnamed: 0_level_0,Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,pace,5k_avgSpeed,10k_avgSpeed,20k_avgSpeed,25k_avgSpeed,30k_avgSpeed,35k_avgSpeed,40k_avgSpeed,rank_overall,rank_in_gender_only,rank_in_division,age,gender,country,state,city,cluster
age_division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
0.0,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795,3795
0.1,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109,2109
0.2,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618,2618
0.3,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905,2905
0.4,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458,2458
0.5,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768,1768
0.6,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097
0.7,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529,529
0.8,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157,157
0.9,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41


In [10]:
age_cluster_0 = cluster_0.groupby('age_division').count().cluster
px.bar(age_cluster_0)



In [11]:
age_cluster_1 = cluster_1.groupby('age_division').count().cluster
px.bar(age_cluster_1)

In [12]:
cluster_1.groupby('age_division').count().cluster

age_division
0.0    3795
0.1    2109
0.2    2618
0.3    2905
0.4    2458
0.5    1768
0.6    1097
0.7     529
0.8     157
0.9      41
1.0       7
Name: cluster, dtype: int64

age_cluster_0 = cluster_0.groupby('age_division').count().cluster
age_cluster_1 = cluster_1.groupby('age_division').count().cluster

fig = make_subplots(rows=2, cols=1)

x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

trace_cluster_0 = go.Bar(x=x, y=age_cluster_0, row=2, col=1)
trace_cluster_1 = go.Bar(x=x, y=age_cluster_1, row=1, col=1)

fig.add_trace(trace_cluster_0)
fig.add_trace(trace_cluster_1)

fig.show()

# 1. Gender 

In [13]:
cluster_gender = {'cluster 0' : cluster_0.gender.value_counts(),
                  'cluster 1' : cluster_1.gender.value_counts()
                  }


cluster_gender = pd.DataFrame(data=cluster_gender)

cluster_gender.fillna(0, inplace=True)

In [14]:
cluster_gender

Unnamed: 0,cluster 0,cluster 1
F,14164.0,0.0
M,0.0,17484.0


In [15]:
#fig = make_subplots(rows=1, cols=3)

fig = go.Figure()

x = ['F', 'M']

trace0 = go.Bar(x=x, y=cluster_gender['cluster 0'])
trace1 = go.Bar(x=x, y=cluster_gender['cluster 1'])

fig.add_trace(trace0)
fig.add_trace(trace1)

fig.update_layout(barmode='group',
                  title='Clusters - Gender')

fig.show()

# Age Division

In [16]:
cluster_age = {'cluster 0' : cluster_0.age_division.value_counts(),
               'cluster 1' : cluster_1.age_division.value_counts(),
            }


cluster_age = pd.DataFrame(data=cluster_age)

cluster_age.fillna(0, inplace=True)

In [17]:
cluster_age

Unnamed: 0,cluster 0,cluster 1
0.0,4831,3795
0.1,2069,2109
0.2,2304,2618
0.3,2270,2905
0.4,1463,2458
0.5,751,1768
0.6,337,1097
0.7,117,529
0.8,18,157
0.9,3,41


In [18]:
#fig = make_subplots(rows=1, cols=3)

fig = go.Figure()

x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9 ,10]

trace0 = go.Bar(x=x, y=cluster_age['cluster 0'])
trace1 = go.Bar(x=x, y=cluster_age['cluster 1'])

fig.add_trace(trace0)
fig.add_trace(trace1)

fig.update_layout(barmode='group',
                  title='Clusters - Age Division')

fig.show()