In [1]:
'''General Libraries'''
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)

'''Statistic'''
import scipy
from scipy import stats
from scipy.stats import norm
import datetime

'''Scikit Learn'''
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

'''Ploting Libraries'''
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "notebook" 


import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
sns.set_palette('RdBu')

'''Miscellaneous'''
from yellowbrick.regressor import ResidualsPlot
import missingno as msno

'''Seeds'''
import random
random.seed(10)
np.random.seed(11)


In [2]:
df = pd.read_csv('cleanData_to_kPrototypes.csv')

In [3]:
df.head()

Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,pace,5k_avgSpeed,10k_avgSpeed,20k_avgSpeed,25k_avgSpeed,30k_avgSpeed,35k_avgSpeed,40k_avgSpeed,rank_overall,rank_in_gender_only,rank_in_division,age,age_division,gender,country,state
0,0.003409,0.007832,0.008234,0.008508,0.007689,0.008533,0.010421,0.010017,0.010147,0.010863,0.003409,0.007832,0.008234,0.007689,0.008533,0.010421,0.010017,0.000219,0.000398,0.001003,0.460317,0.3,M,JPN,others
1,0.106944,0.166667,0.157928,0.158276,0.150604,0.134029,0.128172,0.129555,0.127324,0.12693,0.106944,0.166667,0.157928,0.150604,0.134029,0.128172,0.129555,0.000626,0.0,0.0,0.238095,0.0,F,KEN,others
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.365079,0.2,M,RSA,others
3,0.106692,0.166562,0.157928,0.158276,0.150604,0.134029,0.128172,0.131521,0.130881,0.130932,0.106692,0.166562,0.157928,0.150604,0.134029,0.128172,0.131521,0.000814,0.000114,0.000287,0.095238,0.0,F,ETH,others
4,0.003409,0.005221,0.00255,0.00287,0.002344,0.001813,0.001074,0.001434,0.001375,0.001144,0.003409,0.005221,0.00255,0.002344,0.001813,0.001074,0.001434,3.1e-05,5.7e-05,0.000143,0.349206,0.2,M,JPN,others


In [17]:
df[['age_division', 'gender', 'country', 'state']] = df[['age_division', 'gender', 'country', 'state']].astype('category')

In [18]:
df.dtypes

5k                      float64
10k                     float64
20k                     float64
half                    float64
25k                     float64
30k                     float64
35k                     float64
40k                     float64
official                float64
pace                    float64
5k_avgSpeed             float64
10k_avgSpeed            float64
20k_avgSpeed            float64
25k_avgSpeed            float64
30k_avgSpeed            float64
35k_avgSpeed            float64
40k_avgSpeed            float64
rank_overall            float64
rank_in_gender_only     float64
rank_in_division        float64
age                     float64
age_division           category
gender                 category
country                category
state                  category
dtype: object

# K-Prototypes Clustering

In [5]:
categories = ['age_division', 'gender', 'country', 'state']

categories = [df.columns.get_loc(col) for col in categories if col in df]

print(categories)

[21, 22, 23, 24]


In [17]:
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

kp = KPrototypes(n_clusters=4, init='Huang', n_init=5, verbose=1, n_jobs=-1)
clusters = kp.fit_predict(df, categorical=categories)

# Print the cluster centroids
centroids = kp.cluster_centroids_


Best run was number 1


In [23]:
print(clusters)

[3 3 3 ... 0 0 1]


In [27]:
df['cluster'] = cluster_dict

In [30]:
df.head(20)

Unnamed: 0,5k,10k,20k,half,25k,30k,35k,40k,official,pace,5k_avgSpeed,10k_avgSpeed,20k_avgSpeed,25k_avgSpeed,30k_avgSpeed,35k_avgSpeed,40k_avgSpeed,rank_overall,rank_in_gender_only,rank_in_division,age,age_division,gender,country,state
0,0.003409,0.007832,0.008234,0.008508,0.007689,0.008533,0.010421,0.010017,0.010147,0.010863,0.003409,0.007832,0.008234,0.007689,0.008533,0.010421,0.010017,0.000219,0.000398,0.001003,0.460317,0.3,M,JPN,others
1,0.106944,0.166667,0.157928,0.158276,0.150604,0.134029,0.128172,0.129555,0.127324,0.12693,0.106944,0.166667,0.157928,0.150604,0.134029,0.128172,0.129555,0.000626,0.0,0.0,0.238095,0.0,F,KEN,others
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.365079,0.2,M,RSA,others
3,0.106692,0.166562,0.157928,0.158276,0.150604,0.134029,0.128172,0.131521,0.130881,0.130932,0.106692,0.166562,0.157928,0.150604,0.134029,0.128172,0.131521,0.000814,0.000114,0.000287,0.095238,0.0,F,ETH,others
4,0.003409,0.005221,0.00255,0.00287,0.002344,0.001813,0.001074,0.001434,0.001375,0.001144,0.003409,0.005221,0.00255,0.002344,0.001813,0.001074,0.001434,3.1e-05,5.7e-05,0.000143,0.349206,0.2,M,JPN,others
5,0.106944,0.166667,0.157928,0.158276,0.150687,0.134029,0.128172,0.131521,0.131099,0.130932,0.106944,0.166667,0.157928,0.150687,0.134029,0.128172,0.131521,0.000846,0.000171,0.00043,0.174603,0.0,F,KEN,others
6,0.007955,0.010756,0.008234,0.008508,0.007689,0.00672,0.007358,0.008629,0.008837,0.008576,0.007955,0.010756,0.008234,0.007689,0.00672,0.007358,0.008629,9.4e-05,0.000171,0.00043,0.15873,0.0,M,SUI,others
7,0.093687,0.144737,0.135564,0.135018,0.128073,0.112743,0.104868,0.108039,0.107423,0.108062,0.093687,0.144737,0.135564,0.128073,0.112743,0.104868,0.108039,0.000125,0.000228,0.000573,0.174603,0.0,M,ETH,others
8,0.003157,0.005221,0.002656,0.00287,0.002344,0.001813,0.001074,0.001388,0.001375,0.001144,0.003157,0.005221,0.002656,0.002344,0.001813,0.001074,0.001388,6.3e-05,0.000114,0.000287,0.396825,0.2,M,JPN,others
9,0.093434,0.144737,0.136414,0.136931,0.131075,0.119807,0.117436,0.124303,0.12488,0.125214,0.093434,0.144737,0.136414,0.131075,0.119807,0.117436,0.124303,0.000595,0.001081,0.002723,0.206349,0.0,M,USA,CA


In [24]:
df['cluster'] = clusters

In [21]:
df.to_csv('results_kprototypes.csv', index=False)

pca = PCA(n_components=2)
pca_result = pca.fit_transform(df[to_scale])

print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

plt.figure(figsize=(16,10))
sns.scatterplot(
    x=pca_result[:,0], y=pca_result[:, 1],
    hue=df['cluster'],
    #palette=sns.color_palette("hls", 10),
    
    legend="full",
    alpha=0.3
)

to_scale = list(df.select_dtypes(exclude='O'))




import hypertools as hyp
from scipy.linalg import toeplitz
from copy import copy

eda = hyp.plot(df[to_scale], '.')

