In [1]:
'''General Libraries'''
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)

'''Statistic'''
import scipy
from scipy import stats
from scipy.stats import norm
import datetime

'''Scikit Learn'''
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

from sklearn.cluster import KMeans, OPTICS 
from sklearn.metrics import silhouette_samples, silhouette_score

'''Ploting Libraries'''
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "iframe" 

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import style 
style.use("fivethirtyeight") 

import seaborn as sns
sns.set_palette('RdBu')

import warnings
warnings.filterwarnings("ignore")

'''Miscellaneous'''

'''Seeds'''
import random
random.seed(10)
np.random.seed(11)

In [22]:
df = pd.read_csv('data_Preprocessed_noneOnehot.csv')

In [23]:
df.head()

Unnamed: 0,official,age,gender,country
0,0.012196,0.460317,M,JPN
1,0.153041,0.238095,F,KEN
2,0.0,0.365079,M,RSA
3,0.157316,0.095238,F,ETH
4,0.001652,0.349206,M,JPN


In [24]:
df.dtypes

official    float64
age         float64
gender       object
country      object
dtype: object

# K-Prototypes Clustering

In [8]:
# Get indices of categorical columns

categories = ['gender', 'country']
categories = [df.columns.get_loc(col) for col in categories if col in df]

print(categories)

[2, 3]


In [25]:
# K-Prototypes
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

n_clusters = 6   # where 6 is the optimum K got in CLUSTERING_kmeans notebook.

# Clustering
clusterer = KPrototypes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=1,
                        n_jobs=-1,
                        max_iter=500,
                        gamma=0.1)
# The labels
cluster_labels = clusterer.fit_predict(df, categorical=categories)


Best run was number 5


# Evaluation by Spliting Numericals and Categories

In [26]:
df_num = df[['official', 'age']]

df_cat = df[['gender', 'country']]

In [27]:
df_cat = pd.get_dummies(df_cat)

In [31]:
# calculate the Silhouette score for K-means and K-modes sides 
silScore_KMeans = silhouette_score(df_num, cluster_labels, metric='euclidean')
silScore_KModes = silhouette_score(df_cat, cluster_labels, metric='hamming')

silScore = (silScore_KMeans + silScore_KModes) / 2

In [33]:
print('Silhouette score of KMeans for Numericals: ', silScore_KMeans)
print('Silhouette score of KModes for Categoricals with Onehot: ', silScore_KModes)
print('Average Silhouette score of KPrototypes: ', silScore)

Silhouette score of KMeans for Numericals:  0.07023969240902925
Silhouette score of KModes for Categoricals with Onehot:  -0.36697093772632533
Average Silhouette score of KPrototypes:  -0.14836562265864806


Why the score is so bad??

In [None]:
# Experiments: onehot encode entire data then using hamming metric.

df_test = pd.get_dummies(df)

score = silhouette_score(df_test, cluster_labels, metric='hamming')
print('Silhouette score: ', score)

## Evaluate with HEOM -- didn't work

In [19]:
# Label Encoding of categories to compute distances with the HEOM metrics
lb = LabelEncoder()
test['gender'], test['country'] = lb.fit_transform(df['gender']), lb.fit_transform(df['country'])

test

Unnamed: 0,official,age,gender,country
0,0.012196,0.460317,1,41
1,0.153041,0.238095,0,42
2,0.000000,0.365079,1,59
3,0.157316,0.095238,0,26
4,0.001652,0.349206,1,41
...,...,...,...,...
31629,0.343982,0.222222,1,74
31630,0.582055,0.253968,1,74
31631,0.528235,0.047619,0,74
31632,0.582606,0.317460,0,74


In [None]:
# Sihouette score

## A custom metric
from distython import HEOM # a custom metric class
heom_metric = HEOM(test, cat_ix=categories, normalised='normal')

## Sihouette --- CHẠY 12 TIẾNG KO XONG 
#score = silhouette_score(df, cluster_labels, metric = heom_metric.heom)
print('Silhouette score of k-prototypes clustering: ', score)

The computation of Silhouette score never ends. It took over 12 hours, over my patience, so I shut it down.

I am searching for the why.

# Save to csv
cluster = pd.DataFrame(data=cluster_labels, columns=['cluster'])
cluster.to_csv('clusters_OPTICS.csv', index=False)

# Testing


In [None]:

# Save to csv
cluster = pd.DataFrame(data=cluster_labels, columns=['cluster'])
cluster.to_csv('clusterLabel_kprototypes_6.csv', index=False)

In [17]:
test = df.copy()

In [18]:
test

Unnamed: 0,official,age,gender,country
0,0.012196,0.460317,M,JPN
1,0.153041,0.238095,F,KEN
2,0.000000,0.365079,M,RSA
3,0.157316,0.095238,F,ETH
4,0.001652,0.349206,M,JPN
...,...,...,...,...
31629,0.343982,0.222222,M,USA
31630,0.582055,0.253968,M,USA
31631,0.528235,0.047619,F,USA
31632,0.582606,0.317460,F,USA


In [19]:
lb = LabelEncoder()

test['gender'], test['country'] = lb.fit_transform(df['gender']), lb.fit_transform(df['country'])

test

Unnamed: 0,official,age,gender,country
0,0.012196,0.460317,1,41
1,0.153041,0.238095,0,42
2,0.000000,0.365079,1,59
3,0.157316,0.095238,0,26
4,0.001652,0.349206,1,41
...,...,...,...,...
31629,0.343982,0.222222,1,74
31630,0.582055,0.253968,1,74
31631,0.528235,0.047619,0,74
31632,0.582606,0.317460,0,74


In [21]:
# Importing a custom metric class
from distython import HEOM

# Declare the HEOM with a correct NaN equivalent value
heom_metric = HEOM(test, cat_ix=categories, normalised='normal')

# Declare NearestNeighbor and link the metric
score = silhouette_score(test, cluster_labels, metric = heom_metric.heom)


print(score)

KeyboardInterrupt: 