In [1]:
'''General Libraries'''
import numpy as np 
import pandas as pd 
pd.set_option('display.max_columns', 500)

'''Statistic'''
import scipy
from scipy import stats
from scipy.stats import norm
import datetime

'''Scikit Learn'''
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV

from sklearn.cluster import KMeans, OPTICS 
from sklearn.metrics import silhouette_samples, silhouette_score

'''Ploting Libraries'''
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import plotly.io as pio
pio.renderers.default = "iframe" 

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import style 
style.use("fivethirtyeight") 

import seaborn as sns
sns.set_palette('RdBu')

import warnings
warnings.filterwarnings("ignore")

'''Miscellaneous'''

'''Seeds'''
import random
random.seed(10)
np.random.seed(11)

# The Dataset

In [2]:
df = pd.read_csv('data_Preprocessed_noneOnehot.csv')

In [3]:
df.head()

Unnamed: 0,official,age,gender,country
0,0.012196,0.460317,M,JPN
1,0.153041,0.238095,F,KEN
2,0.0,0.365079,M,RSA
3,0.157316,0.095238,F,ETH
4,0.001652,0.349206,M,JPN


In [4]:
df.dtypes

official    float64
age         float64
gender       object
country      object
dtype: object

# K-Prototypes Clustering

In [5]:
# Get indices of categorical columns, a arguments for k-prototypes
categories = ['gender', 'country']
categories = [df.columns.get_loc(col) for col in categories if col in df]

print(categories)

[2, 3]


In [6]:
# K-Prototypes
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

n_clusters = 5   # where 6 is the optimum K got in CLUSTERING_kmeans notebook.

## Clustering
clusterer = KPrototypes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=1,
                        n_jobs=-1,
                        max_iter=500,
                        gamma=0.1)
## The labels
cluster_labels = clusterer.fit_predict(df, categorical=categories)

Best run was number 4


# Evaluation with Onehot Encoding

***Experiment 1:***  
Divide features into: a Numerical set and Onehot encoding set, then compute scores respectively.


In [7]:
# Experiment 1: splitting
df_num = df[['official', 'age']]

df_cat = df[['gender', 'country']]
df_cat = pd.get_dummies(df_cat)

In [8]:
# Calculate the Silhouette score each part
silScore_num = silhouette_score(df_num, cluster_labels, metric='euclidean')
silScore_cate = silhouette_score(df_cat, cluster_labels, metric='hamming')

# The final score
silScore = (silScore_num + silScore_cate) / 2

# 
print('Silhouette score of KMeans for Numericals: ', silScore_num)
print('Silhouette score of KModes for Categoricals with Onehot: ', silScore_cate)
print('Average Silhouette score of KPrototypes: ', silScore)

Silhouette score of KMeans for Numericals:  0.07593431691940622
Silhouette score of KModes for Categoricals with Onehot:  -0.16244293184388328
Average Silhouette score of KPrototypes:  -0.04325430746223853


***Experiment 2:***  
We run onehot encoding over all of columns then scoring the whole data with the Hamming metric.

In [9]:
# Experiments 2: the whole
df_test = pd.get_dummies(df)

score = silhouette_score(df_test, cluster_labels, metric='hamming')
print('Silhouette score: ', score)

Silhouette score:  -0.015963802260992324


In [10]:
print('Silhouette score: ', score)

Silhouette score:  -0.015963802260992324


***Conclusion***  
Why the score is so bad??

## Evaluate with HEOM -- didn't work

 ***HEOM - Heterogeneous Euclidean-Overlap Metric***. 
 As Silhouette_score of sklearn doesn't work with categorical values, or more concisely, its metric functions don't work with categories, so I need to find another measure. And then I found this idea of HEOM on medium.com and decide to give it a shot.

In [11]:
test = df.copy()

# Label Encoding of categories to compute distances with the HEOM metrics
lb = LabelEncoder()
test['gender'], test['country'] = lb.fit_transform(df['gender']), lb.fit_transform(df['country'])

test

Unnamed: 0,official,age,gender,country
0,0.012196,0.460317,1,41
1,0.153041,0.238095,0,42
2,0.000000,0.365079,1,59
3,0.157316,0.095238,0,26
4,0.001652,0.349206,1,41
...,...,...,...,...
31629,0.343982,0.222222,1,74
31630,0.582055,0.253968,1,74
31631,0.528235,0.047619,0,74
31632,0.582606,0.317460,0,74


In [12]:
# Sihouette score with HEOM

## A custom metric
from distython import HEOM # a custom metric class
heom_metric = HEOM(test, cat_ix=categories, normalised='  normal')

## 
score = silhouette_score(test, cluster_labels, metric = heom_metric.heom)
print('Silhouette score of k-prototypes clustering: ', score)

KeyboardInterrupt: 

The computation of Silhouette score never ends. It took over 12 hours, over my patience, so I shut it down.

I am searching for the why.

# Verdict
With k = 5, which inherit from optimum k of K-means, K-prototypes didn't score very well, or badly indeed. Its score is close to 0.