In [13]:
from datasets import dutch_data
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

from sklearn.decomposition import PCA
import pandas as pd

import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [4]:
df = dutch_data()

In [6]:
train_X = df.drop('target', axis=1)
train_y = df['target']

In [7]:
train_X

features,bought in a store,breaks easily,can also be used out of the kitchen,can be a status symbol,can be automatized,can be carried,can be dangerous,can be dishwashed,can be electronic,can be found in a garage,...,you can earn money with it,you can injure someone with it,you can kill someone with it,you can transport things with it,you have to be able to read notes for it,you have to be intelligent to play it,you have to blow on it,you have to have talent to play it,you have to learn how to use it,you need a weapon license for it
exemplar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
(hot air) balloon,1,0,3,0,0,0,3,0,0,0,...,2,0,0,0,0,1,0,0,4,0
accordion,4,1,3,0,0,1,0,1,0,0,...,3,0,0,0,3,2,0,3,3,0
airplane,1,0,3,3,0,0,4,0,0,0,...,3,1,1,3,0,1,0,0,3,0
anvil,4,0,3,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
apron,4,0,4,0,0,4,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
whisk,4,0,0,0,0,1,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
wire brush,4,0,3,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
wok,4,0,0,0,0,1,0,4,0,0,...,0,0,0,0,0,0,0,0,1,0
wrench,4,0,3,0,0,1,0,1,0,2,...,0,1,0,0,0,0,0,0,2,0


In [8]:
df.groupby('target').count()

features,bought in a store,breaks easily,can also be used out of the kitchen,can be a status symbol,can be automatized,can be carried,can be dangerous,can be dishwashed,can be electronic,can be found in a garage,...,you can earn money with it,you can injure someone with it,you can kill someone with it,you can transport things with it,you have to be able to read notes for it,you have to be intelligent to play it,you have to blow on it,you have to have talent to play it,you have to learn how to use it,you need a weapon license for it
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
clothing,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29
kitchen utensil,33,33,33,33,33,33,33,33,33,33,...,33,33,33,33,33,33,33,33,33,33
musical instrument,27,27,27,27,27,27,27,27,27,27,...,27,27,27,27,27,27,27,27,27,27
tool,29,29,29,29,29,29,29,29,29,29,...,29,29,29,29,29,29,29,29,29,29
vehicle,30,30,30,30,30,30,30,30,30,30,...,30,30,30,30,30,30,30,30,30,30
weapon,18,18,18,18,18,18,18,18,18,18,...,18,18,18,18,18,18,18,18,18,18


## K-means

In [9]:
kmeans = KMeans(n_clusters=6, init='random', random_state=0, n_init=50, max_iter=300).fit(train_X)

In [28]:
df['clusters'] = kmeans.labels_

In [29]:
# The Rand Index computes a similarity measure between two clusterings by considering all pairs of samples and counting pairs that are assigned in the same or different clusters in the predicted and true clusterings.
adjusted_rand_score(df['target'], df['clusters'])

0.9407058741850403

In [30]:
# sum of squared distances to the closest centroid for all observations in the training set
kmeans.inertia_

17966.485786789468

## PCA + K-means

In [14]:
pca = PCA(n_components=3)
pca = pca.fit(train_X)

In [15]:
transformed_train_X = pca.transform(train_X)

In [16]:
kmeans = KMeans(n_clusters=6, init='k-means++', random_state=0, n_init=10, max_iter=300).fit(transformed_train_X)

## Clusters and Centroids

In [17]:
transformed_df = pd.DataFrame(transformed_train_X, columns=['x','y','z'])
transformed_df['clusters'] = kmeans.labels_
transformed_df['target'] = train_y.values
transformed_df['name'] = df.index
transformed_df['size'] = [1 for i in range(transformed_df.shape[0])]
fig = px.scatter_3d(transformed_df, x='x', y='y', z='z',
              color='clusters', hover_data=['name'], size='size', size_max=15)
fig.add_trace(px.scatter_3d(pd.DataFrame(kmeans.cluster_centers_, columns=['x','y','z']), x='x', y='y', z='z').data[0])
fig.show()

## Ground truth clusters

In [19]:
transformed_df = pd.DataFrame(transformed_train_X, columns=['x','y','z'])
transformed_df['clusters'] = kmeans.labels_
transformed_df['target'] = train_y.values
transformed_df['name'] = df.index
transformed_df['size'] = [1 for i in range(transformed_df.shape[0])]
fig = px.scatter_3d(transformed_df, x='x', y='y', z='z',
              color='target', hover_data=['name'],size='size', size_max=15)
fig.show()        