# 2. Clustering Analysis

In [41]:
from sklearn.metrics import *
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import kneighbors_graph
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [42]:
df_01_scaled = pd.read_csv('data/HR_comma_sep_01_scaled.csv', index_col=0)
# choice of attributes , Euclidean distance
train_data = np.array(df_01_scaled)
df_01_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,sales,salary
0,0.318681,0.265625,0.0,0.285047,0.125,0.0,1.0,0.0,0.777778,0.5
1,0.78022,0.78125,0.6,0.775701,0.5,0.0,1.0,0.0,0.777778,1.0
2,0.021978,0.8125,1.0,0.82243,0.25,0.0,1.0,0.0,0.777778,1.0
3,0.692308,0.796875,0.6,0.593458,0.375,0.0,1.0,0.0,0.777778,0.5
4,0.307692,0.25,0.0,0.294393,0.125,0.0,1.0,0.0,0.777778,0.5


## Kmeans

### choice of best value of k

In [44]:
# choice of best value of  K that minimized the sum of squared error
sse_list = list([0] * 48)
#silhouette_list = list()
max_k = 50
#plt.figure((12,6))

for k in range(2, max_k + 1):
    kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10, max_iter=100)
    kmeans.fit(train_data)
    sse_list.append(kmeans.inertia_)
    #silhouette_list.append(silhouette_score(train_data, kmeans.labels_))
    

In [None]:
# plot SSE and silhouette on the same scale
# first axis
fig, ax1 = plt.subplots()
ax1.plot(range(2, max_k), sse_list, 'b')
ax1.set_xlabel('K')
ax1.set_ylabel('SSE', color='b')
ax1.tick_params('y', colors='b')
# second axis
ax2 = ax1.twinx()
ax2.plot(range(2, max_k), silhouette_list, 'orange')
ax2.set_ylabel('Silhouette score', color='orange')
ax2.tick_params('y', colors='orange')

plt.title('K-Means: Sum of Squared Error vs Number of Clusters')
fig.tight_layout()
plt.show()
plt.savefig('images/sse_silhouette_vs_k')
# a way to chose the best k is to look at the 'elbow point' that is the point where the sse drops significantly.

In [None]:
# analysis of the k centroids for a chosen k
kmeans = KMeans(n_clusters=5)
kmeans.fit(train_data)

In [None]:
# 2D PCA highlighting the clusters
pca = PCA(n_components=2)
X_scaled_2D = pca.fit_transform(df_01_scaled)
colors = ['c', 'm', 'g', 'r', 'k', 'y', 'k', 'b']
for i in range(0, X_scaled_2D.shape[0]):
    if kmeans.labels_[i] == 0:
        c1 = plt.scatter(X_scaled_2D[i, 0], X_scaled_2D[i, 1], c='c')
    elif kmeans.labels_[i] == 1:
        c2 = plt.scatter(X_scaled_2D[i, 0], X_scaled_2D[i, 1], c='m')
    elif kmeans.labels_[i] == 2:
        c3 = plt.scatter(X_scaled_2D[i, 0], X_scaled_2D[i, 1], c='g')
    elif kmeans.labels_[i] == 3:
        c4 = plt.scatter(X_scaled_2D[i, 0], X_scaled_2D[i, 1], c='r')
    elif kmeans.labels_[i] == 4:
        c5 = plt.scatter(X_scaled_2D[i, 0], X_scaled_2D[i, 1], c='y')

plt.legend([c1, c2, c3, c4, c5], ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5'])
plt.title('2D-PCA combined with K-means')
plt.show()
plt.savefig('images/pca_kmeans.png')
# TODO plot centroids

## Analysis of k centroids

In [None]:
plt.figure(1, figsize=(20, 10))
for i in range(0, len(kmeans.cluster_centers_)):
    plt.plot(range(0, train_data.shape[1]), kmeans.cluster_centers_[i], label='Cluster %s' % i, linewidth=3)
plt.xticks(range(0, train_data.shape[1]), list(df_01_scaled.columns))
plt.legend()
plt.show()
plt.savefig('images/k-centroids_analysis')