# KMeans

## Install Libraries

In [None]:
!pip install numpy pandas pyparsing matplotlib seaborn sklearn

## Import 

In [None]:
import numpy as np;
import pandas as pd;
import matplotlib.pyplot as plt;
import seaborn as sns;
sns.set()
from sklearn.cluster import KMeans
from sklearn import preprocessing

# Get the data

In [None]:
!wget https://raw.githubusercontent.com/salvo-nicotra/masterupa/main/satisfactionloyalty.csv

# Read the data

In [None]:
data = pd.read_csv("satisfactionloyalty.csv");

# Quick view

In [None]:
data.head(10)

In [None]:
data.describe()

# A picture is a picture is worth a thousand words 

In [None]:
plt.scatter(data['Satisfaction'], data['Loyalty'])
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')
plt.show()

# Clustering

[Sklearn](https://scikit-learn.org/stable/modules/clustering.html#k-means) 

[Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans)

In [None]:
# Taking a copy
x=data.copy()
x.head()

# Quick Clustering

In [None]:
kmeans=KMeans(2)

# Clustering Results

fit_predict: Compute cluster centers and predict cluster index for each sample.

In [None]:
x['cluster_pred']=kmeans.fit_predict(x)

Get the centroids

In [None]:
centroids = kmeans.cluster_centers_

In [None]:
centroids

Get Labels

In [None]:
labels = kmeans.labels_
unique_labels = np.unique(labels)
unique_labels

Label of column containing prediction

In [None]:
label = 'cluster_pred'

# Plot

In [None]:
figure_name = plt.figure(figsize=(10, 10))
colors = ['#DF2020', '#81DF20']
for i in unique_labels:  
    plt.scatter(x[x[label] == i]['Satisfaction'] , x[x[label] == i]['Loyalty'] , label = i, c=colors[i])
    plt.scatter(centroids[i, 0], centroids[i, 1],  marker = "*", s=150, linewidths = 5, zorder = 10,c=colors[i])
plt.legend()
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')
plt.show()

# We can do it better, can we ?

## 1. Standardizing the variables

In [None]:
x_scaled=preprocessing.scale(data)

In [None]:
x_scaled

In [None]:
plt.scatter(x_scaled[:,0], x_scaled[:,1],cmap='rainbow')
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')
plt.show()

# 2. Optimization with Elbow Method

inertia_: Sum of squared distances of samples to their closest cluster center.

In [None]:
wcss = []
for i in range(1,30):
    kmeans=KMeans(i)
    kmeans.fit(x_scaled)
    wcss.append(kmeans.inertia_)

In [None]:
wcss

# Visualize elbow

In [None]:
plt.plot(range(1,30),wcss)
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

The number of cluster is...

In [None]:
ncluster=4

So let's start again

In [None]:
kmeans_new=KMeans(ncluster)
kmeans.fit(x_scaled)
cluster_new=pd.DataFrame(x_scaled,columns=['Satisfaction','Loyalty'])
cluster_new['cluster_pred']=kmeans_new.fit_predict(x_scaled)

In [None]:
cluster_new

In [None]:
centroids = kmeans_new.cluster_centers_

In [None]:
centroids

In [None]:
labels = kmeans_new.labels_
unique_labels = np.unique(labels)

In [None]:
unique_labels

# Plot the new clusters

In [None]:
figure_name = plt.figure(figsize=(10, 10))
colors = ['red', 'purple','blue','green']
for i in unique_labels:  
    plt.scatter(cluster_new[cluster_new[label] == i]['Satisfaction'] , cluster_new[cluster_new[label] == i]['Loyalty'] , label = i,c=colors[i])
    plt.scatter(centroids[i, 0], centroids[i, 1],  marker = "*", s=150, linewidths = 5, zorder = 10,c=colors[i],alpha = 0.6)
plt.legend()
plt.xlabel('Satisfaction')
plt.ylabel('Loyalty')
plt.show()

# Results
1. The xxx dots are the people who are less satisfied and less loyal and therefore can be termed as alienated.
2. The yyy dots are people with high loyalty and less satisfaction.
3. The zzz dots are the people with high loyalty and high satisfaction and they are the fans.
4. The www blue dots are the people who are in the midst of things.

# Make a new prediction

In [None]:
new = pd.DataFrame({'Satisfaction':['-1.5'],'Loyalty':['-1.5']})

In [None]:
new = np.array([[1.5,1.5]])

In [None]:
new

In [None]:
prediction = kmeans_new.predict(new)

In [None]:
colors[prediction[0]]

# References
- https://towardsdatascience.com/visualizing-clusters-with-pythons-matplolib-35ae03d87489