# In this lecture I'll show kmeans on a simple dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Make 4 clusters
set1 = np.random.randn(100,2)
set1 += 4
set2 = np.random.randn(100,2)
set2 += [4,-4]
set3 = np.random.randn(100,2)
set3 += [-4,-4]
set4 = np.random.randn(100,2)
set4 += [-4,4]
X = np.concatenate([set1,set2,set3,set4])

In [None]:
#Pretend we didn't know these points came from different sets
plt.scatter(X[:,0],X[:,1])


In [None]:
#We want to get this
plt.scatter(set1[:,0],set1[:,1])
plt.scatter(set2[:,0],set2[:,1])
plt.scatter(set3[:,0],set3[:,1])
plt.scatter(set4[:,0],set4[:,1])

# This is a clustering task. I will show the simplest clustering algorithm, kmeans.

<img src="images/Kmeans.gif">

# How to do this in sklearn?

In [None]:
from sklearn.cluster import KMeans

In [None]:
#Need to tell it number of clusters. Will talk about how to infer this.
kmeans = KMeans(n_clusters=4)

### We don't really talk about train test split for clustering as there is no right answer

In [None]:
kmeans.fit(X)

In [None]:
preds = kmeans.predict(X)
preds
#there is no logic as to why cluster 3 is called '3' instead of 2 for example

In [None]:
#How did it do?
pred0 = X[preds==0]
pred1 = X[preds==1]
pred2 = X[preds==2]
pred3 = X[preds==3]


In [None]:
plt.scatter(pred0[:,0],pred0[:,1])
plt.scatter(pred1[:,0],pred1[:,1])
plt.scatter(pred2[:,0],pred2[:,1])
plt.scatter(pred3[:,0],pred3[:,1])

In [None]:
#In real life we don't have labels for clustering so not worth it to try to measure performance



# Kmeans, pros and cons
<img src="images/pros.png">

# How to infer number of clusters?

<img src="images/ElbowRule.png">

Distortion measures how far from cluster center points are on average. We want to see where we get diminishing returns. As number clusters goes up of course distortion goes down. Why? Think of when number clusters is same as number of data points.

In [None]:
scores = []
for i in range(1,10):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(X)
    score = -kmeans.score(X)
    scores.append(score)

In [None]:
plt.plot(np.arange(1,10),scores)

In [None]:
#4 clusters is best for this task


If we want to automate, we can look at differences. Play around with this yourself. If you want.