# Clustering

Clustering is an unsupervised learning technique useful to find similarity amongst data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load and plot data

In [None]:
df = pd.read_csv('../data/iris.csv')

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15,10))

plt.scatter(df.sepal_length_cm, df.petal_length_cm)
plt.title('Iris Flowers', size = 20)
plt.xlabel('Sepal Length', size = 20)
plt.ylabel('Petal Length', size = 20)

## Encode labels

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
target_n = le.fit_transform(df.target)

In [None]:
target_n

In [None]:
le.classes_

In [None]:
plt.figure(figsize=(15,10))

plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=target_n)
plt.title('Iris Flowers')
plt.xlabel('Sepal Length')
plt.ylabel('Petal Length')
plt.legend(le.classes_, loc = 'best')

## Kmeans clustering

In [None]:
from sklearn.cluster import KMeans

X = df[['sepal_length_cm','sepal_width_cm','petal_length_cm','petal_width_cm']]
km = KMeans(3)
km.fit(X)

In [None]:
centers = km.cluster_centers_
centers

In [None]:
plt.figure(figsize = (14,6))
plt.subplot(121)
plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=target_n)
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal length (cm)')
plt.title('True Labels')

plt.subplot(122)
plt.scatter(df.sepal_length_cm, df.petal_length_cm, c=km.labels_)
plt.scatter(centers[:,0], centers[:,2], marker='o', c='r', s=100)
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Petal length (cm)')
plt.title('K-Means Clusters')
plt.draw()
plt.show()


## Exercises
1)
- learn about the silhouette score here: http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient
- calculate silhouette score for different values of k
- plot the silhouette score as a function of k for k between 2 and 10

2)
- experiment changing the parameters of k-means
- discuss

3)
- learn about other clustering methods here:
http://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html

4)
- try with different datasets

*Copyright &copy; 2015 Dataweekends.  All rights reserved.*