In [None]:
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

Clustering
=============

In [None]:
from sklearn.datasets import make_blobs
X, y = make_blobs(random_state=42)
X.shape

In [None]:
plt.scatter(X[:, 0], X[:, 1])

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)

In [None]:
kmeans.fit(X)

In [None]:
cluster_labels = kmeans.predict(X)

In [None]:
cluster_labels

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels)

In [None]:
y

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
accuracy_score(y, cluster_labels)

In [None]:
confusion_matrix(y, cluster_labels)

In [None]:
from sklearn.metrics import adjusted_rand_score

In [None]:
adjusted_rand_score(y, cluster_labels)

KMeans can generalize, SpectralClustering can not
-----------------------------------------------------------
KMeans has a ``predict`` function.

In [None]:
X_more = np.random.uniform(X.min(), X.max(), size=(10, 2))
X_more
more_cluster_labels = kmeans.predict(X_more)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels)
plt.scatter(X_more[:, 0], X_more[:, 1], marker="x", c=more_cluster_labels)

In [None]:
from sklearn.cluster import SpectralClustering
spectral_clustering = SpectralClustering(n_clusters=3, gamma=.1)

In [None]:
spectral_clustering.fit(X)


In [None]:
cluster_labels = spectral_clustering.predict(X)

In [None]:
spectral_clustering.labels_

In [None]:
cluster_labels = spectral_clustering.fit_predict(X)

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=cluster_labels)

There is no way to apply ``spectral_clustering`` not to ``X_more``.

An overview of clustering algorithms
---------------------------------------

<img src="cluster_comparison.png" width=100%>


More on the website: http://scikit-learn.org/stable/modules/clustering.html

A less trivial example
-------------------------

In [None]:
from sklearn.datasets import load_digits
digits = load_digits()

X, y = digits.data, digits.target

In [None]:
kmeans = KMeans(n_clusters=10)
kmeans.fit(X)

In [None]:
adjusted_rand_score(y, kmeans.predict(X))

In [None]:
_, axes = plt.subplots(2, 5)
for ax, center in zip(axes.ravel(), kmeans.cluster_centers_):
    ax.matshow(center.reshape(8, 8), cmap=plt.cm.gray)
    ax.set_xticks(())
    ax.set_yticks(())