# Kmeans

In [None]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Vamos a generar nuestro dataset
# blobs, classes = datasets.make_blobs(1000, centers=3,cluster_std=1.75)
# blobs, classes = datasets.make_blobs(n_samples=1000, centers=3, n_features=2,random_state=0)
X, Y = datasets.make_blobs(500, n_features = 2, centers=3, cluster_std=0.5, shuffle=True, random_state = 0)

In [None]:
X

In [None]:
Y

In [None]:
# Veamos nuestros datos
f, ax = plt.subplots(figsize=(7.5, 7.5))
ax.scatter(X[:, 0], X[:, 1])
ax.set_title("Dataset")


In [None]:
from sklearn.cluster import KMeans

In [None]:
# Definamos el modelo de Kmeans
k =3

kmean = KMeans(n_clusters=k, n_init = 10, max_iter = 300, random_state=0)

**Tarea:** 

Revisa la documentación de KMeans e indica la funcionalidad de los parámetros: 
- n_init
- max_iter 

In [None]:
# Entrenemos el algoritmo
y_kmean = kmean.fit_predict(X)

In [None]:
# Los centroides
kmean.cluster_centers_

In [None]:
# Cluster definition
colors_plot = np.array(['red', 'green', 'blue', 'pink'])
f, ax = plt.subplots(figsize=(7.5, 7.5))
ax.scatter(X[:, 0], X[:, 1], color=colors_plot[Y]) #color=rgb[classes])
ax.scatter(kmean.cluster_centers_[:, 0],
           kmean.cluster_centers_[:, 1], marker='*', s=250,
           color='black', label='Centers')
ax.set_title("Blobs")
ax.legend(loc='best')

In [None]:
# within-cluster SSE (distortion) 
kmean.inertia_

## Optimal K

In [None]:
distortions = []

k_max = 11
for i in range(1, k_max):
  kmean = KMeans(n_clusters=i, n_init = 10, max_iter = 300, random_state=0)               
  kmean.fit(X)
  distortions.append(kmean.inertia_)

distortions

In [None]:
plt.plot(range(1,k_max), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

In [None]:
from sklearn.metrics import silhouette_samples
from matplotlib import cm

In [None]:
silhouette_vals = silhouette_samples(X, y_kmean, metric='euclidean')
cluster_labels = np.unique(y_kmean)
n_clusters = cluster_labels.shape[0]

In [None]:
y_ax_lower, y_ax_upper = 0, 0
yticks = []
for i, c in enumerate(cluster_labels):
  c_silhouette_vals = silhouette_vals[y_kmean == c]
  c_silhouette_vals.sort()
  y_ax_upper += len(c_silhouette_vals)
  color = cm.jet(i / n_clusters)
  plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0, edgecolor='none', color=color)
  yticks.append((y_ax_lower + y_ax_upper) / 2)
  y_ax_lower += len(c_silhouette_vals)

silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg, color='red', linestyle="--")
plt.yticks(yticks, cluster_labels + 1)
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')
plt.show()