In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
import plotly.express as px
import plotly.io as pio

from k_means import KMeans, RobustKMeans

pio.templates.default = 'ggplot2'

In [104]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=100, centers=n_clusters, random_state=42, center_box=[-10, 10])
X_train = StandardScaler().fit_transform(X_train)

In [105]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels, )

In [106]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [107]:
kmeans.inertia, kmeans.score(true_labels)

(5.5901093412274685, 1.0)

In [109]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [110]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(X_train, n=1000)

In [111]:
kmeans.inertia, kmeans.score(true_labels)

(5.590132082655962, 1.0)

In [112]:
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

# Closer

In [None]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=100, centers=n_clusters, random_state=42, center_box=[-5, 5])
X_train = StandardScaler().fit_transform(X_train)

In [None]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels)

In [None]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [None]:
kmeans.inertia, kmeans.score(true_labels)

In [None]:
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

In [None]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(X_train, n=100)

In [None]:
kmeans.inertia, kmeans.score(true_labels)

In [None]:
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

# Even closer

In [None]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=100, centers=n_clusters, random_state=42, center_box=[-2, 2])
X_train = StandardScaler().fit_transform(X_train)

In [None]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels)

In [None]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [None]:
kmeans.inertia, kmeans.score(true_labels)

In [None]:
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

In [None]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(X_train, n=100)

In [None]:
kmeans.inertia, kmeans.score(true_labels)

In [None]:
fig = px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

# Con ruido

In [90]:
noise = (np.random.random((100, 2)) - 0.5) * 20

In [91]:
noise_labels = np.random.randint(0, 3, 100)

In [92]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=1000, centers=n_clusters, random_state=42, center_box=[-10, 10])
X_train = np.append(X_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
X_train = StandardScaler().fit_transform(X_train)

In [93]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels)

In [94]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [95]:
kmeans.inertia, kmeans.score(true_labels)

(159.28501404216277, 0.9354545454545454)

In [96]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [100]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(X_train, n=1000)

In [101]:
kmeans.inertia, kmeans.score(true_labels)

(159.28621398059457, 0.9354545454545454)

In [102]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

# Más cerca

In [None]:
noise = (np.random.random((100, 2)) - 0.5) * 10

In [None]:
noise_labels = np.random.randint(0, 3, 100)

In [None]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=1000, centers=n_clusters, random_state=42, center_box=[-5, 5])
X_train = np.append(X_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
X_train = StandardScaler().fit_transform(X_train)

In [None]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels)

In [53]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [54]:
kmeans.inertia, kmeans.score(true_labels)

(314.11851267885976, 0.9327272727272727)

In [55]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [None]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(X_train, n=100)

In [None]:
kmeans.inertia, kmeans.score(true_labels)

In [52]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

# Aún más cerca

In [56]:
noise = (np.random.random((100, 2)) - 0.5) * 4

In [57]:
noise_labels = np.random.randint(0, 3, 100)

In [58]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=1000, centers=n_clusters, random_state=42, center_box=[-4, 4])
X_train = np.append(X_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
X_train = StandardScaler().fit_transform(X_train)

In [59]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels)

In [60]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [61]:
kmeans.inertia, kmeans.score(true_labels)

(415.5726177030058, 0.9245454545454546)

In [62]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [63]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(X_train, n=100)

In [64]:
kmeans.inertia, kmeans.score(true_labels)

(415.65756196116376, 0.9245454545454546)

In [65]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

# Con ruido en blobs

In [77]:
noise, noise_labels = make_blobs(n_samples=2, centers=[[100, 1]])

In [78]:
# Create a dataset of 2D distributions
n_clusters = 3
X_train, true_labels = make_blobs(n_samples=10000, centers=[[1, 1], [10, 1], [1, 10]], random_state=42, center_box=[1, 10], cluster_std=1)
X_train = np.append(X_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
# X_train = StandardScaler().fit_transform(X_train)

In [79]:
px.scatter(x=X_train[:, 0], y=X_train[:, 1], color=true_labels)

In [87]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X_train)

In [88]:
kmeans.inertia, kmeans.score(true_labels)

(35998.361286701496, 0.9998000399920016)

In [89]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [84]:
kmeans = RobustKMeans(n_clusters=n_clusters)
kmeans.fit(X_train, n=100000)

In [85]:
kmeans.inertia, kmeans.score(true_labels)

(35998.36204468409, 0.9998000399920016)

In [86]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=X_train[:, 0], y=X_train[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()