In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
import plotly.express as px
import plotly.io as pio

from k_means import KMeans, RobustKMeans

pio.templates.default = 'ggplot2'

In [2]:
# Create a dataset of 2D distributions
n_clusters = 3
x_train, true_labels = make_blobs(n_samples=100, centers=n_clusters, random_state=42, center_box=[-10, 10])
x_train_scaled = StandardScaler().fit_transform(x_train)

In [None]:
px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=true_labels)

In [4]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x_train_scaled)

In [5]:
kmeans.inertia, kmeans.score(true_labels)

(5.5901093412274685, 1.0)

In [6]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [7]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(x_train_scaled, n=1000)

In [8]:
kmeans.inertia, kmeans.score(true_labels)

(5.590132082655962, 1.0)

In [9]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [10]:
noise, noise_labels = make_blobs(n_samples=2, centers=[[100, 1]])
x_train = np.append(x_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
x_train_scaled = StandardScaler().fit_transform(x_train)

In [None]:
px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=true_labels)

In [12]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x_train_scaled)

In [13]:
kmeans.inertia, kmeans.score(true_labels)

(80.53990216881081, 0.9803921568627451)

In [14]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [15]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(x_train_scaled, n=10000)

In [16]:
kmeans.inertia, kmeans.score(true_labels)

(80.53990641116276, 0.9803921568627451)

In [17]:
fig = px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

# Más cerca

In [18]:
# Create a dataset of 2D distributions
n_clusters = 3
x_train, true_labels = make_blobs(n_samples=100, centers=n_clusters, random_state=42, center_box=[-5, 5])
x_train_scaled = StandardScaler().fit_transform(x_train)

In [None]:
px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=true_labels)

In [20]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x_train_scaled)

In [21]:
kmeans.inertia, kmeans.score(true_labels)

(20.713160136162955, 0.99)

In [22]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [23]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(x_train_scaled, n=1000)

In [24]:
kmeans.inertia, kmeans.score(true_labels)

(20.713221582332622, 0.99)

In [25]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [26]:
noise, noise_labels = make_blobs(n_samples=2, centers=[[100, 1]])
x_train = np.append(x_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
x_train_scaled = StandardScaler().fit_transform(x_train)

In [None]:
px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=true_labels)

In [28]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x_train_scaled)

In [29]:
kmeans.inertia, kmeans.score(true_labels)

(98.00706380389678, 0.9509803921568627)

In [30]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [31]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(x_train_scaled, n=10000)

In [32]:
kmeans.inertia, kmeans.score(true_labels)

(98.0070654753607, 0.9509803921568627)

In [33]:
fig = px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()

# Aún más cerca

In [38]:
# Create a dataset of 2D distributions
n_clusters = 3
x_train, true_labels = make_blobs(n_samples=100, centers=n_clusters, random_state=42, center_box=[-2, 2])
x_train_scaled = StandardScaler().fit_transform(x_train)

In [39]:
px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=true_labels)

In [40]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x_train_scaled)

In [41]:
kmeans.inertia, kmeans.score(true_labels)

(69.94893654998057, 0.84)

In [42]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [43]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(x_train_scaled, n=1000)

In [44]:
kmeans.inertia, kmeans.score(true_labels)

(69.94907952139823, 0.84)

In [45]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [46]:
noise, noise_labels = make_blobs(n_samples=2, centers=[[100, 1]])
x_train = np.append(x_train, noise, axis=0)
true_labels = np.append(true_labels, noise_labels)
x_train_scaled = StandardScaler().fit_transform(x_train)

In [47]:
px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=true_labels)

In [84]:
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(x_train_scaled)

In [85]:
kmeans.inertia, kmeans.score(true_labels)

(114.36089420745236, 0.7156862745098039)

In [86]:
fig = px.scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1])
fig.update_traces(mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2)
fig.add_scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], marker_color=kmeans.labels, mode='markers')
fig.show()

In [81]:
kmeans = RobustKMeans(n_clusters=n_clusters, max_iter=300)
kmeans.fit(x_train_scaled, n=10000)

In [82]:
kmeans.inertia, kmeans.score(true_labels)

(114.2794758875863, 0.7549019607843137)

In [83]:
fig = px.scatter(x=x_train_scaled[:, 0], y=x_train_scaled[:, 1], color=kmeans.labels)
fig.add_scatter(x=kmeans.centroids[:, 0], y=kmeans.centroids[:, 1],
                mode='markers', marker_size=12, marker_color='black', marker_symbol='x-thin', marker_line_width=2, showlegend=False)
fig.show()