# K-Means

In [None]:
from sklearn.cluster import KMeans

k=5
kmeans = KMeans(n_clusters=k)
y_pred = kmeans.fit_predict(X)

In [None]:
y_pred

In [None]:
y_pred is kmeans.labels_

In [None]:
kmeans.cluster_centers_

In [None]:
X_new = np.array([[0, 2], [3, 2], [-3, 3], [-3, 2.5]])
kmeans.predict(X_new)

In [None]:
kmeans.transform(X_new)

# Centroid Initialization Methods

In [None]:
good_init = np.array([[-3, 3], [-3, 2], [-3, 1], [-1, 2], [0, 2]])
kmeans = KMeans(n_clusters=5, init=good_init, n_init=1)

In [None]:
kmeans.inertia_

In [None]:
kmeans.score(X)

# Accelerated K-Means and Mini-batch K-Means

In [None]:
from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(n_clusters=5)
minibatch_kmeans.fit(X)

# Finding the Optimal Number of Clusters

In [None]:
from sklearn.metrics import silhouette_score

silhouette_score(X, kmeans.labels_)

# Using clustering for image segmentation

In [None]:
from matplotlib.image import imread
image = imread(os.path.join("images", "clustering", "ladybug.png"))
image.shape

In [None]:
X = image.reshape(-1, 3)
kmeans = KMeans(n_clusters=8).fit(X)
segmented_img = kmeans.cluster_centers_[kmeans.labels_]
segmented_img = segmented_img.reshape(image.shape)

# Using Clustering fpr Preprocessing

In [None]:
from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y = True)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits)

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

log_reg.score(X_test, y_test)

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50)),
    ("log_reg", LogisticRegression())
])

pipeline.fit(X_train, y_train)

pipeline.score(X_test, y_test)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = dict(kmeans__n_clusters=range(2, 100))
grid_clf = GridSearchCV(pipeline, param_grid, cv=3, verbose=2)
grid_clf.fit(X_train, y_train)

In [None]:
grid_clf.best_params_

In [None]:
grid_clf.score(X_test, y_test)

# Using Clustering for Semi-Supervised Learning

In [None]:
n_labeled = 50
log_reg = LogisticRegression()
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])

In [None]:
log_reg.score(X_test, y_test)

In [None]:
k = 50
kmeans = KMeans(n_clusters=k)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]

In [None]:
y_representative_digits = np.array([4, 8, 0, 6, 8, 3, 7, 7, 9, 2,
                                    5, 5, 8, 5, 2, 1, 2, 9, 6, 1,
                                    1, 6, 9, 0, 8, 3, 0, 7, 4, 1,
                                    6, 5, 2, 4, 1, 3, 6, 3, 9, 2,
                                    4, 2, 9, 4, 7, 6, 2, 3, 1, 1])

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

In [None]:
y_train_propagated = np.empty(len(X_train), dtype=np.int32)
for i in range(k):
  y_train_propogated[kmeans.labels_==i] = y_representative_digits[i]

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train_propogated)
log_reg.score(X_test, y_test)

In [None]:
percentile_closest=20

X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
  in_cluster = (kmeans.labels_ == i)
  cluster_dist = X_cluster_dist[in_cluster]
  cutoff_distance = np.percentile(cluster_dist, percentile_closest)
  above_cutoff = (X_cluster_dist > cutoff_distance)
  X_cluster_dist[in_cluster & above_cutoff] = -1

partially_propagated = (X_cluster_dist != -1)
X_train_partially_propagated = X_train[partially_propagated]
y_train_partially_propagated = y_train_propagated[partially_propagated]

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X_train_partially_propagated, y_train_partially_propagated)
log_reg.score(X_test, y_test)

In [None]:
np.mean(y_train_partially_propagated = y_train[partially_propagated])

# DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.05)
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(X)
dbscan.labels_

In [None]:
len(dbscan.core_sample_indices_)

In [None]:
dbscan.core_sample_indices_

In [None]:
dbscan.components_

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors = 50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

In [None]:
X_new = np.array([[-0.5, 0], [0, 0.5], [1, -0.1], [2, 1]])
knn.predict(X_new)

In [None]:
knn.predict_proba(X_new)

In [None]:
y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1)
y_pred = dbscan.labels_[dbscan.core_sample_indices_][y_pred_idx]
y_pred[y_dist > 0.2] = -1
y_pred.ravel()

# Gaussian Mixtures

In [None]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, n_init=10)
gm.fit(X)

In [None]:
gm.weights_

In [None]:
gm.means_

In [None]:
gm.covariances_

In [None]:
gm.converged_

In [None]:
gm.n_iter_

In [None]:
gm.predict(X)

In [None]:
gm.predict_proba(X)

In [None]:
X_new, y_new = gm.sample(6)
X_new

In [None]:
y_new

In [None]:
gm.score_samples(X)

# Anomaly Detection using Gaussian Mixtures

In [None]:
densities = gm.score_samples(X)
density_threshold = np.percentile(densities, 4)
anomalies = X[densities < density_threshold]

# Selecting the Number of Clusters

In [None]:
gm.bic(X)

In [None]:
gm.aic(X)

# Bayesian Gaussian Mixture Models

In [None]:
from sklearn.mixture import BayesianGaussianMixture

bgm = BayesianGaussianMixture(n_components=10, n_init = 10, random_state=42)
bgm.fit(X)
np.round(bgm.weights_, 2)