In [None]:
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.datasets import load_digits

data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(f"# digits: {n_digits}; # samples: {n_samples}; # features {n_features}")

In [None]:
reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init="k-means++", n_clusters=n_digits, n_init=4)
kmeans.fit(reduced_data)

In [None]:
# Step size of the mesh. Decrease to increase the quality of the VQ.
h = 0.02  # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(
    Z,
    interpolation="nearest",
    extent=(xx.min(), xx.max(), yy.min(), yy.max()),
    cmap=plt.cm.Paired,
    aspect="auto",
    origin="lower",
)

plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=labels, s=5, cmap="Set1")
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(
    centroids[:, 0],
    centroids[:, 1],
    marker="x",
    s=169,
    linewidths=3,
    color="w",
    zorder=10,
)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()

In [None]:
from sklearn.metrics import rand_score, adjusted_rand_score

rand_score(labels,kmeans.labels_)

In [None]:
adjusted_rand_score(labels,kmeans.labels_)

In [None]:
from sklearn import datasets
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import StratifiedKFold

colors = ["navy", "turquoise", "darkorange"]


In [None]:
def make_ellipses(gmm, ax):
    for n, color in enumerate(colors):
        if gmm.covariance_type == "full":
            covariances = gmm.covariances_[n][:2, :2]
        elif gmm.covariance_type == "tied":
            covariances = gmm.covariances_[:2, :2]
        elif gmm.covariance_type == "diag":
            covariances = np.diag(gmm.covariances_[n][:2])
        elif gmm.covariance_type == "spherical":
            covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]
        v, w = np.linalg.eigh(covariances)
        u = w[0] / np.linalg.norm(w[0])
        angle = np.arctan2(u[1], u[0])
        angle = 180 * angle / np.pi  # convert to degrees
        v = 2.0 * np.sqrt(2.0) * np.sqrt(v)
        ell = mpl.patches.Ellipse(
            gmm.means_[n, :2], v[0], v[1], 180 + angle, color=color
        )
        ell.set_clip_box(ax.bbox)
        ell.set_alpha(0.5)
        ax.add_artist(ell)
        ax.set_aspect("equal", "datalim")

In [None]:

iris = datasets.load_iris()

# Break up the dataset into non-overlapping training (75%) and testing
# (25%) sets.
skf = StratifiedKFold(n_splits=4)
# Only take the first fold.
train_index, test_index = next(iter(skf.split(iris.data, iris.target)))


X_train = iris.data[train_index]
y_train = iris.target[train_index]
X_test = iris.data[test_index]
y_test = iris.target[test_index]

n_classes = len(np.unique(y_train))


In [None]:
n_classes

In [None]:

# Try GMMs using different types of covariances.
estimators = {
    cov_type: GaussianMixture(
        n_components=n_classes, covariance_type=cov_type, max_iter=20, random_state=0
    )
    for cov_type in ["spherical", "diag", "tied", "full"]
}

In [None]:
estimators

In [None]:

n_estimators = len(estimators)


In [None]:

plt.figure(figsize=(3 * n_estimators // 2, 6))
plt.subplots_adjust(
    bottom=0.01, top=0.95, hspace=0.15, wspace=0.05, left=0.01, right=0.99
)


for index, (name, estimator) in enumerate(estimators.items()):
    # Since we have class labels for the training data, we can
    # initialize the GMM parameters in a supervised manner.
    estimator.means_init = np.array(
        [X_train[y_train == i].mean(axis=0) for i in range(n_classes)]
    )

    # Train the other parameters using the EM algorithm.
    estimator.fit(X_train)

    h = plt.subplot(2, n_estimators // 2, index + 1)
    make_ellipses(estimator, h)

    for n, color in enumerate(colors):
        data = iris.data[iris.target == n]
        plt.scatter(
            data[:, 0], data[:, 1], s=0.8, color=color, label=iris.target_names[n]
        )
    # Plot the test data with crosses
    for n, color in enumerate(colors):
        data = X_test[y_test == n]
        plt.scatter(data[:, 0], data[:, 1], marker="x", color=color)

    y_train_pred = estimator.predict(X_train)
    train_accuracy = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100
    plt.text(0.05, 0.9, "Train accuracy: %.1f" % train_accuracy, transform=h.transAxes)

    y_test_pred = estimator.predict(X_test)
    test_accuracy = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100
    plt.text(0.05, 0.8, "Test accuracy: %.1f" % test_accuracy, transform=h.transAxes)

    plt.xticks(())
    plt.yticks(())
    plt.title(name)

plt.legend(scatterpoints=1, loc="lower right", prop=dict(size=12))


plt.show()

<!-- https://archive.ics.uci.edu/ml/datasets/Glass+Identification
https://archive.ics.uci.edu/ml/datasets/Image+Segmentation -->
https://archive.ics.uci.edu/ml/datasets/Statlog+(Landsat+Satellite)

In [None]:
trainpath = os.path.join("datasets", "landsat", "sat.trn")
testpath = os.path.join("datasets", "landsat", "sat.tst")

train = pd.read_csv(trainpath,sep=' ',header=None)
test = pd.read_csv(testpath,sep=' ',header=None)
Xy = np.vstack((np.asarray(train),np.asarray(test)))

X = Xy[:,:36]
y = Xy[:,36]

In [None]:
X

In [None]:
y

In [None]:
max(y)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=6, random_state=42).fit(X)

In [None]:

kmeans.labels_


In [None]:

kmeans.cluster_centers_

In [None]:
from sklearn.metrics import rand_score, adjusted_rand_score

rand_score(y,kmeans.labels_)

In [None]:
adjusted_rand_score(y,kmeans.labels_)

In [None]:
reduced_data = PCA(n_components=2).fit_transform(X)

In [None]:
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=y, s=3, cmap="Set1")

In [None]:
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=kmeans.labels_, s=3, cmap="Set1")

In [None]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=6).fit(X)

In [None]:
adjusted_rand_score(y,ac.labels_)

In [None]:
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=ac.labels_, s=3, cmap="Set1")

In [None]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=6).fit(X)

In [None]:
pred = gm.predict(X)
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=pred, s=3, cmap="Set1")

In [None]:
adjusted_rand_score(y,pred)

In [None]:
from sklearn.cluster import DBSCAN

ds = DBSCAN().fit(X)

In [None]:
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=ds.labels_, s=3, cmap="Set1")