# K-Means Clustering

In [1]:
import pandas as pd
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.cluster import KMeans

## Preprocessing of data

In [2]:
def get_array_of_matrix(dataset):
    array_of_images = []
    for row in dataset:
        row = np.asarray(row)
        matrix = np.reshape(row, (48, 48))
        array_of_images.append(matrix)
    return array_of_images


def crop_dataset(dataset, row, clmn):
    copped_dataset = []
    for image in dataset:
        y, x = image.shape
        first_x = x//2-(row//2)
        first_y = y//2-(clmn//2)
        copped_dataset.append(image[first_y:first_y + clmn, first_x:first_x + row])
    return copped_dataset


def reshape_dataset(dataset):
    reshaped_dataset = []
    for image in dataset:
        image = cv.resize(image, (48, 48)) # un po' bruttino
        image = image.flatten()
        reshaped_dataset.append(image)
    # reshaped_dataset = np.reshape(reshaped_dataset, (12660, 2304)) # un po' bruttino
    return reshaped_dataset


def apply_adaptive_threshold(dataset):
    dataset_with_filter = []
    for image in dataset:
        image = cv.adaptiveThreshold(image, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)
        image = image.flatten()
        dataset_with_filter.append(image)
    dataset_with_filter = np.reshape(dataset_with_filter, (12660,2304))
    return dataset_with_filter

In [3]:
x_train_gr_smpl = pd.read_csv("./datasets/x_train_gr_smpl.csv", delimiter=",", dtype=np.uint8)
y_train_smpl = pd.read_csv("./datasets/y_train_smpl.csv", delimiter=",", dtype=np.uint8)

In [4]:
dataset = np.asmatrix(x_train_gr_smpl)
aom_dataset = get_array_of_matrix(dataset)
cropped_dataset = crop_dataset(aom_dataset, 40, 40)
new_dataset = reshape_dataset(cropped_dataset)
dataset1 = apply_adaptive_threshold(new_dataset)

In [15]:
df = np.append(dataset1, y_train_smpl, axis=1)
x = df[:, 0:1599]
y = df[:, -1]


# K-Means Clustering

In [7]:
kmeans = KMeans()

In [16]:
kmeans.fit(x)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [17]:
f = np.unique(kmeans.labels_)
print(f"Total number of clusters found: {len(f)}")

Total number of clusters found: 8


In [24]:
print("Confusion Matrix")
print(confusion_matrix(y,kmeans.labels_))
print("\n")
print("Classification report")
print(classification_report(y,kmeans.labels_))

Confusion Matrix
[[   6  347    6    0  440    5  532   74    0    0]
 [  11  665   19    0  366   12  598  189    0    0]
 [   2  188    8    0  112    4   30   76    0    0]
 [ 994  158    0    0    0    3    6  159    0    0]
 [  17  328   14    2    1 1364   88  286    0    0]
 [   1  233    0 1739    0   13    0  174    0    0]
 [  31    9  495    0    2    4   84  155    0    0]
 [   2   56   10    0   64   14   18   76    0    0]
 [  19  669  471    2   14    5   50  840    0    0]
 [   4   64   83    1    1   12    4  131    0    0]]


Classification report
              precision    recall  f1-score   support

           0       0.01      0.00      0.00      1410
           1       0.24      0.36      0.29      1860
           2       0.01      0.02      0.01       420
           3       0.00      0.00      0.00      1320
           4       0.00      0.00      0.00      2100
           5       0.01      0.01      0.01      2160
           6       0.06      0.11      0.08      

  'precision', 'predicted', average, warn_for)


### K-Means with n_clusters=10

In [21]:
kmeans1 = KMeans(n_clusters=10)
kmeans1.fit(x)
# f = np.unique(kmeans.labels_)
print(f"Total number of clusters found: {len(np.unique(kmeans1.labels_))}")

Total number of clusters found: 10


In [23]:
print("Confusion matrix")
print(confusion_matrix(y,kmeans1.labels_))
print("\n")
print("Classification report")
print(classification_report(y,kmeans1.labels_))

Confusion matrix
[[ 405   50  318  371  195   71    0    0    0    0]
 [ 529  136  259  381  408  145    0    0    1    1]
 [  10   32   89   40  167   81    0    0    0    1]
 [  40  125    0    0  121  111    0    0  923    0]
 [  84  239    0    7  280  383    0    0    2 1105]
 [   1  144    0    0  251  100 1664    0    0    0]
 [  33  129    4   71    1  100    0  441    1    0]
 [   8   27   49   12   41  102    0    0    0    1]
 [  59 1005   16   10  583  395    1    0    1    0]
 [   4  133    0    0   54  108    0    0    0    1]]


Classification report
              precision    recall  f1-score   support

           0       0.35      0.29      0.31      1410
           1       0.07      0.07      0.07      1860
           2       0.12      0.21      0.15       420
           3       0.00      0.00      0.00      1320
           4       0.13      0.13      0.13      2100
           5       0.06      0.05      0.05      2160
           6       0.00      0.00      0.00      