In [1]:
import pandas as pd
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.mixture import GaussianMixture

In [2]:
x_train_gr_smpl = pd.read_csv("./datasets/x_train_gr_smpl.csv", delimiter=",", dtype=np.uint8)
y_train_smpl = pd.read_csv("./datasets/y_train_smpl.csv", delimiter=",", dtype=np.uint8)

In [3]:
# Pre-processing methods for the dataset

def get_array_of_matrix(dataset):
    array_of_images = []
    for row in dataset:
        row = np.asarray(row)
        matrix = np.reshape(row, (48, 48))
        array_of_images.append(matrix)
    return array_of_images


def crop_dataset(dataset, row, clmn):
    copped_dataset = []
    for image in dataset:
        y, x = image.shape
        first_x = x//2-(row//2)
        first_y = y//2-(clmn//2)
        copped_dataset.append(image[first_y:first_y + clmn, first_x:first_x + row])
    return copped_dataset


def reshape_dataset(dataset):
    reshaped_dataset = []
    for image in dataset:
        image = cv.resize(image, (48, 48)) # un po' bruttino
        image = image.flatten()
        reshaped_dataset.append(image)
    # reshaped_dataset = np.reshape(reshaped_dataset, (12660, 2304)) # un po' bruttino
    return reshaped_dataset


def apply_adaptive_threshold(dataset):
    dataset_with_filter = []
    for image in dataset:
        image = cv.adaptiveThreshold(image, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)
        image = image.flatten()
        dataset_with_filter.append(image)
    dataset_with_filter = np.reshape(dataset_with_filter, (12660,2304))
    return dataset_with_filter

In [4]:
dataset = np.asmatrix(x_train_gr_smpl)
aom_dataset = get_array_of_matrix(dataset)
cropped_dataset = crop_dataset(aom_dataset, 40, 40)
new_dataset = reshape_dataset(cropped_dataset)
dataset1 = apply_adaptive_threshold(new_dataset)

In [5]:
df = np.append(dataset1, y_train_smpl, axis=1)
x = df[:, 0:1599]
y = df[:, -1]

## Clustering

In [6]:
gmm = GaussianMixture()
gmm.fit(x)

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
        means_init=None, n_components=1, n_init=1, precisions_init=None,
        random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
        verbose_interval=10, warm_start=False, weights_init=None)

In [7]:
labels = gmm.predict(x)
print(f"Total clusters found: {len(np.unique(labels))}")

Total clusters found: 1


## Running with 10 n_components as a classifier

In [8]:
gmm1 = GaussianMixture(n_components=10)
gmm1.fit(x)
labels1 = gmm1.predict(x)
print(f"Total clusters found: {len(np.unique(labels1))}")

Total clusters found: 10


In [9]:
print("Confusion Matrix:")
print(confusion_matrix(y,labels1))
print("\n")
print("Classification report:")
print(classification_report(y,labels1))

Confusion Matrix:
[[ 141    0  414    6   78    0  463  305    3    0]
 [ 319    0  350    9  177    1  480  521    3    0]
 [ 107    0   93    1   31    0   32  155    1    0]
 [  98    0    0    0  145  923    3  151    0    0]
 [ 274    2    1    6  252    2   36  309 1217    1]
 [  71  731    0    0   95    0    0  219    4 1040]
 [ 104    0    3  470  140    2   57    4    0    0]
 [ 108    0   48    5   24    0   13   36    6    0]
 [ 329    1   11  385  731    2   23  587    1    0]
 [  74    0    1   68  101    0    1   54    0    1]]


Classification report:
              precision    recall  f1-score   support

           0       0.09      0.10      0.09      1410
           1       0.00      0.00      0.00      1860
           2       0.10      0.22      0.14       420
           3       0.00      0.00      0.00      1320
           4       0.14      0.12      0.13      2100
           5       0.00      0.00      0.00      2160
           6       0.05      0.07      0.06    

## Scaling the data to get better results

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()

In [12]:
dataframe = pd.DataFrame(data=df, index=None,columns=None,dtype=np.uint8, copy=False)

In [13]:
dataframe.rename(columns ={1600:"Label"}, inplace =True) #Renaming the last column as "Label"
scaler.fit(dataframe.drop("Label", axis=1))# Fit the data to all columns except Labels
scaled_features = scaler.transform(dataframe.drop("Label", axis=1))
df_feat = pd.DataFrame(scaled_features,columns=dataframe.columns[:-1])

  return self.partial_fit(X, y)
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
X = df_feat
y = dataframe["Label"]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12660 entries, 0 to 12659
Columns: 2304 entries, 0 to 2303
dtypes: float64(2304)
memory usage: 222.5 MB


In [15]:
gmm_scaled = GaussianMixture()
gmm_scaled.fit(X)
labels_scaled = gmm_scaled.predict(X)
print(f"Total clusters found: {len(np.unique(labels_scaled))}")

Total clusters found: 1
