In [1]:
import pandas as pd
import numpy as np
import cv2 as cv
import matplotlib.pyplot as plt

import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# Pre-processing methods for the dataset

def get_array_of_matrix(dataset):
    array_of_images = []
    for row in dataset:
        row = np.asarray(row)
        matrix = np.reshape(row, (48, 48))
        array_of_images.append(matrix)
    return array_of_images


def crop_dataset(dataset, row, clmn):
    copped_dataset = []
    for image in dataset:
        y, x = image.shape
        first_x = x//2-(row//2)
        first_y = y//2-(clmn//2)
        copped_dataset.append(image[first_y:first_y + clmn, first_x:first_x + row])
    return copped_dataset


def reshape_dataset(dataset):
    reshaped_dataset = []
    for image in dataset:
        image = cv.resize(image, (48, 48)) # un po' bruttino
        image = image.flatten()
        reshaped_dataset.append(image)
    # reshaped_dataset = np.reshape(reshaped_dataset, (12660, 2304)) # un po' bruttino
    return reshaped_dataset


def apply_adaptive_threshold(dataset):
    dataset_with_filter = []
    for image in dataset:
        image = cv.adaptiveThreshold(image, 255, cv.ADAPTIVE_THRESH_GAUSSIAN_C, cv.THRESH_BINARY, 11, 2)
        image = image.flatten()
        dataset_with_filter.append(image)
    dataset_with_filter = np.reshape(dataset_with_filter, (12660, 1600))
    return dataset_with_filter

In [3]:
x_train_gr_smpl = pd.read_csv("./datasets/x_train_gr_smpl.csv", delimiter=",", dtype=np.uint8)
y_train_smpl = pd.read_csv("./datasets/y_train_smpl.csv", delimiter=",", dtype=np.uint8)

In [55]:
dataset = np.asmatrix(x_train_gr_smpl)
aom_dataset = get_array_of_matrix(dataset)
cropped_dataset = crop_dataset(aom_dataset, 40, 40)
#new_dataset = reshape_dataset(cropped_dataset)
new_dataset = apply_adaptive_threshold(cropped_dataset)

In [5]:
# add y_train_smpl to new_dataset ---KARAN
dataset = np.append(new_dataset, y_train_smpl, axis=1)
x = dataset[:, 0:1599]
y = dataset[:, -1]
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=.33, random_state=17)

In [34]:
# code for Naive Bayes algorithm
# Gaussian
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)
y_expect = y_test
y_pred = GausNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))
print(" ")
print("Confusion matrix:")
print(confusion_matrix(y_expect, y_pred))
print(" ")
print("Classification report:")
print(classification_report(y_expect, y_pred, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

GaussianNB(priors=None, var_smoothing=1e-09)
0.8475347056007659
 
Confusion matrix:
[[324  78  21   0  12   2   3   2   4   1]
 [ 78 433  50   1  27   1   1   5   7   1]
 [  4  10 112   0   3   0   1   9   0   1]
 [  3  12   0 411  24   0   0   0   4   0]
 [  1  45   7   5 596  12   0   3  18   2]
 [  6  12   2   1  19 682   0   0  19   0]
 [  0   0   0   0   4   0 224   0   9   2]
 [  2   3   2   0   1   0   0  76   0   0]
 [  9  43   2   0  30   0   8   1 610   1]
 [  0   0   0   0   3   0   0   0   0  73]]
 
Classification report:
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       447
           1       0.68      0.72      0.70       604
           2       0.57      0.80      0.67       140
           3       0.98      0.91      0.94       454
           4       0.83      0.87      0.85       689
           5       0.98      0.92      0.95       741
           6       0.95      0.94      0.94       239
           7       0.79    

In [7]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)
print(BernNB)
y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.8128291048348492


In [38]:
MultiNom = MultinomialNB()
MultiNom.fit(X_train, y_train)
print(MultiNom)
y_expect = y_test
y_pred = MultiNom.predict(X_test)
print(accuracy_score(y_expect, y_pred))
# print(" ")
# print("Confusion matrix for MultiNom: ")
# print(confusion_matrix(y_expect, y_pred))
# plt.imshow(confusion_matrix(y_expect, y_pred), cmap="Blues")
# plt.show()
# print(" ")
# print("Classification report for MultiNom: ")
# print(classification_report(y_expect, y_pred, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.8123504068932503


In [9]:
correlation_dict={0:[863,903,823,817,783,943,1179,581,541,1180],\
                  1:[818,659,699,1179,1180,541,1182,581,1181,1183],\
                  2:[1135,863,548,1134,823,1136,903,1192,1175,547],\
                  3:[821,861,862,781,822,902,901,1102,1101,820],\
                  4:[1142,946,906,1141,542,541,582,907,1382,1102],\
                  5:[980,1020,940,979,664,939,665,981,899,1019],\
                  6:[1148,1187,1185,1186,1149,1184,506,1142,505,1183],\
                  7:[814,821,862,822,861,774,813,773,815,781],\
                  8:[869,621,829,1262,785,872,873,833,832,662],\
                  9:[906,946,699,504,698,464,465,866,945,738]}

In [10]:
top_2_cols = [values[:2] for values in correlation_dict.values()]
top_2_cols = sorted(set([y for x in top_2_cols for y in x])) # Flatten & remove duplicates
top_5_cols = [values[:5] for values in correlation_dict.values()]
top_5_cols = sorted(set([y for x in top_5_cols for y in x])) # # Flatten & remove duplicates
top_10_cols = [values[::] for values in correlation_dict.values()]
top_10_cols = sorted(set([y for x in top_10_cols for y in x])) # # Flatten & remove duplicates

In [11]:
# Create a pandas dataframe  
dataframe = pd.DataFrame(data=dataset, index=None,columns=None,dtype=np.uint8, copy=False)

## Top 2 

In [12]:
top_2_df = dataframe.iloc[:,top_2_cols]

In [45]:
x = top_2_df
y = dataset[:, -1]
X_train2, X_test2, y_train2, y_test2 = train_test_split(x, y, test_size=.33, random_state=17)

In [49]:
GausNB = GaussianNB()
GausNB.fit(X_train2, y_train2)
print(GausNB)
y_expect2 = y_test2
y_pred2 = GausNB.predict(X_test2)

print(accuracy_score(y_expect2, y_pred2))
print(" ")
print("Confusion matrix:")
print(confusion_matrix(y_expect2, y_pred2))
print(" ")
print("Classification report:")
print(classification_report(y_expect2, y_pred2, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

GaussianNB(priors=None, var_smoothing=1e-09)
0.7108664432742939
 
Confusion matrix:
[[294  50  30   9  32   5  10   9   7   1]
 [148 273  49  22  36   3   6  23  40   4]
 [ 19  13  87   0   1   0   6  12   1   1]
 [  8  13   3 393  24   0   1   1  10   1]
 [ 22  30  15   5 362 139   6   3  40  67]
 [ 18   8   7   0  22 675   2   0   9   0]
 [  1   3   5   0   5   0 215   0   6   4]
 [  4   6   6   0   0   0   0  68   0   0]
 [ 12  35  26   5  31  24  23   2 544   2]
 [  0   1   1   0  13   1   0   0   1  59]]
 
Classification report:
              precision    recall  f1-score   support

           0       0.56      0.66      0.60       447
           1       0.63      0.45      0.53       604
           2       0.38      0.62      0.47       140
           3       0.91      0.87      0.89       454
           4       0.69      0.53      0.60       689
           5       0.80      0.91      0.85       741
           6       0.80      0.90      0.85       239
           7       0.58    

In [39]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)
print(BernNB)
y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))
# print(" ")
# print("Classification report for MultiNom: ")
# print(classification_report(y_expect, y_pred, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.8549545236955481


In [40]:
MultiNom = MultinomialNB()
MultiNom.fit(X_train, y_train)
print(MultiNom)
y_expect = y_test
y_pred = MultiNom.predict(X_test)
# print(accuracy_score(y_expect, y_pred))
# print(" ")
# print("Confusion matrix for MultiNom: ")
# print(confusion_matrix(y_expect, y_pred))
# plt.imshow(confusion_matrix(y_expect, y_pred), cmap="Blues")
# plt.show()
# print(" ")
# print("Classification report for MultiNom: ")
# print(classification_report(y_expect, y_pred, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


## Top 5

In [17]:
top_5_df = dataframe.iloc[:,top_5_cols]

In [48]:
x = top_5_df
y = dataset[:, -1]
X_train5, X_test5, y_train5, y_test5 = train_test_split(x, y, test_size=.33, random_state=17)

In [51]:
GausNB = GaussianNB()
GausNB.fit(X_train5, y_train5)
print(GausNB)
y_expect5 = y_test5
y_pred5 = GausNB.predict(X_test5)
print(accuracy_score(y_expect5, y_pred5))
print(" ")
print("Confusion matrix:")
print(confusion_matrix(y_expect5, y_pred5))
print(" ")
print("Classification report:")
print(classification_report(y_expect5, y_pred5, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

GaussianNB(priors=None, var_smoothing=1e-09)
0.8118717089516515
 
Confusion matrix:
[[327  60  21   2  12   2   1  16   5   1]
 [ 99 377  51   5  24   1   2  21  21   3]
 [  8   8  94   0   3   0   1  23   3   0]
 [  6   6   0 401  34   0   0   0   5   2]
 [  5  43  14   3 563  24   1   3  29   4]
 [  5  10   2   1  23 680   0   0  18   2]
 [  0   0   0   0   2   0 225   0  11   1]
 [  3   6   4   0   2   0   0  69   0   0]
 [  9  47   3   2  40   1  17   0 584   1]
 [  0   0   0   1   2   0   0   0   1  72]]
 
Classification report:
              precision    recall  f1-score   support

           0       0.71      0.73      0.72       447
           1       0.68      0.62      0.65       604
           2       0.50      0.67      0.57       140
           3       0.97      0.88      0.92       454
           4       0.80      0.82      0.81       689
           5       0.96      0.92      0.94       741
           6       0.91      0.94      0.93       239
           7       0.52    

In [20]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)
print(BernNB)
y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.8192915270464337


In [41]:
MultiNom = MultinomialNB()
MultiNom.fit(X_train, y_train)
print(MultiNom)
y_expect = y_test
y_pred = MultiNom.predict(X_test)
print(accuracy_score(y_expect, y_pred))
# print(" ")
# print("Confusion matrix for MultiNom: ")
# print(confusion_matrix(y_expect, y_pred))
# plt.imshow(confusion_matrix(y_expect, y_pred), cmap="Blues")
# plt.show()
# print(" ")
# print("Classification report for MultiNom: ")
# print(classification_report(y_expect, y_pred, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
0.8123504068932503


In [22]:
top_10_df = dataframe.iloc[:,top_10_cols]

In [52]:
x = top_10_df
y = dataset[:, -1]
X_train10, X_test10, y_train10, y_test10 = train_test_split(x, y, test_size=.33, random_state=17)

In [54]:
GausNB = GaussianNB()
GausNB.fit(X_train10, y_train10)
print(GausNB)
y_expect10 = y_test10
y_pred10 = GausNB.predict(X_test10)
print(accuracy_score(y_expect10, y_pred10))
print(" ")
print("Confusion matrix:")
print(confusion_matrix(y_expect10, y_pred10))
print(" ")
print("Classification report:")
print(classification_report(y_expect10, y_pred10, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

GaussianNB(priors=None, var_smoothing=1e-09)
0.8475347056007659
 
Confusion matrix:
[[324  78  21   0  12   2   3   2   4   1]
 [ 78 433  50   1  27   1   1   5   7   1]
 [  4  10 112   0   3   0   1   9   0   1]
 [  3  12   0 411  24   0   0   0   4   0]
 [  1  45   7   5 596  12   0   3  18   2]
 [  6  12   2   1  19 682   0   0  19   0]
 [  0   0   0   0   4   0 224   0   9   2]
 [  2   3   2   0   1   0   0  76   0   0]
 [  9  43   2   0  30   0   8   1 610   1]
 [  0   0   0   0   3   0   0   0   0  73]]
 
Classification report:
              precision    recall  f1-score   support

           0       0.76      0.72      0.74       447
           1       0.68      0.72      0.70       604
           2       0.57      0.80      0.67       140
           3       0.98      0.91      0.94       454
           4       0.83      0.87      0.85       689
           5       0.98      0.92      0.95       741
           6       0.95      0.94      0.94       239
           7       0.79    

In [25]:
BernNB = BernoulliNB()
BernNB.fit(X_train, y_train)
print(BernNB)
y_expect = y_test
y_pred = BernNB.predict(X_test)
print(accuracy_score(y_expect, y_pred))

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
0.8549545236955481


In [42]:
MultiNom = MultinomialNB()
MultiNom.fit(X_train, y_train)
print(MultiNom)
y_expect = y_test
y_pred = MultiNom.predict(X_test)
# print(accuracy_score(y_expect, y_pred))
# print(" ")
# print("Confusion matrix for MultiNom: ")
# print(confusion_matrix(y_expect, y_pred))
# plt.imshow(confusion_matrix(y_expect, y_pred), cmap="Blues")
# plt.show()
# print(" ")
# print("Classification report for MultiNom: ")
# print(classification_report(y_expect, y_pred, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
