# 1 Importing libraries and loading data

In [477]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm
from sklearn.metrics import confusion_matrix

np.seterr(divide='ignore', invalid='ignore')

# useful function to load CIFAR-10 datasets
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [235]:
# loading datasets

# fashion MNIST
fashion_mnist_train = pd.read_csv("./datasets/fashion-mnist_train.csv") 
fashion_mnist_test = pd.read_csv("./datasets/fashion-mnist_test.csv")

# CIFAR-10 train
baseName = "./datasets/cifar-10-batches-py/data_batch_"
X_train = np.array([row for batch in [unpickle(baseName + str(i))[b'data'] for i in range(1, 6)] for row in batch])
y_train = np.array([label for batch in [unpickle(baseName + str(i))[b'labels'] for i in range(1, 6)] for label in batch])
cifar10_train = pd.DataFrame(X_train)
cifar10_train["class"] = y_train

# CIFAR-10 test
X_test = np.array([row for batch in [unpickle("./datasets/cifar-10-batches-py/test_batch")[b'data'] for i in range(1, 6)] for row in batch])
y_test = np.array([label for batch in [unpickle("./datasets/cifar-10-batches-py/test_batch")[b'labels'] for i in range(1, 6)] for label in batch])
cifar10_test = pd.DataFrame(X_test)
cifar10_test["class"] = y_test

# 2 Data analysis

# 3 Classification

## 3.1 Naive Bayes implementation

In [468]:
class NaiveBayes:
    def __init__(self):
        pass
    
    def fit(self, X, y):
        separated = [[x for x, t in zip(X, y) if t == c] for c in np.unique(y)]
        self.means = np.array([np.mean(g, axis=0) for g in separated])
        self.stds = np.array([np.std(g, axis=0) for g in separated]) + 0.005
    
    def predict(self, X):
        return np.apply_along_axis(self.predict_class_by_row, 1, X)
        
    def predict_class_by_row(self, row):
        P = np.sum(np.log(np.divide(np.exp(-np.divide((row-self.means)**2, 2 * self.stds**2)), 2*self.stds*np.sqrt(2*np.pi))), axis=1)
        return np.argmax(P)

## 3.2 Naive Bayes on CIFAR-10

In [473]:
model = NaiveBayes()
X_train = cifar10_train.drop(columns="class").as_matrix()
y_train = cifar10_train["class"].as_matrix()
model.fit(X_train, y_train)

In [474]:
import time
start = time.time()
X_test = cifar10_test.drop(columns="class").as_matrix()
predicts = model.predict(X_test)
print(time.time() - start)

48.78584122657776


In [475]:
(cifar10_test["class"] == predicts).sum() / 50000 * 100

29.759999999999998

In [476]:
confusion_matrix(cifar10_test["class"], predicts)

array([[2470,  100,  195,   50,  425,  170,  250,   45,  995,  300],
       [ 705,  830,  120,  155,  330,  360,  960,   95,  605,  840],
       [1125,  120,  415,   75, 1460,  240, 1045,  105,  270,  145],
       [ 815,  180,  270,  380,  755,  645, 1310,  130,  170,  345],
       [ 430,   40,  285,  130, 2085,  190, 1325,  110,  250,  155],
       [ 780,   85,  275,  255,  835, 1320,  795,  180,  285,  190],
       [ 530,   10,  300,   90, 1140,  230, 2335,   75,   95,  195],
       [ 670,  120,  180,  205, 1140,  470,  510,  655,  360,  690],
       [ 840,  205,   90,   85,  280,  415,  195,   40, 2355,  495],
       [ 720,  335,   85,  100,  240,  160,  505,  115,  705, 2035]])

## 3.3 Naive Bayes on fashion MNIST

In [478]:
model2 = NaiveBayes()
X_train = fashion_mnist_train.drop(columns="label").as_matrix() 
y_train = fashion_mnist_train["label"].as_matrix()
model2.fit(X_train, y_train)

In [479]:
import time
start = time.time()
X_test = fashion_mnist_test.drop(columns="label").as_matrix()
predicts = model2.predict(X_test)
print(time.time() - start)

3.280240058898926


In [480]:
(fashion_mnist_test["label"] == predicts).sum() / len(fashion_mnist_test) * 100

59.38

In [481]:
confusion_matrix(fashion_mnist_test["label"], predicts)

array([[597,  40,  25, 198,  85,   0,  16,   0,  39,   0],
       [  0, 940,  16,  33,   2,   1,   8,   0,   0,   0],
       [  4,   7, 325,  68, 552,   0,  21,   0,  23,   0],
       [  6, 289,   6, 666,  25,   0,   5,   0,   3,   0],
       [  0,  32,  37, 156, 765,   0,   1,   0,   9,   0],
       [  1,   1,   1,   2,   0, 291,   6, 646,  14,  38],
       [113,  28, 102, 240, 432,   0,  39,   0,  46,   0],
       [  0,   0,   0,   0,   0,   3,   0, 977,   0,  20],
       [  4,   2,  17,  79, 161,   3,  23,   3, 707,   1],
       [  0,   0,   0,   1,   0,  28,   2, 327,  11, 631]])

# 4 Linear regression