# NaiveBayes on MNIST dataset

Step 1: Import Packages

In [1]:
import numpy as np
import cv2
import pandas as pd

from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score




Step 2: Prepare data.

In [2]:
raw_data = pd.read_csv('../data/train.csv',header=0)
data = raw_data.values

imgs = data[0::,1::]
labels = data[::,0]

x_train, x_test, y_train, y_test = train_test_split(imgs, labels, test_size=0.33, random_state=23323)

Step 3: Build model.

In [3]:
class NaiveBayes(object):
    def __init__(self, num_classes=10):
        self.num_classes = num_classes
        self.prior_probability = None
        self.conditional_probability = None
        self.feature_dim = None
    
    def _binarization(self, img):
        bin_img = img.astype(np.uint8)
        cv2.threshold(bin_img, 50, 1, cv2.THRESH_BINARY_INV, bin_img) # pixel = 0 if value > 50 else 1
        return bin_img

    def train(self, features, labels):
        self.feature_dim = features[0].shape[0]
        self.prior_probability = np.zeros(self.num_classes)
        self.conditional_probability = np.zeros((self.num_classes, self.feature_dim, 2))

        # count value
        for feature, label in zip(features, labels):
            bin_feature = self._binarization(feature)

            self.prior_probability[label] += 1

            for index, pixel in enumerate(bin_feature):
                self.conditional_probability[label][index][pixel] += 1

        # division
        self.prior_probability = (self.prior_probability + 1) / (np.sum(self.prior_probability) + self.num_classes)
        for i in range(self.num_classes):
            for index in range(self.feature_dim):
                c_0 = (self.conditional_probability[i][index][0] + 1) / (np.sum(self.conditional_probability[i][index]) + self.conditional_probability.shape[2])
                c_1 = (self.conditional_probability[i][index][1] + 1) / (np.sum(self.conditional_probability[i][index]) + self.conditional_probability.shape[2])

                self.conditional_probability[i][index][0] = c_0
                self.conditional_probability[i][index][1] = c_1

    def predict(self, features):
        y_predicted = np.zeros(features.shape[0])
        
        
        for index, feature in enumerate(features):
            bin_feature = self._binarization(feature)
            
            y_pred = list()
            for i in range(self.num_classes):
                prob = self.prior_probability[i]
                for j in range(self.feature_dim):
                    prob *= self.conditional_probability[i][j][bin_feature[j]]
                y_pred.append((prob, i))
            y_predicted[index] = max(y_pred)[1]
            
        return y_predicted

Step 4: Train model.

In [4]:
nb = NaiveBayes()
nb.train(x_train, y_train)

Step 5: Evaluate model

In [5]:
y_predicted = nb.predict(x_test)
score = accuracy_score(y_predicted, y_test)
print(score)

0.8326839826839827
