In [4]:
from __future__ import print_function, division
from builtins import range, input
from datetime import datetime
from scipy.stats import norm
from future.utils import iteritems
from scipy.stats import multivariate_normal as mvn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split




In [5]:
class NaiveBayes(object):
    def fit(self, X, Y, smoothing=1e-2):
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y)
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing,
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, features, labels):
        P = self.predict(features)
        for j in range(10):
            true_positives = 0
            predicted_amount = 0
            for i in range(len(P)):
                if P[i] == j:
                    predicted_amount +=1
                    if labels[i] == P[i]:
                        true_positives += 1
            tags = 0
            tag_amount = len([i for i in labels if i == j])
            for i in range(len(labels)):
                if labels[i] == P[i] == j:
                    tags += 1
            print(j,"- Recall:", round(tags/tag_amount, 2), "Precision:", round(true_positives/predicted_amount, 2))
        return np.mean(P == labels)

    def predict(self, features):
        N, D = features.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))
        for c, g in iteritems(self.gaussians):
            mean, var = g['mean'], g['var']
            P[:,c] = mvn.logpdf(features, mean=mean, cov=var) + np.log(self.priors[c])
        return np.argmax(P, axis=1)


In [6]:
def get_data(limit=None):
    print("Reading in and transforming data...")
    df = pd.read_csv("train.csv", encoding="ISO-8859-1")
    data = df.values
    np.random.shuffle(data)
    X = data[:, 1:] / 255.0 # data is from 0..255
    Y = data[:, 0]
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

In [7]:
features, labels = get_data(42000)
Ntrain = len(labels) // 10 * 3
featuresTrain, labelsTrain = features[Ntrain:], labels[Ntrain:]
featuresTest, labelsTest = features[:Ntrain], labels[:Ntrain]

model = NaiveBayes()
t0 = datetime.now()
model.fit(featuresTrain, labelsTrain)
print("Training time:", (datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy:", round(model.score(featuresTrain, labelsTrain)*100, 2),"%")
print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(labelsTrain))

print()
t0 = datetime.now()
print("Test accuracy:", model.score(featuresTest, labelsTest)*100,"%")
print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(labelsTest))

Reading in and transforming data...
Training time: 0:00:00.801129
0 - Recall: 0.9 Precision: 0.93
1 - Recall: 0.96 Precision: 0.79
2 - Recall: 0.74 Precision: 0.9
3 - Recall: 0.76 Precision: 0.81
4 - Recall: 0.65 Precision: 0.84
5 - Recall: 0.63 Precision: 0.87
6 - Recall: 0.92 Precision: 0.84
7 - Recall: 0.81 Precision: 0.94
8 - Recall: 0.75 Precision: 0.64
9 - Recall: 0.87 Precision: 0.62
Train accuracy: 80.21 %
Time to compute train accuracy: 0:00:18.091883 Train size: 29400

0 - Recall: 0.92 Precision: 0.93
1 - Recall: 0.96 Precision: 0.8
2 - Recall: 0.76 Precision: 0.89
3 - Recall: 0.76 Precision: 0.82
4 - Recall: 0.64 Precision: 0.84
5 - Recall: 0.61 Precision: 0.87
6 - Recall: 0.92 Precision: 0.84
7 - Recall: 0.8 Precision: 0.94
8 - Recall: 0.74 Precision: 0.66
9 - Recall: 0.88 Precision: 0.61
Test accuracy: 80.5 %
Time to compute test accuracy: 0:00:10.208429 Test size: 12600
