#### Naive Bayes Implementation ####

In [287]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import numpy.linalg as linalg
import scipy.ndimage
import scipy.stats as sp
import math

def readData():
    training = np.loadtxt('digits123-1.csv', delimiter=';')
    test = np.loadtxt('digits123-2.csv', delimiter=';')
    return training, test

# Calculate mean for all features in dataset
# return list of all means
def calcMean(X):
    m = len(X)
    n = len(X[0])
    totalSum = []
    for i in range(n):
        sum = 0
        for j in range(m):
            sum += float(X[j][i])

        totalSum.append(float(sum / len(X)))

    return totalSum

# Calculate variance for all feautures in dataset
# based on list of means and dataset
def calcVar(X, mean):
    m = len(X)
    n = len(X[0])
    totalSum = np.ones((m,n))
    for i in range(n):
        for j in range(m):
            totalSum[j][i] = float((X[j][i] - mean[i])**2)
    
    # calc mean of variance per feature
    var = calcMean(totalSum)
    
    return var
         
# returns the size of each class based on target values
def classSize(Y):
    size = []
    uniques = list(set(Y))
    for i in range(len(uniques)):
        counter = 0
        for j in range(len(Y)):
            if Y[j] == uniques[i]:
                counter +=1    
        size.append(counter)
    return size

# calculate normal distribution
def gaussian(mean, var, v):
    if var != 0:
        gaus = 1.0 / math.sqrt(2 * math.pi * var) * math.exp(-(v - mean)**2/(2*var))
    else:
        if v == mean:
            gaus = 1
        else:
            gaus = 0
            
    return gaus

# split dataset for each class
def splitClasses(X, Y):
    size = classSize(Y)
    classes = []
    class1 = X[0: size[0]]
    classes.append(class1)
    base = size[0]
    for i in range(len(size)-1):
        classN = X[base:base + size[i+1]]
        base = base + size[i+1]
        classes.append(classN)  
        
    return classes
    
# retrieve mean for each class
def classMean(classes):
    means = []
    for i in range(len(classes)):
        mean = calcMean(classes[i])
        means.append(mean)
    return means

# retrieve variance for each class
def classVars(classes, means):
    variances = []
    for i in range(len(classes)):
        var = calcVar(classes[0], means[0])
        variances.append(var)
    return variances

# claculate probabilities for each class
def calcPDF(means, variances, testSet):
    m = len(testSet)
    n = len(testSet[0])-1
    probabilities = np.ones((m,len(means)))
    for h in range(len(means)):
        for i in range(m):
            for j in range(n):
                prob = gaussian(means[h][j], variances[h][j] ,testSet[i][j])
                if prob != 0:
                    probabilities[i][h] *= prob

    return probabilities

# give best predictions for test set
def predict(probabilities, Y):
    uniques = list(set(Y))
    predictions = []
    finalProbs = []
    for i in range(len(probabilities)):
        best = probabilities[i].argmax(axis=0)
        predictions.append(uniques[best])
        
        probs = probabilities[i].max()
        finalProbs.append(probs)
        
    return predictions, finalProbs
    
# calculate the percentage of correct predictions
def getAccuracy(test_set, target):
    correct = 0
    miss = []
    for i in range(len(test_set)):
        if test_set[i][-1] == target[i]:
            correct += 1
        else:
            miss.append(i)
            
    accuracy = (correct/float(len(test_set))) * 100.0
    return accuracy, miss

# give back the n misclassifications with highset probability
def highestMiss(prob, miss, n):
    misclassified = {}
    for i in range(len(miss)):
        misclassified[prob[miss[i]]] = miss[i]
    keys = sorted(misclassified.items(), key=lambda x: x[0])
    return keys[-n:]
    
# main function
def naiveBayes():
    # preprocess data
    dataSets = readData()
    X = dataSets[0]
    Y = X[:,-1]
    X = np.delete(X, -1, 1)
    testSet = dataSets[1]
    
    # split classes and calculate mean and variance for each class
    classes = splitClasses(X, Y)
    means = classMean(classes)
    variances = classVars(classes, means)

    # calculate pdf, predictions and accuracy
    probs = calcPDF(means, variances, testSet)
    pred, finalProbs = predict(probs, Y)
    acc, miss = getAccuracy(testSet, pred)
    
    # examples with highest misclassification
    misclassified = highestMiss(finalProbs, miss, 10)
    
    print "Accuracy: ", acc
    print misclassified
        
naiveBayes()

Accuracy:  60.8333333333
[(4.5076557255914265e-65, 156), (4.8806794970637234e-65, 149), (6.3970153534652496e-65, 207), (2.4932007457562496e-63, 152), (1.6574780653238209e-62, 157), (9.8485919284844255e-62, 222), (1.6381044703392142e-57, 105), (9.2914534931736198e-57, 132), (2.5853255454796082e-55, 126), (3.1439978732073109e-54, 101)]


Accuracy:  60.83 % <br>
Ten misclassifications with highest probability (for digits123-2.csv): <br>
101, 126, 132, 105, 222, 157, 152, 207, 149, 156