In [6]:
import scipy.io as sio
import numpy as np
import pandas as pd
import math
import random
import matplotlib.pyplot as plt
%matplotlib inline

# Load the data
X = sio.loadmat('a1digits.mat')['digits_train'] # Training data
Y = sio.loadmat('a1digits.mat')['digits_test'] # Testing data

# Normalize each digit
def normalizer(X):
    """Normalize the features for each digit vector between 0 and 1"""
    m, n, k = X.shape # Understand the dimensions of the sample
    for i in range(0, k):
        for j in range(0,n):
            max_m = np.max(X[:,j,i])
            min_m = np.min(X[:,j,i])
            X[:,j,i] = np.divide((np.subtract(X[:,j,i],min_m)),(max_m-min_m))
    return X

# X = normalizer(X)
# Y = normalizer(Y)

# Represent each "pixel" in vector in terms of 1 or 0. Binarize data
X = np.where(X > 0.5, 1, 0)
Y = np.where(Y > 0.5, 1, 0)

m, n, k = X.shape

# Plot a random sample
num = random.randint(0,n-1)
fig, axes = plt.subplots(1, k,figsize=(15,2.3),dpi=300)
fig.suptitle('Digits for Sample %i' %num, size=15, x=0.2)

for i in range(0, k):
    axes[i].imshow(np.reshape(X[:,num,i],[8,8]), cmap='Greys_r')
    axes[i].axis('off')
    axes[i].set_title(str(i+1))

FileNotFoundError: [Errno 2] No such file or directory: 'a1digits.mat'

In [None]:
# Calculate the Eta's, which is the mean of all the samples, for that class
eta = np.mean(X,axis=1)

# Visualize the Eta's
fig, axes = plt.subplots(1, k,figsize=(15,2.3),dpi=300)
fig.suptitle('Naive Bayes Means, ' r'$\eta_{k}$', size=15, x=0.205)

for i in range(0, k):
    axes[i].imshow(np.reshape(eta[:,i],[8,8]), cmap='Greys_r')
    axes[i].axis('off')
    axes[i].set_title(str(i+1))

In [None]:
def naive_bayes(eta, x):
    '''Function to classify our digits'''
    y = 1
    m = len(x)
    for i in range(0,m):
        y = y*eta[i]**(x[i])*(1-eta[i])**(1-x[i]) 
    return y    

l = []
max_index = 0
digit_index = 0

for i in range(0,10):
    y = naive_bayes(eta[:,i], Y[:,151,digit_index])
    l.append(y)

max_index = 1 + l.index(max(l))
print('The predicted digit is:',max_index)
print('The actual digit is:', digit_index+1)
fig, axes = plt.subplots(1, 1,figsize=(15,3),dpi=300)
    
plt.xticks(np.arange(1, 11, step=1))
axes.bar(range(1,11),l)
plt.show()

In [None]:
def nb_test(eta, Y):
    '''Naive Bayes classifier, for use when you have already gone through training; that is,
    you've calculated the eta already. Outputs a pandas data-frame for a confusion matrix, nb1;
    a df of error count per digit, nb2; and a simple df with total error rate, n3.'''
    
    class_count = [] # Blank list to put the predictions in
    m2, n2, k2 = Y.shape # Determine the shape of the test data
    
    # Iterate through all samples to predict which class (digit) they should belong to
    for h in range(0,k2):
        c = [0]*10
        for x in range(0,n2):
            l = []
            for i in range(0,k2):
                y = naive_bayes(eta[:,i], Y[:,x-1,h])
                l.append(y)
            max_index = l.index(max(l))
            c[max_index] = c[max_index] + 1

        error_count = n2 - c[h] # Calculate the number of errors for this class
        error_per = (n2-c[h])/n2 # Calculate error percentage
        c.extend([error_count, error_per]) 
        class_count.append(c)

    # Create a confusion matrix showin the number of misclassified digits per class
    c = [1,2,3,4,5,6,7,8,9,10,'Error Count','Error %']
    r = np.array(['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10'])
    nb1 = pd.DataFrame(class_count,columns=c,index=r)
    nb1.index.name = "Class"
    nb1['Error %'] = pd.Series(["{0:.1f}%".format(val * 100) for val in nb1['Error %']], index = nb1.index)

    # Create a table showing the total number of errors
    error_table = np.array(class_count)[:,10:12]
    error_table = np.append(error_table,[error_table.mean(axis=0)],0)
    c = ['Error Count','Error %']
    r = np.array(['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','Avg Errors'])
    nb2 = pd.DataFrame(error_table,columns=c,index=r)
    nb2['Error %'] = pd.Series(["{0:.1f}%".format(val * 100) for val in nb2['Error %']], index = nb2.index)
    
    # Create a seperate table to just show the error count and total error rate
    nb3 = np.array(class_count)
    nb3 = nb3[:,10]
    nb3 = np.append(nb3,(np.mean(nb3)/400))
    label = np.array(['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','Total Error Rate'])
    nb3 = pd.DataFrame(nb3,index=label,columns=['Naive Bayes']).T
    nb3['Total Error Rate'] = pd.Series(["{0:.1f}%".format(val * 100) for val in nb3['Total Error Rate']], index = nb3.index)
    nb3[label[:-1]] = nb3[label[:-1]].applymap(np.int64)  
    return (nb1, nb2, nb3) # Return the confusion matrix and error table