In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import pandas as pd
from sklearn.metrics import confusion_matrix

## Part 2

In [2]:
images_data = np.genfromtxt("hw02_images.csv", delimiter = ",")
labels_data = np.genfromtxt("hw02_labels.csv", delimiter = ",")

## Part 3

In [3]:
train_images = images_data[:30000,:]
test_images = images_data[30000:,:]
train_label = labels_data[:30000]
test_label = labels_data[30000:]

## Part 4

In [4]:
K = int(np.max(train_label))
N = train_images.shape[0]

In [5]:
# calculate sample means
sample_means = [np.mean(train_images[train_label == (c + 1)], axis=0) for c in range(K)]

print("\033[4msample_means\033[0m \n")
for sample_mean in sample_means:
    print(sample_mean,"\n")

[4msample_means[0m 

[254.99866667 254.98416667 254.85616667 254.66733333 254.54466667
 254.274      253.36283333 249.56366667 239.67583333 221.92416667
 196.88683333 178.43316667 189.53316667 204.313      206.07166667
 197.24433333 176.36966667 180.95283333 207.72983333 231.29733333
 245.5595     252.19866667 254.05216667 254.42183333 254.697
 254.8475     254.982      254.996      254.9935     254.97333333
 254.7145     254.09983333 253.49366667 248.63316667 228.3305
 193.45316667 157.305      129.5195     111.14933333  90.59116667
  82.60266667  89.50133333  92.12233333  81.20633333  79.47516667
  99.91616667 117.53483333 138.8295     172.26616667 211.439
 241.081      251.82916667 253.85933333 254.521      254.90366667
 254.963      254.98466667 254.94216667 254.573      253.46616667
 247.69266667 211.78016667 161.32366667 136.2115     122.59783333
 113.05033333 106.335      103.0865      96.07983333  93.35983333
  94.4615      91.25516667 100.87116667 105.99616667 110.13816667
 

In [6]:
class_sizes= np.zeros(5)
for i in range(N):
    class_sizes[int(train_label[i]-1)]+=1

In [7]:
# calculate sample deviations 
sample_deviations = [np.std(train_images[train_label == (c + 1)], axis=0) for c in range(K)]

print("\033[4msample_deviations\033[0m \n")
for sample_deviation in sample_deviations:
    print(sample_deviation,"\n")

[4msample_deviations[0m 

[9.12773551e-02 2.56091075e-01 1.31090756e+00 3.80543465e+00
 5.27948907e+00 6.97889132e+00 1.07720867e+01 2.09088724e+01
 3.74438435e+01 5.25122406e+01 6.43785189e+01 7.09060378e+01
 6.86627306e+01 6.22709378e+01 6.19797698e+01 6.60298794e+01
 7.33258709e+01 7.11195000e+01 6.13707817e+01 4.58070656e+01
 2.86522563e+01 1.50082488e+01 7.59281098e+00 5.46698180e+00
 4.67702088e+00 2.99681671e+00 3.74178211e-01 1.58063278e-01
 2.62280543e-01 4.01607880e-01 5.07890964e+00 1.03206524e+01
 1.34200830e+01 2.58190550e+01 5.23572148e+01 7.39564543e+01
 8.42104109e+01 8.71560628e+01 8.38912910e+01 7.73179110e+01
 8.03720088e+01 8.25237400e+01 8.31962000e+01 8.15289709e+01
 7.79408454e+01 8.11367681e+01 8.62364334e+01 8.68170035e+01
 8.12445772e+01 6.58150663e+01 3.95159939e+01 1.96698496e+01
 1.21747777e+01 7.81098963e+00 3.62960786e+00 2.21591915e+00
 2.89996935e-01 1.37410164e+00 7.24785055e+00 1.33104165e+01
 2.90947684e+01 6.50654043e+01 8.43708969e+01 8.47847791e

In [8]:
np.shape(sample_deviations)

(5, 784)

In [9]:
# calculate prior probabilities
class_priors = [np.mean(train_label == (c + 1)) for c in range(K)]

print("\033[4mclass_priors\033[0m \n", class_priors)

[4mclass_priors[0m 
 [0.2, 0.2, 0.2, 0.2, 0.2]


## Part 5

$g_c(x)=\log{(p(x|y=c))}+\log{(P(y=c))}$

$g_c(x)=\log{(\dfrac{1}{\sqrt{2\pi\sigma^2_c}}.e^{-\dfrac{(x-\mu_c^2)^2}{2\sigma^2_c}}})+\log{(P(y=c))}$ 

$g_c(x)\approx\sum^N_{i=1}{[-\dfrac{1}{2}\log{2\pi\sigma^2_c}{-\dfrac{(x-\mu_c^2)^2}{2\sigma^2_c}}]}+\log{(P(y=c))}$ 

In [21]:
def score_def(x):
    scores = np.array([0, 0, 0, 0, 0])
    for i in range(K):
        scores[i]  = np.sum((-0.5 * ( np.log(2 * math.pi * (sample_deviations[i]**2) ))) + 
                            (-0.5 * ((x-sample_means[i])**2) /  sample_deviations[i]**2 )) + np.log(class_priors[i])
    return scores

In [22]:
g_scores_train = [score_def(train_images[i]) for i in range(np.shape(train_images)[0])]
g_scores_test = [score_def(train_images[i]) for i in range(np.shape(test_images)[0])]

In [23]:
def get_labels(pred,g_scores):
    for i in range(len(g_scores)):
        max_g=np.max(g_scores[i])
        if g_scores[i][0]==max_g:
            pred.append(1)
        elif g_scores[i][1]==max_g:
            pred.append(2)
        elif g_scores[i][2]==max_g:
            pred.append(3)
        elif g_scores[i][3]==max_g:
            pred.append(4)
        else :
            pred.append(5)
        
    pred_label=np.array(pred)
    return pred_label

train_pred = get_labels([],g_scores_train)
test_pred = get_labels([],g_scores_test)

#train_pred = np.argmax(g_scores_train, axis = 1)+1
#test_pred = np.argmax(g_scores_test, axis = 1)+1

In [24]:
confusion_matrix = pd.crosstab(train_pred, train_label, rownames=['y_pred'], colnames=['y_truth'])
print("\033[4mconfusion_matrix\033[0m \n", confusion_matrix)

[4mconfusion_matrix[0m 
 y_truth   1.0   2.0   3.0   4.0   5.0
y_pred                               
1        3688    49     4   680     6
2        1430  5667  1141  1380   533
3         505   208  4669  2949   892
4         234    60   123   686   180
5         143    16    63   305  4389


In [25]:
confusion_matrix = pd.crosstab(test_pred, test_label, rownames=['y_pred'], colnames=['y_truth'])
print("\033[4mconfusion_matrix\033[0m \n", confusion_matrix)

[4mconfusion_matrix[0m 
 y_truth  1.0  2.0  3.0  4.0  5.0
y_pred                          
1        148  138  145  143  174
2        328  362  346  326  323
3        300  309  285  337  288
4         40   46   69   46   42
5        184  145  155  148  173
