In [1]:
import pandas as pd
import numpy as np



In [None]:
def safe_log(array):
    eps = 1.e-100
    log = np.log(array+eps)
    return log



def create_data():
    X = pd.read_csv("hw01_data_set_images.csv",header=None).values
    Y = pd.read_csv("hw01_data_set_labels.csv",header=None).values
    return X,Y



def split_data(X,Y):
    Y_train = np.r_[Y[0:25,:],Y[39:64,:],Y[78:103,:],Y[117:142,:],Y[156:181,:]]
    Y_test = np.r_[Y[25:39,:],Y[64:78,:],Y[103:117,:],Y[142:156,:],Y[181:195,:]]
    X_train = np.r_[X[0:25,:],X[39:64,:],X[78:103,:],X[117:142,:],X[156:181,:]]
    X_test = np.r_[X[25:39,:],X[64:78,:],X[103:117,:],X[142:156,:],X[181:195,:]]
    return X_train,X_test,Y_train,Y_test

In [None]:
def calculate_priors(Y_train):
    num_A = len(Y_train[Y_train == "A"])
    num_B = len(Y_train[Y_train == "B"])
    num_C = len(Y_train[Y_train == "C"])
    num_D = len(Y_train[Y_train == "D"])
    num_E = len(Y_train[Y_train == "E"])

    total = len(Y_train)
    
    p_A = num_A / total
    p_B = num_B / total
    p_C = num_C / total
    p_D = num_D / total
    p_E = num_E / total
    
    priors = np.array([p_A,p_B,p_C,p_D,p_E])
    return priors


def calculate_pcd(X_train,Y_train):

    p_x_1_given_A = (X_train[0:25] == 1).sum(axis=0) / 25
    p_x_1_given_B = (X_train[25:50] == 1).sum(axis=0) / 25
    p_x_1_given_C = (X_train[50:75] == 1).sum(axis=0) / 25
    p_x_1_given_D = (X_train[75:100] == 1).sum(axis=0) / 25
    p_x_1_given_E = (X_train[100:125] == 1).sum(axis=0) / 25
    
    pcd = np.array([p_x_1_given_A,p_x_1_given_B,p_x_1_given_C,p_x_1_given_D,p_x_1_given_E])
        
    return pcd


def posteriors(pcd,X,priors):
    g_A_list = []
    g_B_list = []
    g_C_list = []
    g_D_list = []
    g_E_list = []
    predicted_class_list = []
    
    pcd_log = safe_log(pcd)
    pcd_1_log = safe_log(1-pcd)
    prior_log = safe_log(priors)
        
    g = (X.dot(pcd_log.T) + (1-X).dot(pcd_1_log.T)) + prior_log

    pred_class = g.argmax(axis=1)
        
    return g,pred_class

In [2]:
def find_classes(Y_train,Y_test):
    train_expected = []
    test_expected = []
    for i in range(Y_train.shape[0]):
        if Y_train[i,0] == "A":
            train_expected.append(0)
        if Y_train[i,0] == "B":
            train_expected.append(1)
        if Y_train[i,0] == "C":
            train_expected.append(2)
        if Y_train[i,0] == "D":
            train_expected.append(3)
        if Y_train[i,0] == "E":
            train_expected.append(4)
    
    for i in range(Y_test.shape[0]):
        if Y_test[i,0] == "A":
            test_expected.append(0)
        if Y_test[i,0] == "B":
            test_expected.append(1)
        if Y_test[i,0] == "C":
            test_expected.append(2)
        if Y_test[i,0] == "D":
            test_expected.append(3)
        if Y_test[i,0] == "E":
            test_expected.append(4)
    
    
    
    return train_expected,test_expected     
    
    

def create_conf_matrix(expected, predicted, n_classes):
    m = np.zeros(shape=(n_classes,n_classes))
    for pred, exp in zip(predicted, expected):
        m[pred][exp] += 1
    return m


In [None]:

X,Y = create_data()
X_train,X_test,Y_train,Y_test = split_data(X,Y)
priors = calculate_priors(Y_train)
pcd = calculate_pcd(X_train,Y_train)
g_train,predicted_train = posteriors(pcd,X_train,priors)
g_test,predicted_test = posteriors(pcd,X_test,priors)
expected_train,expected_test = find_classes(Y_train,Y_test)
train_conf_matrix = create_conf_matrix(expected_train,predicted_train,5)
test_conf_matrix = create_conf_matrix(expected_test,predicted_test,5)
    

print(pcd[0])
print(pcd[1])
print(pcd[2])
print(pcd[3])
print(pcd[4])


print(train_conf_matrix)

print(test_conf_matrix)
