In [59]:
# Import library
import numpy as np
from scipy.stats import norm

#Define function
def load_data(filename):
    data = np.loadtxt(filename,delimiter=',')
    row_length,col_length = data.shape

    # Initialize training
    train_x = data[:,0:col_length-1]
    train_y = data[:,col_length-1:col_length]
    return [train_x,train_y]

def testing(filename,join_dist_neg,join_dist_pos):
    data = load_data(filename)
    train_x = data[0]
    train_y = data[1]
    row_length = train_x.shape[0]
    
    p_neg = train_y[train_y == 0].shape[0] / row_length
    p_pos = train_y[train_y == 1].shape[0] / row_length

    count = 0
    for i in range(0,row_length):
        p_neg_i = p_neg * np.product(join_dist_neg[i,:])
        p_pos_i = p_pos * np.product(join_dist_pos[i,:])
        result = 1
        if (p_neg_i >= p_pos_i):
            result = 0
        if (train_y[i] == result):
            count = count + 1
    return (100*count / row_length)
    
    
# Read data
train_file = "spam_train.data"
test_file = "spam_test.data"
valid_file = "spam_validation.data"
data = load_data(train_file)
train_x = data[0]
train_y = data[1]
row_length = train_x.shape[0]

# Training / Building Gaussian distribution for each class
X_neg = train_x[np.where(train_y==0)[0],:]
X_pos = train_x[np.where(train_y==1)[0],:]


# Build gaussian distribution for y = 0 using MLE
MLE_u_neg = np.mean(X_neg,axis=0,keepdims=True)
MLE_std_neg = np.square(X_neg - MLE_u_neg)
MLE_std_neg = np.mean(MLE_std_neg,axis=0,keepdims=True) / (row_length-1)


# join_dist_neg = norm.pdf(X_neg,loc = MLE_u_neg,scale = MLE_std_neg)
join_dist_neg = norm.pdf(train_x,loc = MLE_u_neg,scale = MLE_std_neg)
# join_dist_neg = np.product(join_dist_neg)

# Build gaussian distribution for y = 1 using MLE
MLE_u_pos = np.mean(X_pos,axis=0,keepdims=True)
MLE_std_pos = np.square(X_pos - MLE_u_pos)
MLE_std_pos = np.sum(MLE_std_pos,axis=0,keepdims=True) / (row_length-1)

join_dist_pos = norm.pdf(train_x,loc = MLE_u_pos,scale = MLE_std_pos)
# join_dist_pos = norm.pdf(X_pos,loc = MLE_u_pos,scale = MLE_std_pos)
# join_dist_pos = np.product(join_dist_pos)

# Predict / Testing
print("Accuracy of training: ",testing(train_file,join_dist_neg,join_dist_pos))
print("Accuracy of testing: ",testing(test_file,join_dist_neg,join_dist_pos))
print("Accuracy of validation: ",testing(valid_file,join_dist_neg,join_dist_pos))

Accuracy of training:  76.06666666666666
Accuracy of testing:  100.0
Accuracy of validation:  100.0


In [80]:
join_dist_neg.shape

(3000, 57)

In [46]:
MLE_u_neg = np.mean(X_neg,axis=0,keepdims=True)
MLE_std_neg = np.square(X_neg - MLE_u_neg)
MLE_std_neg = np.mean(MLE_std_neg,axis=0,keepdims=True) / (row_length-1)


# join_dist_neg = norm.pdf(X_neg,loc = MLE_u_neg,scale = MLE_std_neg)
join_dist_neg = norm.pdf(train_x,loc = MLE_u_neg,scale = MLE_std_neg)

In [58]:
def PCA(filename,k):
    # Read data
    data = load_data(filename)
    X = data[0]
    Y = data[1]
    # Define input and output
    num_data = X.shape[0]
    num_feature = X.shape[1]
    # Construct matrix W - sample covariance matrix 
    X_mean = np.mean(X,axis=0)
    X_mean = np.reshape(X_mean,(num_feature,1))
    W = X - np.dot(np.ones((num_data,1)),X_mean.T)
    # Find SVD of W - covariance matrix
    U, s, V = np.linalg.svd(W, full_matrices=True)
    # eigen vectors and eigen value of covariance matrix 
    eigen_vec = U
    # s = np.reshape(s,(s.shape[0],1))
    # eigen_value = np.dot(s,s.T)
    eigen_value = s * s
    # Top k eigen value
    print("Top ",k," eigen values: ",eigen_value[:k])
    V_r = V[:,:k]
    return V_r

In [69]:
V_r = PCA(train_file,3)

Top  3  eigen values:  [  1.52029434e+09   1.28884476e+08   3.44289156e+06]


In [70]:
V_r.shape

(57, 3)

In [101]:
k = list(range(1,11))
k

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [85]:
# Problem 1 - section 2
# Probability of first data point


def dist_pi(feature,V_r,k):
    pi = 0
    for i in range(0,k):
        pi = pi + np.square(V_r[feature,i])
    return (pi / k)

k_list = list(range(1,11))
s_list = list(range(1,20))
for k in k_list:
    V_r = PCA(train_file,k)
    for s in s_list:
        
# sum = 0
# for i in range(0,57):
#     sum = sum + dist_pi(i,V_r,k)
# print(sum)

1.0


In [92]:
dist = np.ones((3000,57))
for i in range(0,57):
    dist[:,i] = dist_pi(i,V_r,k)

In [93]:
testing(train_file,dist,dist)

60.43333333333333

In [91]:
dist_pi(0,V_r,k)

1.3780543061854113e-09

In [90]:
dist[:,0]

array([  1.37805431e-09,   1.37805431e-09,   1.37805431e-09, ...,
         1.37805431e-09,   1.37805431e-09,   1.37805431e-09])

In [81]:
join_dist_neg.shape

(3000, 57)

In [None]:
join_dist_neg = norm.pdf(train_x,loc = MLE_u_neg,scale = MLE_std_neg)

In [73]:
v = V_r * V_r

In [77]:
np.sum(v[:,2])

1.0

In [65]:
V_r[:,0]

array([  4.62210162e-05,  -1.72620590e-05,  -2.47331606e-04,
        -1.34398079e-02,   2.60249982e-02,  -4.71376754e-04,
         1.96087763e-02,  -2.90153582e-02,   1.81130966e-03,
        -2.48387733e-03,   1.90035300e-02,  -2.12007963e-02,
         1.74077525e-02,  -4.46678561e-03,   2.74060969e-02,
        -1.54270453e-02,   1.92243118e-02,  -2.65109929e-02,
        -6.49010270e-03,   7.57110978e-03,   2.46986057e-02,
         8.31206347e-03,  -3.13907449e-02,  -4.44065827e-02,
         1.72354710e-01,   7.37940853e-02,   3.93700960e-02,
        -9.72146983e-04,  -3.97912537e-02,  -7.58509570e-02,
        -1.71086787e-02,   8.79456762e-02,   2.23808846e-02,
         2.64352746e-03,   5.65721911e-02,   4.96268755e-02,
        -8.11395021e-02,  -2.30152468e-01,   7.43307153e-01,
         4.62817236e-01,   2.31052625e-02,   1.05328773e-01,
        -3.40978107e-02,  -2.64029223e-01,  -4.93390581e-02,
         1.25041457e-01,   3.81486564e-02,   2.88680526e-02,
         4.46831080e-02,