# Assignment 2 
### COMP 551 
### Olivier Simard-Morissette 
### 260563480 

## Q1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns
import math
sns.set_palette(sns.color_palette("GnBu_d"))
%matplotlib inline




In [2]:
def get_mean_from_file(file_path):
    '''
    Opens the file provided in the instructor to retrieve the means
    for the different distributions.
    '''
    with open(file_path) as file_handler:
        # strips out last line because of extra comma in file.
        list_of_means = file_handler.read().strip().split(',')[:-1] 
        list_of_means_float = [np.float(mean) for mean in list_of_means]
        array_of_means = np.array(list_of_means_float)
#         vector_of_means = array_of_means.reshape(len(array_of_means),1)
        return array_of_means
 
def get_covariance_matrix_from_file(file_path):
    # strips out last column because of extra commas. 
    df_covariance = pd.read_csv(file_path, header=None)
    del df_covariance[20]
    return df_covariance.as_matrix()



m0 = get_mean_from_file('./hwk2_datasets_corrected/DS1_m_0.txt')
m1 = get_mean_from_file('./hwk2_datasets_corrected/DS1_m_1.txt')
covariance_matrix = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS1_Cov.txt')

m0_dataset = np.random.multivariate_normal(m0, covariance_matrix, size=2000)
m1_dataset = np.random.multivariate_normal(m1, covariance_matrix, size=2000)

# Labeling the data on the 20th column with 0 corresponding to 
# a negative example and 1 corresponding to a positive example.
df_m0 = pd.DataFrame(m0_dataset)
df_m0[20] = 0

df_m1 = pd.DataFrame(m1_dataset)
df_m1[20] = 1

# Concatenating the two datasets into a single dataframe.

# Randomizing the datasets 
df = pd.concat([df_m1, df_m0],ignore_index=True)
df = df.reindex(np.random.permutation(df.index))
test_df = df[:1200].reset_index(drop=True)
train_df = df[1200:].reset_index(drop=True)

test_df.to_csv('./my_dataset/DS1_test.csv')
train_df.to_csv('./my_dataset/DS1_train.csv')



## Q2

In [3]:
import functools

def _get_column_vector_means(df, class_num):
    train_matrix_means = [] 
    class_df = df[df[20] == class_num]
    for column in class_df:
        if column != 20:
            train_matrix_means.append(class_df[column].mean())
    return np.asarray(train_matrix_means)

def _get_covariance_sum(df, class_num, mean_vector):
    count = 0 
    class_df = df[df[20] == class_num] # Only get the sum for this class. 
    class_df = class_df.drop([20], axis = 1) # Remove the class label 
    class_df_as_matrix = class_df.as_matrix()
    
    # Execute summation 
    covariance_running_sum = np.zeros((20,20))
    for i in range(len(class_df_as_matrix)):
        mean_difference = class_df_as_matrix[i] - mean_vector
        mean_diff_vector = mean_difference.reshape(len(mean_difference),1)
        mean_diff_vector_transpose = mean_diff_vector.transpose()
        covariance_component = np.divide(np.matmul(mean_diff_vector,mean_diff_vector_transpose),len(df))
        covariance_running_sum = np.add(covariance_running_sum, covariance_component )

    return covariance_running_sum
    
def _get_covariance(df, class_nums, class_mean_vectors):
    covariance = np.zeros((20,20))
    for class_num in class_nums:
        mean_vector = class_mean_vectors[class_num]
        covariance_sum = _get_covariance_sum(df, class_num, mean_vector)
        covariance = np.add(covariance, covariance_sum)
    return covariance

def _get_class_mean_vectors(df, class_nums):
    class_mean_vectors = {}
    for class_num in class_nums:
        class_mean_vector = _get_column_vector_means(df, class_num)
        class_mean_vectors[class_num] = class_mean_vector
    return class_mean_vectors 

def _get_class_probability(df, class_nums):
    class_probability = {}
    for class_num in class_nums:
        class_probability[class_num] = len(df[df[20] == class_num]) / float(len(df))
    return class_probability


def _decision_boundary(x,class_probability_lookup=None, class_mean_vector_lookup=None, covariance=None):
    '''
    Takes a test sample x as input and returns true if belonging to class zero,
    otherwise returns false 
    '''
    prob_zero = class_probability_lookup[0]
    prob_one = class_probability_lookup[1]
    covariance_inverse = np.linalg.inv(covariance)
    mv_zero = class_mean_vector_lookup[0]
    mv_zero = mv_zero.reshape(len(mv_zero), 1)
    mv_one = class_mean_vector_lookup[1]
    mv_one = mv_one.reshape(len(mv_one), 1)
    mv_zero_transpose = mv_zero.transpose()
    mv_one_transpose = mv_one.transpose()
    x_transpose = x.transpose()
    
    xtw1 = np.matmul(np.matmul(x_transpose,covariance_inverse),np.subtract(mv_zero,mv_one))
    x0_term1 = math.log(prob_zero) - math.log(prob_one)
    x0_term2 = -0.5 * np.matmul(np.matmul(mv_zero_transpose,covariance_inverse),mv_zero)
    x0_term3 = 0.5 * np.matmul(np.matmul(mv_one_transpose, covariance_inverse),mv_one)
    sum_decision = (x0_term1 + x0_term2[0][0] + x0_term3[0][0] +  xtw1[0][0])
    return 0 if sum_decision > 0  else 1 


def get_classifier(df, class_nums):
    '''
    Pass 
    '''
    class_mean_vectors = _get_class_mean_vectors(df, class_nums)
    covariance = _get_covariance(df, class_nums,class_mean_vectors)
    class_probabilities = _get_class_probability(df, class_nums)
    return functools.partial(_decision_boundary,
                             class_probability_lookup=class_probabilities,
                             covariance = covariance,
                             class_mean_vector_lookup = class_mean_vectors), covariance, class_mean_vectors


def get_measurement_indicators(results):
    true_positives = sum([1 for result, answer in results if result == answer and answer == 1])
    true_negatives = sum([1 for result, answer in results if result == answer and answer == 0])
    false_positives = sum([1 for result, answer in results if result != answer and answer == 0])
    false_negatives = sum([1 for result, answer in results if result != answer and answer == 1])

    accuracy = float(true_positives + true_negatives) / float(true_positives + false_positives + false_negatives + true_negatives)
    precision = float(true_positives) / float(true_positives + false_positives)
    recall = float(true_positives) / float(true_positives + false_negatives)
    f1_measure = (2 * precision * recall) / (precision + recall)
    
    return accuracy, precision, recall, f1_measure

    
class_nums = (0, 1)
classifier,covariance,class_mean_vectors = get_classifier(train_df, class_nums)


covariance.tofile('DS1-covariance-matrix')
for vector in class_mean_vectors:
    vector_arr = class_mean_vectors[vector]
    vector_arr.tofile('DS1-Class-%s-Mean-Vector'%(vector))




In [4]:
test_df_vector = test_df.iloc[:,0:20].as_matrix()
answers = test_df.iloc[:,20].as_matrix()

results = [] 
for i , row in enumerate(test_df_vector):
    answer = answers[i]
    row = row.reshape(len(row),1)
    result = classifier(row)
    results.append((result,answer))

The following definitions were used for true positive and true negative. They were taken from 
the slide 15 and 17 from the lecture on model evaluation.
<br />
True positive: Example of class 1 predicted as class 1.
<br />
False positive: Example of class 0 predicted as class 1. 
<br />
True negative: Example of class 0 predicted as class 0.
<br />
False negative: Example of class 1 predicted as class 0. 
<br/>

Accuracy = (TP + TN) / (TP + FP + FN + TN)
<br /> 
Precision = True positives / Total number of declared positives = TP / (TP+ FP)
<br />
Recall = True positives / Total number of actual positives = TP / (TP + FN)
<br />
F1 measure = (2 * ( Precision * Recall ) ) / ( precision + recall )





In [5]:
accuracy, precision, recall, f1_measure = get_measurement_indicators(results)

print("Accuracy:%.4f"%(accuracy))
print("Precision:%.4f"%(precision))
print("Recall:%.4f"%(recall))
print("F1 Measure:%.4f"%(f1_measure))

Accuracy:0.9625
Precision:0.9646
Recall:0.9630
F1 Measure:0.9638


## Q3

The algorithm for k-nearest neighbours used in this assignment is taken from slide 28, lecture 7 on Instance Learning. During training, the data points are just stored. When making a prediction, the euclidean distance is used as our distance measure to compare the input vector with all training data points. Then I sort to get the vectors which had the smallest euclidean distance from the input vector and classify by looking at the majority class of k vectors. 


In [6]:
from collections import Counter


def _get_knn_classifier(x,train_data=None, class_labels = None, k=None):
    sample_norms = np.linalg.norm(x - train_data,axis=1)
    sample_scores_with_cl = [(score,label) for score,label in zip(sample_norms, class_labels)]
    sample_scores_cl_sorted = sorted(sample_scores_with_cl, key = lambda x : x[0])
    sample_scores_k_group = [label for score, label in sample_scores_cl_sorted[:k]]
    c = Counter(sample_scores_k_group)
    return c.most_common()[0][0] # get the most common
    
    
    
def knn_classifier(train_data, class_labels,k):
    return functools.partial(_get_knn_classifier,
                             train_data=train_data, 
                             class_labels = class_labels,
                             k=k)


    
train_df_matrix = train_df.as_matrix()

train_class_labels = train_df_matrix[:,20]
train_data = train_df_matrix[:,0:20]

In [7]:
test_vectors = test_df.as_matrix()[:,0:20]
test_class_labels = test_df.as_matrix()[:,20]

measurements = [] 
for k in range(1,21):
    knn_clfr = knn_classifier(train_data, train_class_labels, k)
    
    results = []
    for test_vector, class_label in zip(test_vectors, test_class_labels):
        prediction = knn_clfr(test_vector)
        results.append((prediction, class_label))


    accuracy, precision, recall, f1_measure = get_measurement_indicators(results)
    
    print(k)
    print("Accuracy:%.4f"%(accuracy))
    print("Precision:%.4f"%(precision))
    print("Recall:%.4f"%(recall))
    print("F1 Measure:%.4f"%(f1_measure))
    
    measurements.append((k,accuracy,precision,recall,f1_measure))
    


1
Accuracy:0.4992
Precision:0.5162
Recall:0.5370
F1 Measure:0.5264
2
Accuracy:0.5042
Precision:0.5410
Recall:0.2862
F1 Measure:0.3743
3
Accuracy:0.5150
Precision:0.5316
Recall:0.5402
F1 Measure:0.5359
4
Accuracy:0.5017
Precision:0.5305
Recall:0.3360
F1 Measure:0.4114
5
Accuracy:0.5117
Precision:0.5290
Recall:0.5273
F1 Measure:0.5282
6
Accuracy:0.5183
Precision:0.5505
Recall:0.3859
F1 Measure:0.4537
7
Accuracy:0.5350
Precision:0.5533
Recall:0.5338
F1 Measure:0.5434
8
Accuracy:0.5233
Precision:0.5543
Recall:0.4100
F1 Measure:0.4713
9
Accuracy:0.5275
Precision:0.5452
Recall:0.5338
F1 Measure:0.5394
10
Accuracy:0.5183
Precision:0.5460
Recall:0.4196
F1 Measure:0.4745
11
Accuracy:0.5325
Precision:0.5499
Recall:0.5402
F1 Measure:0.5450
12
Accuracy:0.5275
Precision:0.5569
Recall:0.4325
F1 Measure:0.4869
13
Accuracy:0.5467
Precision:0.5627
Recall:0.5627
F1 Measure:0.5627
14
Accuracy:0.5342
Precision:0.5624
Recall:0.4566
F1 Measure:0.5040
15
Accuracy:0.5308
Precision:0.5477
Recall:0.5450
F1 Meas

*Do you do better or worse than LDA?*

The performance of KNN across all evaluation measurements was worse than linear discriminant analysis. 

*Are there particular values of k which perform better?* 

If a value of k is even, it has a much lower recall than a value of k which is odd. 

*Report the best fit accuracy, precision, recall and f-measure achieved by this classifier.*

The best fit was determined by looking at the Accuracy from K = 1 to 20. This was determined to be when k=13. The accuracy, precision, recall  and F1 were:

k = 13
<br />
<br />
Accuracy:0.5467
<br />
<br />
Precision:0.5627
<br />
<br />
Recall:0.5627
<br />
<br />
F1 Measure:0.5627



## Q4

In [8]:
from numpy.random import choice

c1_m1_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c1_m1.txt')
c1_m2_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c1_m2.txt')
c1_m3_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c1_m3.txt')
c2_m1_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c2_m1.txt')
c2_m2_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c2_m2.txt')
c2_m3_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c2_m3.txt')
covariance_one = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS2_Cov1.txt')
covariance_two = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS2_Cov2.txt')
covariance_three = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS2_Cov3.txt')

class_nums = (0, 1)

means = {
    (0,"c1"):c1_m1_mean,
    (0,"c2"):c1_m2_mean,
    (0,"c3"):c1_m3_mean,
    (1,"c1"):c2_m1_mean,
    (1,"c2"):c2_m2_mean,
    (1,"c3"):c2_m3_mean
}

covariances = {
    "c1": covariance_one,
    "c2": covariance_two,
    "c3": covariance_three
}

samples = [] 
for class_num in class_nums:
    class_samples = [] 
    for _ in range(2000):
        sample_class = choice(["c1","c2","c3"], 1, p=[0.1, 0.42, 0.48])
        sample_mean = means[(class_num,sample_class[0])]
        sample_covariance = covariances[sample_class[0]]
        sample = np.random.multivariate_normal(sample_mean, sample_covariance)
        class_samples.append(sample)
    samples.append(np.asarray(class_samples))

class_one_df = pd.DataFrame(samples[0])
class_one_df[20] = 0
class_two_df = pd.DataFrame(samples[1])
class_two_df[20] = 1


# Randomizing the datasets 
df = pd.concat([class_one_df, class_two_df],ignore_index=True)
df = df.reindex(np.random.permutation(df.index))
test_df = df[:1200].reset_index(drop=True)
train_df = df[1200:].reset_index(drop=True)

test_df.to_csv('./my_dataset/DS2_test.csv')
train_df.to_csv('./my_dataset/DS2_train.csv')



In [9]:
# Running experiment with LDA 

classifier,covariance,class_mean_vectors = get_classifier(train_df, class_nums)


covariance.tofile('DS2-covariance-matrix')
for vector in class_mean_vectors:
    vector_arr = class_mean_vectors[vector]
    vector_arr.tofile('DS2-Class-%s-Mean-Vector'%(vector))
    

test_df_vector = test_df.iloc[:,0:20].as_matrix()
answers = test_df.iloc[:,20].as_matrix()

results = [] 
for i , row in enumerate(test_df_vector):
    answer = answers[i]
    row = row.reshape(len(row),1)
    result = classifier(row)
    results.append((result,answer))
  

accuracy, precision, recall, f1_measure = get_measurement_indicators(results)

print("Accuracy:%.4f"%(accuracy))
print("Precision:%.4f"%(precision))
print("Recall:%.4f"%(recall))
print("F1 Measure:%.4f"%(f1_measure))





Accuracy:0.4808
Precision:0.4743
Recall:0.5153
F1 Measure:0.4939


In [10]:
# Running experiment with KNN

train_df_matrix = train_df.as_matrix()
train_class_labels = train_df_matrix[:,20]
train_data = train_df_matrix[:,0:20]

test_df_vector = test_df.iloc[:,0:20].as_matrix()
answers = test_df.iloc[:,20].as_matrix()

for k in range(1,21):
    knn_clfr = knn_classifier(train_data, train_class_labels, k)
    
    results = []
    for test_vector, class_label in zip(test_df_vector, answers):
        prediction = knn_clfr(test_vector)
        results.append((prediction, class_label))


    accuracy, precision, recall, f1_measure = get_measurement_indicators(results)
    
    print(k)
    print("Accuracy:%.4f"%(accuracy))
    print("Precision:%.4f"%(precision))
    print("Recall:%.4f"%(recall))
    print("F1 Measure:%.4f"%(f1_measure))



1
Accuracy:0.5167
Precision:0.5084
Recall:0.5119
F1 Measure:0.5101
2
Accuracy:0.5192
Precision:0.5210
Recall:0.2729
F1 Measure:0.3582
3
Accuracy:0.5133
Precision:0.5052
Recall:0.4949
F1 Measure:0.5000
4
Accuracy:0.5408
Precision:0.5504
Recall:0.3610
F1 Measure:0.4360
5
Accuracy:0.5258
Precision:0.5178
Recall:0.5169
F1 Measure:0.5174
6
Accuracy:0.5383
Precision:0.5446
Recall:0.3729
F1 Measure:0.4427
7
Accuracy:0.5283
Precision:0.5208
Recall:0.5085
F1 Measure:0.5146
8
Accuracy:0.5283
Precision:0.5270
Recall:0.3966
F1 Measure:0.4526
9
Accuracy:0.5225
Precision:0.5144
Recall:0.5136
F1 Measure:0.5140
10
Accuracy:0.5242
Precision:0.5204
Recall:0.4102
F1 Measure:0.4588
11
Accuracy:0.5158
Precision:0.5076
Recall:0.5085
F1 Measure:0.5080
12
Accuracy:0.5358
Precision:0.5347
Recall:0.4305
F1 Measure:0.4770
13
Accuracy:0.5217
Precision:0.5133
Recall:0.5237
F1 Measure:0.5185
14
Accuracy:0.5258
Precision:0.5218
Recall:0.4254
F1 Measure:0.4687
15
Accuracy:0.5267
Precision:0.5181
Recall:0.5339
F1 Meas

The best fit was determined by looking at the Accuracy from K = 1 to 20. This was determined to be when k=15. While the accuracy was higher for k=4, the recall was much lower and was thus discarded. The accuracy, precision, recall and F1 were:

k = 15
Accuracy:0.5267
Precision:0.5181
Recall:0.5339
F1 Measure:0.5259

The performance of the LDA is reduced dramatically. This can be explained by LDA's assumption that all sample classes have the same covariance matrices. In this case, the samples were generated using separate covariance matrices and we got a poor performance as a result. 

The performance of the KNN improved from DS1 to DS2. This may be due to the increased variance between class zero and class one, so Euclidean distance measure was a better indicator of the difference between the two classes. The performance of KNN in this case was better than LDA when an odd number of K was used.