# Assignment 2 
### COMP 551 
### Olivier Simard-Morissette 
### 260563480 

## Q1

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot
import seaborn as sns
import math
sns.set_palette(sns.color_palette("GnBu_d"))
%matplotlib inline




In [5]:
def get_mean_from_file(file_path):
    '''
    Opens the file provided in the instructor to retrieve the means
    for the different distributions.
    '''
    with open(file_path) as file_handler:
        # strips out last line because of extra comma in file.
        list_of_means = file_handler.read().strip().split(',')[:-1] 
        list_of_means_float = [np.float(mean) for mean in list_of_means]
        array_of_means = np.array(list_of_means_float)
        return array_of_means
 
def get_covariance_matrix_from_file(file_path):
    # strips out last column because of extra commas. 
    df_covariance = pd.read_csv(file_path, header=None)
    del df_covariance[20]
    return df_covariance.as_matrix()



m0 = get_mean_from_file('./hwk2_datasets_corrected/DS1_m_0.txt')
m1 = get_mean_from_file('./hwk2_datasets_corrected/DS1_m_1.txt')
covariance_matrix = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS1_Cov.txt')

m0_dataset = np.random.multivariate_normal(m0, covariance_matrix, size=2000)
m1_dataset = np.random.multivariate_normal(m1, covariance_matrix, size=2000)

# Labeling the data on the 20th column with 0 corresponding to 
# a negative example and 1 corresponding to a positive example.
df_m0 = pd.DataFrame(m0_dataset)
df_m0[20] = 0

df_m1 = pd.DataFrame(m1_dataset)
df_m1[20] = 1

# Concatenating the two datasets into a single dataframe.

# Randomizing the datasets 
df = pd.concat([df_m1, df_m0],ignore_index=True)
df = df.reindex(np.random.permutation(df.index))
test_df = df[:1200].reset_index(drop=True)
train_df = df[1200:].reset_index(drop=True)

df.to_csv('./DS1.csv')
test_df.to_csv('./DS1_test.csv')
train_df.to_csv('./DS1_train.csv')



## Q2

In [7]:
import functools

def _get_column_vector_means(df, class_num):
    train_matrix_means = [] 
    class_df = df[df[20] == class_num]
    for column in class_df:
        if column != 20:
            train_matrix_means.append(class_df[column].mean())
    return np.asarray(train_matrix_means)

def _get_covariance_sum(df, class_num, mean_vector):
    count = 0 
    class_df = df[df[20] == class_num] # Only get the sum for this class. 
    class_df = class_df.drop([20], axis = 1) # Remove the class label 
    class_df_as_matrix = class_df.as_matrix()
    
    # Execute summation 
    covariance_running_sum = np.zeros((20,20))
    for i in range(len(class_df_as_matrix)):
        mean_difference = class_df_as_matrix[i] - mean_vector
        mean_diff_vector = mean_difference.reshape(len(mean_difference),1)
        mean_diff_vector_transpose = mean_diff_vector.transpose()
        covariance_component = np.divide(np.matmul(mean_diff_vector,mean_diff_vector_transpose),len(df))
        covariance_running_sum = np.add(covariance_running_sum, covariance_component )

    return covariance_running_sum
    
def _get_covariance(df, class_nums, class_mean_vectors):
    covariance = np.zeros((20,20))
    for class_num in class_nums:
        mean_vector = class_mean_vectors[class_num]
        covariance_sum = _get_covariance_sum(df, class_num, mean_vector)
        covariance = np.add(covariance, covariance_sum)
    return covariance

def _get_class_mean_vectors(df, class_nums):
    class_mean_vectors = {}
    for class_num in class_nums:
        class_mean_vector = _get_column_vector_means(df, class_num)
        class_mean_vectors[class_num] = class_mean_vector
    return class_mean_vectors 

def _get_class_probability(df, class_nums):
    class_probability = {}
    for class_num in class_nums:
        class_probability[class_num] = len(df[df[20] == class_num]) / float(len(df))
    return class_probability


def _decision_boundary(x,class_probability_lookup=None, class_mean_vector_lookup=None, covariance=None):
    '''
    Takes a test sample x as input and returns true if belonging to class zero,
    otherwise returns false 
    '''
    prob_zero = class_probability_lookup[0]
    prob_one = class_probability_lookup[1]
    covariance_inverse = np.linalg.inv(covariance)
    mv_zero = class_mean_vector_lookup[0]
    mv_zero = mv_zero.reshape(len(mv_zero), 1)
    mv_one = class_mean_vector_lookup[1]
    mv_one = mv_one.reshape(len(mv_one), 1)
    mv_zero_transpose = mv_zero.transpose()
    mv_one_transpose = mv_one.transpose()
    x_transpose = x.transpose()
    
    xtw1 = np.matmul(np.matmul(x_transpose,covariance_inverse),np.subtract(mv_zero,mv_one))
    x0_term1 = math.log(prob_zero) - math.log(prob_one)
    x0_term2 = -0.5 * np.matmul(np.matmul(mv_zero_transpose,covariance_inverse),mv_zero)
    x0_term3 = 0.5 * np.matmul(np.matmul(mv_one_transpose, covariance_inverse),mv_one)
    sum_decision = (x0_term1 + x0_term2[0][0] + x0_term3[0][0] +  xtw1[0][0])
    return 0 if sum_decision > 0  else 1 


def get_classifier(df, class_nums):
    '''
    Pass 
    '''
    class_mean_vectors = _get_class_mean_vectors(df, class_nums)
    covariance = _get_covariance(df, class_nums,class_mean_vectors)
    class_probabilities = _get_class_probability(df, class_nums)
    return functools.partial(_decision_boundary,
                             class_probability_lookup=class_probabilities,
                             covariance = covariance,
                             class_mean_vector_lookup = class_mean_vectors), covariance, class_mean_vectors


def get_measurement_indicators(results):
    true_positives = sum([1 for result, answer in results if result == answer and answer == 1])
    true_negatives = sum([1 for result, answer in results if result == answer and answer == 0])
    false_positives = sum([1 for result, answer in results if result != answer and answer == 0])
    false_negatives = sum([1 for result, answer in results if result != answer and answer == 1])

    accuracy = float(true_positives + true_negatives) / float(true_positives + false_positives + false_negatives + true_negatives)
    precision = float(true_positives) / float(true_positives + false_positives)
    recall = float(true_positives) / float(true_positives + false_negatives)
    f1_measure = (2 * precision * recall) / (precision + recall)
    return accuracy, precision, recall, f1_measure


In [8]:
# Running experiment here 
class_nums = (0, 1)
classifier,covariance,class_mean_vectors = get_classifier(train_df, class_nums)

# Reporting the coefficients learnt here for LDA. 
covariance.tofile('DS1-covariance-matrix')
for vector in class_mean_vectors:
    vector_arr = class_mean_vectors[vector]
    vector_arr.tofile('DS1-Class-%s-Mean-Vector'%(vector))

In [9]:
# Getting the results on the test set 

test_df_vector = test_df.iloc[:,0:20].as_matrix()
answers = test_df.iloc[:,20].as_matrix()

results = [] 
for i , row in enumerate(test_df_vector):
    answer = answers[i]
    row = row.reshape(len(row),1)
    result = classifier(row)
    results.append((result,answer))

The following definitions were used for true positive and true negative. They were taken from 
the slide 15 and 17 from the lecture on model evaluation.
<br />
True positive: Example of class 1 predicted as class 1.
<br />
False positive: Example of class 0 predicted as class 1. 
<br />
True negative: Example of class 0 predicted as class 0.
<br />
False negative: Example of class 1 predicted as class 0. 
<br/>

Accuracy = (TP + TN) / (TP + FP + FN + TN)
<br /> 
Precision = True positives / Total number of declared positives = TP / (TP+ FP)
<br />
Recall = True positives / Total number of actual positives = TP / (TP + FN)
<br />
F1 measure = (2 * ( Precision * Recall ) ) / ( precision + recall )





In [5]:
accuracy, precision, recall, f1_measure = get_measurement_indicators(results)

print("Accuracy:%.4f"%(accuracy))
print("Precision:%.4f"%(precision))
print("Recall:%.4f"%(recall))
print("F1 Measure:%.4f"%(f1_measure))

Accuracy:0.9625
Precision:0.9646
Recall:0.9630
F1 Measure:0.9638


## Q3

The algorithm for k-nearest neighbours used in this assignment is taken from slide 28, lecture 7 on Instance Learning. During training, the data points are just stored. When making a prediction, the euclidean distance is used as our distance measure to compare the input vector with all training data points. Then I sort to get the vectors which had the smallest euclidean distance from the input vector and classify by looking at the majority class of k vectors. 


In [10]:
from collections import Counter


def _get_knn_classifier(x,train_data=None, class_labels = None, k=None):
    sample_norms = np.linalg.norm(x - train_data,axis=1)
    sample_scores_with_cl = [(score,label) for score,label in zip(sample_norms, class_labels)]
    sample_scores_cl_sorted = sorted(sample_scores_with_cl, key = lambda x : x[0])
    sample_scores_k_group = [label for score, label in sample_scores_cl_sorted[:k]]
    c = Counter(sample_scores_k_group)
    return c.most_common()[0][0] # get the most common
    
    
    
def knn_classifier(train_data, class_labels,k):
    return functools.partial(_get_knn_classifier,
                             train_data=train_data, 
                             class_labels = class_labels,
                             k=k)


    
train_df_matrix = train_df.as_matrix()

train_class_labels = train_df_matrix[:,20]
train_data = train_df_matrix[:,0:20]

In [11]:
test_vectors = test_df.as_matrix()[:,0:20]
test_class_labels = test_df.as_matrix()[:,20]

measurements = [] 
for k in range(1,21):
    knn_clfr = knn_classifier(train_data, train_class_labels, k)
    
    results = []
    for test_vector, class_label in zip(test_vectors, test_class_labels):
        prediction = knn_clfr(test_vector)
        results.append((prediction, class_label))


    accuracy, precision, recall, f1_measure = get_measurement_indicators(results)
    
    print(k)
    print("Accuracy:%.4f"%(accuracy))
    print("Precision:%.4f"%(precision))
    print("Recall:%.4f"%(recall))
    print("F1 Measure:%.4f"%(f1_measure))
    
    measurements.append((k,accuracy,precision,recall,f1_measure))
    


1
Accuracy:0.5183
Precision:0.5109
Recall:0.5541
F1 Measure:0.5316
2
Accuracy:0.5000
Precision:0.4884
Recall:0.2855
F1 Measure:0.3603
3
Accuracy:0.5300
Precision:0.5216
Recall:0.5709
F1 Measure:0.5452
4
Accuracy:0.5425
Precision:0.5497
Recall:0.4020
F1 Measure:0.4644
5
Accuracy:0.5608
Precision:0.5509
Recall:0.5946
F1 Measure:0.5719
6
Accuracy:0.5442
Precision:0.5460
Recall:0.4510
F1 Measure:0.4940
7
Accuracy:0.5675
Precision:0.5542
Recall:0.6301
F1 Measure:0.5897
8
Accuracy:0.5567
Precision:0.5573
Recall:0.4932
F1 Measure:0.5233
9
Accuracy:0.5642
Precision:0.5511
Recall:0.6284
F1 Measure:0.5872
10
Accuracy:0.5558
Precision:0.5539
Recall:0.5118
F1 Measure:0.5320
11
Accuracy:0.5558
Precision:0.5429
Recall:0.6301
F1 Measure:0.5833
12
Accuracy:0.5500
Precision:0.5450
Recall:0.5321
F1 Measure:0.5385
13
Accuracy:0.5508
Precision:0.5387
Recall:0.6233
F1 Measure:0.5779
14
Accuracy:0.5458
Precision:0.5410
Recall:0.5236
F1 Measure:0.5322
15
Accuracy:0.5475
Precision:0.5363
Recall:0.6115
F1 Meas

*Do you do better or worse than LDA?*

The performance of KNN across all evaluation measurements was worse than linear discriminant analysis. 

*Are there particular values of k which perform better?* 

If a value of k is even, it has a much lower recall than a value of k which is odd. 

*Report the best fit accuracy, precision, recall and f-measure achieved by this classifier.*

The best fit was determined by looking at the Accuracy from K = 1 to 20. This was determined to be when k=13. The accuracy, precision, recall  and F1 were:

k = 13
<br />
Accuracy:0.5508
<br />
Precision:0.5387
<br />
Recall:0.6233
<br />
F1 Measure:0.5779
<br />



## Q4

In [12]:
from numpy.random import choice

c1_m1_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c1_m1.txt')
c1_m2_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c1_m2.txt')
c1_m3_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c1_m3.txt')
c2_m1_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c2_m1.txt')
c2_m2_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c2_m2.txt')
c2_m3_mean = get_mean_from_file('./hwk2_datasets_corrected/DS2_c2_m3.txt')
covariance_one = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS2_Cov1.txt')
covariance_two = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS2_Cov2.txt')
covariance_three = get_covariance_matrix_from_file('./hwk2_datasets_corrected/DS2_Cov3.txt')

class_nums = (0, 1)

means = {
    (0,"c1"):c1_m1_mean,
    (0,"c2"):c1_m2_mean,
    (0,"c3"):c1_m3_mean,
    (1,"c1"):c2_m1_mean,
    (1,"c2"):c2_m2_mean,
    (1,"c3"):c2_m3_mean
}

covariances = {
    "c1": covariance_one,
    "c2": covariance_two,
    "c3": covariance_three
}

samples = [] 
for class_num in class_nums:
    class_samples = [] 
    for _ in range(2000):
        sample_class = choice(["c1","c2","c3"], 1, p=[0.1, 0.42, 0.48])
        sample_mean = means[(class_num,sample_class[0])]
        sample_covariance = covariances[sample_class[0]]
        sample = np.random.multivariate_normal(sample_mean, sample_covariance)
        class_samples.append(sample)
    samples.append(np.asarray(class_samples))

class_one_df = pd.DataFrame(samples[0])
class_one_df[20] = 0
class_two_df = pd.DataFrame(samples[1])
class_two_df[20] = 1


# Randomizing the datasets 
df = pd.concat([class_one_df, class_two_df],ignore_index=True)
df = df.reindex(np.random.permutation(df.index))
test_df = df[:1200].reset_index(drop=True)
train_df = df[1200:].reset_index(drop=True)

df.to_csv('./DS2.csv')
test_df.to_csv('./DS2_test.csv')
train_df.to_csv('./DS2_train.csv')



In [14]:
# Running experiment with LDA 

classifier,covariance,class_mean_vectors = get_classifier(train_df, class_nums)


covariance.tofile('DS2-covariance-matrix')
for vector in class_mean_vectors:
    vector_arr = class_mean_vectors[vector]
    vector_arr.tofile('DS2-Class-%s-Mean-Vector'%(vector))
    

test_df_vector = test_df.iloc[:,0:20].as_matrix()
answers = test_df.iloc[:,20].as_matrix()

results = [] 
for i , row in enumerate(test_df_vector):
    answer = answers[i]
    row = row.reshape(len(row),1)
    result = classifier(row)
    results.append((result,answer))
  

accuracy, precision, recall, f1_measure = get_measurement_indicators(results)

print("Accuracy:%.4f"%(accuracy))
print("Precision:%.4f"%(precision))
print("Recall:%.4f"%(recall))
print("F1 Measure:%.4f"%(f1_measure))





Accuracy:0.5375
Precision:0.5352
Recall:0.5343
F1 Measure:0.5348


In [15]:
# Running experiment with KNN

train_df_matrix = train_df.as_matrix()
train_class_labels = train_df_matrix[:,20]
train_data = train_df_matrix[:,0:20]

test_df_vector = test_df.iloc[:,0:20].as_matrix()
answers = test_df.iloc[:,20].as_matrix()

for k in range(1,21):
    knn_clfr = knn_classifier(train_data, train_class_labels, k)
    
    results = []
    for test_vector, class_label in zip(test_df_vector, answers):
        prediction = knn_clfr(test_vector)
        results.append((prediction, class_label))


    accuracy, precision, recall, f1_measure = get_measurement_indicators(results)
    
    print(k)
    print("Accuracy:%.4f"%(accuracy))
    print("Precision:%.4f"%(precision))
    print("Recall:%.4f"%(recall))
    print("F1 Measure:%.4f"%(f1_measure))



1
Accuracy:0.5325
Precision:0.5306
Recall:0.5226
F1 Measure:0.5266
2
Accuracy:0.5258
Precision:0.5500
Recall:0.2580
F1 Measure:0.3512
3
Accuracy:0.5167
Precision:0.5150
Recall:0.4874
F1 Measure:0.5009
4
Accuracy:0.5383
Precision:0.5609
Recall:0.3317
F1 Measure:0.4168
5
Accuracy:0.5250
Precision:0.5241
Recall:0.4925
F1 Measure:0.5078
6
Accuracy:0.5275
Precision:0.5385
Recall:0.3518
F1 Measure:0.4255
7
Accuracy:0.5283
Precision:0.5275
Recall:0.4975
F1 Measure:0.5121
8
Accuracy:0.5225
Precision:0.5300
Recall:0.3551
F1 Measure:0.4253
9
Accuracy:0.5283
Precision:0.5275
Recall:0.4975
F1 Measure:0.5121
10
Accuracy:0.5383
Precision:0.5531
Recall:0.3752
F1 Measure:0.4471
11
Accuracy:0.5433
Precision:0.5451
Recall:0.4958
F1 Measure:0.5193
12
Accuracy:0.5375
Precision:0.5510
Recall:0.3802
F1 Measure:0.4500
13
Accuracy:0.5517
Precision:0.5526
Recall:0.5193
F1 Measure:0.5354
14
Accuracy:0.5550
Precision:0.5708
Recall:0.4255
F1 Measure:0.4875
15
Accuracy:0.5592
Precision:0.5612
Recall:0.5226
F1 Meas

The best fit was determined by determing the highest accuracy from K = 1 to 20. This was determined to be when k=15. The accuracy, precision, recall and F1 were:

k = 15 
<br />
Accuracy:0.5592
<br />
Precision:0.5612
<br />
Recall:0.5226
<br />
F1 Measure:0.5412

The performance of the LDA is reduced dramatically. This can be explained by LDA's assumption that all sample classes have the same covariance matrices. In this case, the samples were generated using separate covariance matrices and we got a poor performance as a result. 

The performance of the KNN did not improve from DS1 to DS2. The performance of KNN in this case was better than LDA when an odd number of K was used.