In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# read in data

data = pd.read_csv('spambase.data', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


In [3]:
data.shape

(4601, 58)

In [4]:
# randomize data, split into training and validation

data_np = data.to_numpy()
np.random.seed(0)
np.random.shuffle(data_np)

trainSplit = int(np.ceil(len(data_np) * (2/3)))
training, validation = data_np[:trainSplit], data_np[trainSplit:]
x_train, y_train = training[:, :-1], training[:, -1:]
x_valid, y_valid = validation[:, :-1], validation[:, -1:]


In [5]:
# calculate mean of training dataset for class 0 and class 1
# get covariance matrix of class 0 and class 1

class_zero, class_one = training[training[:, -1] == 0], training[training[:, -1] == 1]
class_zero, class_one = class_zero[:, :-1], class_one[:, :-1]

class_zero_mean = np.mean(class_zero, axis=0, keepdims=True)
class_one_mean = np.mean(class_one, axis=0, keepdims=True)

cov_zero = np.cov(class_zero, rowvar=False, ddof=1)
cov_one = np.cov(class_one, rowvar=False, ddof=1)

In [6]:
# get the scatter matrices

mean_difference = (class_zero_mean - class_one_mean).reshape(-1, 1)
sb = mean_difference @ mean_difference.T
sw = cov_zero + cov_one

In [7]:
# get eigenvalues and eigenvectors
# sort eigenvectors and get top 1 for the weight

swsb = np.linalg.pinv(sw) @ sb
eigenvalues, eigenvectors = np.linalg.eig(swsb)
index = np.argsort(eigenvalues)[::-1]
sortedEigenvectors = eigenvectors[:,index]
weight = sortedEigenvectors[:, :1]

In [8]:
# create threshold for classification
# create predictions using training data and weight (z = x @ w)

threshold = ((class_zero_mean @ weight) + (class_one_mean @ weight)) / 2
trainingPrediciton = [0 if output < threshold else 1 for output in (x_train @ weight)] 
validationPrediction = [0 if output < threshold else 1 for output in (x_valid @ weight)]

In [9]:
# compute accuracy for both training and validation data

def statistics(prediction, true_value):
    
    truePositive, falsePositive, trueNegative, falseNegative, correct = 0, 0, 0, 0, 0

    for i in range(len(prediction)):
        if prediction[i] == 1 and true_value[i] == 1:
            correct += 1
            truePositive += 1

        elif prediction[i] == 1 and true_value[i] == 0:
            falsePositive += 1

        elif prediction[i] == 0 and true_value[i] == 0:
            correct += 1
            trueNegative += 1

        else:
            falseNegative += 1

    percentage_correct = (correct / len(prediction)) * 100
    precision = truePositive / (truePositive + falsePositive)
    recall = truePositive / (truePositive + falseNegative)
    fmeasure = (2 * precision * recall) / (precision + recall)

    return percentage_correct, precision, recall, fmeasure

In [10]:
# get statistics for training and validaiton data

traininingAccuracy, trainingPrecision, trainingRecall, trainingFmeasure = statistics(trainingPrediciton, y_train)
validationAccuracy, validationPrecision, validationRecall, validationFmeasure = statistics(validationPrediction, y_valid)

print('Training Accuracy: {}%'.format(traininingAccuracy))
print('Validation Accuracy: {}%\n'.format(validationAccuracy))
print('Validation Precision: {}'.format(validationPrecision))
print('Validaiton Recall: {}'.format(validationRecall))
print('Vaidaiton F-Measure: {}'.format(validationFmeasure))

Training Accuracy: 91.03650586701434%
Validation Accuracy: 90.8023483365949%

Validation Precision: 0.8769497400346621
Validaiton Recall: 0.8784722222222222
Vaidaiton F-Measure: 0.8777103209019947
