In [1]:
import numpy as np
# Import negative and positive class means, remove nan column
c1_mean = np.genfromtxt('./src/data/DS1_m_1.txt', delimiter=',')
c1_mean = np.delete(c1_mean, len(c1_mean) - 1)
c2_mean = np.genfromtxt('./src/data/DS1_m_0.txt', delimiter=',')
c2_mean = np.delete(c2_mean, len(c2_mean) - 1)


In [2]:
# Import covariance matrix, remove nan column
cov = np.genfromtxt('./src/data/DS1_Cov.txt', delimiter=',')
cov = np.delete(cov, len(cov[0]) - 1, axis=1)

In [3]:
# Generate 2000 sample points for each class
c1_temp_samples = np.random.multivariate_normal(c1_mean, cov, 2000)
c2_temp_samples = np.random.multivariate_normal(c2_mean, cov, 2000)
col1 = np.array([1 for c1 in range(2000)])
col0 = np.array([0 for c0 in range(2000)])
# Add a label of 1 for positive classes and 0 for negative classes
c1_samples = np.insert(c1_temp_samples, len(c1_temp_samples[0]), col1, axis=1)
c2_samples = np.insert(c2_temp_samples, len(c2_temp_samples[0]), col0, axis=1)
all_samples = np.append(c2_samples, c1_samples, axis=0)
# Shuffle the dataset
np.random.shuffle(all_samples)
# Save dataset as DS1 with 10 decimal precision
np.savetxt('./src/data/DS1.csv', all_samples, delimiter=',', fmt='%.10f')

In [4]:
# Shuffle the individual datasets
np.random.shuffle(c2_samples)
np.random.shuffle(c1_samples)
# Pick the training (70%) and test (30%) sets
c1_test = c1_samples[0:int(len(c1_samples) * 0.3)]
c1_train = c1_samples[int(len(c1_samples) * 0.3):]
c2_test = c2_samples[0:int(len(c2_samples) * 0.3)]
c2_train = c2_samples[int(len(c2_samples) * 0.3):]

# Separate out the data into features and class
c1_train_x = c1_train[:, 0:c1_train.shape[1] - 1]
c1_train_y = c1_train[:, c1_train.shape[1] - 1]
c2_train_x = c2_train[:, 0:c2_train.shape[1] - 1]
c2_train_y = c2_train[:, c2_train.shape[1] - 1]


test = np.append(c2_test, c1_test, axis=0)
train = np.append(c2_train, c1_train, axis=0)

np.random.shuffle(test)
np.random.shuffle(train)

test_x = test[:, 0:test.shape[1] - 1]
test_y = test[:, test.shape[1] - 1]

train_x = train[:, 0:train.shape[1] - 1]
train_y = train[:, train.shape[1] - 1]


In [5]:
n1 = c1_train.shape[0]
n2 = c2_train.shape[0]
n = train.shape[0]

In [6]:
# Calculate mu1 for c1 (positive)
mu1 = np.zeros(shape=(c1_train_x.shape[1],))
for i in range(n1):
    mu1 += c1_train_x[i]
mu1 /= n1
mu1 = mu1.reshape(20, 1)

In [7]:
# Calculate mu2 for c2 (negative)
mu2 = np.zeros(shape=(c2_train_x.shape[1],))
for i in range(n2):
    mu2 += c2_train_x[i]
mu2 /= n2
mu2 = mu2.reshape(20, 1)

In [8]:
# Calculate S1 and S2
xminmu1 = c1_train_x - np.transpose(mu1)
s1 = np.matmul(np.transpose(xminmu1), xminmu1)
s1 /= n1

xminmu2 = c2_train_x - np.transpose(mu2)
s2 = np.matmul(np.transpose(xminmu2), xminmu2)
s2 /= n2

In [9]:
# Calculate sigma
sigma = (1.0 / n) * (n1 * s1 + n2 * s2)

In [10]:
# Calculate pi
pi = n1 / n

In [11]:
# Calculate w0 and w1
sigmaInv = np.linalg.inv(sigma)
w = np.matmul(sigmaInv, (mu1 - mu2))
w0 = ((-0.5) * (np.matmul(np.matmul(np.transpose(mu1), sigmaInv), mu1))) + \
     (0.5 * (np.matmul(np.matmul(np.transpose(mu2), sigmaInv), mu2))) + \
      np.log(pi / (1 - pi))

np.savetxt('./src/DS1_LDA_w.csv', w, delimiter=',', fmt='%.10f')
np.savetxt('./src/DS1_LDA_w0.csv', w0, delimiter=',', fmt='%.10f')

In [12]:
# Logistic Sigmoid function
def sigmoid(a):
    result = 1 + np.power(np.e, -a)
    return 1 / result

In [13]:
# Use the coefficients learnt to predict new input
predictions = np.zeros(shape=(test_x.shape[0], 1))
np.seterr(all='raise')
for i, x in enumerate(test_x):
    predictions[i] = sigmoid(w.T.dot(x) + w0)


In [14]:
# Calculate the true and false positive and negative predictions of the learnt model
true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
for i, prediction in enumerate(predictions):
    actual = test_y[i]
    if prediction >= 0.5 and actual == 1:
        true_pos += 1
    if prediction < 0.5 and actual == 0:
        true_neg += 1
    if prediction >= 0.5 and actual == 0:
        false_pos += 1
    if prediction < 0.5 and actual == 1:
        false_neg += 1

In [15]:
# Calculate the accuracy, precision, recall, and F1 of the learnt model
accuracy = (true_neg + true_pos) / test.shape[0]
precision = true_pos / (true_pos + false_pos)
recall = true_pos / (true_pos + false_neg)
f1 = (2 * precision * recall) / (precision + recall)

In [16]:
print("Probabilistic LDA Results:")
print("Accuracy: %.2f%%" % (accuracy * 100))
print("Precision: %.2f%%" % (precision * 100))
print("Recall: %.2f%%" % (recall * 100))
print("F1 Measure: %.2f%%" % (f1 * 100))

Probabilistic LDA Results:
Accuracy: 95.33%
Precision: 94.88%
Recall: 95.83%
F1 Measure: 95.36%


In [17]:
# To implement k-NN as an alternative classification, we first implement euclidean distance
def euclidean(p1, p2, dimension):
    distance = 0
    for i in range(dimension):
        distance += np.power((p1[i] - p2[i]), 2)
    return np.sqrt(distance)


#### We will also implement a function for getting the k nearest neighbors of a point

In [18]:
def neighbors(training_set: np.array, test_point: np.array, k: int):
    """
    Calculate the k nearest neighbors in the training set for a given point in the test set of data, 
    using Euclidean Distance (applicable for n dimensions)
    
    Args:
        :param training_set: The training set of data to look for neighbors in 
        :param test_point: The point in test set to look for neighbors for
        :param k: Number of closest neighbor values to average
    Returns:
        :return: k nearest neighbors of the test point in the training set
    """
    distances = np.empty(shape=(len(training_set), len(training_set[0]) + 1))
    for i in range(len(training_set)):
        distance = np.linalg.norm(test_point - training_set[i][0:training_set.shape[1] - 1])
        # First 20 columns are the data point values
        distances[i][0:len(distances[i]) - 2] = training_set[i][0:training_set.shape[1] - 1]
        # One to last column is the class
        distances[i][len(distances[i]) - 2] = training_set[i][training_set.shape[1] - 1]
        # Last column is the distance
        distances[i][len(distances[i]) - 1] = distance
    # Sort based on the distance column
    distances = distances[np.argsort(distances[:, distances.shape[1] - 1])]
    k_neighbors = np.empty(shape=(k, training_set.shape[1]))
    for i in range(k):
        k_neighbors[i] = distances[i][0:distances.shape[1] - 1]
    return k_neighbors


In [19]:
kNN_results = np.zeros(shape=(200, 4))

In [21]:
kNN_prediction = np.empty(shape=(len(test), 1))
for k in range(1, 201):
    for i, test_sample in enumerate(test_x):
        point_neighbors = neighbors(train, test_sample, k)
        kNN_prediction[i] = np.mean(point_neighbors[:, -1])
    
    # Calculate the true and false positive and negative for the predictions
    kNN_tp, kNN_tn, kNN_fp, kNN_fn = 0, 0, 0, 0
    for i in range(len(kNN_prediction)):
        if kNN_prediction[i] >= 0.5 and test_y[i] == 1:
            kNN_tp += 1
        if kNN_prediction[i] < 0.5 and test_y[i] == 0:
            kNN_tn += 1
        if kNN_prediction[i] >= 0.5 and test_y[i] == 0:
            kNN_fp += 1
        if kNN_prediction[i] < 0.5 and test_y[i] == 1:
            kNN_fn += 1
            
    # Calculate the accuracy, precision, recall, and f1 metrics of the k-NN model
    kNN_accuracy = (kNN_tn + kNN_tp) / len(test_x)
    kNN_precision = kNN_tp / (kNN_tp + kNN_fp)
    kNN_recall = kNN_tp / (kNN_tp + kNN_fn)
    kNN_f1 = (2 * kNN_precision * kNN_recall) / (kNN_precision + kNN_recall)
    
    kNN_results[k-1][0] = kNN_accuracy
    kNN_results[k-1][1] = kNN_precision
    kNN_results[k-1][2] = kNN_recall
    kNN_results[k-1][3] = kNN_f1

In [22]:
np.savetxt('./src/DS1_kNN.csv', kNN_results, delimiter=',', fmt='%.4f')