In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import math
import random


In [2]:
df = pd.read_csv("16P.csv", encoding = "cp1252")

In [3]:
df = df.drop(columns="Response Id")

In [4]:
mapping = {
    'ESTJ': 0,
    'ENTJ': 1,
    'ESFJ': 2,
    'ENFJ': 3,
    'ISTJ': 4,
    'ISFJ': 5,
    'INTJ': 6,
    'INFJ': 7,
    'ESTP': 8,
    'ESFP': 9,
    'ENTP': 10,
    'ENFP': 11,
    'ISTP': 12,
    'ISFP': 13,
    'INTP': 14,
    'INFP': 15
}

df['Personality'] = df['Personality'].map(mapping)


In [5]:
# feature normalization
a = df["Personality"]
df1 = df.drop(columns="Personality")
new_data = df.to_numpy()
new_data = (new_data - new_data.min(axis=0)) / (new_data.max(axis=0) - new_data.min(axis=0))

In [6]:
data = df.to_numpy()
print(data.shape)

(59999, 61)


In [7]:
# Find the k nearest neighbors of a sample
def find_neighbors(X, y, sample, k):
    # calculate distance between sample and all samples in X
    distances = np.sqrt(np.sum((X - sample)**2, axis=1))
    # get the indices of the k smallest distances
    k_indices = np.argpartition(distances, k)[:k]
    # return the distances and corresponding labels for the k nearest neighbors
    return list(zip(distances[k_indices], y[k_indices]))



# Predict the class of a sample based on the classes of its k nearest neighbors
def predict(neighbors):
    class_counts = {}
    for _, class_ in neighbors:
        if class_ in class_counts:
            class_counts[class_] += 1
        else:
            class_counts[class_] = 1
    sorted_counts = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)
    return sorted_counts[0][0]

    

In [8]:
X = data[:, :-1]
y = data[:, -1]

In [9]:
def recall(y_test, predictions):
    true_positives = 0
    false_negatives = 0
    for i in range(len(y_test)):
        if y_test[i] == predictions[i]:
            true_positives += 1
        elif predictions[i] == 1:
            false_negatives += 1
    recall = true_positives / (true_positives + false_negatives)
    return recall
    

In [10]:
def precision(y_test, predictions):
    true_positives = 0
    false_positives = 0
    for i in range(len(y_test)):
        if y_test[i] == predictions[i]:
            true_positives += 1
        elif predictions[i] == 0:
            false_positives += 1
    precision = true_positives / (true_positives + false_positives)
    return precision


In [242]:
k = [1,3,5,7,9]

accuracies = []
recalls = []
precisions = []

num_samples = len(df)
num_folds = 5
num_samples_per_fold = num_samples // num_folds


for i in range(5):
    test_indices = random.sample(range(num_samples), num_samples_per_fold)
    X_test = X[test_indices]
    y_test = y[test_indices]
    train_indices = [i for i in range(num_samples) if i not in test_indices]
    X_train = X[train_indices]
    y_train = y[train_indices]


    # Perform predictions for each k value
    for j in k:
        predictions = []
        for sample in X_test:
            if sample.ndim == 1:
                sample = sample.reshape(1,-1)
            neighbors = find_neighbors(X_train, y_train, sample, j)
            predictions.append(predict(neighbors))
        accuracy = sum((predictions) == (y_test)) / len(y_test)
        r = recall(y_test, predictions)
        p = precision(y_test, predictions)
        accuracies.append(accuracy)
        recalls.append(r)
        precisions.append(p)

    mean_accuracies = [np.mean(accuracies[i:i+num_folds]) for i in range(0, len(accuracies), num_folds)]
    mean_recalls = [np.mean(recalls[i:i+num_folds]) for i in range(0, len(recalls), num_folds)]
    mean_precisions = [np.mean(precisions[i:i+num_folds]) for i in range(0, len(precisions), num_folds)]
    for j,i,a,l in zip(k,mean_accuracies,mean_recalls,mean_precisions):
        print(f'k = {j}: accuracy = {i}, recall = {a}, precision = {l}')


k = 1: accuracy = 0.985898824902075, recall = 0.9993733680655421, precision = 0.9993908737989037
k = 1: accuracy = 0.985898824902075, recall = 0.9993733680655421, precision = 0.9993908737989037
k = 3: accuracy = 0.9865988832402699, recall = 0.9985811484927989, precision = 0.9994585976120394
k = 1: accuracy = 0.985898824902075, recall = 0.9993733680655421, precision = 0.9993908737989037
k = 3: accuracy = 0.9865988832402699, recall = 0.9985811484927989, precision = 0.9994585976120394
k = 5: accuracy = 0.98698224852071, recall = 0.9988016913386456, precision = 0.9990028907004689
k = 1: accuracy = 0.985898824902075, recall = 0.9993733680655421, precision = 0.9993908737989037
k = 3: accuracy = 0.9865988832402699, recall = 0.9985811484927989, precision = 0.9994585976120394
k = 5: accuracy = 0.98698224852071, recall = 0.9988016913386456, precision = 0.9990028907004689
k = 7: accuracy = 0.988065672139345, recall = 0.9988189857047877, precision = 0.9991220922443107
k = 1: accuracy = 0.985898824

In [11]:
######################## wiht feature normalization ##############################
Xf = new_data[:, :-1]
yf = new_data[:, -1]

In [12]:
k = [1,3,5,7,9]

accuracies = []
recalls = []
precisions = []

num_samples = len(df)
num_folds = 5
num_samples_per_fold = num_samples // num_folds


for i in range(5):
    test_indices = random.sample(range(num_samples), num_samples_per_fold)
    X_test = Xf[test_indices]
    y_test = yf[test_indices]
    train_indices = [i for i in range(num_samples) if i not in test_indices]
    X_train = Xf[train_indices]
    y_train = yf[train_indices]


    # Perform predictions for each k value
    for j in k:
        predictions = []
        for sample in X_test:
            if sample.ndim == 1:
                sample = sample.reshape(1,-1)
            neighbors = find_neighbors(X_train, y_train, sample, j)
            predictions.append(predict(neighbors))
        accuracy = sum((predictions) == (y_test)) / len(y_test)
        r = recall(y_test, predictions)
        p = precision(y_test, predictions)
        accuracies.append(accuracy)
        recalls.append(r)
        precisions.append(p)

    mean_accuracies = [np.mean(accuracies[i:i+num_folds]) for i in range(0, len(accuracies), num_folds)]
    mean_recalls = [np.mean(recalls[i:i+num_folds]) for i in range(0, len(recalls), num_folds)]
    mean_precisions = [np.mean(precisions[i:i+num_folds]) for i in range(0, len(precisions), num_folds)]
    for j,i,a,l in zip(k,mean_accuracies,mean_recalls,mean_precisions):
        print(f'k = {j}: accuracy = {i}, recall = {a}, precision = {l}')


k = 1: accuracy = 0.9840986748895741, recall = 0.9986625738633389, precision = 0.9991183402664504
k = 1: accuracy = 0.9840986748895741, recall = 0.9986625738633389, precision = 0.9991183402664504
k = 3: accuracy = 0.9849820818401532, recall = 0.9990160643491055, precision = 0.999423290131984
k = 1: accuracy = 0.9840986748895741, recall = 0.9986625738633389, precision = 0.9991183402664504
k = 3: accuracy = 0.9849820818401532, recall = 0.9990160643491055, precision = 0.999423290131984
k = 5: accuracy = 0.9860821735144596, recall = 0.9987477274610168, precision = 0.9992548811061072
k = 1: accuracy = 0.9840986748895741, recall = 0.9986625738633389, precision = 0.9991183402664504
k = 3: accuracy = 0.9849820818401532, recall = 0.9990160643491055, precision = 0.999423290131984
k = 5: accuracy = 0.9860821735144596, recall = 0.9987477274610168, precision = 0.9992548811061072
k = 7: accuracy = 0.9845153762813567, recall = 0.9988644266683375, precision = 0.9993396440411226
k = 1: accuracy = 0.984