In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import math
import csv
import pandas as pd

dataset_file = "/content/drive/MyDrive/AI assignment/Assignment3/dataset.csv"
mark_records = []

min_marks = {
    "Assignment-1": 100,
    "Assignment-2": 100,
    "Assignment-3": 100,
    "Assignment-4": 100,
    "Assignment-5": 100,
    "Final": 100,
    "Mid": 100,
}

max_marks = {
    "Assignment-1": -100,
    "Assignment-2": -100,
    "Assignment-3": -100,
    "Assignment-4": -100,
    "Assignment-5": -100,
    "Final": -100,
    "Mid": -100,
}

def load_min_max(df):
    for column in df.columns[1:-1]:
        min_marks[column] = int(min(df[column]))
        max_marks[column] = int(max(df[column]))

def get_normalized_entry(row):
    result = []
    for idx, column in enumerate(row[1:-1]):
        column_name = df.columns[idx+1]
        result.append((int(column) - min_marks[column_name]) / (max_marks[column_name] - min_marks[column_name]))
    return result

df = pd.read_csv(dataset_file)
load_min_max(df)

for row in df.itertuples(index=False):
    current_record = list(row)
    current_record_updated = [current_record[0]]
    current_record_updated.extend(get_normalized_entry(current_record))
    current_record_updated.append(current_record[-1])
    mark_records.append(current_record_updated)

def euclidean_distance(row1, row2):
    if len(row1) != len(row2):
        return None
    ret_val = 0
    for idx, item in enumerate(row1):
        ret_val += (row1[idx] - row2[idx]) ** 2
    return math.sqrt(ret_val)

def get_accuracy(true_output, predicted_output):
    correct = 0
    for idx, outcome in enumerate(true_output):
        if predicted_output[idx] == outcome:
            correct += 1
    return correct / len(true_output) * 100

training = mark_records[: math.floor(len(mark_records) * 0.8)]
validation = mark_records[math.floor(len(mark_records) * 0.8): math.floor(len(mark_records) * 0.9)]
testing = mark_records[math.floor(len(mark_records) * 0.9):]

k_values = [1, 3, 5, 7]
accuracy_results = []

for k in k_values:
    predicted_output = []

    for entry in validation:
        distances = []
        for compare_entry in training:
            if entry != compare_entry:
                sample1 = entry[1: -1]
                sample2 = compare_entry[1: -1]
                distances.append((euclidean_distance(sample1, sample2), compare_entry[-1]))

        distances.sort(key=lambda x: x[0])

        k_nearest_neighbors = distances[:k]
        class_votes = {}
        for neighbor in k_nearest_neighbors:
            neighbor_class = neighbor[1]
            class_votes[neighbor_class] = class_votes.get(neighbor_class, 0) + 1

        predicted_class = max(class_votes, key=class_votes.get)
        predicted_output.append(predicted_class)

    true_output = [entry[-1] for entry in validation]
    accuracy = get_accuracy(true_output, predicted_output)
    accuracy_results.append(accuracy)
    print(f"Accuracy on validation set for K = {k}: {accuracy}%")

best_k = k_values[accuracy_results.index(max(accuracy_results))]
print(f"\nBest K value: k = {best_k}")


predicted_output = []

for entry in testing:
    distances = []
    for compare_entry in training:
        if entry != compare_entry:
            sample1 = entry[1: -1]
            sample2 = compare_entry[1: -1]
            distances.append((euclidean_distance(sample1, sample2), compare_entry[-1]))

    distances.sort(key=lambda x: x[0])

    k_nearest_neighbors = distances[:best_k]
    class_votes = {}
    for neighbor in k_nearest_neighbors:
        neighbor_class = neighbor[1]
        class_votes[neighbor_class] = class_votes.get(neighbor_class, 0) + 1

    predicted_class = max(class_votes, key=class_votes.get)
    predicted_output.append(predicted_class)

true_output = [entry[-1] for entry in testing]
accuracy = get_accuracy(true_output, predicted_output)
print(f"\nAccuracy on testing set for K = {best_k}: {accuracy}%")


Accuracy on validation set for K = 1: 33.33333333333333%
Accuracy on validation set for K = 3: 33.33333333333333%
Accuracy on validation set for K = 5: 66.66666666666666%
Accuracy on validation set for K = 7: 33.33333333333333%

Best K value: k = 5

Accuracy on testing set for K = 5: 50.0%
