In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

In [2]:
data = pd.read_csv('data/fea.csv', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,35,45,45,48,50,52,54,60,61,65,...,7,6,7,6,6,8,8,9,9,9
1,13,12,11,13,11,12,11,12,16,22,...,47,68,74,64,71,72,72,67,62,58
2,12,15,20,24,27,30,42,48,49,57,...,6,6,12,17,15,10,9,9,8,7
3,9,8,7,7,8,9,15,19,29,34,...,123,54,10,10,10,7,6,7,7,18
4,114,112,112,112,112,108,116,118,118,115,...,65,62,59,53,49,44,37,36,37,42


In [9]:
# Constants
examples_per_label = 170
num_splits = 5

def create_df(dataframe, num_labels):
    """Create a dataframe with a specified number of labels."""
    labels = np.repeat(np.arange(num_labels), examples_per_label)
    dataframe = dataframe.iloc[:num_labels*examples_per_label]
    return dataframe, labels

In [10]:
def normalize_data(truncated_dataframe):
    """Normalize the data in the dataframe."""
    normalized_data_array = truncated_dataframe.values / np.linalg.norm(truncated_dataframe.values, axis=1, keepdims=True)
    norms = np.linalg.norm(normalized_data_array, axis=1, keepdims=True)
    assert np.all(np.isclose(norms, 1.0))
    return pd.DataFrame(normalized_data_array, columns=truncated_dataframe.columns)

In [11]:
def create_data_splits(normalized_data_dataframe, num_labels, train_rows, test_rows, num_splits):
    """Create data splits for training and testing."""
    train_dfs_list = []
    test_dfs_list = []
    for i in range(num_splits):
        train_data = []
        test_data = []
        for label in range(num_labels):
            label_data = normalized_data_dataframe[normalized_data_dataframe['label'] == label].copy()
            train_label_data, test_label_data = train_test_split(label_data, train_size=train_rows, test_size=test_rows, random_state=i)
            train_data.append(train_label_data)
            test_data.append(test_label_data)
        train_dfs_list.append(pd.concat(train_data).sample(frac=1.0, random_state=i).reset_index(drop=True))
        test_dfs_list.append(pd.concat(test_data).sample(frac=1.0, random_state=i).reset_index(drop=True))
    return train_dfs_list, test_dfs_list

In [12]:
labels_list = [10,7,5]
train_test_pairs = [(150, 20), (100, 70)]
train_dfs_all = []
test_dfs_all = []
for num_labels in labels_list:
    for train_rows, test_rows in train_test_pairs:
        truncated_df, labels = create_df(data, num_labels)
        normalized_data_df = normalize_data(truncated_df)
        normalized_data_df['label'] = labels
        train_dfs_current_list, test_dfs_current_list = create_data_splits(normalized_data_df, num_labels, train_rows, test_rows, num_splits)
        train_dfs_all.append(train_dfs_current_list)
        test_dfs_all.append(test_dfs_current_list)

In [13]:
X_train_all = []
y_train_all = []
X_test_all = []
y_test_all = []
for train_dfs_list, test_dfs_list in zip(train_dfs_all, test_dfs_all):
    X_train = []
    y_train = []
    for train_df in train_dfs_list:
        X_train.append(train_df.drop('label', axis=1).values)
        y_train.append(train_df['label'].values)
    X_train_all.append(X_train)
    y_train_all.append(y_train)
    X_test = []
    y_test = []
    for test_df in test_dfs_list:
        X_test.append(test_df.drop('label', axis=1).values)
        y_test.append(test_df['label'].values)
    X_test_all.append(X_test)
    y_test_all.append(y_test)

In [14]:
from knn import KNNClassifier

# Create an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["k_value", "distance_algo", "number_of_labels", "training_test_pair", "average_accuracy", "std_accuracy", "computation_time"])

for current_k in [3,5,7,9,11]:
    for distance in ['euclidean', 'manhattan', 'cosine']:
        knn_model = KNNClassifier(k=current_k, distance=distance)
        print(f'Results using k = {current_k} and distance measure of {distance}:')
        count = 0
        for X_train_splits, y_train_splits, X_test_splits, y_test_splits in zip(X_train_all, y_train_all, X_test_all, y_test_all):
            if count % 2 == 0:
                current_label = labels_list[count//2]
            current_training_examples, current_testing_values = train_test_pairs[count%2]
            count += 1
            accuracy_list = []
            start_time = time.time()
            for X_train, y_train, X_test, y_test in zip(X_train_splits, y_train_splits, X_test_splits, y_test_splits):
                knn_model.fit(X_train, y_train)
                y_pred = knn_model.predict(X_test)
                accuracy_list.append(np.mean(y_pred == y_test))
            end_time = time.time()
            computation_time = end_time - start_time
            print(computation_time)
            average_accuracy = np.mean(accuracy_list)
            std_accuracy = np.std(accuracy_list)
            print(f'Using {current_label} labels, {current_training_examples} training examples and {current_testing_values} test cases PER SUBJECT over 5 random splits:')
            print(f'Average accuracy: {average_accuracy:.3f}                           Standard deviation over accuracy: {std_accuracy:.3f}')
            print('------------------------------------------------------------------------------------------')
            
            # Append the results to the DataFrame
            results_to_append = pd.DataFrame({
                "k_value": [current_k],
                "distance_algo": [distance],
                "number_of_labels": [current_label],
                "training_test_pair": [f"{current_training_examples}_{current_testing_values}"],
                "average_accuracy": [average_accuracy],
                "std_accuracy": [std_accuracy],
                "computation_time": [computation_time]
            })
            results_df = pd.concat([results_df, results_to_append], ignore_index=True)
            
        print('==========================================================================================')
    print('==========================================================================================')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('==========================================================================================')

# Save the DataFrame to a CSV file
results_df.to_csv("knn_results.csv", index=False)


Results using k = 3 and distance measure of euclidean:
12.034562826156616
Using 10 labels, 150 training examples and 20 test cases PER SUBJECT over 5 random splits:
Average accuracy: 0.964                           Standard deviation over accuracy: 0.028
------------------------------------------------------------------------------------------
