In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.ensemble
import sklearn.preprocessing
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn import metrics

In [2]:
#Importing the split data

x_train = pd.read_csv("data/train/x_train.csv", header=0,index_col=0)
y_train = pd.read_csv("data/train/y_train.csv", header=0,index_col=0)

x_test = pd.read_csv("data/val/x_val.csv", header=0,index_col=0)
y_test = pd.read_csv("data/val/y_val.csv", header=0,index_col=0)

In [3]:
#KNN Classifier Model 
classifier =  KNeighborsClassifier(n_neighbors=3, weights='distance')
classifier.fit(x_train, y_train.values.ravel())

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='distance')

In [4]:
#Returns the classifier's accuracy

def accuracy(classifier) -> float:
    num_correct = 0
    for i in range(len(x_test)):
        pred = classifier.predict([x_test.values[i]])[0]
        correct = y_test.values[i]
        if pred == correct:
                num_correct += 1
    return num_correct / len(x_test)

print(f"Classifier Accuracy: {accuracy(classifier)}")

Classifier Accuracy: 0.9722222222222222


In [5]:
#Prediction

y_pred=classifier.predict(x_test)
print(f"Accuracy: {metrics.accuracy_score(y_pred, y_test)}")

Accuracy: 0.9722222222222222


In [6]:
#hyperparameter tuning - testing from 2 to 9 

for i in range(2,10):
    print("k = {}".format(i))
    
    classifier =  KNeighborsClassifier(n_neighbors=i, weights='distance')
    classifier.fit(x_train, y_train.values.ravel())
    y_pred=classifier.predict(x_test)
    
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("F1 Score:",metrics.f1_score(y_test, y_pred))
    print("BAS:", metrics.balanced_accuracy_score(y_test, y_pred))
    print("ROC:", metrics.roc_auc_score(y_test, y_pred))
    print("\n")

k = 2
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 3
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 4
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 5
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 6
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 7
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 8
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9


k = 9
Accuracy: 0.9722222222222222
F1 Score: 0.888888888888889
BAS: 0.9
ROC: 0.9




In [7]:
#Training on all labelled data and predicting unlabelled cases

train_data = pd.read_csv('train_mean_features_labelled.csv',header=0, index_col=0)
X_train = train_data.loc[:, train_data.columns != 'label']
y_train = train_data.loc[:, train_data.columns == 'label']
X_test = pd.read_csv('test_mean_features.csv',header=0, index_col=0)

In [8]:
classifier =  KNeighborsClassifier(n_neighbors=3, weights='distance')
classifier.fit(X_train, y_train.values.ravel())
predicted = classifier.predict(X_test)

In [9]:
print(f"Number of positive cases identified in test data: {sum(predicted)}")
print(f"Number of total cases in test data: {len(predicted)}")

Number of positive cases identified in test data: 20
Number of total cases in test data: 180


In [10]:
print(classifier.predict_proba(X_test))

[[1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.3319989  0.6680011 ]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.         1.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [1.         0.        ]
 [0.         1.        ]


In [13]:
#Saving Prediction Results

df = pd.DataFrame(predicted)
df.to_csv('data/results/KNN_prediction.csv', index = True, header=False)

df2 = pd.DataFrame(classifier.predict_proba(X_test))
df2.to_csv('data/results/KNN_prediction_prob_score.csv', index = True, header=False)