## K-nearest neighbours (KNN)

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, max_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [2]:
# getting the dataset
qsar = pd.read_csv('biodeg.csv', sep=';')
qsar = qsar.drop("Unnamed: 42", axis=1)  # removing the unnamed column if you have, no means, comment it out
#USE THIS COMMAND IF THERE IS AN UNNAMED 42 COLUMN
qsar.sample(3)

Unnamed: 0,SpMax_L,J_Dz(e),nHM,F01[N-N],F04[C-N],NssssC,nCb-,C%,nCp,nO,...,C-026,F02[C-N],nHDon,SpMax_B(m),Psi_i_A,nN,SM6_B(m),nArCOOR,nX,experimental_class
972,5.119,2.6427,0,0,0,0,4,60.0,0,0,...,0,0,0,3.976,1.889,0,8.661,0,0,NRB
616,4.675,4.5744,0,0,0,0,0,27.3,0,6,...,0,3,3,3.488,3.885,1,8.405,0,0,NRB
229,4.303,3.0634,0,0,0,0,0,21.4,2,3,...,0,0,3,3.102,3.722,0,6.978,0,0,RB


In [3]:
# changing the str label into 1s and 0s
qsar['experimental_class'] = qsar['experimental_class'].astype('category')
encode_map = {
    'RB': 1,
    'NRB': 0
}
qsar['experimental_class'].replace(encode_map, inplace=True)
# 1 reprsents RB and O represents NRB [changed it into a binary classification problem now]

In [4]:
qsar.isnull().values.any()  # checking if there is any missing value in data frame
qsar.isnull().sum().sum()  # sums of missing value in the data frame

0

In [5]:
# setting up the target variable and the feature variable
y = qsar['experimental_class']
x = qsar.drop(columns='experimental_class')

In [6]:
scaler = StandardScaler()
x_scaler = scaler.fit_transform(x)
# print(x_scaler)

In [7]:
### Splitting the dataset before conducting SFS [forward and backward]
seed_num = 80
# creating set for SFS usage, the remaining set is for testing the model
x_sfs, x_not_sfs, y_sfs, y_not_sfs = train_test_split(x_scaler, y, test_size=0.3, random_state=seed_num)

print(x_sfs.shape)
print(x_not_sfs.shape)
print(y_sfs.shape)
print(y_not_sfs.shape)

# will be using x_sfs and y_sfs for SFS training and for training the model itself
# the x_not_sfs and y_not_sfs is not used for the SFS training, it will be used for testing the model

(738, 41)
(317, 41)
(738,)
(317,)


In [8]:
# Initilaise the knn model
model_knn = KNeighborsClassifier()
model_knn.fit(x_sfs, y_sfs)
#Determine the number of features to be selected
fs = 5  #The features that is used for the selection
direction = "forward"

In [9]:
# feature selection is done here.
# the direction must be set manually by the user

sfs = SequentialFeatureSelector(model_knn, direction=direction, n_features_to_select=fs, cv=5)
x_after_sfs = sfs.fit_transform(x_sfs, y_sfs)  # x_forward holds the selected features [forward sfs]
selected_feature = x.columns[sfs.get_support(indices=True)]
print(selected_feature)

Index(['F04[C-N]', 'F03[C-O]', 'SdO', 'nCrt', 'SpMax_B(m)'], dtype='object')


In [10]:
grid_params = {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'],'metric': ['euclidean', 'manhattan']}
gsc = GridSearchCV(model_knn, grid_params, cv=3, n_jobs=-1)
result = gsc.fit(x_after_sfs, y_sfs)

In [11]:
print("feature Selection: ", fs)
print("The best parameters : " + str(result.best_params_))
# print("The best index : " + str(gsc.best_index_))
# print(gsc.best_estimator_)
print("Training score : " + str(result.best_score_) + "\n")

feature Selection:  5
The best parameters : {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'uniform'}
Training score : 0.8428184281842818



In [12]:
# based on sfs
# updating the test set, 'x_test_' according to 'sfs' selection
# assign 'x_not_sfs' to 'x_test_' --> to be used for testing later
x_test_ = x_not_sfs
x_test_ = sfs.transform(x_test_)
y_train = y_sfs
y_test = y_not_sfs

In [13]:
metric = input("Enter the metric : ")
weight = input("Enter the weight : ")
neighbours = input("Enter the neigbours : ")
print()
neighbours = int(neighbours)

Enter the metric : euclidean
Enter the weight : uniform
Enter the neigbours : 5



In [14]:
m_knn = KNeighborsClassifier(n_neighbors=neighbours, weights=weight, metric=metric)
m_knn.fit(x_after_sfs, y_train)
y_train_pred = m_knn.predict(x_after_sfs)  # training set ,"x_after_sfs_" is used for prediction
print("Training set ")
print("Mean absolute error : " + str(mean_absolute_error(y_train, y_train_pred)))
print("Mean squared error : " + str(mean_squared_error(y_train, y_train_pred)))
print("r2 score : " + str(r2_score(y_train, y_train_pred)))
print("The max error value : " + str(max_error(y_train, y_train_pred)) + "\n")
print("accuracy score: ", accuracy_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

y_pred = m_knn.predict(x_test_)  # testing set ,"x_test_" is used for prediction
print()
print("Testing set ")
print("Mean absolute error : " + str(mean_absolute_error(y_test, y_pred)))
print("Mean squared error : " + str(mean_squared_error(y_test, y_pred)))
print("r2 score : " + str(r2_score(y_test, y_pred)))
print("The max error value : " + str(max_error(y_test, y_pred)) + "\n")
print("accuracy score: ", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Training set 
Mean absolute error : 0.11653116531165311
Mean squared error : 0.11653116531165311
r2 score : 0.4787493532411857
The max error value : 1

accuracy score:  0.8834688346883469
[[448  41]
 [ 45 204]]
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       489
           1       0.83      0.82      0.83       249

    accuracy                           0.88       738
   macro avg       0.87      0.87      0.87       738
weighted avg       0.88      0.88      0.88       738


Testing set 
Mean absolute error : 0.1640378548895899
Mean squared error : 0.1640378548895899
r2 score : 0.2663996439697375
The max error value : 1

accuracy score:  0.8359621451104101
[[185  25]
 [ 27  80]]
              precision    recall  f1-score   support

           0       0.87      0.88      0.88       210
           1       0.76      0.75      0.75       107

    accuracy                           0.84       317
   macro avg       0.82      0.81  