# K-Nearest Neighbourhood Classsification
Byju N Govindan

In [1]:
# import required libraries
import pandas as pd
import numpy as np
import math
import operator

from sklearn.model_selection import train_test_split , KFold
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV # Hyperparamter tuning
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score , precision_score , roc_auc_score ,roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore") #to remove unwanted warnings

In [2]:
# Iris dataset

from sklearn.datasets import load_iris

# import iris dataset
iris = load_iris()
# np.c_ is the numpy concatenate function
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                      columns= iris['feature_names'] + ['target'])
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [3]:
#Describe the dataset
iris_df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [4]:
#Split into X and Y
#X= iris_df.iloc[:, :-1] # Extract all rows in all but last column in the df
#y= iris_df.iloc[:, -1]  # Extract all rows in last column in the df

X=iris_df.iloc[1:,:3]#features  # Extract all rows in all but last column in the df
y=iris_df.iloc[1:,4:]#class labels # Extract all rows in last column in the df


In [5]:
print(X.head())
y.head()

   sepal length (cm)  sepal width (cm)  petal length (cm)
1                4.9               3.0                1.4
2                4.7               3.2                1.3
3                4.6               3.1                1.5
4                5.0               3.6                1.4
5                5.4               3.9                1.7


Unnamed: 0,target
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0


In [6]:
#Divide the data into X_train,Y_train and X_test,Y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2,
                                                   shuffle= True, #shuffle the data to avoid bias
                                                   random_state= 42)
X_train= np.asarray(X_train)
y_train= np.asarray(y_train)

X_test= np.asarray(X_test)
y_test= np.asarray(y_test)

In [7]:
print(f'training set size: {X_train.shape[0]} samples \ntest set size: {X_test.shape[0]} samples')

training set size: 119 samples 
test set size: 30 samples


In [8]:
# Normalize the dataset
scaler= Normalizer().fit(X_train) # the scaler is fitted to the training set
normalized_X_train= scaler.transform(X_train) # the scaler is applied to the training set
normalized_X_test= scaler.transform(X_test) # the scaler is applied to the test set

In [9]:
print('X train before Normalization')
print(X_train[0:5])
print('\nX train after Normalization')
print(normalized_X_train[0:5])

X train before Normalization
[[5.1 3.3 1.7]
 [5.4 3.9 1.3]
 [5.6 3.  4.5]
 [4.8 3.  1.4]
 [5.  3.5 1.6]]

X train after Normalization
[[0.80850592 0.52315089 0.26950197]
 [0.79566782 0.57464898 0.19154966]
 [0.71930965 0.38534445 0.57801668]
 [0.82319321 0.51449576 0.24009802]
 [0.79245373 0.55471761 0.25358519]]


In [10]:
print('X test before Normalization')
print(X_test[0:5])
print('\nX test after Normalization')
print(normalized_X_test[0:5])

X test before Normalization
[[6.4 2.9 4.3]
 [5.1 3.8 1.5]
 [7.7 2.6 6.9]
 [5.7 2.6 3.5]
 [6.7 3.  5. ]]

X test after Normalization
[[0.77691418 0.35203924 0.52198921]
 [0.78047004 0.5815267  0.22955001]
 [0.72224892 0.24387626 0.64721007]
 [0.79427564 0.36230117 0.48771311]
 [0.75433425 0.3377616  0.562936  ]]


In [11]:
# knn=KNeighborsClassifier(K)
# knn.fit(normalized_X_train, y_train)
# y_pred_sklearn= knn.predict(normalized_X_test)
# print(y_pred_sklearn)

In [12]:
#Using Grid search cv to find the optimal K(n_neighbors) on train data that is not normalised
neigh = KNeighborsClassifier()
parameters = {'n_neighbors':(1,3,5,7,9,11)}
gs0_clf = GridSearchCV(neigh, parameters, cv = 5)
gs0_clf.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': (1, 3, 5, 7, 9, 11)})

In [13]:
#Using Grid search cv to find the optimal K(n_neighbors) on train data that is normalised
neigh = KNeighborsClassifier()
parameters = {'n_neighbors':(1,3,5,7,9,11)}
gs1_clf = GridSearchCV(neigh, parameters, cv = 5)
gs1_clf.fit(normalized_X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': (1, 3, 5, 7, 9, 11)})

In [14]:
#To select optimal K (based on K-NN model fit to not normalized train data)
optimal_k0 = gs0_clf.best_params_.get('n_neighbors')
print(optimal_k0)


9


In [15]:
#To select optimal K (based on K-NN model fit to normalized train data)
optimal_k1 = gs1_clf.best_params_.get('n_neighbors')
print(optimal_k1)

# Model fit to normaised train data suggests K= 5 which is very lower than the alternative model considered

5


In [16]:
#Use the obtained optimal_k  (optimal_k1) to train our model
knn_final = KNeighborsClassifier(n_neighbors = optimal_k1)
knn_final.fit(normalized_X_train,y_train)
predictions_test = knn_final.predict(normalized_X_test)
predictions_train = knn_final.predict(normalized_X_train)
predictions_test

array([1., 0., 2., 1., 1., 0., 1., 1., 1., 2., 1., 0., 0., 0., 0., 1., 2.,
       2., 1., 2., 0., 2., 0., 2., 2., 2., 2., 2., 0., 0.])

In [17]:
#The test and train accuracy
test_acc = accuracy_score(y_test, predictions_test)*100
train_acc = accuracy_score(y_train, predictions_train)*100

In [18]:
test_acc

86.66666666666667

In [19]:
train_acc

96.63865546218487

In [20]:
confusion_matrix(y_test, predictions_test)

array([[10,  0,  0],
       [ 0,  7,  2],
       [ 0,  2,  9]], dtype=int64)

In [21]:
print(classification_report(y_test, predictions_test))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        10
         1.0       0.78      0.78      0.78         9
         2.0       0.82      0.82      0.82        11

    accuracy                           0.87        30
   macro avg       0.87      0.87      0.87        30
weighted avg       0.87      0.87      0.87        30



In [22]:
#nOW, Lets try the Brute force approach to implement KNN

In [23]:
# Run with Cross Validation: First lets try brute force on train data that is not normalised

cv_scores = []
neighbors = list(np.arange(3,50,2))
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n,algorithm = 'brute')
    
    cross_val = cross_val_score(knn,X_train,y_train,cv = 5 , scoring = 'accuracy')
    cv_scores.append(cross_val.mean())
    
error = [1-x for x in cv_scores]
optimal_n = neighbors[error.index(min(error)) ]
knn_optimal = KNeighborsClassifier(n_neighbors = optimal_n,algorithm = 'brute')
knn_optimal.fit(X_train,y_train)
pred = knn_optimal.predict(X_test)
acc = accuracy_score(y_test,pred)*100
print("The accuracy for optimal k = {0} using brute is {1}".format(optimal_n,acc))

The accuracy for optimal k = 3 using brute is 90.0


In [29]:
# Run with Cross Validation: First lets try brute force on NORMALISED train data

cv_scores = []
neighbors = list(np.arange(3,50,2))
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n,algorithm = 'brute')
    
    cross_val = cross_val_score(knn,normalized_X_train,y_train,cv = 5 , scoring = 'accuracy')
    cv_scores.append(cross_val.mean())
    
error = [1-x for x in cv_scores]
optimal_n = neighbors[error.index(min(error)) ]
#print("Error: ", error)
#print("Minimum Error: ", min(error))
#print("Neighbor: ", neighbors)
#print("Error index of minimum error: ", error.index(min(error)))
#print("neighbors value for the Error index of minimum error: ", optimal_n)
knn_optimal = KNeighborsClassifier(n_neighbors = optimal_n,algorithm = 'brute')
knn_optimal.fit(normalized_X_train,y_train)
pred = knn_optimal.predict(normalized_X_test)
acc = accuracy_score(y_test,pred)*100
print("The accuracy for optimal k = {0} using brute is {1}".format(optimal_n,acc))

Error:  [0.04202898550724643, 0.025362318840579712, 0.025362318840579712, 0.033695652173913127, 0.033695652173913127, 0.04202898550724643, 0.04202898550724643, 0.033695652173913127, 0.025362318840579712, 0.033695652173913127, 0.025362318840579712, 0.033695652173913127, 0.033695652173913127, 0.04202898550724632, 0.033695652173913127, 0.04202898550724632, 0.04202898550724632, 0.04202898550724632, 0.04202898550724632, 0.04202898550724632, 0.033695652173913127, 0.033695652173913127, 0.033695652173913127, 0.033695652173913127]
Minimum Error:  0.025362318840579712
Neighbor:  [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49]
Error index of minimum error:  1
neighbors value for the Error index of minimum error:  5
The accuracy for optimal k = 5 using brute is 86.66666666666667


In [30]:
# print(np.array_equal(y_test,pred)) #Accuracy is 86.67% only, and thus predictions for train data do not match well with the observed labels

False


# Now Lets change the train to test data ratio and rerun the models
## for non-normalised as well as normalised data:


In [50]:

#from sklearn.datasets import load_iris
iris = load_iris()
# Storing the data and labels into "X" and "y" varaibles
X = iris.data
y = iris.target
# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle= True, random_state=42)
#from sklearn.neighbors import KNeighborsClassifier

# We “assumed” k(the number of neighbors i.e. n_neighbors)  = 3. It can also be 5, 7 … 10
model = KNeighborsClassifier(n_neighbors=3)
# Training or fitting the model with the train data
model.fit(X_train,y_train)
model.predict(X_test)
model.score(X_test,y_test)

0.98

In [51]:
#Using Grid search cv to find the optimal K(n_neighbors) on train data that is not normalised
neigh00 = KNeighborsClassifier()
parameters = {'n_neighbors':(1,3,5,7,9,11)}
gs00_clf = GridSearchCV(neigh00, parameters, cv = 5)
gs00_clf.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': (1, 3, 5, 7, 9, 11)})

In [52]:
#To select optimal K (based on K-NN model fit to not normalized train data)
optimal_k00 = gs00_clf.best_params_.get('n_neighbors')
print(optimal_k00)

1


In [54]:
#Use the obtained optimal_k to train our model
knn_final00 = KNeighborsClassifier(n_neighbors = optimal_k00)
knn_final00.fit(X_train,y_train)
predictions_test00 = knn_final00.predict(X_test)
predictions_train00 = knn_final00.predict(X_train)
predictions_test00

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 2, 1, 2])

In [56]:
#The test and train accuracy
test_acc00 = accuracy_score(y_test, predictions_test00)*100
train_acc00 = accuracy_score(y_train, predictions_train00)*100

In [58]:
test_acc00  # Ran for learning purpose only' Even with K=1, it gives 98% accuracy. 
# But what purpose does a K-NN model with K=1 serve?
# For a very low value of k (suppose k=1), the model overfits on the training data,
# which leads to a high error rate on the validation set.
#On the other hand, for a high value of k, the model performs poorly on both train and validation set.

98.0

In [64]:
print(classification_report(y_test, predictions_test00))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.94      1.00      0.97        15
           2       1.00      0.94      0.97        16

    accuracy                           0.98        50
   macro avg       0.98      0.98      0.98        50
weighted avg       0.98      0.98      0.98        50



In [69]:
# Run with Cross Validation: First lets try brute force on train data that is not normalised

cv_scores = []
neighbors = list(np.arange(3,50,2))
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n,algorithm = 'brute')
    
    cross_val = cross_val_score(knn,X_train,y_train,cv = 5 , scoring = 'accuracy')
    cv_scores.append(cross_val.mean())
    
error = [1-x for x in cv_scores]
optimal_n = neighbors[error.index(min(error)) ]
knn_optimal = KNeighborsClassifier(n_neighbors = optimal_n,algorithm = 'brute')
knn_optimal.fit(X_train,y_train)
pred = knn_optimal.predict(X_test)
acc = accuracy_score(y_test,pred)*100
print("The accuracy for optimal k = {0} using brute is {1}".format(optimal_n,acc))

The accuracy for optimal k = 5 using brute is 98.0


In [59]:
#Lets normalise the data and rerun the knn model with GridSearchCV

# Normalize the dataset
scaler= Normalizer().fit(X_train) # the scaler is fitted to the training set
normalized_X_train= scaler.transform(X_train) # the scaler is applied to the training set
normalized_X_test= scaler.transform(X_test) # the scaler is applied to the test set

In [60]:
#Using Grid search cv to find the optimal K(n_neighbors) on train data that is normalised
neigh01 = KNeighborsClassifier()
parameters = {'n_neighbors':(1,3,5,7,9,11)}
gs01_clf = GridSearchCV(neigh01, parameters, cv = 5)
gs01_clf.fit(normalized_X_train,y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': (1, 3, 5, 7, 9, 11)})

In [61]:
#To select optimal K (based on K-NN model fit to normalized train data)
optimal_k01 = gs01_clf.best_params_.get('n_neighbors')
print(optimal_k01)


9


In [66]:
#Use the obtained optimal_k to train our model
knn_final01 = KNeighborsClassifier(n_neighbors = optimal_k01)
knn_final01.fit(normalized_X_train,y_train)
predictions_test01 = knn_final01.predict(normalized_X_test)
predictions_train01 = knn_final01.predict(normalized_X_train)
predictions_test01

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 1, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0, 1, 1, 2, 1, 2])

In [67]:
#The test and train accuracy
test_acc01 = accuracy_score(y_test, predictions_test01)*100
train_acc01 = accuracy_score(y_train, predictions_train01)*100

test_acc01

96.0

In [68]:
print(classification_report(y_test, predictions_test01))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       0.88      1.00      0.94        15
           2       1.00      0.88      0.93        16

    accuracy                           0.96        50
   macro avg       0.96      0.96      0.96        50
weighted avg       0.96      0.96      0.96        50



In [70]:
# Run with Cross Validation: First lets try brute force on NORMALISED train data

cv_scores = []
neighbors = list(np.arange(3,50,2))
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors = n,algorithm = 'brute')
    
    cross_val = cross_val_score(knn,normalized_X_train,y_train,cv = 5 , scoring = 'accuracy')
    cv_scores.append(cross_val.mean())
    
error = [1-x for x in cv_scores]
optimal_n = neighbors[error.index(min(error)) ]
#print("Error: ", error)
#print("Minimum Error: ", min(error))
#print("Neighbor: ", neighbors)
#print("Error index of minimum error: ", error.index(min(error)))
#print("neighbors value for the Error index of minimum error: ", optimal_n)
knn_optimal = KNeighborsClassifier(n_neighbors = optimal_n,algorithm = 'brute')
knn_optimal.fit(normalized_X_train,y_train)
pred = knn_optimal.predict(normalized_X_test)
acc = accuracy_score(y_test,pred)*100
print("The accuracy for optimal k = {0} using brute is {1}".format(optimal_n,acc))

Error:  [0.040000000000000036, 0.040000000000000036, 0.050000000000000044, 0.030000000000000027, 0.030000000000000027, 0.030000000000000027, 0.030000000000000027, 0.030000000000000027, 0.030000000000000027, 0.030000000000000027, 0.030000000000000027, 0.040000000000000036, 0.040000000000000036, 0.040000000000000036, 0.050000000000000044, 0.050000000000000044, 0.050000000000000044, 0.050000000000000044, 0.050000000000000044, 0.050000000000000044, 0.050000000000000044, 0.040000000000000036, 0.030000000000000027, 0.10999999999999999]
Minimum Error:  0.030000000000000027
Neighbor:  [3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49]
Error index of minimum error:  3
neighbors value for the Error index of minimum error:  9
The accuracy for optimal k = 9 using brute is 96.0


# K-fold Cross Validation

In [89]:
# Instantiating the K-Fold cross validation object with  K = 5 folds 
# Donot confuse this K (cross validation folds) with optimal k (the optimal_n) in above script[s])

Val_acc_scores = []
k_folds = KFold(n_splits = 5, shuffle = True, random_state = 42) # Iterating through each of the folds in K-Fold
for train_index, val_index in k_folds.split(X):
    
    # Splitting the training set from the validation set for this specific fold
    X_train, X_val = X[train_index, :], X[val_index, :]  # X is not pandas dataframe and hence X.iloc not needed
    y_train, y_val = y[train_index], y[val_index] #  # y is not pandas dataframe and hence X.iloc not needed
    
    # Instantiating a KNeighborsClassifier model
    knn_model = KNeighborsClassifier(n_neighbors = 9) # we choose n_neighbors = 9 (optimal_n) from the above model
    
    # Fitting the X_train and y_train datasets to the RandomForestClassifier model
    knn_model.fit(X_train, y_train)
    
    # Getting inferential predictions for the validation dataset
    val_preds = knn_model.predict(X_val)
    
    # Generating validation metrics by comparing the inferential predictions (val_preds) to the actuals (y_val)
    val_accuracy = accuracy_score(y_val, val_preds)
    val_confusion_matrix = confusion_matrix(y_val, val_preds)
    
    Val_acc_scores.append(val_accuracy)
    
    # Printing out the validation metrics
    print(f'Accuracy Score: {val_accuracy}')
    print(f'Confusion Matrix: \n{val_confusion_matrix}')

print("  ")
print(f'Avg. of the Validation Accuracy Score: {np.mean(Val_acc_scores)}')

Accuracy Score: 1.0
Confusion Matrix: 
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
Accuracy Score: 0.9666666666666667
Confusion Matrix: 
[[13  0  0]
 [ 0 10  0]
 [ 0  1  6]]
Accuracy Score: 0.9666666666666667
Confusion Matrix: 
[[12  0  0]
 [ 0 10  0]
 [ 0  1  7]]
Accuracy Score: 0.9333333333333333
Confusion Matrix: 
[[ 8  0  0]
 [ 0  8  2]
 [ 0  0 12]]
Accuracy Score: 0.9666666666666667
Confusion Matrix: 
[[ 7  0  0]
 [ 0 11  0]
 [ 0  1 11]]
  
Avg. of the Validation Accuracy Score: 0.9666666666666668


In [None]:
# Additional References:

In [25]:
#Reference #1: An implementation that focuses on hyperparameter tuning for kNN using the Iris dataset 
# implemented from scratch with no dependencies on existing python data science libraries.
# https://towardsdatascience.com/k-nn-on-iris-dataset-3b827f2591e

In [None]:
#Reference #2: https://medium.com/@avulurivenkatasaireddy/k-nearest-neighbors-and-implementation-on-iris-data-set-f5817dd33711

In [None]:
#Reference #3: https://deepnote.com/@ndungu/Implementing-KNN-Algorithm-on-the-Iris-Dataset-e7c16493-500c-4248-be54-9389de603f16