Import all modules needed for this notebook

In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# The below suppresses all warnings in the notebook
# Only leave this uncommented for display purposes
import warnings
warnings.filterwarnings("ignore")

The below creates a binary classification dataset by creating 3D gaussians and drawing sample points from them

In [2]:
# Mean and covariance for Class 0
mean0 = [0, 0, 0]
cov0 = [[2550, 2000, 1500], [2000, 1500, 1200], [1500, 1200, 1900]]  

# Number of datapoints for class 0
m0 = 100

x0_1, x0_2, x0_3 = np.random.multivariate_normal(mean0, cov0, m0).T

# Concatenate the 3 dimensions of each feature to create the data matrix for class 0 
X0 = np.concatenate((x0_1.reshape(-1, 1), x0_2.reshape(-1, 1), x0_3.reshape(-1, 1)), axis=1)

# Create the target vector for class 0 (target is coded with zero)
X0_target = np.zeros((m0,), dtype=np.int).reshape(-1, 1)

# Mean and covariance for Class 1
mean1 = [3, 3, 3]
cov1 = [[2550, 2000, 1500], [2000, 1500, 1200], [1500, 1200, 1900]] 

# Number of datapoints for class 1
m1 = 100

# Generate class 1 data points from a multivariate (3D) Gaussian distribution
#    Here x1_1, x1_2 and x1_3 are 2 dimensions for each data (feature) point
x1_1, x1_2, x1_3 = np.random.multivariate_normal(mean1, cov1, m1).T

# Concatenate the 3 dimensions of each feature to create the data matrix for class 1
X1 = np.concatenate((x1_1.reshape(-1, 1), x1_2.reshape(-1, 1), x1_3.reshape(-1, 1)), axis=1)

# Create the target vector for class 1 (target is coded with one)
X1_target = np.ones((m1,), dtype=np.int).reshape(-1, 1)

#  Class 0 and 1 data are combined to create a single data matrix X
X = np.append(X0, X1, axis=0)

# Target values for class 0 & 1 are combined to create a single target vector
y = np.concatenate((X0_target, X1_target), axis=0)

Note that for the above-created distributions the mean is the same for all.<br>
In addition the covariance is not too different either.<br>
What this will do is provide a situation in which our approach of KNN fails badly, as will be seen below.

We first partition our data into testing and training

In [3]:
test_frac = 0.2

# Create training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac, random_state=0)
# Note y correspond to the target vector

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(160, 3)
(40, 3)
(160, 1)
(40, 1)


The below box standardizes our feature vectors.<br>
To test the impact of not standardizing these feature vectors this box could be left uncommented.

In [4]:
X_train = scale(X_train)
X_test = scale(X_test)

Now perform cross-validation to determine the optimal hyperparameters

In [5]:
param_grid = {'n_neighbors': np.arange(1,37, 2), 'p': [1, 2, 5,  10, 20, 30, 50, 100], 'weights': ["uniform", "distance"]}

knn = KNeighborsClassifier()

knn_cv = GridSearchCV(knn, param_grid, scoring='f1', cv=3)
knn_cv.fit(X_train, y_train)

print("Best Score: %f" % knn_cv.best_score_)
print("Optimal Hyperparameter Values: ", knn_cv.best_params_)

Best Score: 0.561382
Optimal Hyperparameter Values:  {'n_neighbors': 5, 'p': 5, 'weights': 'uniform'}


We now keep the optimal hyperparameters values, as indicated above, and use these to fit our model to the training data.

In [6]:
optimal_neighbors = 7
optimal_p = 20
optimal_weight = "uniform"

knn = KNeighborsClassifier(weights=optimal_weight, algorithm='auto', n_neighbors=optimal_neighbors, p=optimal_p)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=20,
           weights='uniform')

We now evaluate our training data using the below error metrics. For this we chose to keep the default decision threshold (0.5).

In [7]:
print('The below are the performance measures using the testing dataset:')

thresh = 0.5 # Chosen optimal threshold 

y_test_predicted = (knn.predict_proba(X_test)[:,1] >= thresh).astype(bool)

accuracy = accuracy_score(y_test, y_test_predicted) 
print("\nAccuracy (Threshold %.2f) = %f" % (thresh, accuracy))

precision = precision_score(y_test, y_test_predicted) 
print("Precision (Threshold %.2f) = %f" % (thresh, precision))

recall = recall_score(y_test, y_test_predicted)
print("Recall (Threshold %.2f) = %f" % (thresh, recall))

f1 = f1_score(y_test, y_test_predicted)
print("F1 Score = (Threshold %.2f) = %f" % (thresh, f1))

The below are the performance measures using the testing dataset:

Accuracy (Threshold 0.50) = 0.550000
Precision (Threshold 0.50) = 0.583333
Recall (Threshold 0.50) = 0.636364
F1 Score = (Threshold 0.50) = 0.608696


As expected, these error metrics indicate that our model has failed.<br>
This was expected as KNN requires us to find a clear decision boundary, which does not work well if the classes are too interrelated.

We now see how well these parameters work by using the testing dataset.

In [8]:
# Use the Mahalanobis distance metric
knn = 15; # Number of nearest neighbors to use
weight_fun = "distance" # Weight type to use

# Calculate the Covariance Matrix for the training features
covarianceMatrix = np.cov(X_train)
# Calculate the inverse of the Covariance Matrix for the training features

invCovarianceMatrix = np.linalg.inv(covarianceMatrix) # Use if it does not give an error
# The below will employ a pseudo-inverse in the case that it is not possible to calculate the exact inverse
# invCovarianceMatrix = np.linalg.pinv(covarianceMatrix) # Use if above gives an error

# Using best hyperparameter values found above we find the optimum model
knn = KNeighborsClassifier(weights=weight_fun, algorithm='brute',
n_neighbors=knn, metric = "mahalanobis", metric_params={'V':
covarianceMatrix, 'VI':invCovarianceMatrix})

# Fit the model
knn.fit(X_train, y_train)

print('The below are the performance measures using the testing dataset:')

thresh = 0.5 # Chosen optimal threshold 

y_test_predicted = (knn.predict_proba(X_test)[:,1] >= thresh).astype(bool)

accuracy = accuracy_score(y_test, y_test_predicted) 
print("\nAccuracy (Threshold %.2f) = %f" % (thresh, accuracy))

precision = precision_score(y_test, y_test_predicted) 
print("Precision (Threshold %.2f) = %f" % (thresh, precision))

recall = recall_score(y_test, y_test_predicted)
print("Recall (Threshold %.2f) = %f" % (thresh, recall))

f1 = f1_score(y_test, y_test_predicted)
print("F1 Score = (Threshold %.2f) = %f" % (thresh, f1))

The below are the performance measures using the testing dataset:

Accuracy (Threshold 0.50) = 0.450000
Precision (Threshold 0.50) = 0.000000
Recall (Threshold 0.50) = 0.000000
F1 Score = (Threshold 0.50) = 0.000000
