In [67]:
# Import libraries required for creating models and validating them
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from time import time
from sklearn import metrics as mt
from IPython.html import widgets 
import warnings
warnings.filterwarnings('ignore')

In [68]:
#Read the data
ccdefault=pd.read_csv('./UCI_Credit_Card.csv')

In [69]:
#rename the columns to clearer names
ccdefault.rename(columns={'default.payment.next.month':'DEFAULT'},inplace=True)
ccdefault.rename(columns={'PAY_0':'PAY_1'},inplace=True)

In [70]:
#Cleaning of data
cc = ccdefault.copy() # taking a copy in memory
if 'DEFAULT' in cc:
    y = cc['DEFAULT'].values
    del cc['DEFAULT']
    del cc['BILL_AMT1']
    del cc['BILL_AMT2']
    del cc['BILL_AMT3']
    del cc['BILL_AMT4']
    del cc['BILL_AMT5']
    del cc['BILL_AMT6']
    X = cc.values
num_cv_iterations = 5
num_instances = len(y)
cv_object = StratifiedShuffleSplit(n_splits = num_cv_iterations,
test_size = 0.20, train_size = 0.80, random_state=10)
cv_object.get_n_splits(X, y)
print(cv_object)
for train_index, test_index in cv_object.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

StratifiedShuffleSplit(n_splits=5, random_state=10, test_size=0.2,
            train_size=0.8)
TRAIN: [26296   633  2863 ... 15141 19066  7166] TEST: [ 5469 17052 23411 ... 24675 28588 16031]
TRAIN: [17006  5992 28816 ... 10798 22887 23796] TEST: [  683 10937   999 ...   875  4336  2721]
TRAIN: [ 5957 25994 14047 ...    53   805 24297] TEST: [14399 23807  6988 ...  9804 17546 23927]
TRAIN: [11516  8559 14297 ... 14603 13364    89] TEST: [13759 19809 18958 ... 27816 11181 16164]
TRAIN: [ 2233 17193  9866 ... 12371  9657 11083] TEST: [28935 15053 22533 ... 22298  4357 23349]


In [71]:
#Scale the data so the distance metric is not affected by the range of explanatory variables
scl_obj = StandardScaler()
scl_obj.fit(X_train) 

X_train_scaled = scl_obj.transform(X_train) # apply to training
X_test_scaled = scl_obj.transform(X_test) # apply those means and std to the test set (without snooping at the test set values)

In [91]:
#Using Euclidean distance metric:

parameters = [10, 25, 50, 100]
for K in parameters:
    print("For k = ", K, "and metric = Euclidean: ")
    knn = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='euclidean')
    knn.fit(X_train_scaled,y_train)
    y_hat = knn.predict(X_test_scaled)
    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print('accuracy:', acc )
    print(conf )
    ClassReport = mt.classification_report(y_test,y_hat)
    print(ClassReport)

For k =  10 and metric = Euclidean: 
accuracy: 0.7695
[[4602   71]
 [1312   15]]
              precision    recall  f1-score   support

           0       0.78      0.98      0.87      4673
           1       0.17      0.01      0.02      1327

    accuracy                           0.77      6000
   macro avg       0.48      0.50      0.45      6000
weighted avg       0.64      0.77      0.68      6000

For k =  25 and metric = Euclidean: 
accuracy: 0.7781666666666667
[[4667    6]
 [1325    2]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      4673
           1       0.25      0.00      0.00      1327

    accuracy                           0.78      6000
   macro avg       0.51      0.50      0.44      6000
weighted avg       0.66      0.78      0.68      6000

For k =  50 and metric = Euclidean: 
accuracy: 0.7788333333333334
[[4673    0]
 [1327    0]]
              precision    recall  f1-score   support

           0       0.78 

In [93]:
#Using cosine distance metric

parameters = [10, 25, 50, 100]
for K in parameters:
    print("For k = ", K, "and metric = Cosine: ")
    knn = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='cosine')
    knn.fit(X_train_scaled,y_train)
    y_hat = knn.predict(X_test_scaled)
    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print('accuracy:', acc )
    print(conf )
    ClassReport = mt.classification_report(y_test,y_hat)
    print(ClassReport)

For k =  10 and metric = Cosine: 
accuracy: 0.7703333333333333
[[4605   68]
 [1310   17]]
              precision    recall  f1-score   support

           0       0.78      0.99      0.87      4673
           1       0.20      0.01      0.02      1327

    accuracy                           0.77      6000
   macro avg       0.49      0.50      0.45      6000
weighted avg       0.65      0.77      0.68      6000

For k =  25 and metric = Cosine: 
accuracy: 0.7775
[[4665    8]
 [1327    0]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.87      4673
           1       0.00      0.00      0.00      1327

    accuracy                           0.78      6000
   macro avg       0.39      0.50      0.44      6000
weighted avg       0.61      0.78      0.68      6000

For k =  50 and metric = Cosine: 
accuracy: 0.7788333333333334
[[4673    0]
 [1327    0]]
              precision    recall  f1-score   support

           0       0.78      1.00

In [95]:
#Using manhattan distance metric

parameters = [10, 25, 50, 100]
for K in parameters:
    print("For k = ", K, "and metric = Manhattan: ")
    knn = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='manhattan')
    knn.fit(X_train_scaled,y_train)
    y_hat = knn.predict(X_test_scaled)
    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print('accuracy:', acc )
    print(conf )
    ClassReport = mt.classification_report(y_test,y_hat)
    print(ClassReport)

For k =  10 and metric = Manhattan: 
accuracy: 0.769
[[4605   68]
 [1318    9]]
              precision    recall  f1-score   support

           0       0.78      0.99      0.87      4673
           1       0.12      0.01      0.01      1327

    accuracy                           0.77      6000
   macro avg       0.45      0.50      0.44      6000
weighted avg       0.63      0.77      0.68      6000

For k =  25 and metric = Manhattan: 
accuracy: 0.7785
[[4670    3]
 [1326    1]]
              precision    recall  f1-score   support

           0       0.78      1.00      0.88      4673
           1       0.25      0.00      0.00      1327

    accuracy                           0.78      6000
   macro avg       0.51      0.50      0.44      6000
weighted avg       0.66      0.78      0.68      6000

For k =  50 and metric = Manhattan: 
accuracy: 0.7788333333333334
[[4673    0]
 [1327    0]]
              precision    recall  f1-score   support

           0       0.78      1.00    

### References
Discussion on relevance of distance metric
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4978658/