In [42]:
# Import libraries required for creating models and validating them
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from IPython.html import widgets 
import warnings
warnings.filterwarnings('ignore')

In [43]:
#Read the data
ccdefault=pd.read_csv('./UCI_Credit_Card.csv')

In [44]:
#rename the columns to clearer names
ccdefault.rename(columns={'default.payment.next.month':'DEFAULT'},inplace=True)
ccdefault.rename(columns={'PAY_0':'PAY_1'},inplace=True)

In [45]:
#Verifying the datatypes in the dataset
ccdefault.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID           30000 non-null int64
LIMIT_BAL    30000 non-null float64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_1        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null float64
BILL_AMT2    30000 non-null float64
BILL_AMT3    30000 non-null float64
BILL_AMT4    30000 non-null float64
BILL_AMT5    30000 non-null float64
BILL_AMT6    30000 non-null float64
PAY_AMT1     30000 non-null float64
PAY_AMT2     30000 non-null float64
PAY_AMT3     30000 non-null float64
PAY_AMT4     30000 non-null float64
PAY_AMT5     30000 non-null float64
PAY_AMT6     30000 non-null float64
DEFAULT      30000 non-null int64
dtypes: float64

In [46]:
#Cleaning of data after EDA
cc = ccdefault.copy() # taking a copy in memory
from sklearn.model_selection import StratifiedShuffleSplit
if 'DEFAULT' in cc:
    y = cc['DEFAULT'].values
    del cc['DEFAULT']
    del cc['BILL_AMT1']
    del cc['BILL_AMT2']
    del cc['BILL_AMT3']
    del cc['BILL_AMT4']
    del cc['BILL_AMT5']
    del cc['BILL_AMT6']
    X = cc.values
num_cv_iterations = 5
num_instances = len(y)
cv_object = StratifiedShuffleSplit(n_splits = num_cv_iterations,
test_size = 0.20, train_size = 0.80, random_state=11)
cv_object.get_n_splits(X, y)
print(cv_object)
for train_index, test_index in cv_object.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

StratifiedShuffleSplit(n_splits=5, random_state=11, test_size=0.2,
            train_size=0.8)
TRAIN: [28976 29616 20315 ...  6063 16283 15621] TEST: [ 6669  6830 16516 ... 16163  1990  7999]
TRAIN: [11237  9898 17028 ...  1265   611  7250] TEST: [ 4174 19421 15502 ... 19995  6124 25090]
TRAIN: [23721 14143  5492 ...  4346  5119 23351] TEST: [21778  4074 19546 ...   941 14061 13584]
TRAIN: [19706  2009 13912 ...  3924 20112 21366] TEST: [ 7136 15572 21097 ... 24126  8440  7078]
TRAIN: [ 5794  1831 15948 ... 18673 15962 17274] TEST: [17565 28798 29978 ... 10151 11434 21212]


In [None]:
for train_index, test_index in cv_object.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [47]:
#Perform KNN with Euclidean distance as the metric 
parameters =[5,10,25,50,100,200,500]
accuracies = []
max_acc = 0
idx= -1
iter_num=0
for train_index, test_index in cv_object.split(X, y):
    print("Iteration", iter_num)
    iter_num+=1
    for K in parameters:
        clf = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='euclidean')
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        (max_acc , idx) = (acc, K )if acc> max_acc else (max_acc, idx)
        accuracies.append(acc)
        print('Accuracy of classifier with %d neighbors is: %.4f'%(K,acc))
        
       

print("Max accuracy is for k =  {} neighbors and the accuracy score is  {} ".format(idx , max_acc))   

Iteration 0
Accuracy of classifier with 5 neighbors is: 0.7527
Accuracy of classifier with 10 neighbors is: 0.7740
Accuracy of classifier with 25 neighbors is: 0.7763
Accuracy of classifier with 50 neighbors is: 0.7798
Accuracy of classifier with 100 neighbors is: 0.7787
Accuracy of classifier with 200 neighbors is: 0.7788
Accuracy of classifier with 500 neighbors is: 0.7788
Iteration 1
Accuracy of classifier with 5 neighbors is: 0.7527
Accuracy of classifier with 10 neighbors is: 0.7740
Accuracy of classifier with 25 neighbors is: 0.7763
Accuracy of classifier with 50 neighbors is: 0.7798
Accuracy of classifier with 100 neighbors is: 0.7787
Accuracy of classifier with 200 neighbors is: 0.7788
Accuracy of classifier with 500 neighbors is: 0.7788
Iteration 2
Accuracy of classifier with 5 neighbors is: 0.7527
Accuracy of classifier with 10 neighbors is: 0.7740
Accuracy of classifier with 25 neighbors is: 0.7763
Accuracy of classifier with 50 neighbors is: 0.7798
Accuracy of classifier wi

In [48]:
#Perform KNN with cosine distance as the metric 
parameters =[5,10,25,50,100,200,500]
accuracies = []
max_acc = 0
idx= -1
iter_num=0
for train_index, test_index in cv_object.split(X, y):
    print("Iteration", iter_num)
    iter_num+=1
    for K in parameters:
        clf = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='cosine')
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        (max_acc , idx) = (acc, K )if acc> max_acc else (max_acc, idx)
        accuracies.append(acc)
        print('Accuracy of classifier with %d neighbors is: %.4f'%(K,acc))

print("Max accuracy is for k =  {} neighbors and the accuracy score is  {} ".format(idx , max_acc))        

Iteration 0
Accuracy of classifier with 5 neighbors is: 0.7540
Accuracy of classifier with 10 neighbors is: 0.7772
Accuracy of classifier with 25 neighbors is: 0.7788
Accuracy of classifier with 50 neighbors is: 0.7797
Accuracy of classifier with 100 neighbors is: 0.7788
Accuracy of classifier with 200 neighbors is: 0.7788
Accuracy of classifier with 500 neighbors is: 0.7788
Iteration 1
Accuracy of classifier with 5 neighbors is: 0.7540
Accuracy of classifier with 10 neighbors is: 0.7772
Accuracy of classifier with 25 neighbors is: 0.7788
Accuracy of classifier with 50 neighbors is: 0.7797
Accuracy of classifier with 100 neighbors is: 0.7788
Accuracy of classifier with 200 neighbors is: 0.7788
Accuracy of classifier with 500 neighbors is: 0.7788
Iteration 2
Accuracy of classifier with 5 neighbors is: 0.7540
Accuracy of classifier with 10 neighbors is: 0.7772
Accuracy of classifier with 25 neighbors is: 0.7788
Accuracy of classifier with 50 neighbors is: 0.7797
Accuracy of classifier wi

In [None]:
#Perform KNN with Manhattan distance as the metric 
parameters =[5,10,25,50,100,200,500]
accuracies = []
max_acc = 0
idx= -1
iter_num=0
for train_index, test_index in cv_object.split(X, y):
    print("Iteration", iter_num)
    iter_num+=1
    for K in parameters:
        clf = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='manhattan')
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        (max_acc , idx) = (acc, K )if acc> max_acc else (max_acc, idx)
        accuracies.append(acc)
        print('Accuracy of classifier with %d neighbors is: %.4f'%(K,acc))

    
print("Max accuracy is for k =  {} neighbors and the accuracy score is  {} ".format(idx , max_acc))        

Iteration 0
Accuracy of classifier with 5 neighbors is: 0.7480
Accuracy of classifier with 10 neighbors is: 0.7743
Accuracy of classifier with 25 neighbors is: 0.7762
Accuracy of classifier with 50 neighbors is: 0.7800


### References
Discussion on relevance of distance metric
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4978658/