In [2]:
# Import libraries required for creating models and validating them
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
#from IPython.html import widgets 
import warnings
warnings.filterwarnings('ignore')

In [4]:
#Read the data
ccdefault=pd.read_csv('../data//UCI_Credit_Card.csv')

In [5]:
#rename the columns to clearer names
ccdefault.rename(columns={'default.payment.next.month':'DEFAULT'},inplace=True)
ccdefault.rename(columns={'PAY_0':'PAY_1'},inplace=True)

In [6]:
#Verifying the datatypes in the dataset
ccdefault.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
ID           30000 non-null int64
LIMIT_BAL    30000 non-null float64
SEX          30000 non-null int64
EDUCATION    30000 non-null int64
MARRIAGE     30000 non-null int64
AGE          30000 non-null int64
PAY_1        30000 non-null int64
PAY_2        30000 non-null int64
PAY_3        30000 non-null int64
PAY_4        30000 non-null int64
PAY_5        30000 non-null int64
PAY_6        30000 non-null int64
BILL_AMT1    30000 non-null float64
BILL_AMT2    30000 non-null float64
BILL_AMT3    30000 non-null float64
BILL_AMT4    30000 non-null float64
BILL_AMT5    30000 non-null float64
BILL_AMT6    30000 non-null float64
PAY_AMT1     30000 non-null float64
PAY_AMT2     30000 non-null float64
PAY_AMT3     30000 non-null float64
PAY_AMT4     30000 non-null float64
PAY_AMT5     30000 non-null float64
PAY_AMT6     30000 non-null float64
DEFAULT      30000 non-null int64
dtypes: float64

In [7]:
cc = ccdefault.copy() # taking a copy in memory
from sklearn.model_selection import StratifiedShuffleSplit
if 'DEFAULT' in cc:
    y = cc['DEFAULT'].values
    del cc['DEFAULT']
    del cc['BILL_AMT1']
    del cc['BILL_AMT2']
    del cc['BILL_AMT3']
    del cc['BILL_AMT4']
    del cc['BILL_AMT5']
    del cc['BILL_AMT6']
    X = cc.values
num_cv_iterations = 5
num_instances = len(y)
cv_object = StratifiedShuffleSplit(n_splits = num_cv_iterations,
test_size = 0.20, train_size = 0.80, random_state=10)
cv_object.get_n_splits(X, y)
print(cv_object)
for train_index, test_index in cv_object.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

StratifiedShuffleSplit(n_splits=5, random_state=10, test_size=0.2,
            train_size=0.8)
TRAIN: [26296   633  2863 ... 15141 19066  7166] TEST: [ 5469 17052 23411 ... 24675 28588 16031]
TRAIN: [17006  5992 28816 ... 10798 22887 23796] TEST: [  683 10937   999 ...   875  4336  2721]
TRAIN: [ 5957 25994 14047 ...    53   805 24297] TEST: [14399 23807  6988 ...  9804 17546 23927]
TRAIN: [11516  8559 14297 ... 14603 13364    89] TEST: [13759 19809 18958 ... 27816 11181 16164]
TRAIN: [ 2233 17193  9866 ... 12371  9657 11083] TEST: [28935 15053 22533 ... 22298  4357 23349]


In [8]:
#Perform KNN with Euclidean distance as the metric 
parameters =[1,5,10,15,25,50,100,200]
accuracies = []
max_acc = 0
idx= -1
for K in parameters:
    clf = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='euclidean')
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    (max_acc , idx) = (acc, K )if acc> max_acc else (max_acc, idx)
    accuracies.append(acc)
    print('Accuracy of classifier with %d neighbors is: %.4f'%(K,acc))
    

print("Max accuracy is for k =  {} neighbors and the accuracy score is  {} ".format(idx , max_acc))        

Accuracy of classifier with 1 neighbors is: 0.6830
Accuracy of classifier with 5 neighbors is: 0.7467
Accuracy of classifier with 10 neighbors is: 0.7750
Accuracy of classifier with 15 neighbors is: 0.7730
Accuracy of classifier with 25 neighbors is: 0.7767
Accuracy of classifier with 50 neighbors is: 0.7798
Accuracy of classifier with 100 neighbors is: 0.7790
Accuracy of classifier with 200 neighbors is: 0.7788
Max accuracy is for k =  50 neighbors and the accuracy score is  0.7798333333333334 


In [9]:
#Perform KNN with cosine distance as the metric 
parameters =[1,5,10,15,25,50,100,200]
accuracies = []
max_acc = 0
idx= -1
for K in parameters:
    clf = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='cosine')
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    (max_acc , idx) = (acc, K )if acc> max_acc else (max_acc, idx)
    accuracies.append(acc)
    print('Accuracy of classifier with %d neighbors is: %.4f'%(K,acc))

print("Max accuracy is for k =  {} neighbors and the accuracy score is  {} ".format(idx , max_acc))        

Accuracy of classifier with 1 neighbors is: 0.6858
Accuracy of classifier with 5 neighbors is: 0.7540
Accuracy of classifier with 10 neighbors is: 0.7782
Accuracy of classifier with 15 neighbors is: 0.7765
Accuracy of classifier with 25 neighbors is: 0.7780
Accuracy of classifier with 50 neighbors is: 0.7793
Accuracy of classifier with 100 neighbors is: 0.7792
Accuracy of classifier with 200 neighbors is: 0.7788
Max accuracy is for k =  50 neighbors and the accuracy score is  0.7793333333333333 


In [10]:
#Perform KNN with Manhattan distance as the metric 
parameters =[1,5,10,15,25,50,100,200]
accuracies = []
max_acc = 0
idx= -1
for K in parameters:
    clf = KNeighborsClassifier(n_neighbors=K, weights='uniform', metric='jaccard')
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    (max_acc , idx) = (acc, K )if acc> max_acc else (max_acc, idx)
    accuracies.append(acc)
    print('Accuracy of classifier with %d neighbors is: %.4f'%(K,acc))

    
print("Max accuracy is for k =  {} neighbors and the accuracy score is  {} ".format(idx , max_acc))        

Accuracy of classifier with 1 neighbors is: 0.7260
Accuracy of classifier with 5 neighbors is: 0.6635
Accuracy of classifier with 10 neighbors is: 0.7533
Accuracy of classifier with 15 neighbors is: 0.7640
Accuracy of classifier with 25 neighbors is: 0.7493
Accuracy of classifier with 50 neighbors is: 0.7648
Accuracy of classifier with 100 neighbors is: 0.7673
Accuracy of classifier with 200 neighbors is: 0.7658
Max accuracy is for k =  100 neighbors and the accuracy score is  0.7673333333333333 
