In [1]:
 #import packages
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#import data
bank_data = pd.read_csv('UniversalBank.csv')

In [3]:
#inspect data
bank_data.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [4]:
#separate the predictor and target variables. Drop unnecessary columns. 
X = bank_data.drop(columns=['ID', 'ZIP Code', 'Personal Loan'])
y= bank_data['Personal Loan']

In [5]:
X

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,1,0,0,0
1,45,19,34,3,1.5,1,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,3,0,0,0,1,0
4996,30,4,15,4,0.4,1,85,0,0,1,0
4997,63,39,24,2,0.3,3,0,0,0,0,0
4998,65,40,49,3,0.5,2,0,0,0,1,0


In [6]:
y

0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Name: Personal Loan, Length: 5000, dtype: int64

In [7]:
#Make dummy variables for the Education Column. Transform predictors with more than two categories. 
X= pd.get_dummies(X, columns=['Education'])

In [8]:
X


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3
0,25,1,49,4,1.6,0,1,0,0,0,True,False,False
1,45,19,34,3,1.5,0,1,0,0,0,True,False,False
2,39,15,11,1,1.0,0,0,0,0,0,True,False,False
3,35,9,100,1,2.7,0,0,0,0,0,False,True,False
4,35,8,45,4,1.0,0,0,0,0,1,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,0,0,0,1,0,False,False,True
4996,30,4,15,4,0.4,85,0,0,1,0,True,False,False
4997,63,39,24,2,0.3,0,0,0,0,0,False,False,True
4998,65,40,49,3,0.5,0,0,0,1,0,False,True,False


In [9]:
# Convert boolean columns to integers for consistency 
X[['Education_1', 'Education_2', 'Education_3']] = X[['Education_1', 'Education_2', 'Education_3']].astype(int)

In [10]:
X

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3
0,25,1,49,4,1.6,0,1,0,0,0,1,0,0
1,45,19,34,3,1.5,0,1,0,0,0,1,0,0
2,39,15,11,1,1.0,0,0,0,0,0,1,0,0
3,35,9,100,1,2.7,0,0,0,0,0,0,1,0
4,35,8,45,4,1.0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,40,1,1.9,0,0,0,1,0,0,0,1
4996,30,4,15,4,0.4,85,0,0,1,0,1,0,0
4997,63,39,24,2,0.3,0,0,0,0,0,0,0,1
4998,65,40,49,3,0.5,0,0,0,1,0,0,1,0


In [60]:
#standardize X
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

In [61]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-1.774417,-1.666078,-0.538229,1.397414,-0.193385,-0.555524,2.928915,-0.25354,-1.216618,-0.645314,1.177071,-0.624538,-0.654965
1,-0.029524,-0.096330,-0.864109,0.525991,-0.250611,-0.555524,2.928915,-0.25354,-1.216618,-0.645314,1.177071,-0.624538,-0.654965
2,-0.552992,-0.445163,-1.363793,-1.216855,-0.536736,-0.555524,-0.341423,-0.25354,-1.216618,-0.645314,1.177071,-0.624538,-0.654965
3,-0.901970,-0.968413,0.569765,-1.216855,0.436091,-0.555524,-0.341423,-0.25354,-1.216618,-0.645314,-0.849566,1.601185,-0.654965
4,-0.901970,-1.055621,-0.625130,1.397414,-0.536736,-0.555524,-0.341423,-0.25354,-1.216618,1.549632,-0.849566,1.601185,-0.654965
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,-1.425438,-1.491662,-0.733757,-1.216855,-0.021710,-0.555524,-0.341423,-0.25354,0.821951,-0.645314,-0.849566,-0.624538,1.526798
4996,-1.338194,-1.404454,-1.276892,1.397414,-0.880087,0.280238,-0.341423,-0.25354,0.821951,-0.645314,1.177071,-0.624538,-0.654965
4997,1.540880,1.647835,-1.081363,-0.345432,-0.937312,-0.555524,-0.341423,-0.25354,-1.216618,-0.645314,-0.849566,-0.624538,1.526798
4998,1.715370,1.735043,-0.538229,0.525991,-0.822862,-0.555524,-0.341423,-0.25354,0.821951,-0.645314,-0.849566,1.601185,-0.654965


In [62]:
#Split data into training and validation sets. Partition data into training (60%) and validation (40%) sets. 
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation= train_test_split(X,y, test_size=0.4, random_state=0)

In [92]:
#standardize the data 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_validation= scaler.transform(X_validation)

In [64]:
X_train, X_validation

(array([[-1.161369  , -1.22601206, -0.96122097, ..., -0.86325008,
         -0.62567218,  1.55838744],
        [ 1.44708126,  1.54999525,  1.83476922, ...,  1.15841287,
         -0.62567218, -0.64168895],
        [-0.20493724, -0.09825909,  1.03281855, ..., -0.86325008,
          1.59828107, -0.64168895],
        ...,
        [-1.68305906, -1.6597632 , -1.06959268, ..., -0.86325008,
         -0.62567218,  1.55838744],
        [ 1.01233955,  1.11624411, -0.52773412, ...,  1.15841287,
         -0.62567218, -0.64168895],
        [-1.07442066, -0.96576138, -0.76615188, ..., -0.86325008,
         -0.62567218,  1.55838744]]),
 array([[ 0.76202845,  0.87360622, -1.12111848, ...,  1.20581414,
         -0.6228363 , -0.6749845 ],
        [-1.07957319, -0.97250504,  0.18724922, ...,  1.20581414,
         -0.6228363 , -0.6749845 ],
        [ 0.76202845,  0.34614586, -0.05261819, ..., -0.82931521,
         -0.6228363 ,  1.4815155 ],
        ...,
        [-0.29031534, -0.70877486, -0.99028171, ..., -

In [65]:
#KNN Classification with K=1
from sklearn.neighbors import KNeighborsClassifier
KNN_clss = KNeighborsClassifier(n_neighbors=1)
KNN_clss.fit(X_train,y_train)
KNN_clss.score(X_validation, y_validation) 

0.9605

In [81]:
#Success class is 1 (loan acceptance) and failure class is 0 (loan rejection). Default cutoff value is 0.5. 
#Predict new customer with given predictors

new_customer = [[40,10,84,2,2,0,0,0,1,1,0,1,0]]
new_customer = scaler.transform(new_customer)
new_customer= pd.DataFrame(new_customer)

In [82]:
new_customer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,39.685989,9.919209,83.600153,1.974077,1.944388,0.008351,-0.011599,0.003379,0.990883,1.005115,-0.008096,0.987507,0.017148


In [83]:
#predict new customer
KNN_clss.predict(new_customer)

array([1], dtype=int64)

In [84]:
#answer a. 
#prediction is this new customer will accept the personal loan when K=1 

In [70]:
#Train the Classifier for different values of K

results=[]
for K in range(1,16):
    KNN_clss_K = KNeighborsClassifier(n_neighbors=K)
    KNN_clss_K.fit(X_train, y_train)
    results.append({'K':K, 'accuracy_train': KNN_clss_K.score(X_train, y_train), 'accuracy_validation':KNN_clss_K.score(X_validation, y_validation)})

#Convert results to a pandas data frame 
results = pd.DataFrame(results)
print (results)

     K  accuracy_train  accuracy_validation
0    1        1.000000               0.9605
1    2        0.970000               0.9525
2    3        0.977000               0.9610
3    4        0.964667               0.9545
4    5        0.968333               0.9605
5    6        0.960000               0.9525
6    7        0.962000               0.9550
7    8        0.955333               0.9495
8    9        0.959667               0.9525
9   10        0.952333               0.9485
10  11        0.955000               0.9520
11  12        0.949333               0.9470
12  13        0.951667               0.9525
13  14        0.948667               0.9475
14  15        0.952333               0.9480


In [71]:
#answer b. 
# when K is 3 is when it balances between overfitting and ignoring the predictor information

In [72]:
# using best K, predict y validation 

from sklearn.metrics import confusion_matrix, classification_report

KNN_clss_best = KNeighborsClassifier(n_neighbors=3)
KNN_clss_best.fit(X_train,y_train)
KNN_clss_best.score(X_validation, y_validation) 

y_pred_validation = KNN_clss_best.predict(X_validation)




In [73]:
y_pred_validation

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [74]:
#answer c. 
#Show the confusion matrix for the validation data that results from using the best K.
confusion_matrix = confusion_matrix(y_validation, y_pred_validation)
print (confusion_matrix)


[[1796   12]
 [  66  126]]


In [75]:
#Classify customer using best K
KNN_clss_best = KNeighborsClassifier(n_neighbors=3)
KNN_clss_best.fit(X_train,y_train)
KNN_clss_best.score(X_validation, y_validation) 

0.961

In [86]:
#new customer predictors standardized 

new_customer = [[40,10,84,2,2,0,0,0,1,1,0,1,0]]
new_customer = scaler.transform(new_customer)
new_customer = pd.DataFrame(new_customer)

In [87]:
new_customer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,39.685989,9.919209,83.600153,1.974077,1.944388,0.008351,-0.011599,0.003379,0.990883,1.005115,-0.008096,0.987507,0.017148


In [88]:
#Predict new customer with given predictors using best K
KNN_clss_best.predict(new_customer)

array([1], dtype=int64)

In [89]:
#answer d. 
#prediction is this new customer will accept the personal loan when K=3 

In [90]:
#Repartitioning training, validation, and test sets (50%:30%:20%)

# First split: 50% training, 50% temporary (validation + test)
from sklearn.metrics import confusion_matrix, classification_report

X_train_new, X_temp, y_train_new, y_temp = train_test_split(X, y, test_size=0.5, random_state=0)

# Second split (just the 50% temporary part): 60% validation, 40% test from the temporary set. 40% of 50% is 20% of all the data. 60% of 50% temporary is 30% of all data. 
X_validation_new, X_test, y_validation_new, y_test = train_test_split(X_temp, y_temp, test_size=0.4, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_new = scaler.fit_transform(X_train_new) 
X_validation_new = scaler.transform(X_validation_new)
X_test = scaler.transform(X_test)

# Train the KNN model with the best K
KNN_clss_final = KNeighborsClassifier(n_neighbors=3)
KNN_clss_final.fit(X_train_new, y_train_new)

#Make predictions for all datasets
y_train_predictions = KNN_clss_final.predict(X_train_new)
y_validation_predictions = KNN_clss_final.predict(X_validation_new)
y_test_predictions = KNN_clss_final.predict(X_test)

#Evaluate performance using confusion matrices 

print("Training Confusion Matrix:")
print(confusion_matrix(y_train_new,y_train_predictions))
print(classification_report(y_train_new, y_train_predictions))

print("Validation Confusion Matrix:")
print(confusion_matrix(y_validation_new,y_validation_predictions))
print(classification_report(y_validation_new, y_validation_predictions))

print("Test Confusion Matrix:")
print(confusion_matrix(y_test,y_test_predictions))
print(classification_report(y_test, y_test_predictions))


Training Confusion Matrix:
[[2260    2]
 [  52  186]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2262
           1       0.99      0.78      0.87       238

    accuracy                           0.98      2500
   macro avg       0.98      0.89      0.93      2500
weighted avg       0.98      0.98      0.98      2500

Validation Confusion Matrix:
[[1336    8]
 [  58   98]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1344
           1       0.92      0.63      0.75       156

    accuracy                           0.96      1500
   macro avg       0.94      0.81      0.86      1500
weighted avg       0.95      0.96      0.95      1500

Test Confusion Matrix:
[[909   5]
 [ 34  52]]
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       914
           1       0.91      0.60      0.73        86

    accuracy              

In [91]:
#answer e.
#The test confusion matrix shows high overall accuracy (96%) at predicting unseen data. The training confusion matrix shows the highest overall accuracy (98%) at predicting training data. The test confusion matrix and Validation confusion matrix show the same overall accuracy (96%), but the Validation confusion matrix show slightly better prediction for loan acceptance correctly at (92%) compared to test confusion matrix at (91%)