In [1]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from sklearn.neighbors import KNeighborsClassifier

In [5]:
from scipy.stats import zscore

In [6]:
import seaborn as sns

In [7]:
# A small function to compare predicted values with actual and count how many are correct

def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

In [8]:
NNH = KNeighborsClassifier(n_neighbors= 15 , weights = 'uniform', metric='euclidean')

# weights :  optional (default = ‘uniform’)
#weight function used in prediction. Possible values:
#‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.
#‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

# distance measures can be P=1 manhattan, P=2 Euclidian , anyother p value will map to minkowskian

In [9]:
Bank_df=pd.read_excel("Bank_Personal_Loan_Modelling.xlsx",sheetname='Data')

In [10]:
Bank_df.shape

(5000, 14)

In [11]:
Bank_df.dtypes

ID                      int64
Age                     int64
Experience              int64
Income                  int64
ZIP Code                int64
Family                  int64
CCAvg                 float64
Education               int64
Mortgage                int64
Personal Loan           int64
Securities Account      int64
CD Account              int64
Online                  int64
CreditCard              int64
dtype: object

In [12]:
temp_df= Bank_df
Bank_df['Mortgage_Binary'] = Bank_df['Mortgage'].map(lambda x: 1 if x > 0 else 0)
Bank_df['Mortgage_Binary'].value_counts()
Bank_df=Bank_df.drop(['ID', 'ZIP Code','Mortgage'], axis=1)
Bank_df['Personal Loan']=Bank_df['Personal Loan'].astype('category')
Bank_df['Education']=Bank_df['Education'].astype('category')
Bank_df['Securities Account']=Bank_df['Securities Account'].astype('category')
Bank_df['CD Account']=Bank_df['CD Account'].astype('category')
Bank_df['Online']=Bank_df['Online'].astype('category')
Bank_df['CreditCard']=Bank_df['CreditCard'].astype('category')
Bank_df['Mortgage_Binary']=Bank_df['Mortgage_Binary'].astype('category')
Bank_df.dtypes



Age                      int64
Experience               int64
Income                   int64
Family                   int64
CCAvg                  float64
Education             category
Personal Loan         category
Securities Account    category
CD Account            category
Online                category
CreditCard            category
Mortgage_Binary       category
dtype: object

In [13]:
Bank_df.groupby(["Personal Loan"]).count()



Unnamed: 0_level_0,Age,Experience,Income,Family,CCAvg,Education,Securities Account,CD Account,Online,CreditCard,Mortgage_Binary
Personal Loan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,4520,4520,4520,4520,4520,4520,4520,4520,4520,4520,4520
1,480,480,480,480,480,480,480,480,480,480,480


drop the first column from the data frame. This is Id column which is not used in modeling

In [14]:
# Create a separate dataframe consisting only of the features i.e independent attributes

Bank_Features = Bank_df.drop(labels= "Personal Loan" , axis = 1)

In [15]:
# convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# It is always adviced to scale numeric attributes in models that calculate distances.

Bank_Features_z = Bank_Features.apply(zscore)  # convert all attributes to Z scale 

In [16]:


Bank_Predict = Bank_df["Personal Loan"]

In [17]:
# store the normalized features data into np array 

X = np.array(Bank_Features_z)

In [18]:


Y = np.array(Bank_Predict)

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_Train, X_test, Y_Train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [21]:
# Call Nearest Neighbour algorithm

NNH.fit(X_Train, Y_Train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [22]:


predicted_labels = NNH.predict(X_test)

In [23]:
y_test

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [24]:
# get the accuracy score which is how many test cases were correctly predicted as a ratio of total number of test cases

accuracy_score = getAccuracy(y_test, predicted_labels)

In [25]:
print(accuracy_score)

94.3333333333


In [26]:
# To improve performance ------------------------- Iteration 2 -----------------------------------


In [28]:

bc_features_pruned_df_z =  Bank_Features_z.drop(['Age','Experience','CreditCard'], axis=1)

In [29]:
X = np.array(bc_features_pruned_df_z)

In [30]:
# Break the data into training and test

In [31]:
X_Train, X_test, Y_Train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

In [32]:
# Call Nearest Neighbour algorithm
NNH = KNeighborsClassifier(n_neighbors= 15 , weights = 'uniform', metric='minkowski')
NNH.fit(X_Train, Y_Train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=15, p=2,
           weights='uniform')

In [33]:


predicted_labels = NNH.predict(X_test)

In [34]:
# get the accuracy score which is how many test cases were correctly predicted as a ratio of total number of test cases

accuracy_score = getAccuracy(y_test, predicted_labels)

In [35]:
print(accuracy_score)

95.5333333333


In [40]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [38]:
print(classification_report(y_test,predicted_labels))

             precision    recall  f1-score   support

          0       0.96      1.00      0.98      1343
          1       0.94      0.61      0.74       157

avg / total       0.95      0.96      0.95      1500



In [41]:
print(confusion_matrix(y_test,predicted_labels))

[[1337    6]
 [  61   96]]
