In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
%matplotlib inline

In [None]:
from matplotlib import rcParams
rcParams['patch.force_edgecolor']=True
rcParams['patch.facecolor']='b'

In [None]:
df = pd.read_excel('./Bank_Personal_Loan_Modelling.xlsx', 'Data')

# 1. Overview

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.nunique()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

In [None]:
df.set_index('ID', inplace=True)

In [None]:
cate_var = [col for col in df.columns if df[col].nunique()<=5]
cate_var.remove('Personal Loan')
cont_var = [col for col in df.columns if df[col].nunique()>5]
print('Categorical variables:', cate_var)
print('Continuous variables:', cont_var)

# 2. Plotting

## 2.1. Continuous and Categorical variables

In [None]:
fig_1 = plt.figure(figsize=(25,9))
for i, col in enumerate(cont_var):
    ax = fig_1.add_subplot(2,3,i+1)
    sns.distplot(df[col], color='y')

In [None]:
fig_2 = plt.figure(figsize=(25,9))
for i, col in enumerate(cate_var):
    ax = fig_2.add_subplot(2,3,i+1)
    sns.countplot(df[col], palette='RdBu_r')

## 2.2. Personal Loan

In [None]:
fig_3 = plt.figure(figsize=(25,9))
for i, col in enumerate(cont_var):
    ax = fig_3.add_subplot(2,3,i+1)
    sns.boxplot(x=df['Personal Loan'], y=df[col], palette='RdBu_r')

In [None]:
fig_4 = plt.figure(figsize=(25,9))
for i, col in enumerate(cont_var):
    ax = fig_4.add_subplot(2,3,i+1)
    ax1 = sns.distplot(df[col][df['Personal Loan']==0], hist=False, label='No Personal Loan', color='r')
    sns.distplot(df[col][df['Personal Loan']==1], hist=False, ax=ax1, label='Is Personal Loan', color='b')

In [None]:
fig_5 = plt.figure(figsize=(25,9))
for i, col in enumerate(cate_var):
    ax = fig_5.add_subplot(2,3,i+1)
    sns.barplot(x=col, y='Personal Loan', data=df, ci=None, palette='RdBu_r')

In [None]:
fig_6 = plt.figure(figsize=(25,9))
for i, col in enumerate(cate_var):
    ax = fig_6.add_subplot(2,3,i+1)
    sns.countplot(x=col, hue='Personal Loan', data=df, palette='RdBu_r')

## 2.3. Income

In [None]:
cont_var_temp = cont_var.copy()
cont_var_temp.remove('Income')

In [None]:
fig_7 = plt.figure(figsize=(25,9))
for i, col in enumerate(cont_var_temp):
    ax = fig_7.add_subplot(2,3,i+1)
    sns.scatterplot('Income', y=col, hue='Personal Loan', data=df, palette='RdBu_r')

In [None]:
fig_8 = plt.figure(figsize=(25,9))
for i, col in enumerate(cate_var):
    ax = fig_8.add_subplot(2,3,i+1)
    sns.scatterplot('Income', y=col, hue='Personal Loan', data=df, palette='RdBu_r')

## 2.4. CCAvg

In [None]:
cont_var_temp.remove('CCAvg')

In [None]:
fig_9 = plt.figure(figsize=(25,9))
for i, col in enumerate(cont_var_temp):
    ax = fig_9.add_subplot(2,2,i+1)
    sns.scatterplot('CCAvg', col, hue='Personal Loan', data=df, palette='RdBu_r')

In [None]:
fig_10 = plt.figure(figsize=(25,9))
for i, col in enumerate(cate_var):
    ax = fig_10.add_subplot(2,3,i+1)
    sns.scatterplot('CCAvg', col, hue='Personal Loan', data=df, palette='RdBu_r')

# 3. Correlation

In [None]:
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
with sns.axes_style(style='whitegrid'):
    fig_11, ax = plt.subplots(figsize=(25,9))
    plt.title('Number Of Attributes Heatmap')
    sns.heatmap(corr, mask=mask, annot=True, square=True, linewidths=.5, cmap="RdBu_r")

In [None]:
df[['Age', 'Experience', 'Personal Loan']].corr()

In [None]:
df['General Account'] = df['CD Account'] + df['Securities Account']
df[['General Account', 'CD Account', 'Securities Account', 'Personal Loan']].corr()

In [None]:
df['General Service'] = df['Online'] + df['CreditCard']
df[['General Service', 'Online', 'CreditCard', 'Personal Loan']].corr()

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
df.drop(['ZIP Code', 'Experience', 'General Account', 'Online', 'CreditCard'], axis=1, inplace=True)
scaled_df = pd.DataFrame(StandardScaler().fit_transform(df.drop('Personal Loan', axis=1)))

In [None]:
scaled_df.columns = df.drop('Personal Loan', axis=1).columns
scaled_df.head(10)

# 4. Modelling

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [None]:
x = scaled_df
y = df['Personal Loan']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=100)

In [None]:
model_list = []
model_f1_score = []
model_accuracy_score = []

## 4.1. Logistic Regression

In [None]:
model_list.append('LogisticRegression')
lm = LogisticRegression()

In [None]:
lm.fit(x_train, y_train)

In [None]:
yhat_lm = lm.predict(x_test)

In [None]:
lm_score = f1_score(y_test, yhat_lm)
model_f1_score.append(lm_score)
lm_score

In [None]:
lm_accuracy = accuracy_score(y_test,yhat_lm)
model_accuracy_score.append(lm_accuracy)
lm_accuracy

In [None]:
print(classification_report(y_test, yhat_lm))

## 4.2. Decision Tree Classifier

In [None]:
model_list.append('DecisionTreeClassifier')
tree = DecisionTreeClassifier()

In [None]:
tree.fit(x_train, y_train)

In [None]:
yhat_tree = tree.predict(x_test)

In [None]:
tree_score = f1_score(y_test, yhat_tree)
model_f1_score.append(tree_score)
tree_score

In [None]:
tree_accuracy = accuracy_score(y_test, yhat_tree)
model_accuracy_score.append(tree_accuracy)
tree_accuracy

In [None]:
print(classification_report(y_test, yhat_tree))

## 4.3. Random Forest Classifier

In [None]:
model_list.append('RandomForestClassifier')
forest = RandomForestClassifier()

In [None]:
forest.fit(x_train, y_train)

In [None]:
yhat_forest = forest.predict(x_test)

In [None]:
forest_score = f1_score(y_test,yhat_forest)
model_f1_score.append(forest_score)
forest_score

In [None]:
forest_accuracy = accuracy_score(y_test,yhat_forest)
model_accuracy_score.append(forest_accuracy)
forest_accuracy

In [None]:
print(classification_report(y_test, yhat_forest))

## 4.4. SVC

In [None]:
model_list.append('SVC')
svc = SVC()

In [None]:
svc.fit(x_train, y_train)

In [None]:
yhat_svc = svc.predict(x_test)

In [None]:
svc_score = f1_score(y_test,yhat_svc)
model_f1_score.append(svc_score)
svc_score

In [None]:
svc_accuracy = accuracy_score(y_test,yhat_svc)
model_accuracy_score.append(svc_accuracy)
svc_accuracy

In [None]:
print(classification_report(y_test, yhat_svc))

## 4.5. KNeighbors Classifier

In [None]:
model_list.append('KNeighborsClassifier')
neighbour = KNeighborsClassifier()

In [None]:
neighbour.fit(x_train,y_train)

In [None]:
yhat_neighbour = neighbour.predict(x_test)

In [None]:
neighbour_score = f1_score(y_test,yhat_neighbour)
model_f1_score.append(neighbour_score)
neighbour_score

In [None]:
neighbour_accuracy = accuracy_score(y_test,yhat_neighbour)
model_accuracy_score.append(neighbour_accuracy)
neighbour_accuracy

In [None]:
print(classification_report(y_test,yhat_neighbour))

# 5. Result

## 5.1. F1 Score

In [None]:
fig_12, ax = plt.subplots(figsize=(25,9))
sns.barplot(x=model_list, y=model_f1_score, palette='RdBu_r')
ax.set_title('Model F1 Score')
ax.set_ylabel('F1_Score')

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0%}'.format(height), (x+width*.45, y+height*1.01))

## 5.2. Accuracy Score

In [None]:
fig_13, ax = plt.subplots(figsize=(25,9))
sns.barplot(x=model_list, y=model_accuracy_score, palette='RdBu_r')
ax.set_title('Model Accuracy')
ax.set_ylabel('Accuracy')

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.annotate('{:.0%}'.format(height), (x+width*.45, y+height*1.01))