# WE04 UNIVERSAL BANK

## 1. Import the Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier 
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import summarytools
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

np.random.seed(1)

## 2. Load the data

In [2]:
# load data

bank=pd.read_csv("UniversalBank.csv")

## 3. Conduct initial exploration of the data

In [3]:
# look at the data

bank.head(10) 

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0
7,8,50,24,22,93943,1,0.3,3,0,0,0,0,0,1
8,9,35,10,81,90089,3,0.6,2,104,0,0,0,1,0
9,10,34,9,180,93023,1,8.9,3,0,1,0,0,0,0


In [4]:
bank['CD Account'].value_counts()

0    4698
1     302
Name: CD Account, dtype: int64

In [5]:
# Summary of the data

bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [6]:
# statistical summary 

bank.describe()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,2500.5,45.3384,20.1046,73.7742,93152.503,2.3964,1.937938,1.881,56.4988,0.096,0.1044,0.0604,0.5968,0.294
std,1443.520003,11.463166,11.467954,46.033729,2121.852197,1.147663,1.747659,0.839869,101.713802,0.294621,0.305809,0.23825,0.490589,0.455637
min,1.0,23.0,-3.0,8.0,9307.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1250.75,35.0,10.0,39.0,91911.0,1.0,0.7,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2500.5,45.0,20.0,64.0,93437.0,2.0,1.5,2.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,3750.25,55.0,30.0,98.0,94608.0,3.0,2.5,3.0,101.0,0.0,0.0,0.0,1.0,1.0
max,5000.0,67.0,43.0,224.0,96651.0,4.0,10.0,3.0,635.0,1.0,1.0,1.0,1.0,1.0


In [7]:
# Check the missing values by summing the total na's for each variable
bank.isna().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [8]:
bank.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [9]:
# Dropping the ID and ZIP Code

bank.drop(columns=["ID"])
bank.drop(columns=["ZIP Code"])

Unnamed: 0,ID,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,3,0.5,2,0,0,0,0,1,0


In [10]:
# Perform encoding

Encode = pd.get_dummies(bank, columns = ['Education','Family'])
Encode

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,CCAvg,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard,Education_1,Education_2,Education_3,Family_1,Family_2,Family_3,Family_4
0,1,25,1,49,91107,1.6,0,0,1,0,0,0,1,0,0,0,0,0,1
1,2,45,19,34,90089,1.5,0,0,1,0,0,0,1,0,0,0,0,1,0
2,3,39,15,11,94720,1.0,0,0,0,0,0,0,1,0,0,1,0,0,0
3,4,35,9,100,94112,2.7,0,0,0,0,0,0,0,1,0,1,0,0,0
4,5,35,8,45,91330,1.0,0,0,0,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,29,3,40,92697,1.9,0,0,0,0,1,0,0,0,1,1,0,0,0
4996,4997,30,4,15,92037,0.4,85,0,0,0,1,0,1,0,0,0,0,0,1
4997,4998,63,39,24,93023,0.3,0,0,0,0,0,0,0,0,1,0,1,0,0
4998,4999,65,40,49,90034,0.5,0,0,0,0,1,0,0,1,0,0,0,1,0


## SPLITTING THE DATA

In [11]:
# split the data into validation and training set
# Considering as train=tr and test=tt

tr_df, tt_df = train_test_split(bank, test_size=0.3)
trgt = 'CD Account'
predictor = list(bank.columns)
predictor.remove(trgt)

In [12]:
# create a standard scaler and fit it to the train set of predictor

# Considering as train=tr and test=tt

scaler = preprocessing.StandardScaler()
cols_to_stdize = predictor
     
               
# Transform the predictors of training and validation sets
tr_df[cols_to_stdize] = scaler.fit_transform(tr_df[cols_to_stdize]) 
tt_df[cols_to_stdize] = scaler.transform(tt_df[cols_to_stdize]) 

X_tr = tr_df[predictor]
y_tr = tr_df[trgt] 
X_tt = tt_df[predictor]
y_tt= tt_df[trgt]

## Creating a dataframe

In [13]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

## Logistic Regression

## Logistic Regression using Random Search CV

In [14]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'max_iter':np.arange(500,1000),
    'penalty': ['None','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

log_reg = LogisticRegression()
random_search = RandomizedSearchCV(estimator = log_reg, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = random_search.fit(X_tr, y_tr)

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")\

bestRecall = random_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


1180 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
270 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/meghanaraju/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/meghanaraju/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/meghanaraju/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 457, in _check_solver
    raise ValueError(
ValueError: Only 'saga' solver supports elasticnet 

The best recall score is 0.6662790697674419
... with parameters: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 707}


 0.66627907 0.66627907        nan 0.66627907 0.66627907        nan
        nan 0.66627907 0.66627907 0.66627907        nan        nan
 0.66627907 0.66627907 0.66627907        nan 0.66627907        nan
        nan        nan 0.66627907 0.66627907 0.66627907 0.66627907
        nan        nan        nan 0.66627907 0.66627907 0.66627907
 0.66627907        nan        nan        nan 0.66627907 0.66627907
        nan 0.66627907 0.66627907        nan        nan 0.66627907
 0.66627907 0.66627907 0.66627907 0.66627907        nan 0.66627907
 0.66627907 0.66627907        nan        nan 0.66627907        nan
 0.66627907 0.66627907 0.66627907        nan        nan        nan
        nan 0.66627907 0.66627907        nan        nan        nan
 0.66627907        nan 0.66627907 0.66627907        nan        nan
 0.66627907        nan 0.66627907        nan        nan 0.66627907
 0.66627907 0.66627907        nan        nan 0.66627907 0.66627907
        nan        nan        nan        nan 0.66627907       

In [15]:
c_matrix = confusion_matrix(y_tt, random_search.predict(X_tt))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

##  Logistic Regression Grid Search CV

In [16]:
score_measure = "recall"
kfolds = 5
max_iter = random_search.best_params_['max_iter']
penalty = random_search.best_params_['penalty']
solver = random_search.best_params_['solver']

param_grid = {
    'max_iter': np.arange(max_iter-5,max_iter+5),  
    'penalty': [penalty],
    'solver': [solver]
}

log_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = log_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_tr, y_tr)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecall = grid_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The best recall score is 0.6662790697674419
... with parameters: {'max_iter': 702, 'penalty': 'l1', 'solver': 'liblinear'}


In [17]:
c_matrix = confusion_matrix(y_tt, grid_search.predict(X_tt))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## SVM Random Search CV using linear, rbf and poly kernal

In [18]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'C': np.arange(1,25),   
    'gamma': ['scale','auto'],
    'kernel':['linear','rbf','poly']
}

svm_model = SVC()
random_search = RandomizedSearchCV(estimator = svm_model, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = random_search.fit(X_tr, y_tr)

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")


bestRecall = random_search.best_estimator_


Fitting 5 folds for each of 144 candidates, totalling 720 fits




The best recall score is 0.6894291754756872
... with parameters: {'kernel': 'poly', 'gamma': 'scale', 'C': 15}


In [19]:
c_matrix = confusion_matrix(y_tt, random_search.predict(X_tt))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## SVM Grid Search CV using linear,rbf and poly kernal

In [20]:
score_measure = "recall"
kfolds = 5

C = random_search.best_params_['C']
gamma = random_search.best_params_['gamma']
kernel = random_search.best_params_['kernel']

param_grid = {
    'C': np.arange(C-2,C+2),  
    'gamma': [gamma],
    'kernel': [kernel]
    
}

svm_model = SVC()
grid_search = GridSearchCV(estimator = svm_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_tr, y_tr)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecall = grid_search.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
The best recall score is 0.6894291754756872
... with parameters: {'C': 15, 'gamma': 'scale', 'kernel': 'poly'}


In [21]:
c_matrix = confusion_matrix(y_tt, grid_search.predict(X_tt))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Decision tree by using Random search

In [22]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,200),  
    'min_samples_leaf': np.arange(1,200),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 200), 
    'max_depth': np.arange(1,50), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
random_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search.fit(X_tr, np.ravel(y_tr))

print(f"The best {score_measure} score is {random_search.best_score_}")
print(f"... with parameters: {random_search.best_params_}")

bestRecallTree = random_search.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.7029598308668076
... with parameters: {'min_samples_split': 142, 'min_samples_leaf': 10, 'min_impurity_decrease': 0.0036, 'max_leaf_nodes': 54, 'max_depth': 5, 'criterion': 'gini'}


20 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/meghanaraju/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/meghanaraju/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/Users/meghanaraju/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

 0.         0.48890063

In [23]:
c_matrix = confusion_matrix(y_tt, random_search.predict(X_tt))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Dtree_random", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

## Decision tree by using Grid search

In [24]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(30,36),  
    'min_samples_leaf': np.arange(6,12),
    'min_impurity_decrease': np.arange(0.0048, 0.0054, 0.0001),
    'max_leaf_nodes': np.arange(162,168), 
    'max_depth': np.arange(15,21), 
    'criterion': ['entropy'],
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_tr, np.ravel(y_tr))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 9072 candidates, totalling 45360 fits
The best recall score is 0.6480972515856237
... with parameters: {'criterion': 'entropy', 'max_depth': 15, 'max_leaf_nodes': 162, 'min_impurity_decrease': 0.0048, 'min_samples_leaf': 6, 'min_samples_split': 30}


In [25]:
c_matrix = confusion_matrix(y_tt, grid_search.predict(X_tt))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Dtree_grid", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])

In [26]:
performance.sort_values(by=['Recall'])

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,SVM,0.961333,0.681159,0.566265,0.618421
0,SVM,0.961333,0.681159,0.566265,0.618421
0,Logistic Regression,0.978,1.0,0.60241,0.75188
0,Logistic Regression,0.978,1.0,0.60241,0.75188
0,Dtree_grid,0.976,0.912281,0.626506,0.742857
0,Dtree_random,0.957333,0.606742,0.650602,0.627907


1. Two common methods for hyperparameter optimization in machine learning are random search and grid search. Other algorithms, such    as the use of support vector machines (SVM) and Decision Trees,Logistic can be used with these techniques.

2. From the above results, I can find that based on "TEST" data interms of highest recall score, the "Decision tree random search      model" performs better with 65% than other models.

Based on below mentioned "TRAIN" data,  "Decision tree using random search model" has the greatest recall scoring 70%.

 1. The best recall score is 0.66 - Log Reg using Random Search
 2. The best recall score is 0.66 - Log Reg using Grid Search
 3. The best recall score is 0.68 - SVM (Linear, Poly, Rbf kernels) using Random Search
 4. The best recall score is 0.68 - SVM (Linear, Poly, Rbf kernels) using Grid Search
 5. The best recall score is 0.70 - Decision tree using Random Search
 6. The best recall score is 0.64 - Decision tree using Grid Search