# Using Logistic Regression to predict whether a client will deposit cash in the future

In [40]:
# Importing Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# ML libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
bank = pd.read_csv('http://bit.ly/BankingDataset')
bank

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,blue-collar,married,basic.4y,unknown,yes,no,cellular,aug,thu,...,1,999,0,nonexistent,1.4,93.444,-36.1,4.963,5228.1,0
1,53,technician,married,unknown,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-0.1,93.200,-42.0,4.021,5195.8,0
2,28,management,single,university.degree,no,yes,no,cellular,jun,thu,...,3,6,2,success,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,services,married,high.school,no,no,no,cellular,apr,fri,...,2,999,0,nonexistent,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,retired,married,basic.4y,no,yes,no,cellular,aug,fri,...,1,3,1,success,-2.9,92.201,-31.4,0.869,5076.2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,unknown,no,yes,telephone,jun,thu,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.866,5228.1,0
41184,31,housemaid,married,basic.4y,unknown,no,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.860,5191.0,0
41185,42,admin.,single,university.degree,unknown,yes,yes,telephone,may,wed,...,3,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
41186,48,technician,married,professional.course,no,no,yes,telephone,oct,tue,...,2,999,0,nonexistent,-3.4,92.431,-26.9,0.742,5017.5,0


## Data Cleaning

In [5]:
bank.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx',
       'cons_conf_idx', 'euribor3m', 'nr_employed', 'y'],
      dtype='object')

-> Columns to be used to predict are
- age
- job
- education
- housing
- loan
- marital status

In [6]:
# Cleaning

# Null values
bank.isna().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp_var_rate      0
cons_price_idx    0
cons_conf_idx     0
euribor3m         0
nr_employed       0
y                 0
dtype: int64

In [7]:
bank.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp_var_rate      float64
cons_price_idx    float64
cons_conf_idx     float64
euribor3m         float64
nr_employed       float64
y                   int64
dtype: object

In [13]:
df = bank[['age', 'job', 'marital', 'education', 'housing', 'loan', 'y']]
df

Unnamed: 0,age,job,marital,education,housing,loan,y
0,44,blue-collar,married,basic.4y,yes,no,0
1,53,technician,married,unknown,no,no,0
2,28,management,single,university.degree,yes,no,1
3,39,services,married,high.school,no,no,0
4,55,retired,married,basic.4y,yes,no,1
...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,no,yes,0
41184,31,housemaid,married,basic.4y,no,no,0
41185,42,admin.,single,university.degree,yes,yes,0
41186,48,technician,married,professional.course,no,yes,0


In [14]:
# Changing the categorical columns to ordinal

# Import Encoder library

from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()

In [16]:
# Job
df['job_encoded'] = encoder.fit_transform(df[['job']])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['job_encoded'] = encoder.fit_transform(df[['job']])


Unnamed: 0,age,job,marital,education,housing,loan,y,job_encoded
0,44,blue-collar,married,basic.4y,yes,no,0,1.0
1,53,technician,married,unknown,no,no,0,9.0
2,28,management,single,university.degree,yes,no,1,4.0
3,39,services,married,high.school,no,no,0,7.0
4,55,retired,married,basic.4y,yes,no,1,5.0
...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,no,yes,0,5.0
41184,31,housemaid,married,basic.4y,no,no,0,3.0
41185,42,admin.,single,university.degree,yes,yes,0,0.0
41186,48,technician,married,professional.course,no,yes,0,9.0


In [19]:
# Marital status
def encoderr(column):
    name = input('Enter col name : ')
    df[name] = encoder.fit_transform(df[[column]])
    return df

In [20]:
# marital
encoderr('marital')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = encoder.fit_transform(df[[column]])


Unnamed: 0,age,job,marital,education,housing,loan,y,job_encoded,marital_encoded
0,44,blue-collar,married,basic.4y,yes,no,0,1.0,1.0
1,53,technician,married,unknown,no,no,0,9.0,1.0
2,28,management,single,university.degree,yes,no,1,4.0,2.0
3,39,services,married,high.school,no,no,0,7.0,1.0
4,55,retired,married,basic.4y,yes,no,1,5.0,1.0
...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,no,yes,0,5.0,1.0
41184,31,housemaid,married,basic.4y,no,no,0,3.0,1.0
41185,42,admin.,single,university.degree,yes,yes,0,0.0,2.0
41186,48,technician,married,professional.course,no,yes,0,9.0,1.0


In [21]:
# education
encoderr('education')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = encoder.fit_transform(df[[column]])


Unnamed: 0,age,job,marital,education,housing,loan,y,job_encoded,marital_encoded,education_encoded
0,44,blue-collar,married,basic.4y,yes,no,0,1.0,1.0,0.0
1,53,technician,married,unknown,no,no,0,9.0,1.0,7.0
2,28,management,single,university.degree,yes,no,1,4.0,2.0,6.0
3,39,services,married,high.school,no,no,0,7.0,1.0,3.0
4,55,retired,married,basic.4y,yes,no,1,5.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,no,yes,0,5.0,1.0,3.0
41184,31,housemaid,married,basic.4y,no,no,0,3.0,1.0,0.0
41185,42,admin.,single,university.degree,yes,yes,0,0.0,2.0,6.0
41186,48,technician,married,professional.course,no,yes,0,9.0,1.0,5.0


In [22]:
# Housing
encoderr('housing')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = encoder.fit_transform(df[[column]])


Unnamed: 0,age,job,marital,education,housing,loan,y,job_encoded,marital_encoded,education_encoded,housing_encoded
0,44,blue-collar,married,basic.4y,yes,no,0,1.0,1.0,0.0,2.0
1,53,technician,married,unknown,no,no,0,9.0,1.0,7.0,0.0
2,28,management,single,university.degree,yes,no,1,4.0,2.0,6.0,2.0
3,39,services,married,high.school,no,no,0,7.0,1.0,3.0,0.0
4,55,retired,married,basic.4y,yes,no,1,5.0,1.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,no,yes,0,5.0,1.0,3.0,0.0
41184,31,housemaid,married,basic.4y,no,no,0,3.0,1.0,0.0,0.0
41185,42,admin.,single,university.degree,yes,yes,0,0.0,2.0,6.0,2.0
41186,48,technician,married,professional.course,no,yes,0,9.0,1.0,5.0,0.0


In [24]:
df['housing_encoded'].value_counts()

2.0    21576
0.0    18622
1.0      990
Name: housing_encoded, dtype: int64

In [25]:
#Loan
encoderr('loan')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[name] = encoder.fit_transform(df[[column]])


Unnamed: 0,age,job,marital,education,housing,loan,y,job_encoded,marital_encoded,education_encoded,housing_encoded,loan_encoded
0,44,blue-collar,married,basic.4y,yes,no,0,1.0,1.0,0.0,2.0,0.0
1,53,technician,married,unknown,no,no,0,9.0,1.0,7.0,0.0,0.0
2,28,management,single,university.degree,yes,no,1,4.0,2.0,6.0,2.0,0.0
3,39,services,married,high.school,no,no,0,7.0,1.0,3.0,0.0,0.0
4,55,retired,married,basic.4y,yes,no,1,5.0,1.0,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
41183,59,retired,married,high.school,no,yes,0,5.0,1.0,3.0,0.0,2.0
41184,31,housemaid,married,basic.4y,no,no,0,3.0,1.0,0.0,0.0,0.0
41185,42,admin.,single,university.degree,yes,yes,0,0.0,2.0,6.0,2.0,2.0
41186,48,technician,married,professional.course,no,yes,0,9.0,1.0,5.0,0.0,2.0


In [26]:
df['loan'].value_counts()

no         33950
yes         6248
unknown      990
Name: loan, dtype: int64

## Logistic Regression

In [27]:
# Separate the data into dependent variables and independent variables

df.columns

Index(['age', 'job', 'marital', 'education', 'housing', 'loan', 'y',
       'job_encoded', 'marital_encoded', 'education_encoded',
       'housing_encoded', 'loan_encoded'],
      dtype='object')

In [35]:
X = df[['age', 'job_encoded', 'marital_encoded', 'education_encoded', 'housing_encoded', 'loan_encoded']].values
y = df['y'].values

In [36]:
# Divide the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [37]:
# Train model
logistic_regressor = LogisticRegression()

logistic_regressor.fit(X_train, y_train)

LogisticRegression()

In [39]:
# Prediction
y_predict = logistic_regressor.predict(X_test)

df_predictions = pd.DataFrame({'OG Values' : y_test, 'Predicted Values' : y_predict})
df_predictions

Unnamed: 0,OG Values,Predicted Values
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
8233,0,0
8234,0,0
8235,0,0
8236,0,0


In [41]:
confusion_mat = confusion_matrix(y_test, y_predict)
confusion_mat

array([[7325,    0],
       [ 913,    0]])

In [None]:
# Our model predicted 7,300 values correctly and 900 wrong