## Output variable -> y
## y -> Whether the client has subscribed a term deposit or not 
## Binomial ("yes" or "no")

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix,accuracy_score

In [2]:
bank_data = pd.read_csv('bank-full.csv',sep=';')
bank_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
bank_data.shape

(45211, 17)

In [4]:
bank_data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
bank_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

## EDA

In [7]:
dummies = pd.get_dummies(bank_data,columns=["job","marital","education","default","housing","loan","month","poutcome","contact"])
dummies

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,...,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,contact_cellular,contact_telephone,contact_unknown
0,58,2143,5,261,1,-1,0,no,0,0,...,0,0,0,0,0,0,1,0,0,1
1,44,29,5,151,1,-1,0,no,0,0,...,0,0,0,0,0,0,1,0,0,1
2,33,2,5,76,1,-1,0,no,0,0,...,0,0,0,0,0,0,1,0,0,1
3,47,1506,5,92,1,-1,0,no,0,1,...,0,0,0,0,0,0,1,0,0,1
4,33,1,5,198,1,-1,0,no,0,0,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,yes,0,0,...,1,0,0,0,0,0,1,1,0,0
45207,71,1729,17,456,2,-1,0,yes,0,0,...,1,0,0,0,0,0,1,1,0,0
45208,72,5715,17,1127,5,184,3,yes,0,0,...,1,0,0,0,0,1,0,1,0,0
45209,57,668,17,508,4,-1,0,no,0,1,...,1,0,0,0,0,0,1,0,1,0


In [8]:
merged = pd.concat([bank_data,dummies],axis='columns')
merged

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown,contact_cellular,contact_telephone,contact_unknown
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,1
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,1
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,0,0,0,0,0,0,1,0,0,1
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,0,0,0,0,0,0,1,0,0,1
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,...,1,0,0,0,0,0,1,1,0,0
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,...,1,0,0,0,0,0,1,1,0,0
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,...,1,0,0,0,0,1,0,1,0,0
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,...,1,0,0,0,0,0,1,0,1,0


In [9]:
merged.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y', 'age', 'balance', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'y', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_no', 'default_yes',
       'housing_no', 'housing_yes', 'loan_no', 'loan_yes', 'month_apr',
       'month_aug', 'month_dec', 'month_feb', 'month_jan', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unkn

In [10]:
final = merged.drop(labels =["job","job_admin.","marital","marital_divorced","education","education_primary","default","default_no","housing_no","housing","loan","loan_no","month","month_apr","poutcome","poutcome_failure","contact","contact_cellular","y"], axis=1)
final

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,age.1,balance.1,day.1,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,contact_telephone,contact_unknown
0,58,2143,5,261,1,-1,0,58,2143,5,...,0,1,0,0,0,0,0,1,0,1
1,44,29,5,151,1,-1,0,44,29,5,...,0,1,0,0,0,0,0,1,0,1
2,33,2,5,76,1,-1,0,33,2,5,...,0,1,0,0,0,0,0,1,0,1
3,47,1506,5,92,1,-1,0,47,1506,5,...,0,1,0,0,0,0,0,1,0,1
4,33,1,5,198,1,-1,0,33,1,5,...,0,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,51,825,17,...,0,0,1,0,0,0,0,1,0,0
45207,71,1729,17,456,2,-1,0,71,1729,17,...,0,0,1,0,0,0,0,1,0,0
45208,72,5715,17,1127,5,184,3,72,5715,17,...,0,0,1,0,0,0,1,0,0,0
45209,57,668,17,508,4,-1,0,57,668,17,...,0,0,1,0,0,0,0,1,1,0


In [12]:
final.dtypes

age                    int64
balance                int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
age                    int64
balance                int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
job_blue-collar        uint8
job_entrepreneur       uint8
job_housemaid          uint8
job_management         uint8
job_retired            uint8
job_self-employed      uint8
job_services           uint8
job_student            uint8
job_technician         uint8
job_unemployed         uint8
job_unknown            uint8
marital_married        uint8
marital_single         uint8
education_secondary    uint8
education_tertiary     uint8
education_unknown      uint8
default_yes            uint8
housing_yes            uint8
loan_yes               uint8
month_aug              uint8
month_dec     

In [13]:
X = final

In [14]:
X

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,age.1,balance.1,day.1,...,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_other,poutcome_success,poutcome_unknown,contact_telephone,contact_unknown
0,58,2143,5,261,1,-1,0,58,2143,5,...,0,1,0,0,0,0,0,1,0,1
1,44,29,5,151,1,-1,0,44,29,5,...,0,1,0,0,0,0,0,1,0,1
2,33,2,5,76,1,-1,0,33,2,5,...,0,1,0,0,0,0,0,1,0,1
3,47,1506,5,92,1,-1,0,47,1506,5,...,0,1,0,0,0,0,0,1,0,1
4,33,1,5,198,1,-1,0,33,1,5,...,0,1,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,51,825,17,...,0,0,1,0,0,0,0,1,0,0
45207,71,1729,17,456,2,-1,0,71,1729,17,...,0,0,1,0,0,0,0,1,0,0
45208,72,5715,17,1127,5,184,3,72,5715,17,...,0,0,1,0,0,0,1,0,0,0
45209,57,668,17,508,4,-1,0,57,668,17,...,0,0,1,0,0,0,0,1,1,0


In [16]:
bank_data['y'] = np.where(bank_data['y'].str.contains("yes"), 1, 0)

In [17]:
y = bank_data['y'].values
y

array([0, 0, 0, ..., 1, 0, 0])

In [18]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [19]:
X_train.shape

(36168, 49)

In [20]:
X_test.shape

(9043, 49)

In [21]:
y_train.shape

(36168,)

In [22]:
y_test.shape

(9043,)

In [24]:
reg_model = LogisticRegression()

In [25]:
reg_model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [28]:
y_pred_test = reg_model.predict(X_test)
y_pred_test

array([0, 1, 0, ..., 0, 0, 0])

In [29]:
y_test

array([0, 1, 0, ..., 0, 1, 0])

In [33]:
confusion_matrix = confusion_matrix(y_test,y_pred_test)
confusion_matrix

array([[7857,  123],
       [ 893,  170]], dtype=int64)

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
accuracy_score(y_test,y_pred_test)

0.8876479044564857

In [36]:
# The model accuracy is calculated by (a+d)/(a+b+c+d)
(7857+170)/(7857+123+893+170)

0.8876479044564857

#### The accuracy of the model 88.7%

## Process to increase the accuracy 

In [None]:
#Scaling the data
