In [34]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [35]:
import warnings
warnings.filterwarnings('ignore')

In [36]:
df=pd.read_csv("bank-full.csv.crdownload",';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1.0,-1.0,0.0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1.0,-1.0,0.0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1.0,-1.0,0.0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1.0,-1.0,0.0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1.0,-1.0,0.0,unknown,no


In [37]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [38]:
df.shape

(3221, 17)

In [39]:
df['job'].value_counts()

blue-collar      1096
technician        458
management        434
admin.            418
services          360
retired           120
entrepreneur      103
unemployed         77
self-employed      75
student            37
housemaid          33
unknown            10
Name: job, dtype: int64

In [40]:
df['marital'].value_counts()

married     1974
single       824
divorced     423
Name: marital, dtype: int64

In [41]:
df['education'].value_counts()

secondary    1827
primary       637
tertiary      577
unknown       180
Name: education, dtype: int64

In [42]:
df['default'].value_counts()

no     3143
yes      78
Name: default, dtype: int64

In [43]:
df['contact'].value_counts()

unknown    3221
Name: contact, dtype: int64

In [44]:
df['day'].value_counts()

8     453
14    440
9     429
13    413
6     405
7     390
5     361
12    280
15     50
Name: day, dtype: int64

In [45]:
df['campaign'].value_counts()

1.0     1437
2.0      988
3.0      402
4.0      195
5.0       87
6.0       37
7.0       22
8.0       19
9.0       13
11.0       6
10.0       5
14.0       3
12.0       2
13.0       1
19.0       1
24.0       1
16.0       1
Name: campaign, dtype: int64

In [46]:
df['pdays'].value_counts()

-1.0    3220
Name: pdays, dtype: int64

In [47]:
df['previous'].value_counts()

0.0    3220
Name: previous, dtype: int64

In [48]:
df['poutcome'].value_counts()

unknown    3220
Name: poutcome, dtype: int64

In [49]:
df['balance'].value_counts()

 0       196
 1        20
 3        13
 4        11
 23       11
        ... 
 613       1
 1786      1
 1069      1
-176       1
 2515      1
Name: balance, Length: 1547, dtype: int64

In [50]:
df['balance'].describe()

count     3221.000000
mean       798.823657
std       2504.019805
min      -3313.000000
25%         42.000000
50%        245.000000
75%        697.000000
max      58544.000000
Name: balance, dtype: float64

In [51]:
df['balance']=pd.cut(df['balance'],bins=[-3313,0,10000,20000,30000,40000,50000,58544],labels=[0,1,2,3,4,5,6])
df['balance']

0       1
1       1
2       1
3       1
4       1
       ..
3216    1
3217    1
3218    1
3219    1
3220    1
Name: balance, Length: 3221, dtype: category
Categories (7, int64): [0 < 1 < 2 < 3 < 4 < 5 < 6]

In [52]:
df['balance'].value_counts()

1    2660
0     538
2      12
3       6
6       2
4       1
5       1
Name: balance, dtype: int64

In [53]:
df['duration'].describe()

count    3221.000000
mean      266.149643
std       256.169530
min         1.000000
25%       124.000000
50%       195.000000
75%       315.000000
max      3366.000000
Name: duration, dtype: float64

In [54]:
df['duration']=pd.cut(df['duration'],bins=[1,500,1000,1500,2000,2500,3000,3500],labels=[0,1,2,3,4,5,6])
df['duration']

0         0
1         0
2         0
3         0
4         0
       ... 
3216      0
3217      0
3218      0
3219      0
3220    NaN
Name: duration, Length: 3221, dtype: category
Categories (7, int64): [0 < 1 < 2 < 3 < 4 < 5 < 6]

In [55]:
df['duration'].value_counts()

0    2881
1     262
2      51
3      18
4       7
6       1
5       0
Name: duration, dtype: int64

In [56]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      1
housing      0
loan         0
contact      0
day          0
month        0
duration     1
campaign     1
pdays        1
previous     1
poutcome     1
y            1
dtype: int64

In [57]:
df=df.drop(columns=['campaign','pdays','previous','poutcome','contact'],axis=1)

In [58]:
df=df.dropna(axis=0)

In [59]:

df=df[['y','age','job','marital','education','default','balance','housing','loan','day','month','duration']]

In [60]:
df

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,day,month,duration
0,no,58,management,married,tertiary,no,1,yes,no,5,may,0
1,no,44,technician,single,secondary,no,1,yes,no,5,may,0
2,no,33,entrepreneur,married,secondary,no,1,yes,yes,5,may,0
3,no,47,blue-collar,married,unknown,no,1,yes,no,5,may,0
4,no,33,unknown,single,unknown,no,1,no,no,5,may,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3215,no,53,retired,married,tertiary,no,1,yes,no,15,may,0
3216,no,54,technician,married,tertiary,no,1,yes,yes,15,may,0
3217,no,46,admin.,divorced,secondary,no,1,yes,yes,15,may,0
3218,no,48,admin.,divorced,secondary,no,1,yes,no,15,may,0


In [61]:
bank=pd.get_dummies(df,columns=['job','marital','education','default','balance','housing','loan','day','month','duration'])

In [62]:
label_encoder = preprocessing.LabelEncoder()
bank['y']=label_encoder.fit_transform(bank['y'])

In [63]:
bank

Unnamed: 0,y,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,...,day_14,day_15,month_may,duration_0,duration_1,duration_2,duration_3,duration_4,duration_5,duration_6
0,0,58,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1,0,44,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
2,0,33,0,0,1,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,0,47,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,0,33,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,0,53,0,0,0,0,0,1,0,0,...,0,1,1,1,0,0,0,0,0,0
3216,0,54,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3217,0,46,1,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3218,0,48,1,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0


In [64]:
X=bank.iloc[:,1:]
Y=bank.iloc[:,0]

In [65]:
X

Unnamed: 0,age,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,day_14,day_15,month_may,duration_0,duration_1,duration_2,duration_3,duration_4,duration_5,duration_6
0,58,0,0,0,0,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
1,44,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
2,33,0,0,1,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
3,47,0,1,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
4,33,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3215,53,0,0,0,0,0,1,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3216,54,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3217,46,1,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0
3218,48,1,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,0,0


In [66]:
Y

0       0
1       0
2       0
3       0
4       0
       ..
3215    0
3216    0
3217    0
3218    0
3219    0
Name: y, Length: 3219, dtype: int32

In [67]:
Y.value_counts()

0    3137
1      82
Name: y, dtype: int64

## Model Building

In [68]:
model=LogisticRegression()
model.fit(X,Y)

LogisticRegression()

In [69]:
model.intercept_

array([-0.86640985])

In [70]:
model.coef_

array([[ 2.09000944e-02,  4.61510986e-01,  3.08959725e-01,
         5.89116754e-01, -4.14863811e-01, -1.83219561e-01,
        -2.70490109e-01, -7.86053447e-01, -3.33973089e-01,
        -3.59285926e-01,  2.60842494e-01,  4.82118082e-01,
        -7.90529035e-02,  3.97146823e-01, -7.30296093e-01,
         8.75846484e-03,  7.38685274e-02, -2.54328277e-01,
         2.87985162e-01, -4.31916218e-01, -4.41481489e-01,
         1.17090684e-01, -2.66581430e-01, -1.42869679e-01,
        -4.43141561e-02, -2.00410482e-01, -3.23870746e-03,
         3.44710640e-01, -1.16869899e-02, -7.41701984e-02,
        -2.50220607e-01, -3.33868631e-01,  9.47782600e-03,
        -6.20768270e-01,  5.90891587e-02, -4.85559990e-01,
         2.38113036e-01,  1.44807755e-01, -1.00172785e+00,
         3.24134994e-01,  2.95010103e-01,  7.22510258e-01,
        -3.24390805e-01, -4.05856689e+00,  2.08404064e-01,
         1.35663014e+00,  1.54430812e+00,  7.36909896e-01,
         0.00000000e+00, -1.12076142e-01]])

In [71]:
Y_pred=model.predict(X)

In [72]:
Y_pred_df=pd.DataFrame({"Actaual_Y":Y,"Predicted_Y":Y_pred})

In [73]:
Y_pred_df

Unnamed: 0,Actaual_Y,Predicted_Y
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3215,0,0
3216,0,0
3217,0,0
3218,0,0


In [74]:

confusion_matrix = confusion_matrix(Y,Y_pred)

In [75]:
confusion_matrix

array([[3128,    9],
       [  69,   13]], dtype=int64)

In [76]:
accuracy_score(Y,Y_pred)

0.9757688723205965

In [77]:
print(classification_report(Y,Y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      3137
           1       0.59      0.16      0.25        82

    accuracy                           0.98      3219
   macro avg       0.78      0.58      0.62      3219
weighted avg       0.97      0.98      0.97      3219

