In [1]:
import warnings
warnings.filterwarnings('ignore')
# data cleaning
import numpy as np
import pandas as pd
from collections import Counter
# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#data preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#data modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor

#model evaluation
from sklearn.metrics import precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix,plot_confusion_matrix
from sklearn.metrics import accuracy_score, recall_score, precision_score, average_precision_score, f1_score, log_loss
from sklearn.metrics import roc_curve, auc, plot_roc_curve, roc_auc_score, plot_precision_recall_curve


**Read and inspect data**

In [2]:
df= pd.read_csv('bank.csv',sep= ";")

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [4]:
df['y']= df.y.replace({'no':0,'yes':1})

In [5]:
df.month.replace(['jan','feb','mar','apr','may','jun','jul',
                 'aug','sep','oct','nov','dec'], [1,2,3,4,5,6,7,8,9,10,11,12],inplace=True)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   int64 
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   int64 
dtypes: int64(9), object(8)
memory usage: 600.6+ KB


In [7]:
df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [8]:
df.describe().round(2)

Unnamed: 0,age,balance,day,month,duration,campaign,pdays,previous,y
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.17,1422.66,15.92,6.17,263.96,2.79,39.77,0.54,0.12
std,10.58,3009.64,8.25,2.38,259.86,3.11,100.12,1.69,0.32
min,19.0,-3313.0,1.0,1.0,4.0,1.0,-1.0,0.0,0.0
25%,33.0,69.0,9.0,5.0,104.0,1.0,-1.0,0.0,0.0
50%,39.0,444.0,16.0,6.0,185.0,2.0,-1.0,0.0,0.0
75%,49.0,1480.0,21.0,8.0,329.0,3.0,-1.0,0.0,0.0
max,87.0,71188.0,31.0,12.0,3025.0,50.0,871.0,25.0,1.0


In [9]:
df.describe(include='O')

Unnamed: 0,job,marital,education,default,housing,loan,contact,poutcome
count,4521,4521,4521,4521,4521,4521,4521,4521
unique,12,3,4,2,2,2,3,4
top,management,married,secondary,no,yes,no,cellular,unknown
freq,969,2797,2306,4445,2559,3830,2896,3705


In [10]:
df.y.value_counts()

0    4000
1     521
Name: y, dtype: int64

**1.Choose features & process features**

In [11]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,10,79,1,-1,0,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,5,220,1,339,4,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,4,185,1,330,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,6,199,4,-1,0,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,5,226,1,-1,0,unknown,0


In [12]:
x= df.drop(['y'],axis=1)
x.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,10,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,5,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,4,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,6,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,5,226,1,-1,0,unknown


In [13]:
y=df['y']

In [14]:
x= pd.get_dummies(x,drop_first= True)

**2.Split data**

In [15]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state= 42)

**3.Train model**

In [16]:
model= DecisionTreeClassifier()
model.fit(x_train,y_train)

DecisionTreeClassifier()

**4.Evaluate model**

In [17]:
# train data
print(classification_report(y_train,model.predict(x_train),target_names= ['negative','positive']))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      2795
    positive       1.00      1.00      1.00       369

    accuracy                           1.00      3164
   macro avg       1.00      1.00      1.00      3164
weighted avg       1.00      1.00      1.00      3164



In [18]:
# test data
print(classification_report(y_test,model.predict(x_test),target_names= ['negative','positive']))
# overfitting

              precision    recall  f1-score   support

    negative       0.92      0.92      0.92      1205
    positive       0.36      0.36      0.36       152

    accuracy                           0.86      1357
   macro avg       0.64      0.64      0.64      1357
weighted avg       0.86      0.86      0.86      1357



**6.Optimizing Decision Tree Performance with pre-pruning**

*Maximum depth of the tree can be used as a control variable for pre-pruning. In the following the example, you can plot a decision tree on the same data with max_depth=3*

In [19]:
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

In [20]:
clf = clf.fit(x_train,y_train)

In [21]:
# train data
print(classification_report(y_train,clf.predict(x_train),target_names= ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.92      0.97      0.95      2795
    positive       0.65      0.38      0.48       369

    accuracy                           0.90      3164
   macro avg       0.78      0.68      0.71      3164
weighted avg       0.89      0.90      0.89      3164



In [22]:
# test data
print(classification_report(y_test,clf.predict(x_test),target_names= ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.92      0.97      0.94      1205
    positive       0.57      0.34      0.43       152

    accuracy                           0.90      1357
   macro avg       0.74      0.65      0.68      1357
weighted avg       0.88      0.90      0.89      1357



**5.Visulization**

In [23]:
from IPython.display import Image
from sklearn import tree
import pydotplus

#https://dreampuf.github.io/GraphvizOnline/

In [24]:
dot_data= tree.export_graphviz(model,out_file='bank.txt', feature_names= x.columns, class_names=['no','yes'])