# Decision Tree

In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

In [46]:
df = pd.read_csv("bank.csv")

In [47]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1


In [48]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [49]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=1)

In [50]:
# Model function
def create_model(model):
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test,y_pred))
    return model

In [51]:
# Baseline model
lr = LogisticRegression()
create_model(lr)

              precision    recall  f1-score   support

           0       0.80      0.82      0.81      1760
           1       0.80      0.78      0.79      1589

    accuracy                           0.80      3349
   macro avg       0.80      0.80      0.80      3349
weighted avg       0.80      0.80      0.80      3349



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
dt = DecisionTreeClassifier()
dt = create_model(dt)

              precision    recall  f1-score   support

           0       0.79      0.81      0.80      1760
           1       0.78      0.76      0.77      1589

    accuracy                           0.79      3349
   macro avg       0.78      0.78      0.78      3349
weighted avg       0.79      0.79      0.79      3349



In [53]:
#Training score
dt.score(X_train,y_train)

1.0

In [55]:
#Important features
dt.feature_importances_

array([0.07939872, 0.03349062, 0.01789308, 0.01319226, 0.00084141,
       0.08306421, 0.04299821, 0.00616217, 0.06276539, 0.08045543,
       0.09913988, 0.34984958, 0.02000531, 0.06093876, 0.01356017,
       0.0362448 ])

## Pruning

In [56]:
dt2 = DecisionTreeClassifier(max_depth=8)
create_model(dt2)

              precision    recall  f1-score   support

           0       0.82      0.83      0.83      1760
           1       0.81      0.80      0.81      1589

    accuracy                           0.82      3349
   macro avg       0.82      0.82      0.82      3349
weighted avg       0.82      0.82      0.82      3349



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [57]:
dt3 = DecisionTreeClassifier(min_samples_leaf=50)
dt3 = create_model(dt3)

              precision    recall  f1-score   support

           0       0.87      0.78      0.82      1760
           1       0.78      0.87      0.83      1589

    accuracy                           0.83      3349
   macro avg       0.83      0.83      0.83      3349
weighted avg       0.83      0.83      0.83      3349



In [58]:
dt4 = DecisionTreeClassifier(min_samples_leaf=50,criterion="entropy")
dt4 = create_model(dt4)

              precision    recall  f1-score   support

           0       0.84      0.82      0.83      1760
           1       0.81      0.82      0.81      1589

    accuracy                           0.82      3349
   macro avg       0.82      0.82      0.82      3349
weighted avg       0.82      0.82      0.82      3349



## Cross validation

In [59]:
from sklearn.model_selection import cross_val_score

In [60]:
m1 = cross_val_score(dt3,X,y,cv=4)

In [61]:
m1

array([0.75250716, 0.76523297, 0.80967742, 0.82258065])

In [62]:
m1.mean()

0.7874995506875764

In [63]:
m2 = cross_val_score(dt4,X,y,cv=4)

In [64]:
m2

array([0.74355301, 0.76164875, 0.81935484, 0.80967742])

In [65]:
m2.mean()

0.7835585030450545