In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import six
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

# Step 1 - Read the Data from the CSV

In [2]:
dummy_df = pd.read_csv("../Data/bank.csv",na_values='NA')
temp = dummy_df.columns.values
temp

array(['age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'duration',
       'campaign', 'pdays', 'previous', 'poutcome', 'deposit'],
      dtype=object)

In [3]:
dummy_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


# Step 2 - Data Pre-Processing

In [4]:
contact_df = dummy_df
contact_df.iloc[:,-1]

0        yes
1        yes
2        yes
3        yes
4        yes
        ... 
11157     no
11158     no
11159     no
11160     no
11161     no
Name: deposit, Length: 11162, dtype: object

In [5]:
def preprocessor(df):
    res_df = df.copy()
    le = preprocessing.LabelEncoder()
    
    res_df['job'] = le.fit_transform(res_df['job'])
    res_df['marital'] = le.fit_transform(res_df['marital'])
    res_df['education'] = le.fit_transform(res_df['education'])
    res_df['default'] = le.fit_transform(res_df['default'])
    res_df['housing'] = le.fit_transform(res_df['housing'])
    res_df['month'] = le.fit_transform(res_df['month'])
    res_df['loan'] = le.fit_transform(res_df['loan'])
    res_df['contact'] = le.fit_transform(res_df['contact'])
    res_df['day_of_week'] = le.fit_transform(res_df['day'])
    res_df['poutcome'] = le.fit_transform(res_df['poutcome'])
    res_df['deposit'] = le.fit_transform(res_df['deposit'])
    return res_df

In [6]:
encoded_df = preprocessor(contact_df)
encoded_df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit,day_of_week
0,59,0,1,1,0,2343,1,0,2,5,8,1042,1,-1,0,3,1,4
1,56,0,1,1,0,45,0,0,2,5,8,1467,1,-1,0,3,1,4
2,41,9,1,1,0,1270,1,0,2,5,8,1389,1,-1,0,3,1,4
3,55,7,1,1,0,2476,1,0,2,5,8,579,1,-1,0,3,1,4
4,54,0,1,2,0,184,0,0,2,5,8,673,2,-1,0,3,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11157,33,1,2,0,0,1,1,0,0,20,0,257,1,-1,0,3,0,19
11158,39,7,1,1,0,733,0,0,2,16,6,83,4,-1,0,3,0,15
11159,32,9,2,1,0,29,0,0,0,19,1,156,2,-1,0,3,0,18
11160,43,9,1,1,0,0,0,1,0,8,8,9,2,172,5,0,0,7


In [7]:

X = encoded_df.drop(['deposit'],axis=1).values # .values takes in matrix form, its not necesary to give axis=
y = encoded_df['deposit'].values
y

array([1, 1, 1, ..., 0, 0, 0])

# Step 3 - Split Train / Test Data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Step 4 - Build the Decision Tree Model

In [9]:
model = DecisionTreeClassifier(random_state=1,max_depth=10) # Significance of random state
model.fit(X_train,y_train)
# model_score = model.score(X_train,y_train)
# model_score1 = model.score(X_test,y_test)

DecisionTreeClassifier(max_depth=10, random_state=1)

In [10]:
print(len(X_test)) # 20% is test data

2233


In [11]:
# Prediction of job vs deposit
y_pred = model.predict_proba(X_test)[:,1] # in test data we are taking job column prediction
print(len(y_pred)) # prediction of prob of deposit for givn job in test data
print(y_pred)

2233
[0.90909091 0.         0.33333333 ... 0.25510204 0.         0.08      ]


# Step 5 - TUNING

In [12]:
# GridSearchCV

import time
from sklearn.model_selection import GridSearchCV

In [13]:
np.random.seed(42)

param_dist = {'max_depth': [2, 3, 4],
              
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(model, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 3)

cv_rf.fit(X_train,y_train)

print('Best Parameters using grid search: \n', cv_rf.best_params_)


Best Parameters using grid search: 
 {'criterion': 'gini', 'max_depth': 4, 'max_features': None}


In [14]:
# Retrain model using GridsearchCV parameters
model = DecisionTreeClassifier(criterion='gini',
                               max_features=None,max_depth=4) # Significance of random state
model.fit(X_train,y_train)
# model_score = model.score(X_train,y_train)
# model_score1 = model.score(X_test,y_test)

DecisionTreeClassifier(max_depth=4)

In [15]:
# Prediction of job vs deposit
y_pred = model.predict_proba(X_test)[:,1] # in test data we are taking job column prediction
print(len(y_pred)) # prediction of prob of deposit for givn job in test data
print(y_pred)

2233
[0.83333333 0.03170732 0.3094984  ... 0.48125478 0.03092784 0.13023783]


In [16]:
## Ways to TUNE
# Scaling of data does not matter so Skipping that
# Done with Hyperparameter tuning
# 

# Step 6 - Performance Metrics

In [17]:
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred) # roc curve is fpr vs tpr
roc_auc_dt = auc(fpr_dt, tpr_dt) # area under the curve
roc_auc_dt # more the area better the prediction, meaning true positive are much more than false positives

0.8565118620875732

In [18]:
# predict() is used to predict the actual class (In your case one of 0 or 1).
# predict_proba() is used to predict the class probabilities

predictions = model.predict(X_test)
predictions

array([1, 0, 0, ..., 0, 0, 0])

In [19]:
# Model Accuracy
print (model.score(X_test, y_test))
y_actual_result = y_test[0]

0.7765338110165696


In [20]:
for i in range(len(predictions)):
    if(predictions[i] == 1):
        y_actual_result = np.vstack((y_actual_result, y_test[i]))
y_actual_result

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [1],
       [1],
       [1],
    