In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')
%cd / gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
[Errno 2] No such file or directory: '/ gdrive'
/content


In [7]:
trainfile = r'/gdrive/My Drive/Assignments/Assignment2/TargetMarketing/TRAIN.csv'
testfile = r'/gdrive/My Drive/Assignments/Assignment2/TargetMarketing/TEST.csv'

trainData = pd.read_csv(trainfile)
testData = pd.read_csv(testfile)

print(trainData.columns)


Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')


In [9]:
#hot encoding
trainData_copy = trainData.iloc[:, :-1]
testData_copy = testData.iloc[:, :-1]
categoricalFeatures = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

combined_Data = pd.concat([trainData_copy, testData_copy], keys= [0,1])
combined_Data = pd.get_dummies(combined_Data,columns = categoricalFeatures )

X_train = combined_Data.xs(0) 
X_test = combined_Data.xs(1)

Y_train = trainData["y"]
Y_test = testData["y"]

print(X_train.head())
print(Y_train.head())
print(X_test.head())
print(Y_test.head())

   age  balance  day  ...  poutcome_other  poutcome_success  poutcome_unknown
0   30     1787   19  ...               0                 0                 1
1   33     4789   11  ...               0                 0                 0
2   35     1350   16  ...               0                 0                 0
3   30     1476    3  ...               0                 0                 1
4   59        0    5  ...               0                 0                 1

[5 rows x 51 columns]
0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object
   age  balance  day  ...  poutcome_other  poutcome_success  poutcome_unknown
0   58     2143    5  ...               0                 0                 1
1   44       29    5  ...               0                 0                 1
2   33        2    5  ...               0                 0                 1
3   47     1506    5  ...               0                 0                 1
4   33        1    5  ...               0               

In [17]:
#normal prediction using Decision Tree Classifier 
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)
X_train.copy()
clf_predict = clf.predict(X_test)
print("---Normal Decision Tree Classifier: ")
print("Classification Report")
print(confusion_matrix(Y_test, clf_predict))
print(classification_report(Y_test, clf_predict))

---Normal Decision Tree Classifier: 
Classification Report
[[37180  2742]
 [ 2564  2725]]
              precision    recall  f1-score   support

          no       0.94      0.93      0.93     39922
         yes       0.50      0.52      0.51      5289

    accuracy                           0.88     45211
   macro avg       0.72      0.72      0.72     45211
weighted avg       0.88      0.88      0.88     45211



In [18]:
#Hyperparameter Tuning
#Using random search for Decision tree classifier

parameters = {'min_samples_leaf': range(10,100,10), 'max_depth': range(5,30,5), 'criterion': ['gini', 'entropy']}
clf_random = RandomizedSearchCV(clf, parameters, n_iter =15 , cv = 5)
clf_random.fit(X_train, Y_train)
random_param = clf_random.best_params_
print(random_param)

#using best parameters obtained, we create decision tree classifier
clfr = DecisionTreeClassifier(**random_param)
clfr.fit(X_train, Y_train)
clfr_predict = clfr.predict(X_test)

print("Classification report for Decision tree aftee hypertuning using random search")
print(classification_report(Y_test, clfr_predict))


{'min_samples_leaf': 30, 'max_depth': 5, 'criterion': 'entropy'}
Classification report for Decision tree aftee hypertuning using random search
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     39922
         yes       0.62      0.29      0.40      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.64      0.67     45211
weighted avg       0.88      0.90      0.88     45211



In [19]:
#Using grid search for Decision tree classifier

clf_grid = GridSearchCV(clf, parameters, cv = 5)
clf_grid.fit(X_train, Y_train)
grid_param = clf_grid.best_params_
print(grid_param)

#using best parameters obtained, we create decision tree classifier
clfg = DecisionTreeClassifier(**grid_param)
clfg.fit(X_train, Y_train)
clfg_predict = clfg.predict(X_test)

print("Classification report for Decision tree aftee hypertuning using random search")
print(classification_report(Y_test, clfg_predict))

{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 30}
Classification report for Decision tree aftee hypertuning using random search
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     39922
         yes       0.62      0.29      0.40      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.64      0.67     45211
weighted avg       0.88      0.90      0.88     45211



In [20]:
#predicting using normal random forest trees 
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)
rfc_predict = rfc.predict(X_test)
print("Classification report for prediction using normal random forest trees:")
print(classification_report(Y_test, rfc_predict))

Classification report for prediction using normal random forest trees:
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.71      0.34      0.46      5289

    accuracy                           0.91     45211
   macro avg       0.82      0.66      0.70     45211
weighted avg       0.89      0.91      0.89     45211



In [22]:
#Hyperparameter tuning in Random Forest trees
#using randomized search

parametersRf = {'min_samples_leaf': range(10,100,10), 'max_depth': range(5,30,5), 'criterion': ['gini', 'entropy']}
rfc_random = RandomizedSearchCV(rfc, parametersRf, n_iter = 15, cv = 10)
rfc_random.fit(X_test, Y_test)
rfc_random_best_params = rfc_random.best_params_
print(rfc_random_best_params)

#using the best parameters obtained during randomized search for rfc
rfcr = RandomForestClassifier(**rfc_random_best_params)
rfcr.fit(X_train, Y_train)
rfcr_predict= rfcr.predict(X_test)
print("Classification report for random forest tree after hypertuning using random search")
print(classification_report(Y_test, rfcr_predict))

{'min_samples_leaf': 70, 'max_depth': 5, 'criterion': 'gini'}
Classification report for random forest tree after hypertuning using random search
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.89      0.00      0.01      5289

    accuracy                           0.88     45211
   macro avg       0.89      0.50      0.47     45211
weighted avg       0.88      0.88      0.83     45211



In [24]:
#Hyperparameter tuning in Random Forest trees
#using grid search

rfc_grid = GridSearchCV(rfc, parameters, cv = 5)
rfc_grid.fit(X_test, Y_test)
rfc_grid_best_params = rfc_grid.best_params_
print(rfc_grid_best_params)

#using the best parameters obtained during randomized search for rfc
rfcg = RandomForestClassifier(**rfc_grid_best_params)
rfcg.fit(X_train, Y_train)
rfcg_predict= rfcg.predict(X_test)
print("Classification report for random forest tree after hypertuning using grid search")
print(classification_report(Y_test, rfcg_predict))

{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 90}
Classification report for random forest tree after hypertuning using grid search
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211

