## Decision Tree Model Apprach

In [1]:
# Update sklearn to prevent version mismatches
#!pip install sklearn --upgrade

### Read the CSV and Perform Basic Data Cleaning

In [2]:
# Import depedencies
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pickle
import os

In [3]:
# Load libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler,LabelEncoder
from sklearn.feature_selection import VarianceThreshold,SelectFromModel

In [4]:
# File path 
data_feature_file = os.path.join("","data","featureData","feature_dataframe.csv")
model_result = os.path.join("","data","results","DecisionTree.csv")

In [5]:
# Read data
df = pd.read_csv(data_feature_file)

In [6]:
# Display Sample data
df.head()

Unnamed: 0,age,job,education,contact,duration,campaign,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,...,jun,mar,may,nov,oct,sep,mon,thu,tue,wed
0,56,4,2,0,261,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
1,57,7,5,0,149,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
2,37,7,5,0,226,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
3,40,10,3,0,151,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0
4,56,7,5,0,307,1,1.1,93.994,-36.4,4.857,...,0,0,1,0,0,0,1,0,0,0


In [7]:
# Check data size
df.shape

(41188, 36)

In [8]:
# Let's seperate dependent and independent variables
X= df.drop(columns = ['y'])
y = df['y']

In [9]:
# Get feature list
feature_name = X.columns.tolist()

In [10]:
# Shape of independent variable or features
X.shape

(41188, 35)

### Pre-processing model performace

In [11]:
# Split data to train and test and check size of train data.
# Using 80-/20 split with random state as 420 (hyper parameter)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=50, test_size=0.2)

# Print shape of train data
print(X_train.shape)

(32950, 35)


Decision trees and ensemble methods do not require feature scaling to be performed as they are not sensitive to the the variance in the data.

In [12]:
# Build Model
base_model = DecisionTreeClassifier(criterion='gini',max_depth=12)

In [13]:
# Train model
base_model.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=12)

In [14]:
# Predict 
pred_base = base_model.predict(X_test)

In [15]:
# Get train and test scores
training_score = round(base_model.score(X_train, y_train)*100,3)
test_score = round(accuracy_score(y_test, pred_base)*100,3)

In [16]:
# Evaluate predictions
print(accuracy_score(y_test, pred_base))
print(confusion_matrix(y_test, pred_base))
print(classification_report(y_test, pred_base))

0.8973051711580481
[[6892  396]
 [ 450  500]]
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      7288
           1       0.56      0.53      0.54       950

    accuracy                           0.90      8238
   macro avg       0.75      0.74      0.74      8238
weighted avg       0.89      0.90      0.90      8238



In [17]:
print(f"Training Data Score: {training_score} %")
print(f"Testing Data Score: {test_score} %")

Training Data Score: 95.53 %
Testing Data Score: 89.731 %


#### Training results and test results are not matching. F1 score for class 1 is not comparable with class 0. This means model is biased towards class 0. In short, current state of model will not give desired results.

In [18]:
# Let' use different methods to handle imbalnced data and test model results

In [19]:
from sklearn.metrics import classification_report
def decisionTree(X_train, y_train, X_test, y_test):
    
    # Build model
    model = DecisionTreeClassifier(criterion='gini',max_depth=12)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Scores and accurancy
    model_train_score = round(model.score(X_train, y_train)*100,3)
    model_test_score = round(model.score(X_test, y_test)*100,3)

    print("\nTraining model score: ",model_train_score)
    print("Testing model score: ",model_test_score)
    
    # Get accurancy
    accuracy = model.score(X_test, y_test)
    print("\nAccuracy", accuracy, "\n")
    
    # Get predictions
    y_pred = model.predict(X_test)

    # Get confusion matrix
    dt_matri = classification_report(y_test,y_pred)
    print(dt_matri)

In [20]:
# Get Model results
def getModelResult(X,y):
    
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    
    # Get model results 
    decisionTree(X_train, y_train, X_test, y_test)

In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.inspection import permutation_importance
import shap

def featureSelection(X,y,method):
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)
    
    # Get Random Forest Regressor instance and fit training data
    dt = DecisionTreeRegressor()
    dt.fit(X_train, y_train)
    
    if method == 1:
        # Using DecisionTree to select features
        sorted_idx = dt.feature_importances_.argsort()
        
        # Select X and y
        X=X[X.columns[dt.feature_importances_>0.001]]
        
        # Get results witout feature selection
        print("\n-- Model performace after Decision Tree Regressor feature selection -- ")
        getModelResult(X,y)
    
    elif method==2:
        
        # Get feature importance
        perm_importance = permutation_importance(dt, X_test, y_test)

        # Get Index of feature
        sorted_idx = perm_importance.importances_mean.argsort()

        # Recreate dependent and independent data set using selected features
        X=X[X.columns[perm_importance.importances_mean>0]]

        # Get results witout feature selection
        print("\n-- Model performace after Permutation feature selection -- ")
        getModelResult(X,y)
    
    elif method==3:
        explainer = shap.TreeExplainer(dt)
        shap_values = explainer.shap_values(X_test)
        
        # Get feature names
        vals= np.abs(shap_values).mean(0)
        feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
        feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
        feature_importance = feature_importance[feature_importance.feature_importance_vals > 0.002 ]
        
        # Recreate dependent and independent data set using selected features
        X= X[feature_importance.col_name]

        # Get results witout feature selection
        print("\n -- Model performace after SHAP feature selection -- ")
        getModelResult(X,y)

#### Method 1: Undersampling

In [22]:
# Class count
count_class_0, count_class_1 = df.y.value_counts()

# Divide by class
df_class_0 = df[df['y'] == 0]
df_class_1 = df[df['y'] == 1]

In [23]:
# Undersample 0-class and concat the DataFrames of both class
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

print('Random under-sampling:')
print(df_test_under.y.value_counts())

Random under-sampling:
1    4640
0    4640
Name: y, dtype: int64


In [24]:
# Get features and dependent variable data 
X = df_test_under.drop('y',axis='columns')
y = df_test_under['y']

In [25]:
# Get results witout feature selection
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  96.538
Testing model score:  85.614

Accuracy 0.8561422413793104 

              precision    recall  f1-score   support

           0       0.86      0.85      0.85       928
           1       0.85      0.87      0.86       928

    accuracy                           0.86      1856
   macro avg       0.86      0.86      0.86      1856
weighted avg       0.86      0.86      0.86      1856



#### F1 score is imporved compared to base model. Training results and test results are still not comparable. 

In [26]:
# Get Model performance for undersampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Decision Tree Regressor feature selection -- 

Training model score:  96.525
Testing model score:  84.806

Accuracy 0.8480603448275862 

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       928
           1       0.85      0.85      0.85       928

    accuracy                           0.85      1856
   macro avg       0.85      0.85      0.85      1856
weighted avg       0.85      0.85      0.85      1856


-- Model performace after Permutation feature selection -- 

Training model score:  96.404
Testing model score:  85.399

Accuracy 0.8539870689655172 

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       928
           1       0.85      0.86      0.86       928

    accuracy                           0.85      1856
   macro avg       0.85      0.85      0.85      1856
weighted avg       0.85      0.85      0.85      1856


 -- Model performace after SH

#### Test and Train Model accurancy with under sampled data and feature selection gave good results but they are not close enough. F1 score for train and test is comparable. 

#### Method 2: Over Sampling

In [27]:
# Oversample 1-class and concat the DataFrames of both classes
df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.y.value_counts())

Random over-sampling:
1    36548
0    36548
Name: y, dtype: int64


In [28]:
# Get features and dependent variable data 
X = df_test_over.drop('y',axis='columns')
y = df_test_over['y']

In [29]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X,y)

-- Model performace without feature selection -- 

Training model score:  94.045
Testing model score:  92.606

Accuracy 0.9260601915184679 

              precision    recall  f1-score   support

           0       0.97      0.88      0.92      7310
           1       0.89      0.97      0.93      7310

    accuracy                           0.93     14620
   macro avg       0.93      0.93      0.93     14620
weighted avg       0.93      0.93      0.93     14620



#### F1 score is improved significantly comapred to base model. Test and Train Models without Feature with over sampling without feature selection gave accuracy score. 

In [30]:
# Get Model performance for oversampled data using feature selection
featureSelection(X,y,1)
featureSelection(X,y,2)
featureSelection(X,y,3)


-- Model performace after Decision Tree Regressor feature selection -- 

Training model score:  94.045
Testing model score:  92.681

Accuracy 0.926812585499316 

              precision    recall  f1-score   support

           0       0.97      0.88      0.92      7310
           1       0.89      0.98      0.93      7310

    accuracy                           0.93     14620
   macro avg       0.93      0.93      0.93     14620
weighted avg       0.93      0.93      0.93     14620


-- Model performace after Permutation feature selection -- 

Training model score:  94.051
Testing model score:  92.647

Accuracy 0.9264705882352942 

              precision    recall  f1-score   support

           0       0.97      0.88      0.92      7310
           1       0.89      0.98      0.93      7310

    accuracy                           0.93     14620
   macro avg       0.93      0.93      0.93     14620
weighted avg       0.93      0.93      0.93     14620


 -- Model performace after SHA

#### F1 score is improved significantly comapred to base model. Test and Train Models with Feature with over sampling without feature selection gave accuracy score. 

### Method 3: SMOTE

In [31]:
X = df.drop('y',axis='columns')
y = df['y']

In [32]:
from imblearn.over_sampling import SMOTE

# Get smote object
smote = SMOTE(sampling_strategy='minority')

# Fit data
X_smote, y_smote = smote.fit_sample(X, y)

# Check SMOTE results 
y_smote.value_counts()

1    36548
0    36548
Name: y, dtype: int64

In [33]:
# Get model results for oversampled data
print("-- Model performace without feature selection -- ")
getModelResult(X_smote, y_smote)

-- Model performace without feature selection -- 

Training model score:  95.964
Testing model score:  93.003

Accuracy 0.9300273597811217 

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      7310
           1       0.92      0.94      0.93      7310

    accuracy                           0.93     14620
   macro avg       0.93      0.93      0.93     14620
weighted avg       0.93      0.93      0.93     14620



#### F1 score is improved significantly comapred to base model. Test and Train Models without Feature with over sampling without feature selection gave accuracy score. 

In [34]:
# Get Model performance for SMOTE data using feature selection
featureSelection(X_smote, y_smote,1)
featureSelection(X_smote, y_smote,2)
featureSelection(X_smote, y_smote,3)


-- Model performace after Decision Tree Regressor feature selection -- 

Training model score:  95.968
Testing model score:  92.955

Accuracy 0.9295485636114911 

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      7310
           1       0.92      0.94      0.93      7310

    accuracy                           0.93     14620
   macro avg       0.93      0.93      0.93     14620
weighted avg       0.93      0.93      0.93     14620


-- Model performace after Permutation feature selection -- 

Training model score:  95.98
Testing model score:  93.071

Accuracy 0.9307113543091655 

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      7310
           1       0.92      0.94      0.93      7310

    accuracy                           0.93     14620
   macro avg       0.93      0.93      0.93     14620
weighted avg       0.93      0.93      0.93     14620


 -- Model performace after SHA

In [35]:
# Best model test score
test_scoreSelect = 93.071

#### F1 score is improved significantly comapred to base model. Test and Train Models with SMOTE technique with feature selection gave accuracy score. 

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [36]:
# Let's use smote data for grid search
len(X_smote),len(y_smote)

(73096, 73096)

In [37]:
def featureCSVSelection(X,y):
    # Split data into train-test -- statify y will insure there will be equal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50, stratify=y)
    
    # Get Decision Tree Regressor instance and fit training data
    dt = DecisionTreeRegressor()
    dt.fit(X_train, y_train)
    
    explainer = shap.TreeExplainer(dt)
    shap_values = explainer.shap_values(X_test)

    # Get feature names
    vals= np.abs(shap_values).mean(0)
    feature_importance = pd.DataFrame(list(zip(X.columns,vals)),columns=['col_name','feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
    feature_importance = feature_importance[feature_importance.feature_importance_vals > 0.002 ]

    # Recreate dependent and independent data set using selected features
    X= X[feature_importance.col_name]

    # Get results witout feature selection
    print("\n -- Model performace after SHAP feature selection -- ")
    getCVModelResult(X,y)

In [38]:
# Use grid search to tune the model
from sklearn.model_selection import GridSearchCV

def getCVModelResult(X,y):
    
    # Split data into train-test -- statify y will insure there will be eqal represenation for each class
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)
    

    params = {'max_leaf_nodes': list(range(2, 100)), 'min_samples_split': [2], 
          'criterion': ['gini', 'entropy'], 'max_depth': [2,4,6,8,10,12]}
    grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=50), params, verbose=1, cv=5)
    
    
    # Fit model
    grid_search_cv.fit(X_train, y_train)
    
    # Scores and accurancy
    model_train_score = round(grid_search_cv.score(X_train, y_train)*100,3)
    model_test_score = round(grid_search_cv.score(X_test, y_test)*100,3)

    print("\nTraining model score: ",model_train_score)
    print("Testing model score: ",model_test_score)
    
    # Get accurancy
    accuracy = grid_search_cv.score(X_test, y_test)
    print("\nAccuracy", accuracy, "\n")
    
    # Get predictions
    y_pred = grid_search_cv.predict(X_test)

    # Get confusion matrix
    csv_matri = classification_report(y_test,y_pred)
    print(csv_matri)

In [39]:
# Get Model performance for SMOTE data using feature selection
featureCSVSelection(X_smote, y_smote)


 -- Model performace after SHAP feature selection -- 
Fitting 5 folds for each of 1176 candidates, totalling 5880 fits

Training model score:  92.231
Testing model score:  92.025

Accuracy 0.9202462380300958 

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      7310
           1       0.91      0.93      0.92      7310

    accuracy                           0.92     14620
   macro avg       0.92      0.92      0.92     14620
weighted avg       0.92      0.92      0.92     14620



In [40]:
# Test model score is same regardless feature selection method
test_scoreTuned = 92.025

#### GridSearch gave us the best model with overall accuracy of 92% and also F1 score as 92% for each class.

### Model Evaluation

In [41]:
# Save results in csv file

In [42]:
evaluations = {'': ['Base Model', 'Select Features Model', 'Tuned Model'],
               'Accuracy': [f"{test_score}%", f"{test_scoreSelect}%", f"{test_scoreTuned}%"]}

evaluations_df = pd.DataFrame(evaluations)
evaluations_df = evaluations_df.set_index('')

evaluations_df.to_csv(model_result)
evaluations_df

Unnamed: 0,Accuracy
,
Base Model,89.731%
Select Features Model,93.071%
Tuned Model,92.025%


### Summary: 
-  Decision Tree Model built using GridSearch gave the best results for each class with an accuracy of 92.025 %.  