In [6]:
# importing the necessary packages
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics 

# importing packages to visualize the decision tree
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

# Reading in the csv file
data = pd.read_csv("final_data.csv", encoding = 'ISO-8859-1')
data.columns

Index(['Unnamed: 0.1', 'year', 'state', 'lp_amount', 'status', 'months',
       'pay_method_ACH', 'pay_method_Other', 'pay_method_credit card',
       'pay_method_paper check', 'region_central', 'region_north_east',
       'region_rocky', 'region_south', 'region_south_east', 'region_west',
       'status_Active', 'status_Decline', 'status_Returned',
       'status_Returned < 90 days', 'status_Returned_90', 'income',
       '18_months'],
      dtype='object')

In [7]:
# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'months', 'state', 'status', 'pay_method_Other', 'lp_amount'], axis = 1)

In [8]:
# Splitting the dataset into the X and y variables
X = data.iloc[:,:-1]
y = data['18_months']

# Splitting the data using the train_test_split package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

In [9]:
# Using the RandomOverSampler package to deal with the imbalanced dataset
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [11]:
# Create Decision Tree classifer
clf = DecisionTreeClassifier()

# Training the Decision Tree Classifer
clf = clf.fit(X_train_ros ,y_train_ros)

# Predicting the response for test dataset
y_pred = clf.predict(X_test)

# Creating the confusion matrix for the decision tree
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[22854,  2061],
       [ 1284,  1509]])

In [12]:
# Printing the accuracy, precision, recall, and f1 score for the decision tree
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.8792767431788653
Precision: 0.4226890756302521
Recall: 0.5402792696025779
F1 Score: 0.8856915063123052


In [13]:
# Printing out the classification report for the decision tree
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93     24915
           1       0.42      0.54      0.47      2793

    accuracy                           0.88     27708
   macro avg       0.68      0.73      0.70     27708
weighted avg       0.89      0.88      0.89     27708



In [14]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.9631991145651625
Test Accuracy: 0.8792767431788653
Train Precision: 0.7385981550133632
Test Precision: 0.4226890756302521
Train Recall: 0.9968582732138701
Test Recall: 0.5402792696025779


In [15]:
# Create Decision Tree classifer object to prune the tree
# Help to deal with overfitting of the model
clf_prune = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Training the  Decision Tree Classifer
clf_prune = clf_prune.fit(X_train_ros,y_train_ros)

# Predict the response for test dataset
y_pred = clf_prune.predict(X_test)

# Creating the confusion matrix for the decision tree
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[19009,  5906],
       [  250,  2543]])

In [16]:
# Printing the accuracy, precision, recall, and f1 score for the data
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.7778258986574275
Precision: 0.30098236477689666
Recall: 0.9104905119942714
F1 Score: 0.8194917860956958


In [17]:
# Printing out the classification report for the decision tree
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86     24915
           1       0.30      0.91      0.45      2793

    accuracy                           0.78     27708
   macro avg       0.64      0.84      0.66     27708
weighted avg       0.92      0.78      0.82     27708



In [18]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.9631991145651625
Test Accuracy: 0.7778258986574275
Train Precision: 0.7385981550133632
Test Precision: 0.30098236477689666
Train Recall: 0.9968582732138701
Test Recall: 0.9104905119942714
