In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics 
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus



In [2]:
data = pd.read_csv("final_data.csv", encoding = 'ISO-8859-1')
data.head()
data.columns

Index(['Unnamed: 0', 'year', 'state_0', 'state_1', 'state_2', 'state_3',
       'state_4', 'state_5', 'state_6', 'status_0', 'status_1', 'status_2',
       'status_3', 'status_4', 'months', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check', 'income',
       'lp_amount', 'Y'],
      dtype='object')

In [3]:
# Creating a separate dataframe for the dropped columns that can't be used in a logistic regression
data_dropped = data[[ 'months']]

# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'months'], axis = 1)

# Entering the column names into a dataframe
col_names = ['year', 'state_0', 'state_1', 'state_2', 'state_3',
       'state_4', 'state_5', 'state_6', 'status_0', 'status_1', 'status_2',
       'status_3', 'status_4', 'pay_method_ACH', 'pay_method_Other',
       'pay_method_credit card', 'pay_method_paper check', 'income',
       'lp_amount']

In [4]:
# Splitting the dataset into the features and target variables
feature_cols = col_names[:-1]
X = data[feature_cols]
y = data['Y']

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

In [5]:
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [6]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train_ros ,y_train_ros)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat2 = metrics.confusion_matrix(y_test, y_pred)
cnf_mat2

array([[21978,  1685],
       [ 1247,  1651]])

In [7]:
# Printing the accuracy, precision, recall, and f1 score for the data
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.889612589887429
Precision: 0.4949040767386091
Recall: 0.5697032436162871
F1 Score: 0.8929749051770234


In [8]:
# Printing out the classification report for the decision tree
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94     23663
           1       0.49      0.57      0.53      2898

    accuracy                           0.89     26561
   macro avg       0.72      0.75      0.73     26561
weighted avg       0.90      0.89      0.89     26561



In [9]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.969629276373585
Test Accuracy: 0.889612589887429
Train Precision: 0.7790141622218135
Test Precision: 0.4949040767386091
Train Recall: 0.9979971724787936
Test Recall: 0.5697032436162871


In [10]:
# Create Decision Tree classifer object
clf_prune = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf_prune = clf_prune.fit(X_train_ros,y_train_ros)

#Predict the response for test dataset
y_pred = clf_prune.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat2 = metrics.confusion_matrix(y_test, y_pred)
cnf_mat2

array([[17534,  6129],
       [   88,  2810]])

In [11]:
# Printing the accuracy, precision, recall, and f1 score for the data
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.765935017506871
Precision: 0.3143528358876832
Recall: 0.9696342305037957
F1 Score: 0.8085377204060327


In [12]:
print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

[[17534  6129]
 [   88  2810]]
              precision    recall  f1-score   support

           0       1.00      0.74      0.85     23663
           1       0.31      0.97      0.47      2898

    accuracy                           0.77     26561
   macro avg       0.65      0.86      0.66     26561
weighted avg       0.92      0.77      0.81     26561



In [13]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.969629276373585
Test Accuracy: 0.765935017506871
Train Precision: 0.7790141622218135
Test Precision: 0.3143528358876832
Train Recall: 0.9979971724787936
Test Recall: 0.9696342305037957
