In [9]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics 
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

In [10]:
data = pd.read_csv("final_data.csv", encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.1,Unnamed: 0,year,state_0,state_1,state_2,state_3,state_4,state_5,state_6,months,pay_method_ACH,pay_method_Other,pay_method_credit card,pay_method_paper check,income,lp_amount,Y
0,0,2014.0,0,0,0,0,0,0,1,0.87,0,0,1,0,0.456715,0.5,0
1,1,2016.0,0,0,0,0,0,0,1,0.67,0,0,1,0,0.456715,0.625,0
2,2,2014.0,0,0,0,0,0,0,1,0.57,0,0,1,0,0.621274,0.5,0
3,3,2017.0,0,0,0,0,0,0,1,3.73,0,0,1,0,0.621274,0.625,0
4,4,2014.0,0,0,0,0,0,0,1,0.6,0,0,1,0,0.602031,0.5,0


In [11]:
# Creating a separate dataframe for the dropped columns that can't be used in a logistic regression
data_dropped = data[[ 'months']]

# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'months'], axis = 1)

In [12]:
# Splitting the dataset into the features and target variables
X = data.iloc[:,:-1]
y = data['Y']

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

In [13]:
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [14]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train_ros ,y_train_ros)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat2 = metrics.confusion_matrix(y_test, y_pred)
cnf_mat2

array([[15233,  3118],
       [ 2168,   643]])

In [15]:
# Printing the accuracy, precision, recall, and f1 score for the data
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.7502126453076269
Precision: 0.17096516883807497
Recall: 0.2287442191390964
F1 Score: 0.7649476897312667


In [16]:
# Printing out the classification report for the decision tree
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.83      0.85     18351
           1       0.17      0.23      0.20      2811

    accuracy                           0.75     21162
   macro avg       0.52      0.53      0.52     21162
weighted avg       0.78      0.75      0.76     21162



In [17]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.9373877705320858
Test Accuracy: 0.7502126453076269
Train Precision: 0.6870527000650618
Test Precision: 0.17096516883807497
Train Recall: 0.9851895043731779
Test Recall: 0.2287442191390964
