In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from imblearn.over_sampling import RandomOverSampler
from sklearn import metrics 
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus



In [2]:
data = pd.read_csv("final_data.csv", encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.1,Unnamed: 0,year,status,pay_method_ACH,pay_method_credit card,pay_method_paper check,state_0,state_1,state_2,state_3,...,months,state,status_0,status_1,status_2,status_3,status_4,lp_amount,income,Y
0,0,2014.0,Returned < 90 days,0,1,0,0,0,0,0,...,0.87,MA,0,0,0,0,1,0.5,0.462722,0
1,1,2015.0,Returned < 90 days,0,1,0,0,0,0,0,...,0.0,MA,0,0,0,0,1,0.375,0.462722,0
2,2,2016.0,Returned < 90 days,0,1,0,0,0,0,0,...,0.67,MA,0,0,0,0,1,0.625,0.462722,0
3,3,2014.0,Decline,0,1,0,0,0,0,0,...,0.57,MA,0,0,0,1,0,0.5,0.621078,0
4,4,2017.0,Returned,0,1,0,0,0,0,0,...,3.73,MA,0,0,0,1,1,0.625,0.621078,0


In [3]:
# Creating a separate dataframe for the dropped columns that can't be used in a logistic regression
data_dropped = data[['state', 'months', 'status']]

# Dropping rows that will not be used in the logistic regression
data = data.drop(['Unnamed: 0', 'state', 'months', 'status'], axis = 1)

In [4]:
# Splitting the dataset into the features and target variables
X = data.iloc[:,:-1]
y = data['Y']

# Splitting the data using the train_test_split sklearn package
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2019)

In [5]:
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [6]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train_ros ,y_train_ros)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Creating the confusion matrix for the regression
cnf_mat2 = metrics.confusion_matrix(y_test, y_pred)
cnf_mat2

array([[21189,  1463],
       [ 1229,  1613]])

In [7]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8944065270259669
Precision: 0.5243823146944083
Recall: 0.567558057705841


In [9]:
# Printing out the classification report for the decision tree
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94     22652
           1       0.52      0.57      0.55      2842

    accuracy                           0.89     25494
   macro avg       0.73      0.75      0.74     25494
weighted avg       0.90      0.89      0.90     25494



In [11]:
# Getting the f1 score for the balanced dataset
metrics.f1_score(y_test, y_pred, average='weighted')

0.8962200137013216