In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler



In [2]:
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.1,Unnamed: 0,year,state,status,months,pay_method_ACH,pay_method_Other,pay_method_credit card,pay_method_paper check,state_CA,...,state_OH,state_PA,state_TX,status_Active,status_Decline,status_Returned,status_Returned < 90 days,income,lp_amount,Y
0,0,2014.0,MA,Returned < 90 days,0.87,0,0,1,0,0,...,0,0,0,0,0,0,1,0.459624,0.5,0
1,1,2015.0,MA,Returned < 90 days,0.0,0,0,1,0,0,...,0,0,0,0,0,0,1,0.459624,0.375,0
2,2,2016.0,MA,Switched to LW,0.0,0,0,1,0,0,...,0,0,0,0,0,0,0,0.459624,0.375,0
3,3,2016.0,MA,Returned < 90 days,0.67,0,0,1,0,0,...,0,0,0,0,0,0,1,0.459624,0.625,0
4,4,2014.0,MA,Decline,0.57,0,0,1,0,0,...,0,0,0,0,1,0,0,0.619935,0.5,0


In [3]:
# Creating a separate dataframe for the dropped columns that can't be used in a random forest
data_dropped = data[['months', 'lp_amount', 'state', 'status', 'pay_method_Other']]

# Dropping rows that will not be used in the random forest
data = data.drop(['Unnamed: 0', 'months', 'lp_amount', 'state', 'status', 'pay_method_Other'], axis = 1)

In [4]:
# Splitting features and target
X = data.iloc[:,:-1]
Y = data.Y

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [5]:
rando = RandomOverSampler(random_state=2019)
X_train_rando, y_train_rando = rando.fit_resample(X_train, y_train)

In [5]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat2 = metrics.confusion_matrix(y_test, y_pred)
cnf_mat2

array([[22548,  1115],
       [ 1595,  1303]])

In [6]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94     23663
           1       0.54      0.45      0.49      2898

    accuracy                           0.90     26561
   macro avg       0.74      0.70      0.72     26561
weighted avg       0.89      0.90      0.89     26561



In [7]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.8979707089341515
Precision: 0.5388751033912325
Recall: 0.44962042788129747


In [8]:
# Getting the f1 score for the balanced dataset
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred))

F1 Score: 0.893876637062795
AUC: 0.7012502257734679
