In [12]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1', index_col=0, header=0)
data.head()

Unnamed: 0,Unnamed: 0.1,year,state,lp_amount,status,months,pay_method_ACH,pay_method_Other,pay_method_credit card,pay_method_paper check,...,region_south,region_south_east,region_west,status_Active,status_Decline,status_Returned,status_Returned < 90 days,status_Returned_90,income,18_months
0,0,2014.0,north_east,34.95,Returned_90,0.87,0,0,1,0,...,0,0,0,0,0,0,0,1,0.465506,0
1,1,2015.0,north_east,29.95,Returned < 90 days,0.0,0,0,1,0,...,0,0,0,0,0,0,1,0,0.465506,0
2,2,2016.0,north_east,29.95,Switched to LW,0.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0.465506,0
3,3,2016.0,north_east,39.95,Returned_90,0.67,0,0,1,0,...,0,0,0,0,0,0,0,1,0.465506,0
4,5,2014.0,north_east,34.95,Decline,0.57,0,0,1,0,...,0,0,0,0,1,0,0,0,0.624609,0


In [13]:
# Dropping rows that will not be used in the random forest
data = data.drop(['Unnamed: 0.1', 'months', 'state', 'status', 'pay_method_Other'], axis = 1)

In [16]:
# Splitting features and target
X = data.iloc[:,:-1]
Y = data['18_months']

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [17]:
# Using the RandomOverSampler package to deal with the imbalanced dataset
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [18]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[23764,  1151],
       [ 1397,  1396]])

In [19]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.95     24915
           1       0.55      0.50      0.52      2793

    accuracy                           0.91     27708
   macro avg       0.75      0.73      0.74     27708
weighted avg       0.90      0.91      0.91     27708



In [20]:
# Printing the accuracy, precision, and recall for the classifier
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.9080409989894616
Precision: 0.5480957989791913
Recall: 0.49982098102398853


In [21]:
# Getting the f1 score for the classifier
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred))

F1 Score: 0.9061487180358649
AUC: 0.7268119554929293


In [22]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

#Running f1 score on both sets
print("Train F1:",metrics.f1_score(y_train, clf.predict(X_train), average='weighted'))
print("Test F1:",metrics.f1_score(y_test, y_pred, average='weighted'))

Train Accuracy: 0.9916268662103148
Test Accuracy: 0.9080409989894616
Train Precision: 0.9804136253041362
Test Precision: 0.5480957989791913
Train Recall: 0.9377472655340936
Test Recall: 0.49982098102398853
Train F1: 0.9915442210437967
Test F1: 0.9061487180358649
