In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler

In [20]:
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1')
data.head()

Unnamed: 0.1,Unnamed: 0,year,state_0,state_1,state_2,state_3,state_4,state_5,state_6,status_0,...,status_3,status_4,months,pay_method_ACH,pay_method_Other,pay_method_credit card,pay_method_paper check,income,lp_amount,Y
0,0,2014.0,0,0,0,0,0,0,1,0,...,0,1,0.87,0,0,1,0,0.459624,0.5,0
1,1,2015.0,0,0,0,0,0,0,1,0,...,0,1,0.0,0,0,1,0,0.459624,0.375,0
2,2,2016.0,0,0,0,0,0,0,1,0,...,1,0,0.0,0,0,1,0,0.459624,0.375,0
3,3,2016.0,0,0,0,0,0,0,1,0,...,0,1,0.67,0,0,1,0,0.459624,0.625,0
4,4,2014.0,0,0,0,0,0,0,1,0,...,1,1,0.57,0,0,1,0,0.619935,0.5,0


In [21]:
# Creating a separate dataframe for the dropped columns that can't be used in a random forest
data_dropped = data[['months']]

# Dropping rows that will not be used in the random forest
data = data.drop(['Unnamed: 0', 'months'], axis = 1)

In [22]:
# Splitting features and target
X = data.iloc[:,:-1]
Y = data.Y

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [23]:
rando = RandomOverSampler(random_state=2019)
X_train_rando, y_train_rando = rando.fit_resample(X_train, y_train)

In [24]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X_train_rando,y_train_rando)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat2 = metrics.confusion_matrix(y_test, y_pred)
cnf_mat2

array([[22256,  1407],
       [ 1266,  1632]])

In [25]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.94     23663
           1       0.54      0.56      0.55      2898

    accuracy                           0.90     26561
   macro avg       0.74      0.75      0.75     26561
weighted avg       0.90      0.90      0.90     26561



In [26]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.899363728775272
Precision: 0.5370187561697927
Recall: 0.5631469979296067


In [27]:
# Getting the f1 score for the balanced dataset
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("AUC:",roc_auc_score(y_test, y_pred))

F1 Score: 0.9004083901592841
AUC: 0.7518435408022711
