In [1]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import mean_squared_error

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1', index_col=0, header=0)
data.columns



Index(['year', 'agent', 'pay_method', 'state', 'zip_code', 'status', 'months',
       'pay_method_ACH', 'pay_method_credit card', 'pay_method_paper check',
       'region_central', 'region_north_east', 'region_rocky', 'region_south',
       'region_south_east', 'region_west', 'status_Active', 'status_Decline',
       'status_Returned', 'status_Returned_90', 'status_Switched to LW',
       'income', 'Y'],
      dtype='object')

In [2]:
# Dropping rows that will not be used in the random forest
data = data.drop(['agent', 'pay_method', 'state', 'months', 'zip_code', 'status'], axis = 1)

In [3]:
# Splitting features and target

X = data.iloc[:,:-1]
Y = data['Y']

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [4]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=300, max_depth = 10)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[6509, 1832],
       [1332, 4123]])

In [5]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))
print("AUC:", metrics.roc_auc_score(y_test,y_pred))
print("MCC:",metrics.matthews_corrcoef(y_test, y_pred))

Accuracy: 0.770658161786025
Precision: 0.6923593618807724
Recall: 0.755820348304308
F1 Score: 0.772140017405941
AUC: 0.7680912076013808
MCC: 0.5292884229720595


In [6]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.78      0.80      8341
           1       0.69      0.76      0.72      5455

    accuracy                           0.77     13796
   macro avg       0.76      0.77      0.76     13796
weighted avg       0.78      0.77      0.77     13796



In [7]:
# Creating the feature importance variable to display 
feature_imp = pd.Series(clf.feature_importances_, index = X.columns).sort_values(ascending=False)
feature_imp

status_Returned_90        0.311645
status_Active             0.240971
status_Returned           0.204633
year                      0.127850
income                    0.045282
status_Decline            0.032399
pay_method_ACH            0.009556
status_Switched to LW     0.009271
pay_method_credit card    0.008704
pay_method_paper check    0.002330
region_central            0.001470
region_south              0.001377
region_north_east         0.001345
region_west               0.001213
region_south_east         0.001153
region_rocky              0.000803
dtype: float64

In [8]:
# Calculating the RMSE number for the model
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("RMSE : %f" % (rmse))

RMSE : 0.478896
