In [1]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import mean_squared_error

# reading in the csv file
data = pd.read_csv('final_data1.csv', encoding = 'ISO-8859-1', index_col=0, header=0)
data.columns



Index(['year', 'state', 'months', 'pay_method_ACH', 'pay_method_credit card',
       'pay_method_paper check', 'region_central', 'region_north_east',
       'region_rocky', 'region_south', 'region_south_east', 'region_west',
       'status_Active', 'status_Decline', 'status_Returned',
       'status_Returned_90', 'income', '18_months'],
      dtype='object')

In [2]:
# Dropping rows that will not be used in the random forest
data = data.drop(['state', 'months'], axis = 1)

In [3]:
# Splitting features and target

X = data.iloc[:,:-1]
Y = data['18_months']

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [4]:
# Using the SMOTE package to deal with the imbalanced dataset
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [5]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=300, max_depth = 10)
clf = clf.fit(X_train_ros, y_train_ros)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[13047,  5053],
       [  248,  2629]])

In [7]:
# Printing the accuracy, precision, and recall for the balanced dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))
print("AUC:", metrics.roc_auc_score(y_test,y_pred))
print("MCC:",metrics.matthews_corrcoef(y_test, y_pred))

Accuracy: 0.7472946560518663
Precision: 0.34222858630564956
Recall: 0.9137990962808481
F1 Score: 0.7854547024294932
AUC: 0.817313912781308
MCC: 0.45315523989543616


In [8]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.72      0.83     18100
           1       0.34      0.91      0.50      2877

    accuracy                           0.75     20977
   macro avg       0.66      0.82      0.66     20977
weighted avg       0.89      0.75      0.79     20977



In [9]:
# Creating the feature importance variable to display 
feature_imp = pd.Series(clf.feature_importances_, index = X.columns).sort_values(ascending=False)
feature_imp

status_Active             0.271471
status_Returned_90        0.219445
status_Returned           0.179653
status_Decline            0.136788
year                      0.127791
income                    0.029693
pay_method_credit card    0.014176
pay_method_ACH            0.012494
pay_method_paper check    0.003958
region_south_east         0.001036
region_north_east         0.000888
region_west               0.000792
region_central            0.000672
region_south              0.000632
region_rocky              0.000510
dtype: float64

In [10]:
# Calculating the RMSE number for the model
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("RMSE : %f" % (rmse))

RMSE : 0.502698
