In [1]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import mean_squared_error

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1', index_col=0, header=0)
data.columns



Index(['Unnamed: 0.1', 'year', 'state', 'lp_amount', 'status', 'months',
       'pay_method_ACH', 'pay_method_Other', 'pay_method_credit card',
       'pay_method_paper check', 'region_central', 'region_north_east',
       'region_rocky', 'region_south', 'region_south_east', 'region_west',
       'status_Active', 'status_Decline', 'status_Returned',
       'status_Returned_90', 'income', '18_months'],
      dtype='object')

In [2]:
# Dropping rows that will not be used in the random forest
data = data.drop(['Unnamed: 0.1', 'months', 'state', 'status', 'pay_method_Other', 'lp_amount'], axis = 1)

In [3]:
# Splitting features and target
X = data.iloc[:,:-1]
Y = data['18_months']

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [4]:
# Using the RandomOverSampler package to deal with the imbalanced dataset
random = RandomOverSampler(random_state=2019)
X_train_ros, y_train_ros = random.fit_resample(X_train, y_train)

In [5]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=300, max_depth = 10)
clf = clf.fit(X_train_ros, y_train_ros)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[19763,  5152],
       [  264,  2529]])

In [6]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.79      0.88     24915
           1       0.33      0.91      0.48      2793

    accuracy                           0.80     27708
   macro avg       0.66      0.85      0.68     27708
weighted avg       0.92      0.80      0.84     27708



In [8]:
# Printing the accuracy, precision, recall, and f1 score for the classifier
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.8045329868629999
Precision: 0.3292540033849759
Recall: 0.9054779806659506
F1 Score: 0.8395134695246069


In [9]:
# Creating the feature importance variable to display 
feature_imp = pd.Series(clf.feature_importances_, index = X.columns).sort_values(ascending=False)
feature_imp

status_Active             0.274240
status_Returned           0.239255
status_Decline            0.179670
status_Returned_90        0.139128
year                      0.107164
income                    0.023923
pay_method_credit card    0.014986
pay_method_ACH            0.012986
pay_method_paper check    0.005291
region_north_east         0.000775
region_south_east         0.000722
region_central            0.000547
region_south              0.000499
region_rocky              0.000435
region_west               0.000381
dtype: float64

In [10]:
# Running the trained model on the training and test data to make sure the model is not overfitting

# Running accuracy on both sets
print("Train Accuracy:",metrics.accuracy_score(y_train, clf.predict(X_train)))
print("Test Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Running precision on both sets
print("Train Precision:",metrics.precision_score(y_train, clf.predict(X_train)))
print("Test Precision:",metrics.precision_score(y_test, y_pred))

# Running recall on both sets
print("Train Recall:",metrics.recall_score(y_train, clf.predict(X_train)))
print("Test Recall:",metrics.recall_score(y_test, y_pred))

Train Accuracy: 0.8069607689809078
Test Accuracy: 0.8045329868629999
Train Precision: 0.33950723638869745
Test Precision: 0.3292540033849759
Train Recall: 0.9171515010472423
Test Recall: 0.9054779806659506


In [11]:
# Calculating the RMSE number for the model
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
print("RMSE : %f" % (rmse))

RMSE : 0.442117
