In [1]:
# importing the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_squared_error

# reading in the csv file
data = pd.read_csv('final_data.csv', encoding = 'ISO-8859-1', index_col=0, header=0)
data.columns



Index(['year', 'agent', 'pay_method', 'state', 'zip_code', 'status', 'months',
       'pay_method_ACH', 'pay_method_credit card', 'pay_method_paper check',
       'region_central', 'region_north_east', 'region_rocky', 'region_south',
       'region_south_east', 'region_west', 'status_Active', 'status_Decline',
       'status_Returned', 'status_Returned_90', 'income', 'Y'],
      dtype='object')

In [2]:
# Dropping rows that will not be used in the random forest
data = data.drop(['agent', 'pay_method', 'state', 'months', 'zip_code', 'status'], axis = 1)

In [3]:
# Splitting features and target

X = data.iloc[:,:-1]
Y = data['Y']

# Splitting for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=2019)

In [4]:
# Creating and using the random forest model 
clf = RandomForestClassifier(n_estimators=300, max_depth = 10)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Creating the confusion matrix
cnf_mat = metrics.confusion_matrix(y_test, y_pred)
cnf_mat

array([[7451, 1452],
       [1865, 2874]])

In [5]:
# Printing the accuracy, precision, and recall for the dataset
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='weighted'))
print("AUC:", metrics.roc_auc_score(y_test,y_pred))
print("MCC:",metrics.matthews_corrcoef(y_test, y_pred))

Accuracy: 0.7568538337487172
Precision: 0.6643550624133149
Recall: 0.60645705845115
F1 Score: 0.7540708796315743
AUC: 0.7216829827805564
MCC: 0.45364475663656895


In [6]:
# Printing the classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82      8903
           1       0.67      0.60      0.63      4739

    accuracy                           0.76     13642
   macro avg       0.73      0.72      0.73     13642
weighted avg       0.75      0.76      0.75     13642



In [7]:
# Creating the feature importance variable to display 
feature_imp = pd.Series(clf.feature_importances_, index = X.columns).sort_values(ascending=False)
print(feature_imp)

status_Active             0.335581
status_Returned_90        0.284398
status_Returned           0.137831
year                      0.118684
income                    0.057820
status_Decline            0.042781
pay_method_ACH            0.006377
pay_method_credit card    0.005611
pay_method_paper check    0.002893
region_north_east         0.001468
region_central            0.001421
region_south              0.001394
region_rocky              0.001262
region_west               0.001253
region_south_east         0.001225
dtype: float64
