In [1]:
import numpy as np
import pandas as pd  

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import shallow_pipeline as pipeline 

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
COMPLAINTS_CSV = 'data/complaints.csv'
COMPLAINT_COL = 'consumer_complaint_narrative'

In [3]:
# Load data 
cfpb = pipeline.load_data(verbose=True)

# Process features  
cat_columns = {('product', 50), 
               ('sub-product', 50), 
               ('issue', 50), 
               ('sub-issue', 50), 
               ('company', 100), 
               ('state', None), 
               ('tags', None), 
               ('consumer_disputed', None)}

cfpb_X = pipeline.process_cat_features(cfpb, cat_columns)

# Process label 
cfpb_y = np.where(cfpb['company_response_to_consumer'].isin(
    ['Closed with non-monetary relief', 'Closed with monetary relief']), 1, 0)

Date range: 2015-03-19 00:00:00 to 2019-12-31 00:00:00
Number of complaints: 491146

Distribution of company response: 
Closed with explanation            0.808145
Closed with non-monetary relief    0.124195
Closed with monetary relief        0.053729
Closed                             0.007617
Untimely response                  0.006314
Name: company_response_to_consumer, dtype: float64

Distribution of missing values: 
date_received                   0.000000
product                         0.000000
sub-product                     0.106225
issue                           0.000000
sub-issue                       0.260953
consumer_complaint_narrative    0.000000
company_public_response         0.520477
company                         0.000000
state                           0.003848
zip_code                        0.224844
tags                            0.827795
consumer_consent_provided       0.000000
submitted_via                   0.000000
date_sent_to_company            0.000000
c

In [4]:
# Split training and testing data 
X_train, X_test, y_train, y_test = train_test_split(cfpb_X, cfpb_y, test_size=0.2, random_state=42)

# Build random forest classifier 
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

# Predict 
y_pred = (clf.predict_proba(X_test)[:,1] >= 0.2).astype(bool)

# Evaluate 
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

Accuracy: 0.8088974854932302
Precision: 0.4298454837706108
Recall: 0.23795855100752053


array([[75313,  5498],
       [13274,  4145]])

In [5]:
# Get feature importance
coefs = pd.DataFrame({
    'feature': X_train.columns.values,
    'coef': clf.feature_importances_.ravel()})
coefs.sort_values(by='coef', ascending=False)[:10]

Unnamed: 0,feature,coef
105,company_Experian Information Solutions Inc.,0.084399
179,product_Mortgage,0.079204
172,product_Credit card,0.072371
292,sub-product_Checking account,0.06181
86,"company_CITIBANK, N.A.",0.055046
316,sub-product_Missing,0.050729
136,company_Other,0.049803
153,company_SYNCHRONY FINANCIAL,0.048632
173,product_Credit card or prepaid card,0.043456
104,"company_Empowerment Ventures, LLC",0.029388
