In [1]:
import warnings
import numpy as np
import pandas as pd  

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
%load_ext autoreload
%autoreload 2

import shallow_pipeline as pipeline 

Using TensorFlow backend.


In [3]:
COMPLAINTS_CSV = 'data/complaints.csv'

RESPONSE_COL = 'company_response_to_consumer'
NARRATIVE_COL = 'consumer_complaint_narrative'

CAT_COLUMNS = [
    ('product', 50), 
    ('sub-product', 50), 
    ('issue', 50), 
    ('sub-issue', 50), 
    ('company', 100), 
    ('state', None), 
    ('tags', None), 
    ('week', None)
]

RF_PARAMS = {
    "estimator__n_estimators": [10, 20], 
    "estimator__max_depth": [3, 5, 10]
}

In [4]:
# Load data 
cfpb = pipeline.load_data(verbose=True)

Date range: 2016-01-01 00:00:00 to 2019-12-31 00:00:00
Number of complaints: 436393

Distribution of company response: 
Closed with explanation            0.813714
Closed with non-monetary relief    0.123405
Closed with monetary relief        0.051419
Untimely response                  0.006313
Closed                             0.005149
Name: company_response_to_consumer, dtype: float64

Number of unique values in each column: 
date_received                     1460
product                             18
sub-product                         75
issue                              161
sub-issue                          216
consumer_complaint_narrative    409660
company_public_response             10
company                           4220
state                               63
zip_code                         10227
tags                                 3
consumer_consent_provided            1
submitted_via                        1
date_sent_to_company              1485
company_response_to_c

In [5]:
# Process features  
cfpb_X = pipeline.process_features(cfpb, CAT_COLUMNS, NARRATIVE_COL)

# Define label 
cfpb_y = cfpb[RESPONSE_COL]

# Split training and testing data 
X_train, X_test, y_train, y_test = pipeline.split_resample(cfpb_X, cfpb_y, verbose=True)

# Build models 
best_RF = pipeline.hypertune_RF(X_train, y_train, RF_PARAMS, verbose=True)

# Predict (highest probability) 
y_pred = best_RF.predict(X_test)
    
# Get prediction probabilities 
y_pred_proba = best_RF.predict_proba(X_test)


Distribution of training labels: 
Closed with explanation            0.813307
Closed with non-monetary relief    0.123536
Closed with monetary relief        0.051576
Untimely response                  0.006327
Closed                             0.005253
Name: company_response_to_consumer, dtype: float64

Best score: 0.2
Best parameters: {'estimator__max_depth': 3, 'estimator__n_estimators': 10}


In [6]:
# Summarize predictions 
pipeline.summarize_probs(best_RF, y_test, y_pred)

                                 precision    recall  f1-score   support

                         Closed       0.00      0.00      0.00       413
        Closed with explanation       0.82      1.00      0.90     71162
    Closed with monetary relief       0.00      0.00      0.00      4433
Closed with non-monetary relief       0.00      0.00      0.00     10725
              Untimely response       0.00      0.00      0.00       546

                       accuracy                           0.82     87279
                      macro avg       0.16      0.20      0.18     87279
                   weighted avg       0.66      0.82      0.73     87279



In [7]:
# Summarize probabilities 
pipeline.summarize_probas(best_RF, y_test, y_pred_proba)

Unnamed: 0_level_0,Closed,Closed with explanation,Closed with monetary relief,Closed with non-monetary relief,Untimely response
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Closed,1.857,81.661,5.608,9.914,0.959
Closed with explanation,0.529,81.733,5.054,12.028,0.656
Closed with monetary relief,0.442,79.214,9.443,10.347,0.554
Closed with non-monetary relief,0.479,79.407,4.13,15.474,0.511
Untimely response,0.847,82.539,4.82,10.4,1.394


In [8]:
# Get feature importance 
feature_importance = pipeline.feature_importance(best_RF, X_train, verbose=True)


Class: Closed
                                         feature  importance
132                   sub-issue_Debt is not mine    0.152492
208                                  company_ERC    0.147510
82   issue_Cont'd attempts collect debt not owed    0.115977
88         issue_Disclosure verification of debt    0.081964
239                                company_Other    0.076971

Class: Closed with explanation
                                         feature  importance
209  company_Experian Information Solutions Inc.    0.226000
20                  sub-product_Checking account    0.119924
4            product_Credit card or prepaid card    0.106111
90                        issue_Fees or interest    0.047973
23        sub-product_Conventional home mortgage    0.042416

Class: Closed with monetary relief
                                               feature  importance
4                  product_Credit card or prepaid card    0.184058
20                        sub-product_Checking acco