In [1]:
import warnings
import numpy as np
import pandas as pd  
import datetime

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
%load_ext autoreload
%autoreload 2

import shallow_pipeline as pipeline 

Using TensorFlow backend.


In [3]:
COMPLAINTS_CSV = 'data/complaints.csv'

RESPONSE_COL = 'company_response_to_consumer'
NARRATIVE_COL = 'consumer_complaint_narrative'

CAT_COLUMNS = [
    ('product', 50), 
    ('sub-product', 50), 
    ('issue', 50), 
    ('sub-issue', 50), 
    ('company', 100), 
    ('state', None), 
    ('tags', None), 
    ('week', None)
]

RF_PARAMS = {
    "estimator__n_estimators": [10, 20], 
    "estimator__max_depth": [3, 5, 10]
}

In [4]:
# Load data 
cfpb = pipeline.load_data(verbose=True)

Date range: 2016-01-01 00:00:00 to 2019-12-31 00:00:00
Number of complaints: 436393

Distribution of company response: 
Closed with explanation            0.813714
Closed with non-monetary relief    0.123405
Closed with monetary relief        0.051419
Untimely response                  0.006313
Closed                             0.005149
Name: company_response_to_consumer, dtype: float64

Number of unique values in each column: 
date_received                     1460
product                             18
sub-product                         75
issue                              161
sub-issue                          216
consumer_complaint_narrative    409660
company_public_response             10
company                           4220
state                               63
zip_code                         10227
tags                                 3
consumer_consent_provided            1
submitted_via                        1
date_sent_to_company              1485
company_response_to_c

In [5]:
# Start timer 
start = datetime.datetime.now()

# Process features  
cfpb_X = pipeline.process_features(cfpb, CAT_COLUMNS, NARRATIVE_COL)

# Define label 
cfpb_y = cfpb[RESPONSE_COL]

# Split training and testing data 
X_train, X_test, y_train, y_test = pipeline.split_resample(cfpb_X, cfpb_y, resample=True, verbose=True)

# Build models 
best_RF = pipeline.hypertune_RF(X_train, y_train, RF_PARAMS, verbose=True)

# Predict (highest probability) 
y_pred = best_RF.predict(X_test)
    
# Get prediction probabilities 
y_pred_proba = best_RF.predict_proba(X_test)

# End timer
stop = datetime.datetime.now()
print("Time Elapsed:", stop - start)


Distribution of training labels: 
Closed with monetary relief        0.2
Closed with non-monetary relief    0.2
Closed                             0.2
Untimely response                  0.2
Closed with explanation            0.2
Name: company_response_to_consumer, dtype: float64

Best score: 0.6471935379828355
Best parameters: {'estimator__max_depth': 10, 'estimator__n_estimators': 10}
Time Elapsed: 2:10:25.166102


In [6]:
# Summarize predictions 
pipeline.summarize_probs(best_RF, y_test, y_pred)

                                 precision    recall  f1-score   support

                         Closed       0.04      0.47      0.08       413
        Closed with explanation       0.89      0.58      0.70     71162
    Closed with monetary relief       0.21      0.75      0.33      4433
Closed with non-monetary relief       0.30      0.46      0.36     10725
              Untimely response       0.05      0.38      0.09       546

                       accuracy                           0.57     87279
                      macro avg       0.30      0.53      0.31     87279
                   weighted avg       0.78      0.57      0.64     87279



In [7]:
# Summarize probabilities 
pipeline.summarize_probas(best_RF, y_test, y_pred_proba)

Unnamed: 0_level_0,Closed,Closed with explanation,Closed with monetary relief,Closed with non-monetary relief,Untimely response
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Closed,30.963,23.863,14.133,11.222,19.818
Closed with explanation,12.859,36.994,15.927,22.185,12.035
Closed with monetary relief,12.973,25.699,38.582,13.13,9.615
Closed with non-monetary relief,10.2,33.449,12.468,34.975,8.908
Untimely response,18.766,30.071,9.173,13.82,28.17


In [8]:
# Get feature importance 
feature_importance = pipeline.feature_importance(best_RF, X_train, verbose=True)


Class: Closed
                                              feature  importance
6   product_Credit reporting, credit repair servic...    0.150396
27                       sub-product_Credit reporting    0.071081
50  sub-product_Other (i.e. phone, health club, etc.)    0.071010
82        issue_Cont'd attempts collect debt not owed    0.069646
96         issue_Incorrect information on your report    0.057184

Class: Closed with explanation
                                               feature  importance
10                                    product_Mortgage    0.063576
15                                product_Student loan    0.056444
267     company_TRANSUNION INTERMEDIATE HOLDINGS, INC.    0.050048
328                                           state_TX    0.047536
6    product_Credit reporting, credit repair servic...    0.045034

Class: Closed with monetary relief
                                               feature  importance
239                                      company_Oth