In [1]:
import pandas as pd
import numpy as np

from joblib import dump, load

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
complaints = pd.read_csv('../data/complaints.csv')
complaints.head()

Unnamed: 0,Consumer complaint narrative,Issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report


In [3]:
complaints['Issue'].value_counts()

Issue
Incorrect information on your report    229305
Attempts to collect debt not owed        73163
Communication tactics                    21243
Struggling to pay mortgage               17374
Fraud or scam                            12347
Name: count, dtype: int64

In [4]:
seed = 11
for statement in complaints.loc[complaints['Issue'] == 'Fraud or scam', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

On XX/XX/21 I received an email from Venmo asking for {$400.00}. The picture, name, and ID all match my wife 's account so I paid it. As soon as I paid it, the picture changed to a woman who was not my wife, and the name changed to @ XXXX. I feel like Venmo should have better protected our accounts and not allow a scammer/hacker to use my wife 's name and picture. I contacted Venmo and they refuse to look into it or try to recover my {$400.00} from the scammer. I also contacted XXXX XXXX XXXX XXXX County Police and they were hesitant to grant a police report because they said there is no way they will catch the scammer and that will be the only way to get my money back. I place my trust in Venmo to protect my accounts and not allow hackers to use my wife 's info. Venmo says they are not responsible but I have seen very similar cases on this site when Venmo refunds the full amount which is usually over {$400.00}. I am asking Venmo to refund me the full {$400.00}. Screen shots are attach

In [5]:
seed = 11
for statement in complaints.loc[complaints['Issue'] == 'Struggling to pay mortgage', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

In XX/XX/XXXX I was injured at work and was unable to work. I my employer contested my workmans compensation claim and I fell behind in all my bills including my Mortgage with Wells Fargo. I called bi weekly, I informed the bank of what I was dealing with and that I expected the outcome to be in my favor. I spoke with the Bank at least twice every two weeks from XX/XX/XXXX XX/XX/XXXX, ( XX/XX/XXXX being the month I went to court with Compensation company ). I won the case and as stated, on XX/XX/XXXX XXXX the commission awarded me back pay and weekly amount until I go back to work. I called Wells Fargo on XX/XX/XXXX regarding my case and the following are the 3 incidents that followed of which I am seeking assistance in repayment and clarity : 1 ) I called Wells Fargo to inform them that I won my case and payment was made to me in the form of a check on XX/XX/XXXX and that I would be paying the back amount in full by that Friday ( XX/XX/XXXX ) at the latest Monday ( XX/XX/XXXX ) becaus

In [6]:
seed = 11
for statement in complaints.loc[complaints['Issue'] == 'Communication tactics', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

On XX/XX/2017 I received a called from I received a call from XXXX from XXXX, he called back to back three times, when I answered the he asked to speak to XXXX, I responded this she, he identified himself, Then he proceeded to as for my ss # and I said I 'm not going to verified anything with he then asked for my birthday, I repeated my self, he continue to asked for my address I refused. He became very upset with and his whole attitude change. I told him you do n't talk any kind of way. He snapped you have a great afternoon, I 've just got out of the hostipal and not supposed to be under any stress. 
My husband grab the phone he hung up this is the second they called and got smart. My husband explained to them that I have a serious XXXX XXXX and their respond was somebody have to pay the bill. If we had the money the bill would be paid nobody like to be harrass
-----------------------------
First, the debt collector never told me this was an attempted to collect a debt. I have been hi

In [7]:
seed = 11
for statement in complaints.loc[complaints['Issue'] == 'Attempts to collect debt not owed', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

XXXX XXXX XXXX. I do not know what this is for. I believe it is XXXX XXXX  who forces subscribers to open new accounts when you transfer services which I have done twice and have noticed they reported put those accounts into collections after charging me under my old account when I was paying regularly on my new account. In XX/XX/XXXX'17, I had a balance of around {$40.00} plus whatever they charged for a late fee. They waived the late fee and credited my new account {$20.00} and I paid the balance. Now I see this on my report that was reported in XX/XX/XXXX so I can only assume it originated from XXXX.
-----------------------------
I have sent several letters to XXXX XXXX XXXX and XXXX XXXX XXXX about the debts listed with their companies. I have asked repeatedly for the actual applications from the company they are collecting for. I get the bills and letters stating that the debt is valid. They have severely damaged my credit to the point I had to hire a credit repair agency. And the

In [8]:
seed = 11
for statement in complaints.loc[complaints['Issue'] == 'Incorrect information on your report', 'Consumer complaint narrative'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

A Geo rgia State Tax Lien  (  XXXX   XXXX   Superior Court ) is showing  on my credit rep ort. Tax Lien is   showing as filed  XXXX / XXXX / 2013 . I do not a delinquent tax/tax lien with the state of Georgia. I am requesting this tax lien be removed from my credit report as soon as possible.
-----------------------------
Hi I am submitting this XXXX XXXX this isn't any influence and this is not a third party. XXXX has low and unfair credit number for me in their report. I have complained. The problem has not been resolved. my fico has me at a credit score over 719XXXX has me at a score around 590. That is a huge difference. XXXX paints me as a XXXX. my fico say I have good credit. What the heck is going on here. i have almost no debt and my identity was stolen causing my score to drop n i made this clear for 60 days straight with XXXX i spoke to a representative agent name XXXX and XXXX and XXXX from the fraud department I prefer to speak to a us rept but they refused they had me on m

In [9]:
X = complaints[['Consumer complaint narrative']]
y = complaints['Issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 11, stratify = y)

In [10]:
vect = CountVectorizer()

X_train_vec = vect.fit_transform(X_train['Consumer complaint narrative'])
X_test_vec = vect.transform(X_test['Consumer complaint narrative'])

In [11]:
nb = MultinomialNB().fit(X_train_vec, y_train)

y_pred = nb.predict(X_test_vec)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8002784128205709
[[12086  2087   489  3306   323]
 [  571  4483    59    99    99]
 [   72    58  2784   123    50]
 [ 6554   919   850 47147  1856]
 [   39    41    26    26  4211]]


In [12]:
word = 'debt'

np.exp(nb.feature_log_prob_)[:, vect.vocabulary_[word]]

array([0.01065047, 0.00826678, 0.00030694, 0.00197996, 0.00055034])

In [13]:
vect = CountVectorizer()
clf = MultinomialNB()

pipe = Pipeline([("vect", vect), ("clf", clf)])

param_grid = {
    'vect__ngram_range':[(1,1), (1,2), (1,3)],
    'vect__min_df':[1, 2, 5, 10, 20],
    'clf__fit_prior':[False, True]
}

rs = RandomizedSearchCV(estimator = pipe, param_distributions = param_grid, verbose = 2, n_jobs = -1)
rs.fit(X_train['Consumer complaint narrative'], y_train)

dump(rs, "../models/cv_01.joblib")

Fitting 5 folds for each of 10 candidates, totalling 50 fits


['../models/cv_01.joblib']

In [14]:
rs = load("../models/cv_01.joblib")
print(f'Best params: {rs.best_params_}')
print(f'Best score: {rs.best_score_}')

Best params: {'vect__ngram_range': (1, 3), 'vect__min_df': 2, 'clf__fit_prior': False}
Best score: 0.8313188103280795


In [15]:
y_pred = rs.best_estimator_.predict(X_test['Consumer complaint narrative'])

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8341746078453564
[[14806   751   187  2202   345]
 [  933  4096    39    76   167]
 [  138    21  2723   115    90]
 [ 6993   173   345 47809  2006]
 [   32    14     9    16  4272]]


In [16]:
word = 'give me money'

np.exp(rs.best_estimator_['clf'].feature_log_prob_)[:, rs.best_estimator_['vect'].vocabulary_[word]]

array([6.79747524e-08, 2.14321659e-07, 5.28332853e-07, 1.16803441e-08,
       2.01144148e-07])