In [1]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import string
from tqdm import tqdm

You have been provided with a dataset of over 350,000 such complaints for 5 common issue types. 

> Your goal is to *train* a **text classification** model to **identify** the **issue type** based on the consumer **complaint narrative**.

In [2]:
# Read in data
complaints = pd.read_csv('../data/complaints.csv')

# Standardized column naming
complaints.columns = [x.replace('Consumer ', '').lower().replace(' ', '_') for x in complaints.columns]
complaints['complaint_length'] = complaints['complaint_narrative'].apply(len)

We standardized the column naming conventions for team coordination.

In [3]:
# Explore the count of unique complaints for each issue.
complaints['issue'].value_counts()

Incorrect information on your report    229305
Attempts to collect debt not owed        73163
Communication tactics                    21243
Struggling to pay mortgage               17374
Fraud or scam                            12347
Name: issue, dtype: int64

In [4]:
complaints.head()

Unnamed: 0,complaint_narrative,issue,complaint_length
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report,711
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam,1958
2,I have a particular account that is stating th...,Incorrect information on your report,294
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed,3444
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report,876


It appears that the data has several entries redacted due to personally idenfiyiable information.  We will write a function to remove cases of text redaction.

In [5]:
# Explore some data from the narratives column.
#there appears to be multiple areas where data is redacted.
complaints['complaint_narrative']

0         My name is XXXX XXXX this complaint is not mad...
1         I searched on XXXX for XXXXXXXX XXXX  and was ...
2         I have a particular account that is stating th...
3         I have not supplied proof under the doctrine o...
4         Hello i'm writing regarding account on my cred...
                                ...                        
353427           Collections account I have no knowledge of
353428    Dear CFPB Team, The reason for my complaint is...
353429    FRCA violations : Failing to Follow Debt Dispu...
353430    My Father, a XXXX XXXX  acquired an HECM rever...
353431    I have tried to contact cash app about a fraud...
Name: complaint_narrative, Length: 353432, dtype: object

In [6]:
# Looking at some narratives inside the "Incorrect information" complaint.
seed = 123
for complaint in complaints.loc[complaints['issue'] == 'Incorrect information on your report', 'complaint_narrative'].sample(3, random_state=seed):
    print(complaint)
    print('-----------------------------')

I just reviewed a copy of my Experian credit report and found the following information to be an error. I am a victim of identity theft and did not make these charges. I reported the theft of my identity to the Federal Trade Commission.
-----------------------------
after my legal separation from my husband he started to open credit in my name with no permission I have a legal case against him but can not find him he has ruined my life his name is XXXX XXXX XXXX   remove I have disputed by mail with all three credit bureaus. act # XXXX owing an alleged {$2200.00} XXXX   XXXX  orig creditor XXXX
-----------------------------
Equifax report a collections account " XXXX XXXX XXXX ''. I have no knowledge of this account. I have not been furnished any proof or verifications of this account. I don't have any signed contract agreements with XXXX XXXX XXXX, verbal agreements nor any paperwork associated with them. A proper investigation according to FCRA was not conducted, word of mouth from t

In [7]:
# replacing strings in narrative to remove consecutive x's
complaints['complaint_narrative'] = complaints['complaint_narrative'].str.replace(r'[xX][xX]+','', regex=True)

# stripping all numeric values from complaints
complaints['complaint_narrative'] = complaints['complaint_narrative'].str.replace('\d+', '', regex=True)

#Looking at cleaned output that has occurence of only one "x"
complaints.groupby('issue')['complaint_narrative'].apply(lambda x: x.str.extractall(r'(\w+X+\w+)').groupby(0).size().reset_index(name='count'))

Unnamed: 0_level_0,Unnamed: 1_level_0,0,count
issue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Attempts to collect debt not owed,0,AMERICANEXPRESS,2
Attempts to collect debt not owed,1,ANXIETY,1
Attempts to collect debt not owed,2,APPROXIMATELY,3
Attempts to collect debt not owed,3,AXCESS,1
Attempts to collect debt not owed,4,AXCSSFN,6
...,...,...,...
Struggling to pay mortgage,131,TOXIC,5
Struggling to pay mortgage,132,TXHAF,11
Struggling to pay mortgage,133,UNEXCUSABLE,1
Struggling to pay mortgage,134,UNEXPLAINED,1


I will encode the categores to numeric values for the purpose of analysis.

In [8]:
# Use mapping dictionary to convert columns to numerical values
issue_mapping = {'Attempts to collect debt not owed': 1, 
                 'Communication tactics': 2, 
                 'Fraud or scam': 3, 
                 'Incorrect information on your report': 4,
                 'Struggling to pay mortgage': 5}
# replace the issue values with numerical values
complaints['issue'] = complaints['issue'].replace(issue_mapping)
complaints.head(n=5)

Unnamed: 0,complaint_narrative,issue,complaint_length
0,My name is this complaint is not made in err...,4,711
1,I searched on for and was pointed to a web...,3,1958
2,I have a particular account that is stating th...,4,294
3,I have not supplied proof under the doctrine o...,1,3444
4,Hello i'm writing regarding account on my cred...,4,876


> Naive Bayes Using the Text Field

Tokenization of complaint_narrative (Warning: 6 minute runtime)

In [9]:
# Tokenizing text (Careful! 6 min run time)
complaints['complaint_narrative_tokens'] = list(tqdm(complaints['complaint_narrative'].apply(nltk.word_tokenize), desc="Tokenizing"))

Tokenizing: 100%|██████████| 353432/353432 [00:00<00:00, 2388762.08it/s]


Applying stopwords to complaints narratives.


In [10]:
# define stopwords
stop_words = stop_words = set(stopwords.words('english'))

# apply stopword removal to the 'complaint_narrative' column
complaints['complaint_narrative'] = complaints['complaint_narrative'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

Making lowercase and removing special characters

In [11]:
# Lowercasing text
complaints['complaint_narrative'] = complaints['complaint_narrative'].apply(lambda x: x.lower())

# Removing punctuation and special characters
translator = str.maketrans('', '', string.punctuation)
complaints['complaint_narrative'] = complaints['complaint_narrative'].apply(lambda x: x.translate(translator))

Stemming Text (Warning: 7 Minute run time)

In [12]:
# Instantiate PorterStemmer
stemmer = PorterStemmer()

# Apply PorterStemmer to cleaned 'complaint_narrative' column.
complaints['complaint_narrative_stemmed'] = complaints['complaint_narrative'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))

complaints.head()

Unnamed: 0,complaint_narrative,issue,complaint_length,complaint_narrative_tokens,complaint_narrative_stemmed
0,name complaint made error neither made third p...,4,711,"[My, name, is, this, complaint, is, not, made,...",name complaint made error neither made third p...
1,searched pointed website legitimately believed...,3,1958,"[I, searched, on, for, and, was, pointed, to, ...",search point websit legitim believ websit wher...
2,particular account stating owe listed credit ...,4,294,"[I, have, a, particular, account, that, is, st...",particular account state owe list credit repor...
3,supplied proof doctrine estoppel silence engel...,1,3444,"[I, have, not, supplied, proof, under, the, do...",suppli proof doctrin estoppel silenc engelhard...
4,hello im writing regarding account credit repo...,4,876,"[Hello, i, 'm, writing, regarding, account, on...",hello im write regard account credit report be...


Train Test Split

In [13]:
X = complaints[['complaint_narrative']]
y = complaints['issue']

X_train, X_test, y_train, y_test = train_test_split( X, y, random_state = 321, stratify = y)

Count Vectorizer

In [14]:
#Fill in the code to fit and transform a CountVectorizer (using all defaults) on the text column of X_train and X_test
vect = CountVectorizer()

#Fit 
X_train_vec = vect.fit_transform(X_train["complaint_narrative"])
X_test_vec = vect.transform(X_test["complaint_narrative"]) 

look at the vocabulary

In [15]:
vect.vocabulary_

{'filed': 30062,
 'identity': 37077,
 'theft': 75700,
 'report': 64091,
 'learned': 43405,
 'someone': 71016,
 'using': 81167,
 'incarceration': 38285,
 'submitted': 73184,
 'documentation': 23992,
 'transunion': 78053,
 'disregarded': 23540,
 'it': 41485,
 'even': 27601,
 'block': 9443,
 'account': 640,
 'days': 19884,
 'upon': 80830,
 'receiving': 61830,
 'information': 39196,
 'also': 3564,
 'received': 61785,
 'letter': 43812,
 'irs': 41267,
 'stating': 72233,
 'tax': 74794,
 'return': 65892,
 'name': 49551,
 'us': 80958,
 'dept': 21654,
 'educatio': 25438,
 'acct': 1021,
 'closed': 14079,
 'send': 68614,
 'written': 84766,
 'notification': 51581,
 'reported': 64154,
 'consumer': 16714,
 'reporting': 64233,
 'agencies': 2609,
 'never': 50350,
 'ever': 27633,
 'late': 43006,
 'although': 3623,
 'updated': 80713,
 'chargeoff': 12943,
 'providing': 60101,
 'severe': 69215,
 'scandalous': 67693,
 'action': 1414,
 'destroying': 22010,
 'perfect': 56297,
 'payment': 55659,
 'history': 35