# Exploratory Data Analysis on Complaints Dataset

In [28]:
import pandas as pd 
import numpy as np 
import re

import spacy
nlp = spacy.load('en_core_web_sm')

from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

from string import punctuation

from collections import Counter

from joblib import dump, load

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from numpy import ravel

from sklearn.model_selection import GridSearchCV

In [2]:
complaints = pd.read_csv('../data/complaints.csv')

In [3]:
complaints.head(5)

Unnamed: 0,Consumer complaint narrative,Issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report


In [4]:
complaints['Issue'].unique()

array(['Incorrect information on your report', 'Fraud or scam',
       'Attempts to collect debt not owed', 'Communication tactics',
       'Struggling to pay mortgage'], dtype=object)

In [5]:
complaints = complaints.rename(columns = {'Consumer complaint narrative':'text'})

In [6]:
complaints.head(5)

Unnamed: 0,text,Issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report


In [7]:
complaints['Issue'].value_counts().sort_index()

Issue
Attempts to collect debt not owed        73163
Communication tactics                    21243
Fraud or scam                            12347
Incorrect information on your report    229305
Struggling to pay mortgage               17374
Name: count, dtype: int64

In [8]:
first_complaint = complaints.loc[0, "text"]

In [15]:
first_complaint

'My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.'

## Function to show the before and after cleaning text

In [19]:
def print_text(sample, clean):
    print(f"Before: {sample}")
    print("------------------")
    print(f"After: {clean}")

## Cleaning the text

In [None]:
# remove punctuation and numbers
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# xxx
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'\b[xX]{2,}\b', '', x))
# line breaks
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'\n|\r|\t', ' ', x))
# html tags
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'<.*?>', '', x))

In [None]:
complaints.head(10)

In [None]:
seed = 123
for statement in complaints.loc[complaints['Issue'] == 'Attempts to collect debt not owed', 'text'].sample(3, random_state=seed):
    print(statement)
    print('-----------------------------')

### Remove non-alphabetic characters

In [20]:
sample_text = complaints.loc[0, "text"]
clean_text = " ".join([w for w in sample_text.split() if w.isalpha()]) # Side effect: removes extra spaces
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third I declare under penalty of perjury I am alleging that a person or without my from unauthorized use of my social security number and card used my personal identif

## Remove all special characters and punctuation

In [21]:
clean_text = re.sub(r"[^A-Za-z0-9\s]+", "", sample_text)
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party I declare under penalty of perjury I am alleging that a person or company without my authorization from unauthorized use of my social security number and c

## Remove extra spaces, tabs, and line breaks

In [34]:
clean_text = " ".join(sample_text.split())
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number an

## Remove punctuation

In [35]:
clean_text = re.sub(f"[{re.escape(punctuation)}]", "", sample_text)
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party I declare under penalty of perjury I am alleging that a person or company without my authorization from unauthorized use of my social security number and c

## Remove repeated characters

In [22]:
clean_text = re.sub(r'(.)\1{3,}',r'\1', sample_text)
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is X X this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card

## Remove digits

For when you want to remove numbers but not dates.

In [36]:
clean_text = " ".join([w for w in sample_text.split() if not w.isdigit()]) # Side effect: removes extra spaces
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number an

## Remove stopwords

In [24]:
stop_words = set(stopwords.words('english'))

stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [30]:
tokens = sample_text.split()
# clean_tokens = Counter([t for t in tokens if not t in stop_words])
clean_tokens = [t for t in tokens if not t in stop_words]
clean_text = " ".join(clean_tokens)
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name XXXX XXXX complaint made error neither made third party. I declare penalty perjury I alleging person company- without authorization- unauthorized use social security number card used personal identifying information apply goods, services, money ;

## Remove short tokens

When you want to remove tokens with few characters

In [39]:
tokens = sample_text.split()
clean_tokens = [t for t in tokens if len(t) > 2]
clean_text = " ".join(clean_tokens)
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: name XXXX XXXX this complaint not made error neither being made third party. declare under penalty perjury alleging that person company- without authorization- from unauthorized use social security number and card used personal identifying information ap

## Transform emojis into characters

Use the emoji library.

In [40]:
# from emoji import demojize

# sample_text2 = "I love 🥑"
# clean_text = demojize(sample_text2)
# print_text(sample_text2, clean_text)

ModuleNotFoundError: No module named 'emoji'

## Split text into sentences using NLTK

In [41]:
sentences = sent_tokenize(sample_text)
print_text(sample_text, sentences)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: ['My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party.', 'I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security numb

## Function to clean text

In [42]:
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\b[xX]{2,}\b', '', text)  # Remove XXX's
    text = " ".join([w for w in text.split() if w.isalpha()])  # Remove non-alphabetic characters
    tokens = text.split()  # Create tokens
    clean_tokens = [t for t in tokens if not t in stop_words]  # Remove stop words
    clean_text = " ".join(clean_tokens)  # Join clean tokens
    clean_text = " ".join(clean_text.split())  # Remove extra spaces, tabs, and new lines
    return clean_text

complaints["text"].map(preprocess_text)

0         name complaint made error neither made third d...
1         searched pointed website legitimately believed...
2         particular account stating owe listed credit r...
3         supplied proof doctrine estoppel engelhardt v ...
4         hello writing regarding account credit reports...
                                ...                        
353427                        collections account knowledge
353428    dear cfpb reason complaint tried resolve accou...
353429    frca violations failing follow debt dispute pr...
353430    acquired hecm reverse mortgage property review...
353431    tried contact cash app fraudulent payment refu...
Name: text, Length: 353432, dtype: object

In [46]:
complaints['text'] = complaints['text'].apply(preprocess_text)

In [48]:
complaints.iloc[353431]

text     tried contact cash app fraudulent payment refu...
Issue                                        Fraud or scam
Name: 353431, dtype: object

## Export clean text as csv

In [49]:
complaints.to_csv('../data/complaints_clean.csv')

## Model Practice

In [50]:
X = complaints[['text']]
y = complaints['Issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

Use CountVectorizer to transform text into a vector based on the frequency of the word in the text. 

In [51]:
vect = CountVectorizer()

X_train_vect = vect.fit_transform(X_train['text'])
X_test_vect = vect.transform(X_test['text'])

In [53]:
X_train_vect

<265074x60047 sparse matrix of type '<class 'numpy.int64'>'
	with 12518802 stored elements in Compressed Sparse Row format>

Use Multinomial Naive Bayes to calculate the probability distribution of text data. 

In [54]:
nb = MultinomialNB().fit(X_train_vect, y_train)

y_pred = nb.predict(X_test_vect)

In [55]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8207972113447566
[[11754  2003   454  3785   295]
 [  586  4494    47    96    88]
 [   69    59  2804   117    38]
 [ 5006   768   707 49251  1594]
 [   30    38    15    39  4221]]


Investigate how the model uses different words.

In [56]:
word = 'great'

np.exp(nb.feature_log_prob_)[:, vect.vocabulary_[word]]

array([0.00011038, 0.00010508, 0.00014907, 0.00013521, 0.00017622])