# Exploratory Data Analysis on Complaints Dataset

In [177]:
import pandas as pd 
import numpy as np 
import re

from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from string import punctuation

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

import time 

In [131]:
complaints = pd.read_csv('../data/complaints.csv')

In [4]:
complaints['Issue'].unique()

array(['Incorrect information on your report', 'Fraud or scam',
       'Attempts to collect debt not owed', 'Communication tactics',
       'Struggling to pay mortgage'], dtype=object)

In [7]:
complaints['Issue'].value_counts().sort_index()

Issue
Attempts to collect debt not owed        73163
Communication tactics                    21243
Fraud or scam                            12347
Incorrect information on your report    229305
Struggling to pay mortgage               17374
Name: count, dtype: int64

In [132]:
complaints = complaints.rename(columns = {'Consumer complaint narrative':'text'})

In [125]:
complaints.head(5)

Unnamed: 0,text,Issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report


## Finding rows with problematic text

In [74]:
complaints[complaints.isna().any(axis=1)]

Unnamed: 0,Consumer complaint narrative,Issue


In [77]:
complaints.loc[2410, "text"]

'These are not my accounts.'

In [133]:
nan_list = [2410,
 2431,
 4163,
 8183,
 22304,
 49893,
 51227,
 52408,
 52740,
 53738,
 54829,
 55072,
 55151,
 56325,
 56611,
 57646,
 60719,
 62092,
 63781,
 71570,
 72267,
 76569,
 77525,
 78353,
 78846,
 80300,
 82745,
 82794,
 83522,
 84829,
 87554,
 91708,
 94894,
 126931,
 131587,
 141918,
 143158,
 145081,
 145319,
 150396,
 163623,
 166353,
 168677,
 175038,
 182763,
 183190,
 193133,
 198269,
 198980,
 199390,
 201068,
 206651,
 207243,
 209405,
 211673,
 212579,
 213690,
 214934,
 217725,
 221983,
 222420,
 224388,
 227249,
 229688,
 229955,
 231079,
 236970,
 239142,
 241021,
 241827,
 243189,
 249356,
 249987,
 252887,
 253195,
 253645,
 253790,
 254940,
 256994,
 257403,
 257592,
 267301,
 268967,
 271094,
 271373,
 272625,
 274793,
 276234,
 279190,
 279589,
 282824,
 285705,
 287403,
 289891,
 294438,
 296938,
 303779,
 310185,
 311182,
 315658,
 316775,
 317451,
 320765,
 330193,
 342502,
 342538,
 348214,
 348432,
 349723]

In [134]:
nan_rows = complaints.loc[nan_list]
nan_rows.head(50)

Unnamed: 0,text,Issue
2410,These are not my accounts.,Incorrect information on your report
2431,2nd Request..these are not my accounts.,Incorrect information on your report
4163,IHAVENEVERWENTTOXXXXXXXXXXXX.\nITISSHOWINGTOAC...,Incorrect information on your report
8183,Thecreditbureausarereportinginaccurate/outdate...,Incorrect information on your report
22304,XXXX XXXX XXXX is not my account. this is fraud.,Incorrect information on your report
49893,Iamavictimofidentitytheftandthisdebtdoesnotbel...,Attempts to collect debt not owed
51227,IhaveanaccountonmyfilethatisnotmineIfiledafrau...,Incorrect information on your report
52408,"OnXX/XX/2019,Iplacedasecurityfreezeonmyfilewit...",Incorrect information on your report
52740,"XXXXXXXXXXXXXXXXXXXXXXXXXXXX,OHXXXXDOBXXXXSSXX...",Incorrect information on your report
53738,This is not my account.,Attempts to collect debt not owed


## Function to show the before and after cleaning text

In [19]:
def print_text(sample, clean):
    print(f"Before: {sample}")
    print("------------------")
    print(f"After: {clean}")

# Cleaning the text

In [None]:
# remove punctuation and numbers
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
# xxx
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'\b[xX]{2,}\b', '', x))
# line breaks
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'\n|\r|\t', ' ', x))
# html tags
complaints['text'] = complaints['text'].apply(lambda x: re.sub(r'<.*?>', '', x))

## Remove non-alphabetic characters

In [20]:
sample_text = complaints.loc[0, "text"]
clean_text = " ".join([w for w in sample_text.split() if w.isalpha()]) # Side effect: removes extra spaces
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third I declare under penalty of perjury I am alleging that a person or without my from unauthorized use of my social security number and card used my personal identif

## Remove extra spaces, tabs, and line breaks

In [34]:
clean_text = " ".join(sample_text.split())
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number an

## Remove stopwords

In [151]:
stop_words = set(stopwords.words('english'))

stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [30]:
tokens = sample_text.split()
# clean_tokens = Counter([t for t in tokens if not t in stop_words])
clean_tokens = [t for t in tokens if not t in stop_words]
clean_text = " ".join(clean_tokens)
print_text(sample_text, clean_text)

Before: My name is XXXX XXXX this complaint is not made in error neither is it being made by a third party. I declare under penalty of perjury I am alleging that a person or company- without my authorization- from unauthorized use of my social security number and card used my personal identifying information to apply for goods, services, or money ; and, was successful in creating some accounts. I have no knowledge of this and all is being investigated by the FTC and Under 15 U.S. Code 1681b Permissible purposes of consumer reports I never gave any written consent to report anything on my consumer reports. XXXX XXXX XXXX  XXXX XXXX Date Opened : XX/XX/2018 Balance : {$98000.00}. Please delete this account ASAP.
------------------
After: My name XXXX XXXX complaint made error neither made third party. I declare penalty perjury I alleging person company- without authorization- unauthorized use social security number card used personal identifying information apply goods, services, money ;

## Update list of stopwords

Exclude the following from the list of stopwords: 
These, this, not, my, account 

In [152]:
words_to_exclude = ['these', 
                    'this', 
                    'not',
                   'my',
                   'is',
                   'are']
custom_stopwords = stop_words.difference(words_to_exclude)

# Function to clean text

In [156]:
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\b[xX]{2,}\b', '', text)  # Remove XXX's
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    tokens = text.split()  # Create tokens 
    clean_tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in custom_stopwords]  # Lemmatize and remove stop words
    clean_text = " ".join(clean_tokens)  # Join clean tokens
    clean_text = " ".join(clean_text.split())  # Remove extra spaces, tabs, and new lines
    return clean_text

In [157]:
# Practice run with problematic rows df
start = time.time()

problem_df = complaints.loc[nan_list]
problem_df['text'] = problem_df['text'].apply(preprocess_text)


end = time.time()
elapsed_time = end - start
print('Execution time:', elapsed_time, 'seconds')

display(problem_df.head(50))

Execution time: 0.019951820373535156 seconds


Unnamed: 0,text,Issue
2410,these are not my account,Incorrect information on your report
2431,nd requestthese are not my account,Incorrect information on your report
4163,ihaveneverwenttoxxxxxxxxxxxx itisshowingtoacco...,Incorrect information on your report
8183,thecreditbureausarereportinginaccurateoutdated...,Incorrect information on your report
22304,is not my account this is fraud,Incorrect information on your report
49893,iamavictimofidentitytheftandthisdebtdoesnotbel...,Attempts to collect debt not owed
51227,ihaveanaccountonmyfilethatisnotmineifiledafrau...,Incorrect information on your report
52408,onxxiplacedasecurityfreezeonmyfilewithxxxxxxxx...,Incorrect information on your report
52740,ohxxxxdobxxxxssxxxxmyinformationisclearlyshown...,Incorrect information on your report
53738,this is not my account,Attempts to collect debt not owed


In [158]:
# Apply to entire df
start = time.time()

complaints['text'] = complaints['text'].apply(preprocess_text)

end = time.time()
elapsed_time = end - start
print('Execution time:', elapsed_time, 'seconds')

Execution time: 302.92202711105347 seconds


## Drop rows with missing values

Found 21 rows with empty strings.

In [196]:
complaints[complaints.isnull().any(axis=1)]

Unnamed: 0,text,Issue


In [195]:
(complaints['text'] == "").sum()

0

In [197]:
empty_text_rows = complaints[complaints['text'] == ""]
print(empty_text_rows)

Empty DataFrame
Columns: [text, Issue]
Index: []


In [198]:
nan_rows = complaints.loc[nan_list]
nan_rows.head(50)

KeyError: '[57646, 77525, 83522, 150396, 175038, 182763, 193133, 198269, 212579, 214934, 221983, 227249, 249356, 253645, 257403, 271094, 310185, 317451, 320765, 348432, 349723] not in index'

In [193]:
complaints = complaints.drop(complaints[complaints['text'] == ""].index)

In [194]:
complaints.info()

<class 'pandas.core.frame.DataFrame'>
Index: 353411 entries, 0 to 353431
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    353411 non-null  object
 1   Issue   353411 non-null  object
dtypes: object(2)
memory usage: 8.1+ MB


## Export clean df as csv

In [199]:
complaints.to_csv('../data/complaints_clean.csv', index = False)

## Model Practice

In [200]:
X = complaints[['text']]
y = complaints['Issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

Use CountVectorizer to transform text into a vector based on the frequency of the word in the text. 

In [201]:
vect = CountVectorizer()

X_train_vect = vect.fit_transform(X_train['text'])
X_test_vect = vect.transform(X_test['text'])

Use Multinomial Naive Bayes to calculate the probability distribution of text data. 

In [202]:
nb = MultinomialNB().fit(X_train_vect, y_train)

y_pred = nb.predict(X_test_vect)

In [203]:
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.82861928853576
[[11892  1873   389  3822   313]
 [  609  4483    43    95    81]
 [   67    51  2815   114    40]
 [ 4696   669   607 49791  1560]
 [   35    31    13    34  4230]]


Investigate how the model uses different words.

In [204]:
word = 'fraud'

np.exp(nb.feature_log_prob_)[:, vect.vocabulary_[word]]

array([0.00194687, 0.00037304, 0.00506862, 0.00200693, 0.00060378])