In [None]:
import pandas as pd
import numpy as np

from collections import Counter
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

In [None]:
complaints = pd.read_csv('../data/complaints.csv')

complaints.info()

In [None]:
complaints.head()

In [None]:
# Complaints by Issue
complaints['Issue'].value_counts().reset_index().rename(columns = {'index': 'category', 'Issue' : 'count'})

In [None]:
# Simplify column names
complaints.columns = ['complaint', 'category']
complaints

In [None]:
# Create columns with word count, character count, and average word length for each complaint
complaints['word_count'] = complaints['complaint'].str.split().str.len()
complaints['char_count'] = complaints['complaint'].str.len()
complaints['avg_word_len'] = complaints['char_count'] / complaints['word_count']

#### Explore duplicate complaints

In [None]:
dups = complaints.loc[complaints['complaint'].duplicated(keep=False)].sort_values('complaint')

In [None]:
# See how many categories are applied to the duplicate complaints
dups.groupby('complaint')['category'].nunique().reset_index().sort_values('category')#.iloc[201, 0]

Discovered that for at least one complaint, nearly the exact same text was submitted over 100 times. Based on the specifics included in the consistent portions of the text, quite confident this is the same person doing something like copying and pasting almost exactly the same thing over and over

In [None]:
complaints.loc[complaints['complaint'].str.contains('My information was used to obtain an apartment, cell phone, and an auto loan with my ex-partner. I am a victim of identity theft and have put this off for too long')]#['category'].value_counts()

#### Make some plots

In [None]:
import plotly.express as px

In [None]:
complaints['category'].value_counts().reset_index()

In [None]:
# Number of complaints by category
df = complaints['category'].value_counts().reset_index().rename(columns = { 'category' : 'count', 'index': 'category'})

fig = px.bar(df, x = 'count', y = 'category')
fig.show()

In [None]:
# Create a aggregated data frame, grouped by category
complaint_overview = complaints.groupby('category').agg(
    avg_wc = ('word_count', 'mean'),
    max_wc = ('word_count', 'max'),
    avg_cc = ('char_count', 'mean'),
    total_complaints = ('complaint', 'count'),
    avg_word_len = ('avg_word_len', 'mean')   
).reset_index()

# Melt to long form to make facet plot easier
complaint_overview = complaint_overview.melt(id_vars='category')

complaint_overview

In [None]:
# Look at average word count, max word count, and average character count by category
df = complaint_overview.loc[~complaint_overview['variable'].isin(['total_complaints', 'avg_word_len'])]
fig = px.bar(df, x="value", y="category", color="category", facet_row="variable")
fig.show()

In [None]:
# Distribution of all complaints by word count
df = complaints
fig = px.histogram(complaints, x='word_count')
fig.show()

In [None]:
# Distribution of complaints by word count, grouped by category
df = complaints
fig = px.histogram(complaints, x='word_count', color = 'category', facet_row='category', histnorm='percent')
fig.show()

In [None]:
# Distribution of complaints by character count, grouped by category
df = complaints
fig = px.histogram(complaints, x='char_count', color = 'category', facet_row='category', histnorm='percent')
fig.show()

#### Taking a look at redacted material

In [None]:
complaints.loc[(complaints['complaint'].str.contains('\sX{1,}\s', regex=True))
               |
               (complaints['complaint'].str.contains('XX/XX/'))]

In [None]:
complaints.loc[(complaints['complaint'].str.contains('X{1,2}\/X{1,2}\/X{2,4}', regex=True))
               &
               (~complaints['complaint'].str.contains('X{1,2}\/X{1,2}\/X{2,4} ', regex=True))].iloc[0,0]

In [None]:
complaints.loc[(complaints['complaint'].str.contains('X{1,2}\/X{1,2}\/\d{2,4} ', regex=True))]#.iloc[0,0]

In [None]:
complaints.loc[(complaints['complaint'].str.contains(' X{2} ', regex=True))].iloc[1,0]

In [None]:
complaints['complaint'].str.replace('X', '').loc[(complaints['complaint'].str.contains(' X{2} ', regex=True))].iloc[0]

In [None]:
# Create a version of the complaints free of redacted materials plus a little bit of other cleaning
complaints['complaint_clean'] = complaints['complaint'].str.replace('X', '').str.replace('//', '').str.replace('.00', '')

In [None]:
complaints.loc[(complaints['complaint_clean'].str.contains('--', regex=True))].iloc[2,0]

#### Now do some tokenizing

In [None]:
all_complaints = ''.join(complaints['complaint_clean'])

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
all_counter = Counter([x.lower() for x in regexp_tokenize(all_complaints, '[-\'\w]+') if x.lower() not in stop_words])

In [None]:
all_counter.most_common()

In [None]:
# Number of unique words after removing redactions and stop words
len(all_counter)

In [None]:
# Total occurrences of all words after removing redactions and stop words
sum(all_counter.values())

In [None]:
# Total occurrences of all words with no cleaning
complaints['word_count'].sum()

In [None]:
complaints.loc[complaints['complaint'].str.contains(' 00 ')].iloc[0, 0]

#### Tokenize by group

In [None]:
def get_cat_tokens(category):
    group_text = (
        ''.join(complaints.loc[complaints['category'] == category]['complaint_clean'])
    )
    
    group_counter = (
        Counter([x.lower() for x in regexp_tokenize(group_text, '[-\'\w]+') if x.lower() not in stop_words])
    )
    
    return group_counter.most_common()

In [None]:
complaints['category'].unique()

In [None]:
info_counter = get_cat_tokens('Incorrect information on your report')

In [None]:
info_counter

In [None]:
fraud_counter = get_cat_tokens('Fraud or scam')

In [None]:
fraud_counter

In [None]:
dno_counter = get_cat_tokens('Attempts to collect debt not owed')

In [None]:
dno_counter

In [None]:
comm_counter = get_cat_tokens('Communication tactics')

In [None]:
comm_counter

In [None]:
mortgage_counter = get_cat_tokens('Struggling to pay mortgage')

In [None]:
mortgage_counter

In [None]:
#import pickle
#complaints.to_pickle('../data/complaints_df.pkl')