# Import libraries and load text

In [64]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string
import plotly
import plotly.graph_objects as go
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [65]:
# Prepping dataframe as done in 1_EDA.ipynb

df = pd.read_csv('complaints_1year.csv')
df = df[['Product', 'Consumer complaint narrative']]

df = df.rename(columns={"Product": "product", "Consumer complaint narrative": "narrative"})
# df = df[ (df['product'] != "Debt collection" ) & 
#         (df['product'] != "Money transfer, virtual currency, or money service") & 
#        (df['product'] != "Checking or savings account")]


print(df['product'].value_counts())

df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'Non Money Transfer',
                       'Credit card or prepaid card': 'Non Money Transfer',
                       'Mortgage': 'Non Money Transfer',
                       'Vehicle loan or lease': 'Non Money Transfer',
                       'Payday loan, title loan, or personal loan': 'Non Money Transfer',
                       'Student loan': 'Non Money Transfer',
                       'Debt collection' : 'Non Money Transfer', 
                       'Money transfer, virtual currency, or money service': 'Money Transfer', 
                       'Checking or savings account': 'Non Money Transfer'}, inplace=True)

Credit reporting, credit repair services, or other personal consumer reports    103116
Debt collection                                                                  28593
Credit card or prepaid card                                                      17991
Mortgage                                                                         13884
Checking or savings account                                                      10723
Money transfer, virtual currency, or money service                                6180
Vehicle loan or lease                                                             4171
Payday loan, title loan, or personal loan                                         2321
Student loan                                                                      2179
Name: product, dtype: int64


In [66]:
df['product'].value_counts()

Non Money Transfer    182978
Money Transfer          6180
Name: product, dtype: int64

In [67]:
train_pos = df[df['product'] == 'Money Transfer'].sample(frac=0.2)
df_rest_pos = df[df['product'] == 'Money Transfer'].drop(train_pos.index)

train_neg = df[df['product'] == 'Non Money Transfer'].sample(frac=0.27)
df_rest_neg = df[df['product'] == 'Non Money Transfer'].drop(train_neg.index)


In [68]:
df_rest = pd.concat([df_rest_pos, df_rest_neg])
df_train = pd.concat([train_pos, train_neg])

In [69]:
df_train['product'].value_counts()

Non Money Transfer    49404
Money Transfer         1236
Name: product, dtype: int64

In [71]:
df_rest['product'].value_counts()

Non Money Transfer    133574
Money Transfer          4944
Name: product, dtype: int64

In [72]:
df_train.shape


(50640, 2)

In [73]:
df_train.tail(10)

Unnamed: 0,product,narrative
39284,Non Money Transfer,On XX/XX/XXXX I received a settlement offer fr...
37645,Non Money Transfer,on XX/XX/2019 My Wife had her Hours Cut back a...
184492,Non Money Transfer,I have an unverified accounts on my credit rep...
167633,Non Money Transfer,I recieved notices dated XX/XX/2021 and XX/XX/...
18464,Non Money Transfer,according to sunoco rewards card there saying ...
121146,Non Money Transfer,"XXXX XXXX XXXX XXXX XXXX XXXX XXXX,XX/XX/XXXXG..."
62676,Non Money Transfer,On XX/XX/2020 I sent a letter regarding inaccu...
102099,Non Money Transfer,I am the victim of an identity theft where a c...
47200,Non Money Transfer,My home loan was placed in forbearance as of X...
144561,Non Money Transfer,I contacted Experian for a credit report as we...


In [74]:
df_train.index = np.arange(0, len(df_train))

In [75]:
df_train.tail(10)

Unnamed: 0,product,narrative
50630,Non Money Transfer,On XX/XX/XXXX I received a settlement offer fr...
50631,Non Money Transfer,on XX/XX/2019 My Wife had her Hours Cut back a...
50632,Non Money Transfer,I have an unverified accounts on my credit rep...
50633,Non Money Transfer,I recieved notices dated XX/XX/2021 and XX/XX/...
50634,Non Money Transfer,according to sunoco rewards card there saying ...
50635,Non Money Transfer,"XXXX XXXX XXXX XXXX XXXX XXXX XXXX,XX/XX/XXXXG..."
50636,Non Money Transfer,On XX/XX/2020 I sent a letter regarding inaccu...
50637,Non Money Transfer,I am the victim of an identity theft where a c...
50638,Non Money Transfer,My home loan was placed in forbearance as of X...
50639,Non Money Transfer,I contacted Experian for a credit report as we...


In [76]:
df_train['product'].value_counts()

Non Money Transfer    49404
Money Transfer         1236
Name: product, dtype: int64

In [77]:
df_train.loc[0]['narrative']

'My wife received settlement from XXXX XXXX company in the amount of {$9000.00} She opted to have the funds sent through PayPal for the so called quick and convenient service. She received the money uploaded her debit card information, she tried to cash out the money but was only allowed to send {$500.00}. So she called PayPal numerous times and got messages stating due to high call volume no-one maybe be able to answer the call and then the call drops. She finally got a representative explained her issue and the representative restored her account. My wife then sent {$5000.00} to my PayPal account then she sent {$3500.00}. I received the money and tried to cash it down to my XXXX XXXX account and when I did that I immediately received an email from PayPal stating that my account was shut down and closed and that I could look at my account but was unable to claim my money or perform any transactions and that they would be holding my money for 180 days/6 months. This is unacceptable, we

In [78]:
len(df_train)

50640

# Loop through narratives to remove stopwords, tokenize and lemmatize

In [79]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']
stopwords_list += ['--', 'xxxx', 'xx/xx/2020', 'xx/xx/2021']

In [80]:
# function to tokenize data and remove stopwords
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed


# function to concat words (used in function below)
def concat_words(list_of_words):
    # remove any NaN's
    # list_of_words = [i for i in list if i is not np.nan]

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each complaint into a single space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string with the words separated by ' '
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string

# Prepare dataframe for modeling

In [81]:
df_train['processed narrative'] = df_train['narrative']
df_train.shape

(50640, 3)

In [82]:
for i in range(len(df_train)):
    processed_narr = process_narrative(df_train['narrative'].loc[i])
    narr = make_lemma_and_concat(processed_narr)
    df_train['processed narrative'].loc[i] = narr
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df_train.head()

Finished line number 0
Finished line number 3000
Finished line number 6000
Finished line number 9000
Finished line number 12000
Finished line number 15000
Finished line number 18000
Finished line number 21000
Finished line number 24000
Finished line number 27000
Finished line number 30000
Finished line number 33000
Finished line number 36000
Finished line number 39000
Finished line number 42000
Finished line number 45000
Finished line number 48000


Unnamed: 0,product,narrative,processed narrative
0,Money Transfer,My wife received settlement from XXXX XXXX com...,wife received settlement company amount opted ...
1,Money Transfer,My girlfriend sent me {$5000.00} on XX/XX/XXXX...,girlfriend sent using paypal help pay work don...
2,Money Transfer,"On XX/XX/2021, I initiated a wire transfer of ...",initiated wire transfer gemini account externa...
3,Money Transfer,Can not link my PNC Bank account to numerous X...,link pnc bank account numerous platform
4,Money Transfer,I receive two payment for the amount of {$1500...,receive two payment amount cash app try cash m...


In [85]:
df_rest.shape

(138518, 3)

In [88]:
df_rest.head()

Unnamed: 0,product,narrative,processed narrative
17,Money Transfer,My ex boyfriend worked for the call center at ...,My ex boyfriend worked for the call center at ...
19,Money Transfer,"Money was removed from my account, seemingly t...","Money was removed from my account, seemingly t..."
44,Money Transfer,The loan company Money Key Loans called me on ...,The loan company Money Key Loans called me on ...
61,Money Transfer,Here is a time Line of the Wire that was sent ...,Here is a time Line of the Wire that was sent ...
113,Money Transfer,Coinbase of XXXX XXXX has frozen my available ...,Coinbase of XXXX XXXX has frozen my available ...


In [89]:
# df_rest['processed narrative'] = df_rest['narrative']
df_rest.index = np.arange(0, len(df_rest))
for i in range(len(df_rest)):
    processed_narr = process_narrative(df_rest['narrative'].loc[i])
    narr = make_lemma_and_concat(processed_narr)
    df_rest['processed narrative'].loc[i] = narr
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df_rest.head()

Finished line number 0
Finished line number 3000
Finished line number 6000
Finished line number 9000
Finished line number 12000
Finished line number 15000
Finished line number 18000
Finished line number 21000
Finished line number 24000
Finished line number 27000
Finished line number 30000
Finished line number 33000
Finished line number 36000
Finished line number 39000
Finished line number 42000
Finished line number 45000
Finished line number 48000
Finished line number 51000
Finished line number 54000
Finished line number 57000
Finished line number 60000
Finished line number 63000
Finished line number 66000
Finished line number 69000
Finished line number 72000
Finished line number 75000
Finished line number 78000
Finished line number 81000
Finished line number 84000
Finished line number 87000
Finished line number 90000
Finished line number 93000
Finished line number 96000
Finished line number 99000
Finished line number 102000
Finished line number 105000
Finished line number 108000
Finis

Unnamed: 0,product,narrative,processed narrative
0,Money Transfer,My ex boyfriend worked for the call center at ...,ex boyfriend worked call center u bank wiscons...
1,Money Transfer,"Money was removed from my account, seemingly t...",money removed account seemingly cashapp able c...
2,Money Transfer,The loan company Money Key Loans called me on ...,loan company money key loan called saying saw ...
3,Money Transfer,Here is a time Line of the Wire that was sent ...,time line wire sent activity since party invol...
4,Money Transfer,Coinbase of XXXX XXXX has frozen my available ...,coinbase frozen available cash withdraw availa...


# Save dataframe as csv for use in other notebooks

In [90]:
df_train.to_csv('complaints_processed_1year_0719.csv')

In [91]:
df_rest.head()

Unnamed: 0,product,narrative,processed narrative
0,Money Transfer,My ex boyfriend worked for the call center at ...,ex boyfriend worked call center u bank wiscons...
1,Money Transfer,"Money was removed from my account, seemingly t...",money removed account seemingly cashapp able c...
2,Money Transfer,The loan company Money Key Loans called me on ...,loan company money key loan called saying saw ...
3,Money Transfer,Here is a time Line of the Wire that was sent ...,time line wire sent activity since party invol...
4,Money Transfer,Coinbase of XXXX XXXX has frozen my available ...,coinbase frozen available cash withdraw availa...


In [92]:
df_rest.index = np.arange(0, len(df_rest))

In [93]:
df_rest.to_csv('complaints_processed_1year_0719_test.csv')