# Import libraries and load text

In [53]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.stem.wordnet import WordNetLemmatizer
import string
import plotly
import plotly.graph_objects as go
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [54]:
!pip3 install plotly



In [55]:
# Prepping dataframe as done in 1_EDA.ipynb

df = pd.read_csv('complaints_1year.csv')
df = df[['Product', 'Consumer complaint narrative']]

df = df.rename(columns={"Product": "product", "Consumer complaint narrative": "narrative"})
df = df[ (df['product'] != "Debt collection" ) & 
        (df['product'] != "Money transfer, virtual currency, or money service") & 
       (df['product'] != "Checking or savings account")]

df['product'].replace({'Credit reporting, credit repair services, or other personal consumer reports': 'credit_related',
                       'Credit card or prepaid card': 'credit_related',
                       'Mortgage': 'Mortgage and loans',
                       'Vehicle loan or lease': 'Mortgage and loans',
                       'Payday loan, title loan, or personal loan': 'Mortgage and loans',
                       'Student loan': 'Mortgage and loans'}, inplace=True)

In [56]:
df.head(10)

Unnamed: 0,product,narrative
0,Mortgage and loans,"Caliber Loan Number : XXXX On XX/XX/2020, I sp..."
2,credit_related,This closed XXXX XXXX account ( last four XXXX...
3,credit_related,THESE ACCOUNTS WAS LISTED ON MY CREDIT REPORT ...
4,credit_related,I reviewed a copy of my credit report and show...
5,Mortgage and loans,"My previous complaint was closed, XXXX, but no..."
7,credit_related,THESE ACCOUNTS WAS LISTED ON MY CREDIT REPORT ...
9,Mortgage and loans,"XXXX XXXX XXXX FM XXXX XXXX, TX XXXX Re : Sant..."
11,credit_related,It would be ideal if you be exhorted that I ha...
12,credit_related,"AS A CONSUMER, I AM TRYING TO REMOVE THE FRAUD..."
15,credit_related,I have now called several times today. I was j...


In [57]:
df.index = np.arange(0, len(df))

In [58]:
df.head()

Unnamed: 0,product,narrative
0,Mortgage and loans,"Caliber Loan Number : XXXX On XX/XX/2020, I sp..."
1,credit_related,This closed XXXX XXXX account ( last four XXXX...
2,credit_related,THESE ACCOUNTS WAS LISTED ON MY CREDIT REPORT ...
3,credit_related,I reviewed a copy of my credit report and show...
4,Mortgage and loans,"My previous complaint was closed, XXXX, but no..."


In [59]:
df['product'].value_counts()

credit_related        121107
Mortgage and loans     22555
Name: product, dtype: int64

In [60]:
df.loc[0]['narrative']

'Caliber Loan Number : XXXX On XX/XX/2020, I spoke with a loan agent by the name of XXXX XXXX about refinancing options and interest rates only. He asked if I wanted to submit a loan application and I explicitly told him, " No \'\'. I filed a complaint with Caliber Homes and no excuse or reason was provided for his unethical actions. Their response was, " Since you decided not to move forward with the loan refinance process, the Loan Consultant was required to withdraw your application in our system. " ( Direct quote ). As if this solves the problem. The issue is, I told him not to submit a loan application in the first place from the beginning. The call was recorded on their end. Caliber is saying that a loan number was required to be opened as part of their inquiry, so that the Loan Consultant could gather my information and be able to provide me the most accurate information available in regards to my refinance options. Even if this were the case, I should have been told that upfron

In [61]:
len(df)

143662

# Loop through narratives to remove stopwords, tokenize and lemmatize

In [62]:
stopwords_list = stopwords.words('english') + list(string.punctuation)
stopwords_list += ["''", '""', '...', '``']
stopwords_list += ['--', 'xxxx', 'xx/xx/2020', 'xx/xx/2021']

In [63]:
# function to tokenize data and remove stopwords
def process_narrative(narrative):
    tokens = nltk.word_tokenize(narrative)
    stopwords_removed = [token.lower() for token in tokens if token.lower() not in stopwords_list]
    
    # adding line to remove all tokens with numbers and punctuation
    stopwords_punc_and_numbers_removed = [word for word in stopwords_removed if word.isalpha()]
    
    return stopwords_punc_and_numbers_removed


# function to concat words (used in function below)
def concat_words(list_of_words):
    # remove any NaN's
    # list_of_words = [i for i in list if i is not np.nan]

    concat_words = ''
    for word in list_of_words:
        concat_words += word + ' '
    return concat_words.strip()

# function to lemmatize words and merge each complaint into a single space-separated string

lemm = WordNetLemmatizer()

def make_lemma_and_concat(list_of_words):
    # remove any NaN's
    list_of_words = [i for i in list_of_words if i is not np.nan]
    
    # lemmatize each word
    lemmatized_list = []
    for idx, word in enumerate(list_of_words):
        lemmatized_list.append(lemm.lemmatize(word))
    
    # make the list into a single string with the words separated by ' '
    concatenated_string = concat_words(lemmatized_list)
    return concatenated_string

# Prepare dataframe for modeling

In [65]:
df['processed narrative'] = df['narrative']
df.shape

(143662, 3)

In [66]:
for i in range(len(df)):
    processed_narr = process_narrative(df['narrative'].loc[i])
    narr = make_lemma_and_concat(processed_narr)
    df['processed narrative'].loc[i] = narr
    if i % 3000 == 0:
        print(f'Finished line number {i}')
df.head()

Finished line number 0
Finished line number 3000
Finished line number 6000
Finished line number 9000
Finished line number 12000
Finished line number 15000
Finished line number 18000
Finished line number 21000
Finished line number 24000
Finished line number 27000
Finished line number 30000
Finished line number 33000
Finished line number 36000
Finished line number 39000
Finished line number 42000
Finished line number 45000
Finished line number 48000
Finished line number 51000
Finished line number 54000
Finished line number 57000
Finished line number 60000
Finished line number 63000
Finished line number 66000
Finished line number 69000
Finished line number 72000
Finished line number 75000
Finished line number 78000
Finished line number 81000
Finished line number 84000
Finished line number 87000
Finished line number 90000
Finished line number 93000
Finished line number 96000
Finished line number 99000
Finished line number 102000
Finished line number 105000
Finished line number 108000
Finis

Unnamed: 0,product,narrative,processed narrative
0,Mortgage and loans,"Caliber Loan Number : XXXX On XX/XX/2020, I sp...",caliber loan number spoke loan agent name refi...
1,credit_related,This closed XXXX XXXX account ( last four XXXX...,closed account last four show year month old c...
2,credit_related,THESE ACCOUNTS WAS LISTED ON MY CREDIT REPORT ...,account listed credit report never opened acco...
3,credit_related,I reviewed a copy of my credit report and show...,reviewed copy credit report show inaccurate in...
4,Mortgage and loans,"My previous complaint was closed, XXXX, but no...",previous complaint closed resolved satisfactio...


# Save dataframe as csv for use in other notebooks

In [67]:
df.to_csv('complaints_processed_1year.csv')