In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
import string

In [2]:
complaints = pd.read_csv('complaints.csv')

In [3]:
complaints = complaints.rename(columns = {'Consumer complaint narrative': 'complaints_narrative',
                                         'Issue': 'complaints_issue'})

In [4]:
complaints

Unnamed: 0,complaints_narrative,complaints_issue
0,My name is XXXX XXXX this complaint is not mad...,Incorrect information on your report
1,I searched on XXXX for XXXXXXXX XXXX and was ...,Fraud or scam
2,I have a particular account that is stating th...,Incorrect information on your report
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report
...,...,...
353427,Collections account I have no knowledge of,Attempts to collect debt not owed
353428,"Dear CFPB Team, The reason for my complaint is...",Attempts to collect debt not owed
353429,FRCA violations : Failing to Follow Debt Dispu...,Attempts to collect debt not owed
353430,"My Father, a XXXX XXXX acquired an HECM rever...",Struggling to pay mortgage


In [5]:
#remove X's
complaints['complaints_narrative'] = complaints['complaints_narrative'].str.replace(r'[xX][xX]+',"", regex = True)

In [26]:
#remove numbers
complaints['complaints_narrative'] = complaints['complaints_narrative'].str.replace('\d+', '', regex=True)

In [27]:
complaints['complaint_length'] = complaints['complaints_narrative'].apply(len)

In [21]:
#Asha's code to remove stop words and punctuation
stopwords_list = stopwords.words('english') + list(string.punctuation)

def tokenize_and_remove_stopwords(dataframe, column_name):
    stopwords_list = stopwords.words('english') + list(string.punctuation) 

    tokenized_data = []
    for row in dataframe[column_name]:
        tokens = nltk.word_tokenize(row.lower())
        filtered_tokens = [token.lemma_.strip() for token in tokens if token not in stopwords_list]
        tokenized_data.append(filtered_tokens)

    return tokenized_data

complaints['issues_tokenized'] = tokenize_and_remove_stopwords(complaints, 'complaints_narrative')

In [23]:
complaints

Unnamed: 0,complaints_narrative,complaints_issue,complaint_length,issues_tokenized
0,My name is this complaint is not made in err...,Incorrect information on your report,679,"[name, complaint, made, error, neither, made, ..."
1,I searched on for and was pointed to a web...,Fraud or scam,1890,"[searched, pointed, website, legitimately, bel..."
2,I have a particular account that is stating th...,Incorrect information on your report,294,"[particular, account, stating, owe, 10000.00, ..."
3,I have not supplied proof under the doctrine o...,Attempts to collect debt not owed,3396,"[supplied, proof, doctrine, estoppel, silence,..."
4,Hello i'm writing regarding account on my cred...,Incorrect information on your report,860,"[hello, 'm, writing, regarding, account, credi..."
...,...,...,...,...
353427,Collections account I have no knowledge of,Attempts to collect debt not owed,42,"[collections, account, knowledge]"
353428,"Dear CFPB Team, The reason for my complaint is...",Attempts to collect debt not owed,4290,"[dear, cfpb, team, reason, complaint, tried, r..."
353429,FRCA violations : Failing to Follow Debt Dispu...,Attempts to collect debt not owed,4248,"[frca, violations, failing, follow, debt, disp..."
353430,"My Father, a acquired an HECM reverse mortg...",Struggling to pay mortgage,5206,"[father, acquired, hecm, reverse, mortgage, pr..."


In [None]:
#add lemmatization to Asha's code using nltk
from nltk.stem import WordNetLemmatizer


def tokenize_and_lemmatize(dataframe, column_name):
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    lemmatizer = WordNetLemmatizer()
    tokenized_data = []
    
    for row in dataframe[column_name]:
        tokens = nltk.word_tokenize(row.lower())
        filtered_tokens = [lemmatizer.lemmatize(token.strip()) for token in tokens if token not in stopwords_list]
        tokenized_data.append(filtered_tokens)

    return tokenized_data

In [16]:
#Using Spacy to tokenize and lemmatize
#or Using Spacy
import spacy
nlp = spacy.load('en_core_web_sm')

In [17]:
#Using Spacy to tokenize and lemmatize
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if not token.is_stop
             and not token.is_digit
             and not token.is_punct
             and not token.is_space]
    return ' '.join(tokens)

In [None]:
#this took more than 30 minutes and never finished
complaints['clean_narrative'] = complaints['complaints_narrative'].apply(preprocess_text)

In [None]:
#attempt #3 Using Spacy nlp to tokenize and lemmatize

In [None]:
# Lemmatization with stopwords removal
complaints['lemmatized']=complaints['complaints_narrative'].apply(lambda x: ' '.join([token.lemma_ for token in list(nlp(x)) if (token.is_stop==False)]))

In [31]:
# Load spaCy pipeline
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

# Process large text as a stream via `nlp.pipe()` and iterate over the results, extracting lemmas
lemma_text_list = []
for doc in nlp.pipe(complaints['complaints_narrative']):
    lemma_text_list.append(" ".join(token.lemma_ for token in doc if not token.is_stop))
complaints["text_lemma"] = lemma_text_list

In [None]:
# Use mapping to convert columns to numerical values
issue_mapping = {'Attempts to collect debt not owed': 1, 
                 'Communication tactics': 2, 
                 'Fraud or scam': 3, 
                 'Incorrect information on your report': 4,
                 'Struggling to pay mortgage': 5}

# replace the issue values with numerical values

complaints['issue'] = complaints['issue'].replace(issue_mapping)

### Tim's pickle file

In [3]:
complaints_tim = pd.read_pickle('complaints_nlp.pkl')

In [4]:
complaints_tim

Unnamed: 0,issue,complaint_length,tokens,stems,lemmas
0,4,711,"[name, complaint, made, error, neither, made, ...","[name, complaint, made, error, neither, made, ...","[name, complaint, made, error, neither, made, ..."
1,3,1958,"[searched, pointed, website, legitimately, bel...","[search, point, websit, legitim, believ, websi...","[searched, pointed, website, legitimately, bel..."
2,4,294,"[particular, account, stating, owe, listed, cr...","[particular, account, state, owe, list, credit...","[particular, account, stating, owe, listed, cr..."
3,1,3444,"[supplied, proof, doctrine, estoppel, silence,...","[suppli, proof, doctrin, estoppel, silenc, eng...","[supplied, proof, doctrine, estoppel, silence,..."
4,4,876,"[hello, im, writing, regarding, account, credi...","[hello, im, write, regard, account, credit, re...","[hello, im, writing, regarding, account, credi..."
...,...,...,...,...,...
353427,1,42,"[collections, account, knowledge]","[collect, account, knowledg]","[collection, account, knowledge]"
353428,1,4586,"[dear, cfpb, team, reason, complaint, tried, r...","[dear, cfpb, team, reason, complaint, tri, res...","[dear, cfpb, team, reason, complaint, tried, r..."
353429,1,4328,"[frca, violations, failing, follow, debt, disp...","[frca, violat, fail, follow, debt, disput, pro...","[frca, violation, failing, follow, debt, dispu..."
353430,5,5418,"[father, acquired, hecm, reverse, mortgage, pr...","[father, acquir, hecm, revers, mortgag, proper...","[father, acquired, hecm, reverse, mortgage, pr..."


In [10]:
complaints_proc = pd.read_csv('complaints_proc.csv')

In [15]:
complaints_proc

Unnamed: 0,complaints_narrative,complaints_issue,complaint_length,clean_narrative
0,My name is this complaint is not made in err...,4,679,complaint error party declare penalty perjury ...
1,I searched on for and was pointed to a web...,3,1890,search point website legitimately believe webs...
2,I have a particular account that is stating th...,4,294,particular account state owe $ 10000.00 list c...
3,I have not supplied proof under the doctrine o...,1,3396,supply proof doctrine estoppel silence engelha...
4,Hello i'm writing regarding account on my cred...,4,860,hello write account credit report belong open ...
...,...,...,...,...
353427,Collections account I have no knowledge of,1,42,collection account knowledge
353428,"Dear CFPB Team, The reason for my complaint is...",1,4290,dear cfpb team reason complaint try resolve ac...
353429,FRCA violations : Failing to Follow Debt Dispu...,1,4248,frca violation fail follow debt dispute proced...
353430,"My Father, a acquired an HECM reverse mortg...",5,5206,father acquire hecm reverse mortgage property ...


In [14]:
# Use mapping to convert columns to numerical values
issue_mapping = {'Attempts to collect debt not owed': 1, 
                 'Communication tactics': 2, 
                 'Fraud or scam': 3, 
                 'Incorrect information on your report': 4,
                 'Struggling to pay mortgage': 5}

# replace the issue values with numerical values

complaints_proc['complaints_issue'] = complaints_proc['complaints_issue'].replace(issue_mapping)

In [19]:
X = complaints_proc[['clean_narrative']].fillna('')
y = complaints_proc['complaints_issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
vect = CountVectorizer()

X_train_vec = vect.fit_transform(X_train['clean_narrative'])
X_test_vec = vect.transform(X_test['clean_narrative'])

In [33]:
len(vect.vocabulary_)

61372

In [34]:
vect.vocabulary_['great']

26055

In [59]:
X_train_vec.sum(axis=0).tolist()[0]

matrix([[240312,   1052,      9, ...,      1,      1,      1]],
       dtype=int64)

In [60]:
# Fill this in to build a DataFrame of words and their counts
word_counts = pd.DataFrame({
    'words': vect.get_feature_names_out(),
    'frequency': X_train_vec.sum(axis=0).tolist()[0]
})

word_counts.head()

Unnamed: 0,words,frequency
0,0,240312
1,0,1052
2,0,9
3,0,2
4,90514807004515,1


In [61]:
word_counts.sort_values('frequency')

Unnamed: 0,words,frequency
30685,interese,1
32918,legaslative,1
32919,legate,1
32920,legatee,1
32921,legation,1
...,...,...
0,00,240312
29880,information,290524
3627,account,507766
46229,report,509968


In [None]:
#TRYING CHUNKING FOR WORD_FREQUENCIES

In [24]:
from tqdm import tqdm

In [43]:
vect = CountVectorizer()

chunk_size = 1000

chunks = [X_train[i:i+chunk_size]['clean_narrative'] for i in range(0, len(X_train), chunk_size)]


word_freq = []


for chunk in tqdm(chunks):
    X_train_vec_chunk = vect.fit_transform(chunk)
    word_freq.append(X_train_vec_chunk.toarray().sum(axis=0))


word_freq = sum(word_freq)


word_counts = pd.DataFrame({
    'words': vect.get_feature_names_out(),
    'frequency': word_freq
})

100%|██████████| 266/266 [00:34<00:00,  7.63it/s]


ValueError: operands could not be broadcast together with shapes (4823,) (4831,) 

In [40]:
word_freq

[array([938,  16,   1, ...,  10,   1,   1], dtype=int64),
 array([889,   2,   1, ...,   3,   1,   1], dtype=int64),
 array([852,   7,   1, ...,  10,   1,   2], dtype=int64),
 array([782,   4,   1, ...,   3,  13,   7], dtype=int64),
 array([889,   6,   4, ...,   5,  15,   5], dtype=int64),
 array([843,   1,   7, ...,   8,   2,   3], dtype=int64),
 array([920,   4,   1, ...,  14,   9,   2], dtype=int64),
 array([1097,    3,    1, ...,   17,    8,    1], dtype=int64),
 array([801,  12,   1, ...,   3,   1,   1], dtype=int64),
 array([792,  10,   1, ...,   1,   1,   2], dtype=int64),
 array([869,   5,   1, ...,   2,   2,   2], dtype=int64),
 array([961,   4,   1, ...,  11,   1,   2], dtype=int64),
 array([909,   4,   1, ...,   2,  14,   2], dtype=int64),
 array([881,   2,   3, ...,  12,   8,   2], dtype=int64),
 array([966,   3,   2, ...,  13,   1,   1], dtype=int64),
 array([955,   1,   1, ...,   8,   7,   2], dtype=int64),
 array([789,   2,   3, ...,   4,  11,   2], dtype=int64),
 array([

In [44]:
len(word_counts['words']) 

1

In [45]:
len(word_counts['frequency'])

1

In [None]:
import string
stopwords_list = stopwords.words('english') + list(string.punctuation)

In [None]:
from nltk.corpus import stopwords

In [None]:
X = complaints[['complaints_narrative']]
y = complaints['complaints_issue']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 321, stratify = y)

In [None]:
vect = CountVectorizer()

X_train_vec = vect.fit_transform(X_train['complaints_narrative'])
X_test_vec = vect.transform(X_test['complaints_narrative'])

In [None]:
X_train_vec.shape