In [1]:
#imports
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/norahajjar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#get data - store in a zip file and open here, downloaded from the CFPB database. 
#saved this file locally as it is too large for Github, can do the same on your machines if you want to run
df = pd.read_csv("complaints.csv")

In [3]:
#drop rows w/o a consumer complaint narrative (consistent with EDA)
df.dropna(subset = ['Consumer complaint narrative'], inplace=True)

In [4]:
#consistent w/ EDA analysis
new_names = [n.lower().replace(" ", "_").replace("?", "") for n in df.columns]
df.columns = new_names

In [5]:
#create df with complaint id and narrative, as we are performing topic modeling on the text only
df2 = df[['complaint_id', 'consumer_complaint_narrative']].copy()

Steps:
1) convert to lowercase
2) remove special characters and tokenize them into terms
3) remove stop words
4) stemming 
5) construct term-document matrix

Steps based on file:///Users/norahajjar/Downloads/Latent_Dirichlet_Allocation_LDA_for_Topic_Modeling.pdf
Cleaning code based on: https://ashwin-ks.github.io/2018-08-15-NLP-Consumer-Complaints-Classification-ML-DL/

In [6]:
# Remove punctuation, convert to lowercase, remove stop words, remove redacted/x'ed out terms
stop = stopwords.words('english')
df2['consumer_complaint_narrative'] = df2['consumer_complaint_narrative'] \
.map(lambda x: re.sub('[,\.!?]', '', x)) \
.map(lambda x: x.lower()) \
.apply(lambda x: ' '.join([i for i in x.split() if i not in stop])) \
.str.replace(r"xx+\s","")

In [7]:
df2.consumer_complaint_narrative.head(1)

0    transworld systems inc trying collect debt min...
Name: consumer_complaint_narrative, dtype: object

In [8]:
#standardize text
#use external normalization dictionaries
#Below, we used three normalizazion dictionaries from these links :
#http://people.eng.unimelb.edu.au/tbaldwin/etc/emnlp2012-lexnorm.tgz
#http://luululu.com/tweet/typo-corpus-r1.txt
dico = {}
dico2 = open('emnlp_dict.txt', 'rb')
for word in dico2:
    word = word.decode('utf8')
    word = word.split()
    dico[word[0]] = word[1]
dico2.close()
dico3 = open('typo-corpus-r1.txt', 'rb')
for word in dico3:
    word = word.decode('utf8')
    word = word.split()
    dico[word[0]] = word[1]
dico3.close()

In [9]:
def txt_std(words):
    list_words = words.split()
    for i in range(len(list_words)):
        if list_words[i] in dico.keys():
            list_words[i] = dico[list_words[i]]
    return ' '.join(list_words)

In [10]:
df2['consumer_complaint_narrative'] = df2['consumer_complaint_narrative'].apply(txt_std)

In [11]:
#remove stop words again
stop = stopwords.words('english')
df2['consumer_complaint_narrative'] = df2['consumer_complaint_narrative'] \
.apply(lambda x: ' '.join([i for i in x.split() if i not in stop]))

In [12]:
df2.head()

Unnamed: 0,complaint_id,consumer_complaint_narrative
0,3384392,transworld systems trying collect debt mine ow...
2,3417821,would like request suppression following items...
3,3433198,past 2 weeks receiving excessive amounts telep...
11,3366475,sold access event digitally screenshots detail...
12,3385399,checking credit report noticed three collectio...


In [13]:
#list of words for wordcloud
words = ','.join(list(df2['consumer_complaint_narrative'].values))

In [14]:
wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue')

In [None]:
wordcloud.generate(words)
wordcloud.to_image()

In [None]:
# Helper function
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

In [None]:
#create BoW
# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')

# Fit and transform the processed text
count_data = count_vectorizer.fit_transform(df2['consumer_complaint_narrative'])

# Visualise the 10 most common words w/ helper function
plot_10_most_common_words(count_data, count_vectorizer)

There are several existing algorithms you can use to perform the topic modeling. The most common of it are:
Latent Semantic Analysis (LSA/LSI), 
Probabilistic Latent Semantic Analysis (pLSA), and 
Latent Dirichlet Allocation (LDA)
https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

In [None]:
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 5
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)

In [None]:
%%time
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')

In [None]:
#DBSCAN? 

In [None]:
#any other models?