# Topic Modeling: Latent Dirichlet Allocation (LDA)

In [1]:
import re
import gensim
import pandas as pd
import swifter
import pyLDAvis
import pyLDAvis.gensim_models

import plotly.express as px

from tqdm import tqdm

pyLDAvis.enable_notebook()
tqdm.pandas()

  from imp import reload


In [2]:
df = pd.read_hdf('./../../code/data/starbucks/data.h5', key='preprocessed_starbucks')

In [3]:
df.shape

(3313, 29)

In [4]:
def sentence_to_words(sentences):
    """Function to convert sentences to words"""
    return (gensim.utils.simple_preprocess(str(sentence), deacc=True) for sentence in sentences)

data = df['preprocessed_tweet'].values.tolist()
data_words = list(sentence_to_words(data))
bigram = gensim.models.phrases.Phrases(data_words, min_count=5, threshold=10, connector_words=gensim.models.phrases.ENGLISH_CONNECTOR_WORDS)
bigram_model = gensim.models.phrases.Phraser(bigram)
df.loc[:, 'sep_words'] = df['preprocessed_tweet'].swifter.apply(lambda x: list(sentence_to_words([x]))[0])
df['bigram'] = df['sep_words'].swifter.apply(lambda x: bigram_model[x])

Pandas Apply:   0%|          | 0/3313 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/3313 [00:00<?, ?it/s]

In [5]:
id2word = gensim.corpora.Dictionary(df['bigram'].values.tolist())
corpus = [id2word.doc2bow(text) for text in df['bigram'].values.tolist()]

In [6]:
topics_range = range(3, 10, 1)

model_results = {
    'Number of topics': [],
    'Coherence Score': []
}

for k in tqdm(topics_range):

    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=k,
                                            random_state=42,
                                            chunksize=500,
                                            passes=10,
                                            alpha='asymmetric',
                                            eta='auto',
                                            per_word_topics=True)
    
    coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                    texts=df['bigram'].values.tolist(), 
                                                    dictionary=id2word,
                                                    coherence='u_mass')
    coherence_score = coherence_model_lda.get_coherence()

    model_results['Number of topics'].append(k)
    model_results['Coherence Score'].append(coherence_score)


100%|██████████| 7/7 [00:23<00:00,  3.33s/it]


In [7]:
model_results = pd.DataFrame(model_results)
model_results

Unnamed: 0,Number of topics,Coherence Score
0,3,-7.062664
1,4,-7.452986
2,5,-6.083892
3,6,-6.740066
4,7,-7.565121
5,8,-8.779543
6,9,-8.24949


In [8]:
fig = px.line(model_results, x="Number of topics", y="Coherence Score", title='Scree Plot', markers=True)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



The number of topics corresponding to the lowest coherence score is 5.

In [14]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=5,
                                            random_state=42,
                                            chunksize=100,
                                            passes=3,
                                            alpha='asymmetric',
                                            eta='auto',
                                            per_word_topics=True)
    
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, 
                                                texts=df['bigram'].values.tolist(), 
                                                dictionary=id2word, 
                                                coherence='u_mass')
coherence_score = coherence_model_lda.get_coherence()

In [15]:
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [11]:
def dominant_topics(ldamodel, corpus, tweets):
    sent_topics_df = pd.DataFrame()
    for row in tqdm(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j==0:
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True
                )
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    contents = pd.Series(tweets)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.rename(columns={0: "Tweet"}, inplace=True)
    return sent_topics_df.copy()

topics_df = dominant_topics(ldamodel=lda_model, corpus=corpus, tweets=df['tweet'].values.tolist())

100%|██████████| 3313/3313 [00:01<00:00, 1900.61it/s]


In [12]:
topics_df.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Tweet
0,0,0.8766,"coffee, drink, like, cup, want, good, order, d...",WHY Y’all lie &amp; say Starbucks took EBT now...
1,0,0.8473,"coffee, drink, like, cup, want, good, order, d...",Thanks A Latta Giveaway\n#WIN a $10 Starbucks ...
2,3,0.6863,"like, order, want, use, drink, coffee, hour, p...",I used to hate Starbucks but now I love it so ...
3,0,0.787,"coffee, drink, like, cup, want, good, order, d...",philz needs to replace the starbucks on story ...
4,2,0.7681,"coffee, work, day, time, people, like, think, ...",@staceyabrams @BeeForGeorgia There were more p...
