In [1]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pyLDAvis
import pyLDAvis.gensim_models
from nltk.tokenize import word_tokenize
import re
from unidecode import unidecode
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter

from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

In [2]:
# Load the data
processed_df = pd.read_csv('../data/lda_processed_content.csv')

# Extract processed content as a list of lists (tokenized documents)
processed_content = [content.split() for content in processed_df['processed_content']]

# Create a dictionary from the processed content
dictionary = corpora.Dictionary(processed_content)

# Convert the list of documents (corpus) into Document Term Matrix using the dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_content]

In [3]:


# Train the LDA model
num_topics = 3
lda_model = LdaModel(
    doc_term_matrix,
    num_topics=num_topics,
    id2word=dictionary,
    passes=10,  # More passes can improve the model
    random_state=17
)

# Print the topics
for topic in lda_model.print_topics():
    print(topic)

# Get coherence score
coherence_model = CoherenceModel(model=lda_model, texts=processed_content, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score:.4f}')

(0, '0.021*"zacks" + 0.017*"earnings" + 0.012*"company" + 0.012*"year" + 0.011*"stocks" + 0.010*"growth" + 0.008*"rank" + 0.008*"nasdaq" + 0.007*"stock" + 0.007*"quarter"')
(1, '0.016*"year" + 0.012*"company" + 0.011*"quarter" + 0.010*"million" + 0.010*"nvidia" + 0.009*"nasdaq" + 0.008*"billion" + 0.007*"zacks" + 0.007*"revenues" + 0.006*"technology"')
(2, '0.013*"nasdaq" + 0.007*"nyse" + 0.006*"market" + 0.006*"u" + 0.006*"week" + 0.005*"trade" + 0.005*"stock" + 0.005*"stocks" + 0.005*"nvidia" + 0.005*"shares"')
Coherence Score: 0.4203


In [4]:
# Visualization with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.display(vis)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, passes, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, random_state=17)
        model_list.append(lda_model)
        coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherence_model.get_coherence())

    return model_list, coherence_values


# We can now run the function to compute the coherence values for different number of topics. We can then plot the coherence values to determine the optimal number of topics.

In [None]:
model_list, coherence_values = compute_coherence_values(
    dictionary=dictionary,
    corpus=doc_term_matrix,
    texts=processed_content,
    passes=10,
    start=2,
    limit=20,
    step=6
)

limit = 20
start = 2
step = 6
x = range(start, limit, step)

plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence Score")
plt.legend(("Coherence Values"), loc='best')
plt.title("Coherence Scores vs. Number of Topics")
plt.grid()
plt.show()

# We can see that 8 is probably the best number of topics for now since it has the highest coherence score. We can now train the LDA model with 8 topics and visualize the topics.

In [None]:
# Train the LDA model
num_topics = 8
lda_model = LdaModel(
    doc_term_matrix,
    num_topics=num_topics,
    id2word=dictionary,
    passes=10,  # More passes can improve the model
    random_state=17
)

# Print the topics
for topic in lda_model.print_topics():
    print(topic)

# Get coherence score
coherence_model = CoherenceModel(model=lda_model, texts=processed_content, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f'Coherence Score: {coherence_score:.4f}')

In [None]:
# Visualization with pyLDAvis
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, doc_term_matrix, dictionary)
pyLDAvis.display(vis)