In [1]:
# import numpy as np
# import random
# from collections import Counter, defaultdict
# import re

# Load dataset and pre-processing

In [2]:
df_summary = 0

In [None]:
# This code is downloading the notebook from GitHub and running it
import requests
from pathlib import Path
url = "https://raw.githubusercontent.com/nbakas/NLP/refs/heads/main/02-Preprocessing.ipynb"
filename = url.split("/")[-1]
local_path = Path.cwd() / filename
if not local_path.exists():
    response = requests.get(url)
    response.raise_for_status()
    local_path.write_bytes(response.content)
%run $local_path

In [None]:
df_summary

# Libraries

In [5]:
# We will use gensim library for topic modeling
# Import corpora module for document processing
from gensim import corpora
# Import LdaMulticore for parallel LDA implementation
from gensim.models.ldamulticore import LdaMulticore
# Import matplotlib for visualization
import matplotlib.pyplot as plt

# Convert df_summary to list of texts

In [None]:
my_texts = df_summary.astype(str).tolist()
my_texts[:5]

# Tokenize the texts

In [None]:
processed_texts = [my_text.split() for my_text in my_texts]
processed_texts[:5]

# Create a dictionary mapping words to their IDs

In [None]:
my_dictionary = corpora.Dictionary(processed_texts)
my_dictionary

In [None]:
# Print 10 random items from the dictionary to understand its structure
print("10 random items from the dictionary:")
import random
random_ids = random.sample(list(my_dictionary.keys()), 10)
for word_id in random_ids:
    print(f"Word ID {word_id}: {my_dictionary[word_id]}")

In [None]:
len(my_dictionary)

# Filter out extreme values

In [None]:
# Filter out extreme values (optional)
# Filter out extreme values to improve LDA performance and quality
# no_below=100: Remove words that appear in fewer than 100 documents (rare terms)
#   - Removes noise and very specific terms that don't help identify general topics
# no_above=0.1: Remove words that appear in more than 10% of documents (too common)
#   - Removes overly common words that appear across many topics and don't help differentiate
# This filtering reduces my_dictionary size, speeds up computation, and helps LDA focus on meaningful topic-specific words
my_dictionary.filter_extremes(no_below=10, no_above=0.1)
my_dictionary

In [None]:
len(my_dictionary)

# Create a document-term matrix

In [None]:
# The code is creating a "bag-of-words" representation of your processed texts using the Gensim library.
# In the following line, `my_corpus = [my_dictionary.doc2bow(text) for text in processed_texts]`, each document in `processed_texts` is converted to a bag-of-words format using `doc2bow()`.
my_corpus = [my_dictionary.doc2bow(text) for text in processed_texts]
my_corpus[:5]
# The output `[[(0, 1), (1, 1), (2, 1)], [(3, 1)], [(4, 1)], [(5, 1), (6, 1)], [(7, 1)]]` shows the first 5 documents in your my_corpus:

# 1. First document contains words with IDs 0, 1, and 2, each appearing once
# 2. Second document contains word with ID 3, appearing once
# 3. Third document contains word with ID 4, appearing once
# 4. Fourth document contains words with IDs 5 and 6, each appearing once
# 5. Fifth document contains word with ID 7, appearing once

# Each tuple (word_id, frequency) represents a word by its dictionary ID and how many times it appears in that document.

# Set LDA parameters

In [14]:
num_topics = 10  # Number of topics to be extracted
my_passes = 10 # Number of my_passes of the corpus through the model during training. More my_passes means better accuracy but longer runtime
workers = 4  # Number of worker processes for parallel computing

# Train the LDA model

In [15]:
# It will take ~10 minutes to train the model.
# https://radimrehurek.com/gensim/models/ldamulticore.html
lda_model = LdaMulticore(
    corpus=my_corpus, # The document-term list we created earlier
    id2word=my_dictionary, # Maps word IDs to actual words for interpretable output
    num_topics=num_topics, # Number of topics to extract 
    passes=my_passes, # Number of training my_passes through the corpus 
    workers=workers, # Number of parallel processes to use 
    alpha='symmetric', # Topic distribution prior - 'symmetric' gives equal probability to all topics initially
    eta='auto' # Word distribution prior (influences how words are distributed across topics). 'auto' lets the model learn optimal word weights. β in notes.
)

# Evaluate LDA model performance

## Coherence score

Coherence score - measures the semantic similarity between high scoring words in topics. Calculate coherence score using c_v (coherence of vectors) measure (based on sliding window, normalized pointwise mutual information and cosine similarity). Coherence score is a measure of how well the topics are defined by the words in the topics. It takes into account the semantic similarity between the words in the topics. It takes values between 0 and 1, with 1 being the highest coherence. Typical values are between 0.3 and 0.6.

In [None]:
# https://radimrehurek.com/gensim/models/coherencemodel.html
from gensim.models.coherencemodel import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=my_dictionary, coherence='c_v')
coherence_score = coherence_model_lda.get_coherence()
print(f"Coherence Score: {coherence_score:.4f}")

## Perplexity 

Perplexity - a measure of how well the model predicts a sample (lower is better). 
- It's the exponential of the negative average log-likelihood per word
- Typical perplexity values for LDA models are usually in the range of 100–1000
- Lower values (e.g., < 100) indicate better generalization (less surprise),
- but very low perplexity on the training set (e.g., < 50) can be a sign of overfitting,
meaning the model fits the training data too closely and may not generalize well to unseen data
- Very high values (e.g., > 1000) suggest poor topic modeling or an inappropriate number of topics

**Perplexity Formula:**

If `log_perplexity` is the negative average log-likelihood per word (from Gensim):

Perplexity = e^(-log_perplexity)

Where:
- `log_perplexity` is returned by `lda_model.log_perplexity(corpus)`
- `exp` is the exponential function (base *e*)

| log_perplexity | Actual perplexity | Interpretation                  |
|----------------|-------------------|---------------------------------|
| -5             | ~148              | Very good fit                   |
| -6             | ~403              | Good                            |
| -7             | ~1097             | Acceptable to borderline high   |
| -8             | ~2980             | Likely too high → poor generalization |

In [None]:
perplexity = lda_model.log_perplexity(my_corpus)
print(f"Perplexity: {perplexity:.4f}")

## Topic-term weight statistics

In [None]:
import numpy as np
# The topic-term matrix represents the distribution of terms across topics
topic_term_matrix = lda_model.get_topics()
print(topic_term_matrix.shape)
topic_term_matrix

In [None]:
# Print the terms in the topic-term matrix
for topic_idx, topic in enumerate(topic_term_matrix):
    print(f"Topic {topic_idx + 1}:")
    top_terms_indices = topic.argsort()[-10:][::-1]  # Get indices of top 10 terms
    top_terms = [lda_model.id2word[idx] for idx in top_terms_indices]  # Map indices to terms
    top_terms_probs = [topic[idx] for idx in top_terms_indices]  # Get probabilities of top 10 terms
    top_terms_with_probs = [f"{term} ({prob:.4f})" for term, prob in zip(top_terms, top_terms_probs)]
    print("Top terms:", ", ".join(top_terms_with_probs))
    print()

In [None]:
top_n = 30
top_words = set()

# Collect top-N words across all topics
for topic in lda_model.get_topics():
    top_indices = topic.argsort()[-top_n:][::-1]
    top_words.update(top_indices)

# Convert to sorted list
top_words = sorted(list(top_words))

# Slice topic_term_matrix to include only top-N words
reduced_topic_matrix = topic_term_matrix[:, top_words]

# Now compute distinctiveness as before
topic_similarities = np.zeros((num_topics, num_topics))
for i in range(num_topics):
    for j in range(num_topics):
        if i != j:
            similarity = np.dot(reduced_topic_matrix[i], reduced_topic_matrix[j]) / (
                np.linalg.norm(reduced_topic_matrix[i]) * np.linalg.norm(reduced_topic_matrix[j])
            )
            topic_similarities[i, j] = similarity

avg_similarities = np.mean(topic_similarities, axis=1)
topic_distinctiveness = 1 - avg_similarities

print("\nTopic Distinctiveness (higher is better):")
for i, d in enumerate(topic_distinctiveness):
    print(f"Topic {i+1}: {d:.4f}")
print(f"Average Topic Distinctiveness: {np.mean(topic_distinctiveness):.4f}")


In [None]:
# Print the topics
print("\nGensim LDA Topics:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx+1}: {topic}")

# Visualize LDA topics using pyLDAvis

In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# Prepare the visualization
vis_data = gensimvis.prepare(lda_model, my_corpus, my_dictionary)

# Set the figure size for better visualization
pyLDAvis.enable_notebook()

# Display the interactive visualization
pyLDAvis.display(vis_data)

In [None]:
# Get 10 random documents and print their topics
import random
import numpy as np

# Select 10 random document indices
random_doc_indices = random.sample(range(len(my_corpus)), 10)

print("\nTopic Distribution for 10 Random Documents:")
print("-" * 50)

for idx in random_doc_indices:
    # Get the document's topic distribution
    doc_topics = lda_model.get_document_topics(my_corpus[idx])
    
    # Sort topics by probability (highest first)
    doc_topics = sorted(doc_topics, key=lambda x: x[1], reverse=True)
    
    # Get the original text (if available)
    original_text = df_summary.iloc[idx]
    
    print(f"\nDocument {idx}: \"{original_text}\"")
    print("Topic Distribution:")
    
    for topic_id, prob in doc_topics[:3]:
        # Get the top words for this topic
        topic_words = lda_model.show_topic(topic_id, topn=5)
        words = ", ".join([word for word, _ in topic_words])
        
        # Format the probability as a percentage
        prob_percent = prob * 100
        
        print(f"  Topic {topic_id+1}: {prob_percent:.2f}% ({words})")
