# 07 - Topic Modeling & Clustering
## INSY 669 Text Analytics | GLP-1 Weight Loss Drugs

This notebook discovers latent themes in each corpus using:
1. **LDA (Latent Dirichlet Allocation)** - probabilistic topic modeling
2. **K-Means Clustering** - unsupervised document grouping
3. **Topic comparison** - how topics differ between public and media corpora

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

## 7.1 Load Data

In [None]:
df_public = pd.read_csv('../data/public_processed.csv')
df_media = pd.read_csv('../data/media_processed.csv')
df_public = df_public.dropna(subset=['clean'])
df_media = df_media.dropna(subset=['clean'])
print(f"Public: {len(df_public)} | Media: {len(df_media)}")

## 7.2 LDA Topic Modeling

LDA assumes each document is a **mixture of topics** and each topic is a **distribution over words**.
Unlike K-Means (hard assignment), LDA provides soft, probabilistic topic membership.

In [None]:
def fit_lda(texts, n_topics=5, max_features=3000, n_top_words=10):
    """Fit LDA and return model, vectorizer, and topic-word matrix."""
    cv = CountVectorizer(max_features=max_features, min_df=5, max_df=0.9)
    dtm = cv.fit_transform(texts)
    
    lda = LatentDirichletAllocation(
        n_components=n_topics,
        random_state=42,
        max_iter=20,
        learning_method='online'
    )
    lda.fit(dtm)
    
    feature_names = cv.get_feature_names_out()
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        topics.append(top_words)
        print(f"  Topic {topic_idx + 1}: {', '.join(top_words)}")
    
    return lda, cv, dtm, topics

In [None]:
print("PUBLIC CORPUS - LDA Topics")
print("-" * 50)
lda_pub, cv_pub, dtm_pub, topics_pub = fit_lda(df_public['clean'], n_topics=5)

In [None]:
print("MEDIA CORPUS - LDA Topics")
print("-" * 50)
lda_med, cv_med, dtm_med, topics_med = fit_lda(df_media['clean'], n_topics=5)

## 7.3 Choosing Number of Topics

We evaluate different values of k using **perplexity** (lower is better) and **log-likelihood** (higher is better).

In [None]:
k_range = range(2, 12)

cv_all = CountVectorizer(max_features=3000, min_df=5, max_df=0.9)
dtm_all_pub = cv_all.fit_transform(df_public['clean'])

perplexities = []
log_likelihoods = []

for k in k_range:
    lda_k = LatentDirichletAllocation(
        n_components=k, random_state=42, max_iter=20, learning_method='online'
    )
    lda_k.fit(dtm_all_pub)
    perplexities.append(lda_k.perplexity(dtm_all_pub))
    log_likelihoods.append(lda_k.score(dtm_all_pub))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, perplexities, 'o-', color='#E94560', linewidth=2)
axes[0].set_xlabel('Number of Topics (k)')
axes[0].set_ylabel('Perplexity')
axes[0].set_title('LDA Perplexity vs Number of Topics', fontweight='bold')

axes[1].plot(k_range, log_likelihoods, 's-', color='#2196F3', linewidth=2)
axes[1].set_xlabel('Number of Topics (k)')
axes[1].set_ylabel('Log-Likelihood')
axes[1].set_title('LDA Log-Likelihood vs Number of Topics', fontweight='bold')

plt.tight_layout()
plt.savefig('../figures/lda_topic_selection.png', dpi=150, bbox_inches='tight')
plt.show()

## 7.4 Topic Distribution per Document

In [None]:
# Get document-topic distributions
doc_topics_pub = lda_pub.transform(dtm_pub)
doc_topics_med = lda_med.transform(dtm_med)

# Average topic proportions per corpus
avg_pub = doc_topics_pub.mean(axis=0)
avg_med = doc_topics_med.mean(axis=0)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(range(1, len(avg_pub) + 1), avg_pub, color='#2196F3', alpha=0.8)
axes[0].set_xlabel('Topic')
axes[0].set_ylabel('Average Proportion')
axes[0].set_title('Public: Average Topic Distribution', fontweight='bold')
axes[0].set_xticks(range(1, len(avg_pub) + 1))

axes[1].bar(range(1, len(avg_med) + 1), avg_med, color='#FF9800', alpha=0.8)
axes[1].set_xlabel('Topic')
axes[1].set_ylabel('Average Proportion')
axes[1].set_title('Media: Average Topic Distribution', fontweight='bold')
axes[1].set_xticks(range(1, len(avg_med) + 1))

plt.tight_layout()
plt.savefig('../figures/topic_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 7.5 K-Means Text Clustering

K-Means provides **hard assignment** of documents to clusters. We cluster the combined corpus
and see if clusters naturally separate public from media documents.

In [None]:
# Combine corpora for clustering
all_clean = pd.concat([df_public['clean'], df_media['clean']], ignore_index=True)
all_labels = ['Public'] * len(df_public) + ['Media'] * len(df_media)

tfidf = TfidfVectorizer(max_features=3000, min_df=5, max_df=0.9)
X_tfidf = tfidf.fit_transform(all_clean)

In [None]:
# Elbow method and silhouette scores
k_range = range(2, 11)
inertias = []
silhouettes = []

for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X_tfidf)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X_tfidf, km.labels_, sample_size=500, random_state=42))

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(k_range, inertias, 'o-', color='#E94560', linewidth=2)
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia (WCSS)')
axes[0].set_title('Elbow Method', fontweight='bold')

axes[1].plot(k_range, silhouettes, 's-', color='#2196F3', linewidth=2)
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score vs k', fontweight='bold')

plt.tight_layout()
plt.savefig('../figures/kmeans_selection.png', dpi=150, bbox_inches='tight')
plt.show()

best_k = k_range[np.argmax(silhouettes)]
print(f"Best k by silhouette: {best_k} (score: {max(silhouettes):.4f})")

In [None]:
# Fit K-Means with k=2 (natural media vs public split)
km2 = KMeans(n_clusters=2, random_state=42, n_init=10)
km2.fit(X_tfidf)

# Cross-tabulation: do clusters align with media vs public?
ct = pd.crosstab(
    pd.Series(all_labels, name='True Label'),
    pd.Series(km2.labels_, name='Cluster')
)
print("K-Means (k=2) vs True Labels:")
print(ct)

# Purity
purity = sum(ct.max(axis=1)) / ct.values.sum()
print(f"\nCluster purity: {purity:.4f}")

In [None]:
# Top terms per cluster
feature_names = tfidf.get_feature_names_out()
order_centroids = km2.cluster_centers_.argsort()[:, ::-1]

for i in range(2):
    top_terms = [feature_names[ind] for ind in order_centroids[i, :15]]
    print(f"Cluster {i}: {', '.join(top_terms)}")

## 7.6 Summary

In [None]:
print("TOPIC MODELING & CLUSTERING SUMMARY")
print("-" * 50)
print(f"\nLDA Topics discovered:")
print(f"  Public corpus: 5 topics")
print(f"  Media corpus:  5 topics")
print(f"\nK-Means Clustering:")
print(f"  Best k by silhouette: {best_k}")
print(f"  k=2 cluster purity: {purity:.4f}")
print(f"\nKey insight: {'Clusters naturally separate media from public text' if purity > 0.7 else 'Some overlap between media and public language'}")