In [None]:
# STEP 7 — BERTopic for all data
# Input : merged_preprocessed_for_topics.csv
# Outputs: topics_per_doc.csv, topic_summary.csv
# FINAL run (MTS=30)

# Config & imports
import os, math
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from sentence_transformers import SentenceTransformer

# Files
INPUT = "merged_preprocessed_for_topics.csv"   # has clean_for_topics, dt, subreddit, id, kind, etc.
RUN_TAG = "mts30_fast"                         

TOPICS_PER_DOC_CSV = f"topics_per_doc_{RUN_TAG}.csv"
TOPIC_SUMMARY_CSV  = f"topic_summary_{RUN_TAG}.csv"

# Hyperparams
MIN_TOPIC_SIZE = 30           # MTS
TRAIN_MAX_DOCS = 100_000      # subset size for model fit
CAP_PER_WEEK   = 500          # stratify: max docs per week per subreddit/kind
BATCH_SIZE     = 2048         # transform batches
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" 

# Speed knobs for BERTopic
# 1) Light embedding model
embedder = SentenceTransformer(EMB_MODEL_NAME)

# 2) Fast UMAP (lower n_components, mild neighbors)
umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.1,
    metric="cosine",
    random_state=42,
    low_memory=True,
)

# 3) HDBSCAN tuned for speed/stability
cluster_model = hdbscan.HDBSCAN(
    min_cluster_size=MIN_TOPIC_SIZE,
    min_samples=max(5, MIN_TOPIC_SIZE // 2),
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,     # speeds up a lot
)

# 4) Vectorizer: cap vocab (min_df filters rares, max_features caps size)
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    min_df=10,             # drop very rare terms
    max_df=0.5,            # drop extremely frequent terms
    max_features=60_000,   # cap vocab
    stop_words="english",
)

topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=MIN_TOPIC_SIZE,
    nr_topics="auto",                 # allow light merging of near-duplicate topics
    calculate_probabilities=False,    # speeds up
    low_memory=True,                  # reduces RAM
    verbose=True,
)

# Load data
df_full = pd.read_csv(INPUT, low_memory=False)
assert "clean_for_topics" in df_full.columns, "Missing 'clean_for_topics' column from preprocessing step."
assert "dt" in df_full.columns, "Missing 'dt' column."

# Keep only rows with usable text
df_full["clean_for_topics"] = df_full["clean_for_topics"].astype(str).str.strip()
df_full = df_full[df_full["clean_for_topics"].str.split().str.len() >= 3].copy()
df_full["dt"] = pd.to_datetime(df_full["dt"], errors="coerce", utc=True)

# Build stratified training subset (balanced in time & subreddit/kind)
df_full["week"] = df_full["dt"].dt.to_period("W-MON").astype(str)

group_cols = [c for c in ["subreddit","kind","week"] if c in df_full.columns]
if not group_cols:
    group_cols = ["week"]

def cap_group(g, cap=CAP_PER_WEEK):
    if len(g) <= cap:
        return g
    return g.sample(cap, random_state=42)

df_train = (df_full
            .groupby(group_cols, group_keys=False)
            .apply(cap_group, cap=CAP_PER_WEEK)
            .sample(min(TRAIN_MAX_DOCS, len(df_full)), random_state=42)
            .reset_index(drop=True))

print(f"[info] Training subset: {len(df_train):,} docs (of {len(df_full):,})")

# Fit on subset
train_texts = df_train["clean_for_topics"].tolist()
topics_train, _ = topic_model.fit_transform(train_texts)
print("[ok] Fit complete.")

# Build & save topic summary from the trained model
topic_info = topic_model.get_topic_info().rename(columns={"Count":"count","Name":"name","Topic":"topic"})
topic_info.to_csv(TOPIC_SUMMARY_CSV, index=False)
print(f"[ok] Wrote topic summary → {TOPIC_SUMMARY_CSV}")

# STransform the full dataset in batches (assign topics)
def batched(iterable, n):
    for i in range(0, len(iterable), n):
        yield i, iterable[i:i+n]

all_topics = []
all_probs  = []  # not used (probabilities disabled)
for i, batch in tqdm(batched(df_full["clean_for_topics"].tolist(), BATCH_SIZE),
                     total=math.ceil(len(df_full)/BATCH_SIZE), desc="Transform full"):
    bt, _ = topic_model.transform(batch)
    all_topics.extend(bt)

# Collect per-doc outputs
out = df_full[["id","subreddit","kind","dt","clean_for_topics"]].copy()
out["topic"] = all_topics
# map to human-readable names now (can also do later)
name_map = topic_info.set_index("topic")["name"].to_dict()
out["topic_name"] = out["topic"].map(name_map)

out.to_csv(TOPICS_PER_DOC_CSV, index=False)
print(f"[ok] Wrote per-doc topics → {TOPICS_PER_DOC_CSV}")

# Small quality check
print(out["topic"].value_counts().head(10).sort_index())
print("\nPreview:")
display(out.head(5))


2025-10-19 19:10:08,199 - BERTopic - Embedding - Transforming documents to embeddings.


[info] Training subset: 100,000 docs (of 447,012)


Batches: 100%|██████████| 3125/3125 [28:03<00:00,  1.86it/s] 
2025-10-19 19:38:17,096 - BERTopic - Embedding - Completed ✓
2025-10-19 19:38:17,097 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-19 23:41:36,473 - BERTopic - Dimensionality - Completed ✓
2025-10-19 23:41:36,481 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-20 07:39:42,401 - BERTopic - Cluster - Completed ✓
2025-10-20 07:39:42,403 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-10-20 07:39:55,081 - BERTopic - Representation - Completed ✓
2025-10-20 07:39:55,087 - BERTopic - Topic reduction - Reducing number of topics
2025-10-20 07:39:55,345 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-20 07:40:14,287 - BERTopic - Representation - Completed ✓
2025-10-20 07:40:14,305 - BERTopic - Topic reduction - Reduced number of topics from 303 to 116


[ok] Fit complete.
[ok] Wrote topic summary → topic_summary_mts30_fast.csv


Batches: 100%|██████████| 64/64 [2:00:15<00:00, 112.74s/it]
2025-10-20 09:40:31,640 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-10-20 09:40:48,399 - BERTopic - Dimensionality - Completed ✓
2025-10-20 09:40:48,399 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-10-20 09:40:49,020 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 64/64 [00:53<00:00,  1.19it/s]8, 7233.25s/it]
2025-10-20 09:41:42,795 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-10-20 09:41:45,257 - BERTopic - Dimensionality - Completed ✓
2025-10-20 09:41:45,257 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-10-20 09:41:45,873 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 64/64 [00:55<00:00,  1.15it/s]0, 3011.85s/it]
2025-10-20 09:42:41,465 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-10-20 09:42:44,330 - BERTopic - Dimensionality - Com

[ok] Wrote per-doc topics → topics_per_doc_mts30_fast.csv
topic
-1     290290
 0     114522
 1       3831
 2       5133
 3       4852
 4        927
 6       1380
 8       1003
 11      1353
 16       922
Name: count, dtype: int64

Preview:


Unnamed: 0,id,subreddit,kind,dt,clean_for_topics,topic,topic_name
0,gf34wp,IsraelPalestine,post,2020-05-07 09:08:08+00:00,israelis should be more critical of israel as ...,-1,-1_israel_jews_palestinians_hamas
1,gnle2w,IsraelPalestine,post,2020-05-20 23:18:00+00:00,can we stop blaming israel for kids hamas kill...,0,0_genocide_comments_israel_hamas
2,ho0fmu,IsraelPalestine,post,2020-07-09 09:58:12+00:00,conceding some points to pro palestinians sinc...,-1,-1_israel_jews_palestinians_hamas
3,hm3cjv,IsraelPalestine,post,2020-07-06 07:46:19+00:00,you don t need to pick sides the entire rhetor...,-1,-1_israel_jews_palestinians_hamas
4,i3c9om,IsraelPalestine,post,2020-08-04 03:54:01+00:00,as a left wing european i can t support palest...,-1,-1_israel_jews_palestinians_hamas
