In [None]:
# STEP 5 — BERTopic
# Input : merged_preprocessed_for_topics.csv
# Outputs: topics_per_doc.csv, topic_summary.csv
# Test run (MTS= 10 / 30 / 50)
# FINAL run (MTS=30)

# %pip install -q bertopic sentence-transformers umap-learn hdbscan plotly

# inspired by https://www.pinecone.io/learn/bertopic/
# [https://www.kaggle.com/code/samvelkoch/tutorial-bertopic-best-practices](https://www.kaggle.com/code/samvelkoch/tutorial-bertopic-best-practices) 
# [https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html](https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html) 
# [https://github.com/MaartenGr/BERTopic](https://github.com/MaartenGr/BERTopic)

import time
from pathlib import Path
import numpy as np
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# Config 
INPUT_CSV        = "merged_preprocessed_for_topics.csv"   # from cleaning step
TEXT_COL         = "clean_for_topics"                     # text column to model on
ID_COLS_PREF     = ["id"]                                 # kept if present
TIME_COL_PREFS   = ["dt", "date", "created_utc"]          # one will be kept if present

# Sampling: start small for quick tests, then set to None to use all rows
MAX_DOCS         = 50_000        # set to None to use ALL data
RANDOM_STATE     = 42

# BERTopic hyperparams
EMBEDDING_MODEL  = "all-MiniLM-L6-v2"     
MIN_TOPIC_SIZE   = 50                      # try 50, 30, 10 in separate runs
N_GRAM_RANGE     = (1, 2)
CALC_PROBS       = True

# Output filenames
TOPICS_PER_DOC_CSV = f"topics_per_doc_mts{MIN_TOPIC_SIZE}.csv"
TOPIC_SUMMARY_CSV  = f"topic_summary_mts{MIN_TOPIC_SIZE}.csv"
MODEL_PKL          = f"bertopic_model_mts{MIN_TOPIC_SIZE}.pkl"
HTML_TOPICS_OVERV  = f"topics_overview_mts{MIN_TOPIC_SIZE}.html"
HTML_BARCHART_TOPN = f"topic_barchart_top20_mts{MIN_TOPIC_SIZE}.html"

# Load & prepare data
df = pd.read_csv(INPUT_CSV, low_memory=False)
if TEXT_COL not in df.columns:
    raise ValueError(f"Expected '{TEXT_COL}' in {INPUT_CSV}.")

# Keep optional metadata
id_cols = [c for c in ID_COLS_PREF if c in df.columns]
time_col = next((c for c in TIME_COL_PREFS if c in df.columns), None)

# Make sure dt is proper datetime if present
if "dt" in df.columns:
    df["dt"] = pd.to_datetime(df["dt"], errors="coerce", utc=True)

# Drop empty text 
df = df[df[TEXT_COL].astype(str).str.strip().astype(bool)].copy()
if (MAX_DOCS is not None) and (len(df) > MAX_DOCS):
    df = df.sample(MAX_DOCS, random_state=RANDOM_STATE).copy()

docs = df[TEXT_COL].astype(str).tolist()
print(f"[info] Documents to model: {len(df):,}")

# Fit BERTopic
embedder = SentenceTransformer(EMBEDDING_MODEL)
topic_model = BERTopic(
    embedding_model=embedder,
    language="english",
    min_topic_size=MIN_TOPIC_SIZE,
    n_gram_range=N_GRAM_RANGE,
    calculate_probabilities=CALC_PROBS,
    verbose=True,
)

t0 = time.time()
topics, probs = topic_model.fit_transform(docs)
mins = (time.time() - t0) / 60.0
print(f"[done] Fit completed in {mins:.1f} minutes.")

# Save per-document topics
out = pd.DataFrame({"topic_id": topics})
if CALC_PROBS and probs is not None:
    out["topic_prob"] = np.max(probs, axis=1)
else:
    out["topic_prob"] = None

# Bring back metadata (if available)
if id_cols:
    for c in id_cols:
        out[c] = df[c].values
if time_col and time_col in df.columns:
    out[time_col] = df[time_col].values
if "subreddit" in df.columns:
    out["subreddit"] = df["subreddit"].values
if "kind" in df.columns:
    out["kind"] = df["kind"].values

# Keep raw & clean text for later reference
keep_text_cols = []
if "text" in df.columns and "text" != TEXT_COL:
    keep_text_cols.append("text")
keep_text_cols.append(TEXT_COL)
for c in keep_text_cols:
    if c in df.columns:
        out[c] = df[c].values

# Nice ordering
preferred = id_cols + (["subreddit","kind"] if "subreddit" in out.columns else []) \
            + ([time_col] if time_col and time_col in out.columns else []) \
            + keep_text_cols + ["topic_id","topic_prob"]
ordered_cols = [c for c in preferred if c in out.columns] + \
               [c for c in out.columns if c not in preferred]
out = out[ordered_cols]

out.to_csv(TOPICS_PER_DOC_CSV, index=False)
print(f"[ok] per-document topics → {TOPICS_PER_DOC_CSV}  (rows: {len(out):,})")

# Save topic summary
topic_info = topic_model.get_topic_info()
topic_info.to_csv(TOPIC_SUMMARY_CSV, index=False)
print(f"[ok] topic summary → {TOPIC_SUMMARY_CSV}  (topics: {len(topic_info):,})")
display(topic_info.head(10))


# Save model as a single file
topic_model.save(MODEL_PKL)  # default serialization="pickle"
print(f"[ok] saved model → {MODEL_PKL}")


# HTML visuals
try:
    fig_over = topic_model.visualize_topics()
    fig_over.write_html(HTML_TOPICS_OVERV)
    print(f"[ok] topics overview HTML → {HTML_TOPICS_OVERV}")

    # Top 20 topics by size
    top_topic_ids = topic_info[topic_info.Topic != -1].sort_values("Count", ascending=False)["Topic"].head(20).tolist()
    fig_bar = topic_model.visualize_barchart(topics=top_topic_ids)
    fig_bar.write_html(HTML_BARCHART_TOPN)
    print(f"[ok] topic barchart HTML → {HTML_BARCHART_TOPN}")
except Exception as e:
    print(f"[warn] visualization skipped: {e}")

print("\nNext: map topic IDs to human-readable labels and assign stance (pro-Israel / pro-Palestine / neutral).")


[info] Documents to model: 50,000


2025-10-16 11:10:34,111 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 1563/1563 [15:30<00:00,  1.68it/s]
2025-10-16 11:26:06,799 - BERTopic - Embedding - Completed ✓
2025-10-16 11:26:06,799 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-16 11:26:44,269 - BERTopic - Dimensionality - Completed ✓
2025-10-16 11:26:44,282 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-16 11:27:30,659 - BERTopic - Cluster - Completed ✓
2025-10-16 11:27:30,688 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-16 11:27:41,106 - BERTopic - Representation - Completed ✓


[done] Fit completed in 17.2 minutes.
[ok] per-document topics → topics_per_doc_mts50.csv  (rows: 50,000)
[ok] topic summary → topic_summary_mts50.csv  (topics: 103)


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,28287,-1_the_and_to_of,"[the, and, to, of, that, is, in, you, it, they]",[the problem with blatant intellectual dishone...
1,0,1518,0_land_the_jews_of,"[land, the, jews, of, and, to, was, the land, ...",[i m genuinely trying to understand what you m...
2,1,1018,1_hamas_hamas is_is_they,"[hamas, hamas is, is, they, to, the, and, isra...",[as far as the reason this conflict can t end ...
3,2,864,2_idf_the idf_the_idf is,"[idf, the idf, the, idf is, they, is, and, to,...","[is that what the idf is doing in gaza, the id..."
4,3,819,3_land_you_the_to,"[land, you, the, to, it, of, that, and, is, pe...",[that means nothing all of those settlements a...
5,4,788,4_israel_us_the us_israel is,"[israel, us, the us, israel is, the, is, to, i...",[those are all great questions reality is it w...
6,5,691,5_genocide_is_the_of,"[genocide, is, the, of, it, that, to, israel, ...",[you begin with a serious sounding disclaimer ...
7,6,662,6_islam_muslims_muslim_religion,"[islam, muslims, muslim, religion, the, and, i...",[wtf is this i wanna see those statistics prov...
8,7,622,7_genocide_group_of_it,"[genocide, group, of, it, the, of genocide, yo...",[genocide is the deliberate and systematic ext...
9,8,553,8_state_solution_state solution_palestinians,"[state, solution, state solution, palestinians...",[because at the time when we are trying to ach...




[ok] saved model → bertopic_model_mts50.pkl
[ok] topics overview HTML → topics_overview_mts50.html
[ok] topic barchart HTML → topic_barchart_top20_mts50.html

Next: map topic IDs to human-readable labels and assign stance (pro-Israel / pro-Palestine / neutral).
