In [None]:
# STEP 9 — Zoom in on Topic 0 (split into subtopics)

#Extract all documents assigned to topic 0 from topics_per_doc_mts30_fast.csv and save them into a separate CSV file.

import pandas as pd
from pathlib import Path

# Input / Output paths
INPUT_FILE = Path("topics_per_doc_mts30_fast.csv")
OUTPUT_FILE = Path("topic_0.csv")

def main():
    # 1. Load the full per-doc topics file
    print(f"[info] Loading file: {INPUT_FILE}")
    df = pd.read_csv(INPUT_FILE, low_memory=False)

    # 2. Check for the topic column
    if "topic" not in df.columns:
        raise ValueError("Column 'topic' not found in input file!")

    # 3. Filter for topic == 0
    df_topic0 = df.loc[df["topic"] == 0].copy()
    n_rows = len(df_topic0)
    print(f"[info] Extracted {n_rows:,} rows with topic == 0")

    if n_rows == 0:
        print("[warn] No rows found for topic == 0. Check topic IDs in your data.")
        return

    # 4. Save to new file
    df_topic0.to_csv(OUTPUT_FILE, index=False)
    print(f"[ok] Saved Topic 0 data → {OUTPUT_FILE.resolve()}")

    # 5. Quick preview
    print("\nPreview:")
    print(df_topic0.head(5).to_string(index=False))

if __name__ == "__main__":
    main()




[info] Loading file: topics_per_doc_mts30_fast.csv
[info] Extracted 114,522 rows with topic == 0
[ok] Saved Topic 0 data → C:\Users\Matilde\Desktop\thesis\Thesis_project_2.0\topic_0.csv

Preview:
    id       subreddit kind                        dt                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               

In [None]:
# Re-run BERTopic on Topic 0 data (zoom-in)
# Input  : topic_0.csv  (subset of all docs originally assigned to topic 0)
# Outputs: topic0_per_doc_mts30_topic0.csv, topic0_summary_mts30_topic0.csv

# Config & imports
import os, math
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path

from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
import hdbscan
from sentence_transformers import SentenceTransformer

# Files
INPUT = "topic_0.csv"                 # extracted Topic 0 file
RUN_TAG = "mts30_topic0"              # unique tag so outputs don’t collide with the global run
TOPICS_PER_DOC_CSV = f"topic0_per_doc_{RUN_TAG}.csv"
TOPIC_SUMMARY_CSV  = f"topic0_summary_{RUN_TAG}.csv"

# Hyperparams
MIN_TOPIC_SIZE = 30
TRAIN_MAX_DOCS = 20_000
CAP_PER_WEEK   = 500
BATCH_SIZE     = 2048
EMB_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2" 

# Speed knobs for BERTopic
# Light embedding model
embedder = SentenceTransformer(EMB_MODEL_NAME)

#Fast UMAP (lower n_components, mild neighbors)
umap_model = UMAP(
    n_neighbors=10,
    n_components=3,
    min_dist=0.2,
    metric="cosine",
    random_state=42,
    low_memory=True,
)

# HDBSCAN tuned for speed/stability
cluster_model = hdbscan.HDBSCAN(
    min_cluster_size=MIN_TOPIC_SIZE,
    min_samples=max(5, MIN_TOPIC_SIZE // 2),
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data= True,  # speeds up a lot
)

# Vectorizer: cap vocab (min_df filters rares, max_features caps size)
vectorizer_model = CountVectorizer(
    ngram_range=(1, 2),
    min_df=10,
    max_df=0.5,
    max_features=60_000,
    stop_words="english",
)

# Create the BERTopic model
topic_model = BERTopic(
    embedding_model=embedder,
    umap_model=umap_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=MIN_TOPIC_SIZE,
    nr_topics="auto",
    calculate_probabilities=False,
    low_memory=True,
    verbose=True,
)

# Load and prepare data
df_full = pd.read_csv(INPUT, low_memory=False)
assert "clean_for_topics" in df_full.columns, "Missing 'clean_for_topics' column from preprocessing step."
assert "dt" in df_full.columns, "Missing 'dt' column."

# Preserve parent topic information before overwriting it later
if "topic" in df_full.columns:
    df_full = df_full.rename(columns={"topic": "parent_topic", "topic_name": "parent_topic_name"})

# Clean and filter text
df_full["clean_for_topics"] = df_full["clean_for_topics"].astype(str).str.strip()
df_full = df_full[df_full["clean_for_topics"].str.split().str.len() >= 3].copy()
df_full["dt"] = pd.to_datetime(df_full["dt"], errors="coerce", utc=True)

print(f"[info] Loaded {len(df_full):,} documents for Topic 0 re-clustering")

# Build stratified training subset (balanced in time & subreddit/kind)
df_full["week"] = df_full["dt"].dt.to_period("W-MON").astype(str)
group_cols = [c for c in ["subreddit", "kind", "week"] if c in df_full.columns]
if not group_cols:
    group_cols = ["week"]

def cap_group(g, cap=CAP_PER_WEEK):
    if len(g) <= cap:
        return g
    return g.sample(cap, random_state=42)

df_train = (
    df_full.groupby(group_cols, group_keys=False)
           .apply(cap_group, cap=CAP_PER_WEEK)
           .sample(min(TRAIN_MAX_DOCS, len(df_full)), random_state=42)
           .reset_index(drop=True)
)
print(f"[info] Training subset: {len(df_train):,} docs (of {len(df_full):,})")

# Fit on subset
train_texts = df_train["clean_for_topics"].tolist()
topics_train, _ = topic_model.fit_transform(train_texts)
print("[ok] Fit complete on training subset.")

# Build & save topic summary
topic_info = topic_model.get_topic_info().rename(
    columns={"Count": "count", "Name": "name", "Topic": "topic"}
)
topic_info.to_csv(TOPIC_SUMMARY_CSV, index=False)
print(f"[ok] Wrote topic summary → {TOPIC_SUMMARY_CSV}")


# Transform the full dataset in batches (assign topics)

def batched(iterable, n):
    for i in range(0, len(iterable), n):
        yield i, iterable[i:i+n]

all_topics = []
for i, batch in tqdm(
    batched(df_full["clean_for_topics"].tolist(), BATCH_SIZE),
    total=math.ceil(len(df_full) / BATCH_SIZE),
    desc="Transform full"
):
    bt, _ = topic_model.transform(batch)
    all_topics.extend(bt)

#Save per-doc results
base_cols = ["id", "subreddit", "kind", "dt", "clean_for_topics"]
if "parent_topic" in df_full.columns:
    base_cols += ["parent_topic", "parent_topic_name"]

out = df_full[base_cols].copy()
out["topic"] = all_topics
name_map = topic_info.set_index("topic")["name"].to_dict()
out["topic_name"] = out["topic"].map(name_map)
out.to_csv(TOPICS_PER_DOC_CSV, index=False)
print(f"[ok] Wrote per-doc topics → {TOPICS_PER_DOC_CSV}")


# Small quality check
print(out["topic"].value_counts().head(10).sort_index())
print("\nPreview:")
display(out.head(5))


[info] Loaded 114,522 documents for Topic 0 re-clustering


2025-10-27 14:18:02,484 - BERTopic - Embedding - Transforming documents to embeddings.


[info] Training subset: 20,000 docs (of 114,522)


Batches: 100%|██████████| 625/625 [06:27<00:00,  1.61it/s]
2025-10-27 14:24:30,666 - BERTopic - Embedding - Completed ✓
2025-10-27 14:24:30,666 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-10-27 14:24:52,389 - BERTopic - Dimensionality - Completed ✓
2025-10-27 14:24:52,391 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-10-27 14:24:58,808 - BERTopic - Cluster - Completed ✓
2025-10-27 14:24:58,810 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2025-10-27 14:25:01,844 - BERTopic - Representation - Completed ✓
2025-10-27 14:25:01,845 - BERTopic - Topic reduction - Reducing number of topics
2025-10-27 14:25:01,910 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-10-27 14:25:04,775 - BERTopic - Representation - Completed ✓
2025-10-27 14:25:04,779 - BERTopic - Topic reduction - Reduced number of topics from 130 to 49


[ok] Fit complete on training subset.
[ok] Wrote topic summary → topic0_summary_mts30_topic0.csv


Batches: 100%|██████████| 64/64 [00:57<00:00,  1.11it/s]
2025-10-27 14:26:02,886 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-10-27 14:26:08,281 - BERTopic - Dimensionality - Completed ✓
2025-10-27 14:26:08,282 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-10-27 14:26:08,423 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 64/64 [00:40<00:00,  1.60it/s]3s/it]
2025-10-27 14:26:48,542 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-10-27 14:26:49,665 - BERTopic - Dimensionality - Completed ✓
2025-10-27 14:26:49,666 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-10-27 14:26:49,814 - BERTopic - Cluster - Completed ✓
Batches: 100%|██████████| 64/64 [00:40<00:00,  1.57it/s]4s/it]
2025-10-27 14:27:30,617 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-10-27 14:27:31,685 - BERTopic - Dimensionality - Completed ✓
2025-10-27 1

[ok] Wrote per-doc topics → topic0_per_doc_mts30_topic0.csv
topic
-1    12232
 0    72234
 1     3496
 2     3529
 3     3305
 4     2856
 5     1128
 6     1101
 7      989
 9      690
Name: count, dtype: int64

Preview:


Unnamed: 0,id,subreddit,kind,dt,clean_for_topics,parent_topic,parent_topic_name,topic,topic_name
0,gnle2w,IsraelPalestine,post,2020-05-20 23:18:00+00:00,can we stop blaming israel for kids hamas kill...,0,0_genocide_comments_israel_hamas,0,0_hamas_gaza_genocide_palestinians
1,lb9cr8,IsraelPalestine,post,2021-02-02 23:08:05+00:00,the israel iran conflict,0,0_genocide_comments_israel_hamas,0,0_hamas_gaza_genocide_palestinians
2,mlaert,IsraelPalestine,post,2021-04-06 12:23:37+00:00,bds founder in his own words please see some q...,0,0_genocide_comments_israel_hamas,0,0_hamas_gaza_genocide_palestinians
3,na4hkg,IsraelPalestine,post,2021-05-11 18:52:42+00:00,no child should be scared they won t see tomor...,0,0_genocide_comments_israel_hamas,0,0_hamas_gaza_genocide_palestinians
4,nm40u4,IsraelPalestine,post,2021-05-27 09:59:57+00:00,people are accusing israel of genocide human r...,0,0_genocide_comments_israel_hamas,0,0_hamas_gaza_genocide_palestinians
