In [1]:
# STEP 6 - compare the results from BERTopic

import pandas as pd

summaries = {}
for size in [50, 30, 10]:
    df = pd.read_csv(f"topic_summary_mts{size}.csv")
    summaries[size] = df[df["Topic"] != -1]  # skip outliers/unassigned
    print(f"\n--- MIN_TOPIC_SIZE = {size} ---")
    print(f"Topics: {len(summaries[size])}, Total docs: {summaries[size]['Count'].sum():,}")
    print(summaries[size].head(5)[["Topic","Count","Name"]])

def topic_diversity(model_topics):
    top_words = [set(t.split(", ")) for t in model_topics]
    all_words = sum((list(t) for t in top_words), [])
    unique_words = len(set(all_words))
    total_words = len(all_words)
    return unique_words / total_words

for size in [50, 30, 10]:
    top_words = summaries[size]["Representation"].astype(str).tolist()
    div = topic_diversity(top_words)
    print(f"MIN_TOPIC_SIZE={size} → topic diversity={div:.2f}")

summary = []
for size, df in summaries.items():
    summary.append({
        "min_topic_size": size,
        "num_topics": len(df),
        "mean_docs_per_topic": df["Count"].mean(),
        "median_docs_per_topic": df["Count"].median(),
        "max_topic": df["Count"].max()
    })

pd.DataFrame(summary)





--- MIN_TOPIC_SIZE = 50 ---
Topics: 102, Total docs: 21,713
   Topic  Count                          Name
1      0   1518            0_land_the_jews_of
2      1   1018      1_hamas_hamas is_is_they
3      2    864      2_idf_the idf_the_idf is
4      3    819             3_land_you_the_to
5      4    788  4_israel_us_the us_israel is

--- MIN_TOPIC_SIZE = 30 ---
Topics: 163, Total docs: 21,749
   Topic  Count                                      Name
1      0    875                  0_idf_the idf_idf is_the
2      1    805                      1_you_my_answer_your
3      2    766    2_hamas_hamas is_support_support hamas
4      3    627  3_genocide_group_of genocide_genocide is
5      4    588              4_israel_us_the us_israel is

--- MIN_TOPIC_SIZE = 10 ---
Topics: 436, Total docs: 22,909
   Topic  Count                                       Name
1      0    870              0_idf_the idf_idf is_soldiers
2      1    592  1_genocide_intent_of genocide_genocide is
3      2    587 

Unnamed: 0,min_topic_size,num_topics,mean_docs_per_topic,median_docs_per_topic,max_topic
0,50,102,212.872549,132.5,1518
1,30,163,133.429448,85.0,875
2,10,436,52.543578,29.0,870


In [None]:
# %pip install --upgrade pip
%pip install wordcloud matplotlib pandas


^C
Note: you may need to restart the kernel to use updated packages.
Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   -- ------------------------------------- 0.1/1.8 MB 1.3 MB/s eta 0:00:02
   ----------------- ---------------------- 0.8/1.8 MB 6.1 MB/s eta 0:00:01
   ---------------------------------- ----- 1.5/1.8 MB 9.6 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 9.3 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Wordclouds for BERTopic summaries (mts=50,30,10)
import ast
import os
from pathlib import Path
import math
import re
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Config
FILES = {
    50: "topic_summary_mts50.csv",
    30: "topic_summary_mts30.csv",
    10: "topic_summary_mts10.csv",
}
TOP_K_TOPICS = 12          # how many topics per run to visualize
GRID_COLS = 4              # grid columns for montage
WC_MAX_WORDS = 80          # max words per wordcloud
WC_WIDTH = 900
WC_HEIGHT = 600
OUTPUT_DIR = Path("wordclouds")

# Helpers 
def parse_representation(rep):
    """
    Robustly parse the 'Representation' column to a list of words.
    It can be:
      - a python-list-like string: "['word1', 'word2', ...]"
      - a comma/space separated string: "word1, word2, word3"
    """
    if not isinstance(rep, str):
        return []
    rep = rep.strip()
    words = []
    # Try list literal first
    if rep.startswith("[") and rep.endswith("]"):
        try:
            parsed = ast.literal_eval(rep)
            if isinstance(parsed, (list, tuple)):
                words = [str(w) for w in parsed]
        except Exception:
            pass
    if not words:
        # Fallback: split on commas
        parts = [w.strip() for w in rep.split(",")]
        # If it didn't contain commas, split on whitespace
        if len(parts) == 1:
            parts = rep.split()
        words = [re.sub(r"[^a-zA-Z0-9_\-]+", "", w) for w in parts if w]

    # Deduplicate preserving order
    seen = set()
    out = []
    for w in words:
        wl = w.lower()
        if wl and wl not in seen:
            seen.add(wl)
            out.append(wl)
    return out

def rank_weights(words, base_weight=1.0, decay=0.9):
    """
    Assign descending weights by rank: w0=base, w1=base*decay, ...
    Returns dict word->weight suitable for WordCloud.generate_from_frequencies().
    """
    freqs = {}
    for i, w in enumerate(words):
        freqs[w] = base_weight * (decay ** i)
    return freqs

def make_wordcloud(freqs, title=None, width=WC_WIDTH, height=WC_HEIGHT, max_words=WC_MAX_WORDS):
    wc = WordCloud(width=width, height=height, background_color="white", max_words=max_words)
    wc_img = wc.generate_from_frequencies(freqs)
    fig, ax = plt.subplots(figsize=(width/100, height/100), dpi=100)
    ax.imshow(wc_img, interpolation="bilinear")
    ax.axis("off")
    if title:
        ax.set_title(title, fontsize=14, pad=6)
    fig.tight_layout()
    return fig

def grid_wordclouds(entries, cols=GRID_COLS, title=None, save_path=None):
    """
    entries: list of (fig_title, freq_dict)
    Renders a grid of wordclouds.
    """
    n = len(entries)
    rows = math.ceil(n / cols)
    fig, axes = plt.subplots(rows, cols, figsize=(cols*3.8, rows*3.0), dpi=150)
    if rows == 1 and cols == 1:
        axes = [[axes]]
    elif rows == 1:
        axes = [axes]
    elif cols == 1:
        axes = [[ax] for ax in axes]

    idx = 0
    for r in range(rows):
        for c in range(cols):
            ax = axes[r][c]
            ax.axis("off")
            if idx < n:
                t, freqs = entries[idx]
                wc = WordCloud(width=600, height=400, background_color="white", max_words=WC_MAX_WORDS)
                img = wc.generate_from_frequencies(freqs)
                ax.imshow(img, interpolation="bilinear")
                ax.set_title(t, fontsize=10, pad=4)
                idx += 1
    if title:
        fig.suptitle(title, fontsize=14)
    fig.tight_layout(rect=[0, 0.02, 1, 0.98])
    if save_path:
        fig.savefig(save_path, bbox_inches="tight")
    return fig

# Main
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

for mts, path in FILES.items():
    if not Path(path).exists():
        print(f"[skip] {path} not found.")
        continue

    df = pd.read_csv(path, low_memory=False)
    if "Topic" not in df.columns or "Count" not in df.columns or "Representation" not in df.columns:
        print(f"[warn] {path} missing expected columns (Topic, Count, Representation). Skipping.")
        continue

    # Drop the outlier topic (-1) and take top-K by Count
    topics_df = df[df["Topic"] != -1].copy()
    topics_df = topics_df.sort_values("Count", ascending=False).head(TOP_K_TOPICS)

    # Build entries: (title, freq_dict)
    entries = []
    for _, row in topics_df.iterrows():
        topic_id = int(row["Topic"])
        rep = row["Representation"]
        words = parse_representation(rep)
        freqs = rank_weights(words, base_weight=1.0, decay=0.92)
        label = row["Name"] if "Name" in row and isinstance(row["Name"], str) and row["Name"].strip() else f"Topic {topic_id}"
        title = f"{label} (#{topic_id})"
        entries.append((title, freqs))

        # Save individual wordcloud
        fig = make_wordcloud(freqs, title=title)
        indiv_path = OUTPUT_DIR / f"wordcloud_mts{mts}_topic{topic_id}.png"
        fig.savefig(indiv_path, bbox_inches="tight")
        plt.close(fig)

    # Save grid montage
    grid_path = OUTPUT_DIR / f"wordclouds_grid_mts{mts}.png"
    grid_wordclouds(entries, cols=GRID_COLS, title=f"Top {len(entries)} topics — min_topic_size={mts}", save_path=grid_path)
    plt.close("all")

    print(f"[ok] Saved {len(entries)} individual wordclouds and grid → {grid_path}")


[ok] Saved 12 individual wordclouds and grid → wordclouds\wordclouds_grid_mts50.png
[ok] Saved 12 individual wordclouds and grid → wordclouds\wordclouds_grid_mts30.png
[ok] Saved 12 individual wordclouds and grid → wordclouds\wordclouds_grid_mts10.png
