In [3]:
df = pd.read_csv('eu_speeches_all_2025-10-16.csv')

# Pipeline

In [2]:
import numpy as np, pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# ===== settings (defaults; can be overridden in run_macro_series) =====
TIME_BIN   = "Y"   # "Y", "2Y", "3Y"
MIN_PER_PARTY = 10
NULL_ITERS = 300
BOOT_ITERS = 300
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
MAX_DIMS   = 100
BOILER = [
    "mr president","madam president","honourable members",
    "president","commissioner","high representative","thank you"
]

# ===== helpers =====

def _scrub_boiler(s: str) -> str:
    s = str(s).lower()
    for b in BOILER:
        s = s.replace(b, " ")
    return s

def _length_in_tokens(text) -> int:
    """Crude proxy for speaking time; replace with duration if available."""
    return max(1, len(str(text).split()))

def compute_cpc(X, y, w=None):
    """
    Compute Cluster-Polarization Coefficient (CPC) and adjusted CPC (CPC_adj).

    Parameters
    ----------
    X : array-like, shape (N, J)
        Data matrix (e.g., embeddings or PCA scores).
    y : array-like, shape (N,)
        Group labels (e.g., party IDs).
    w : array-like, shape (N,), optional
        Optional non-negative weights. If provided, returns a weighted CPC.
        CPC_adj is only computed for the unweighted case (w is None).

    Returns
    -------
    stats : dict
        {
          "BSS", "WSS", "TSS",
          "CPC", "CPC_adj",
          "N", "J", "K"
        }
    """
    X = np.asarray(X)
    y = np.asarray(y)
    N, J = X.shape
    groups, counts = np.unique(y, return_counts=True)
    K = len(groups)

    if w is None:
        # Unweighted CPC
        mu = X.mean(axis=0)
        WSS = 0.0
        BSS = 0.0
        for g, n_g in zip(groups, counts):
            Xg = X[y == g]
            mug = Xg.mean(axis=0)
            WSS += ((Xg - mug) ** 2).sum()
            BSS += n_g * ((mug - mu) ** 2).sum()
        TSS = BSS + WSS
        CPC = BSS / (TSS + 1e-12)

        # Adjusted CPC (Equation 3 in Mehlhaff)
        if (N > J) and (N > J * K) and (N - J * K) != 0:
            CPC_adj = 1.0 - (1.0 - CPC) * (N - J) / (N - J * K)
        else:
            CPC_adj = np.nan
    else:
        # Weighted case: CPC is well-defined; CPC_adj is left as NaN
        w = np.asarray(w, dtype=float)
        w = np.clip(w, 0.0, None)
        if w.sum() <= 0:
            raise ValueError("All weights are zero or negative.")
        W = w / w.sum()

        mu = (W[:, None] * X).sum(axis=0)
        TSS = (W[:, None] * (X - mu) ** 2).sum()

        BSS = 0.0
        WSS = 0.0
        for g in groups:
            m = (y == g)
            Wg = W[m]
            Xg = X[m]
            wg_sum = Wg.sum()
            if wg_sum <= 0:
                continue
            mug = (Wg[:, None] * Xg).sum(axis=0) / wg_sum
            WSS += (Wg[:, None] * (Xg - mug) ** 2).sum()
            BSS += wg_sum * ((mug - mu) ** 2).sum()

        CPC = BSS / (TSS + 1e-12)
        CPC_adj = np.nan  # not defined in Mehlhaff for weighted case

    return {
        "BSS": float(BSS),
        "WSS": float(WSS),
        "TSS": float(TSS),
        "CPC": float(CPC),
        "CPC_adj": float(CPC_adj),
        "N": int(N),
        "J": int(J),
        "K": int(len(groups)),
    }

def between_within_scatter(X, labels):
    """Legacy helper returning (BSS, WSS, TSS); thin wrapper around compute_cpc."""
    stats = compute_cpc(X, labels)
    return stats["BSS"], stats["WSS"], stats["TSS"]

def bss_tss(X, y):
    """Legacy helper: returns raw CPC = BSS/TSS (unadjusted)."""
    stats = compute_cpc(X, y)
    return stats["CPC"]

def _weighted_cpc(X, y, w):
    """Weighted CPC convenience wrapper."""
    stats = compute_cpc(X, y, w=w)
    return stats["CPC"]

def pairwise_std_centroid_dists(X, y):
    """
    Pairwise standardized centroid distances between groups.

    Effect-size style metric: ||mu_a - mu_b|| divided by the average
    within-group spread of a and b. Not part of CPC; purely descriptive.
    """
    X = np.asarray(X)
    y = np.asarray(y)
    groups = np.unique(y)
    cents = {g: X[y == g].mean(axis=0) for g in groups}
    pooled = {}
    for g in groups:
        Xg = X[y == g]
        mu = cents[g]
        if Xg.shape[0] <= 1:
            pooled[g] = 0.0
        else:
            pooled[g] = float(
                np.sqrt(((Xg - mu) ** 2).sum() / max(Xg.shape[0] - 1, 1))
            )
    rows = []
    for i, a in enumerate(groups):
        for b in groups[i + 1 :]:
            num = float(np.linalg.norm(cents[a] - cents[b]))
            denom = 0.5 * (pooled[a] + pooled[b]) + 1e-12
            rows.append((a, b, num / denom))
    return pd.DataFrame(rows, columns=["party_a", "party_b", "std_dist"]).sort_values(
        "std_dist", ascending=False
    )

def _aggregate_by_speaker(X_bin, df_bin):
    """Equal weight per speaker (one vector per MEP per bin)."""
    Xs, ys = [], []
    for _, g in df_bin.groupby("speaker_name", sort=False):
        pos = g.index.to_numpy()  # df_bin must be 0..N-1 (reset_index done upstream)
        Xs.append(X_bin[pos].mean(axis=0))
        ys.append(g["political_group"].mode().iloc[0])
    return np.stack(Xs), np.array(ys)

def _build_speech_matrix(X_bin, d_bin, cap_per_speaker=5):
    """
    Speech-level design with a per-speaker cap to avoid dominance.
    Returns: X_use, y_use, spk_ids, lg_use, tp_use, w_use  (aligned arrays)
    """
    take_idx, spk_codes = [], []
    spk_cats = pd.Categorical(d_bin["speaker_name"].astype(str))
    spk_id_all = spk_cats.codes  # 0..S-1 for all rows

    # take first `cap_per_speaker` speeches per speaker (stable order)
    for sid in np.unique(spk_id_all):
        idxs = np.where(spk_id_all == sid)[0]
        take_idx.extend(idxs[:cap_per_speaker])
        spk_codes.extend([sid] * min(len(idxs), cap_per_speaker))

    if not take_idx:
        return None, None, None, None, None, None

    take_idx = np.array(take_idx, dtype=int)
    spk_codes = np.array(spk_codes, dtype=int)

    X_use = X_bin[take_idx]
    y_use = d_bin["political_group"].astype(str).iloc[take_idx].values
    lg_use = d_bin["language"].astype(str).iloc[take_idx].values
    tp_use = d_bin["topic"].astype(str).iloc[take_idx].values
    w_use = np.array(
        [_length_in_tokens(t) for t in d_bin["speech_content"].iloc[take_idx]],
        dtype=float,
    )
    return X_use, y_use, spk_codes, lg_use, tp_use, w_use

# ===== main: multilingual, language-centered, agenda-controlled =====

def run_macro_series(
    df,
    macro_topic,
    languages=None,
    *,
    # mode switch:
    mode="speaker",                 # "speaker" | "speech_capped" | "speech_weighted"
    cap_per_speaker=5,              # used only if mode=="speech_capped"
    keep_min_speakers=0,            # speaker-mode only: drop parties with < keep_min_speakers speakers (0 = off)
    min_speeches_per_party=30,      # speech_* modes: min speeches/party/bin (after cap for speech_capped)
    min_unique_speakers_per_party=5,# speech_* modes: also require ≥ this many distinct speakers/party/bin
    require_parties=3,              # both modes: require at least this many parties kept
    weight_by="count",              # speech_weighted only: "count" or "tokens"
    demean_level="language_topic",  # "language_topic" (default), "language", or None
    shuffle_strata="language_topic",# "language_topic" | "language" | "topic" | None
    drop_parties=("ID",),           # parties to drop; set to None or () to keep all
    time_bin=None,                  # override global TIME_BIN if not None
    model_name=None,                # override global MODEL_NAME if not None
    pca_max_dims=None,              # override global MAX_DIMS if not None
    use_cpc_adj=True,               # if True, z/CI are based on CPC_adj when defined, else CPC
    null_iters=None,
    boot_iters=None,
):
    """
    Run the Mehlhaff-style CPC pipeline on a macro-topic slice (or ALL).

    Core behaviour:
    - Same 3 modes as V5.1 (speaker / speech_capped / speech_weighted).
    - One multilingual embedding + PCA per macro slice.
    - Demeaning by language / language×topic as before.
    - Cluster-aware permutation null and bootstrap CIs.

    Changes vs V5:
    - Uses compute_cpc() to return CPC and CPC_adj.
    - Stores BSS, WSS, TSS, CPC, CPC_adj in the result.
    - The `obs_bss_tss` column now contains the *used* measure:
      CPC_adj (preferred) or CPC if CPC_adj is undefined.
    """
    # resolve defaults
    if time_bin is None:
        time_bin = TIME_BIN
    if model_name is None:
        model_name = MODEL_NAME
    if pca_max_dims is None:
        pca_max_dims = MAX_DIMS
    if null_iters is None:
        null_iters = NULL_ITERS
    if boot_iters is None:
        boot_iters = BOOT_ITERS

    # ---- filter / hygiene ----
    if (macro_topic is None) or (str(macro_topic).lower() in {"all", "__all__", "overall"}):
        d = df.copy()
        macro_label = "All speeches"
    else:
        d = df[df["macro_topic"] == macro_topic].copy()
        macro_label = macro_topic

    if languages is not None:
        d = d[d["language"].isin(languages)].copy()

    d = d.dropna(
        subset=["political_group", "speech_content", "speaker_name", "topic", "language", "date"]
    )
    d = d[d["speech_content"].astype(str).str.len() >= 40]

    if drop_parties:
        d = d[~d["political_group"].isin(drop_parties)].copy()

    d["time_bin"] = pd.to_datetime(d["date"]).dt.to_period(time_bin).dt.to_timestamp()
    d = d.reset_index(drop=True)

    if d.empty:
        return pd.DataFrame(
            columns=[
                "languages","macro_topic","time_bin","ok","reason",
                "obs_bss_tss","cpc","cpc_adj","bss","wss","tss",
                "null_mean","null_sd","z","ci_lo","ci_hi",
                "n_docs","n_parties","counts","mode","weight_by"
            ]
        ), None

    # ---- Embed once for the whole slice (multilingual) ----
    model = SentenceTransformer(model_name)
    model.max_seq_length = 512
    texts = d["speech_content"].astype(str).map(_scrub_boiler).tolist()
    E = model.encode(
        texts,
        batch_size=32,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True,
    )

    # ---- Remove language / agenda offsets before PCA (controlled by demean_level) ----
    langs = d["language"].astype(str).values
    topics = d["topic"].astype(str).values

    E_adj = E.copy()
    if demean_level in {"language", "language_topic"}:
        # (a) center within language
        for lg in np.unique(langs):
            m = (langs == lg)
            if m.any():
                E_adj[m] -= E_adj[m].mean(axis=0, keepdims=True)
    if demean_level == "language_topic":
        # (b) finer: center within (language × micro-topic)
        for lg in np.unique(langs):
            for tp in np.unique(topics):
                m = (langs == lg) & (topics == tp)
                if m.any():
                    E_adj[m] -= E_adj[m].mean(axis=0, keepdims=True)
    # if demean_level is None → no de-meaning

    # global center + PCA once (bins comparable)
    E_c = E_adj - E_adj.mean(axis=0, keepdims=True)
    n_samples, n_features = E_c.shape
    pca_k = max(2, min(pca_max_dims, n_features, n_samples - 1))
    pca = PCA(n_components=pca_k, random_state=42).fit(E_c)
    E_p = pca.transform(E_c)

    rows = []
    pw = None

    # helper to choose obs metric
    def _pick(obs_stats):
        cpc_raw = obs_stats["CPC"]
        cpc_adj = obs_stats["CPC_adj"]
        if use_cpc_adj and np.isfinite(cpc_adj):
            return cpc_adj
        return cpc_raw

    # ---- Per-bin loop ----
    for bin_val, g in d.groupby("time_bin", sort=True):
        idx = g.index.to_numpy()
        X_bin = E_p[idx]
        d_bin = d.iloc[idx].copy().reset_index(drop=True)

        if d_bin.empty:
            continue

        # ===== MODE: SPEAKER =====
        if mode == "speaker":
            X_mat, y_lab = _aggregate_by_speaker(X_bin, d_bin)
            counts = pd.Series(y_lab).value_counts()

            if keep_min_speakers and keep_min_speakers > 0:
                keep = counts[counts >= keep_min_speakers].index
                sel = np.isin(y_lab, keep)
                X_mat, y_lab = X_mat[sel], y_lab[sel]
                counts = pd.Series(y_lab).value_counts()

            ok_bin = (len(counts) >= require_parties) and (counts.min() >= MIN_PER_PARTY)

            if not ok_bin:
                rows.append(
                    {
                        "languages": ",".join(sorted(d_bin["language"].unique())),
                        "macro_topic": macro_label,
                        "time_bin": bin_val,
                        "ok": False,
                        "reason": "low_n",
                        "obs_bss_tss": np.nan,
                        "cpc": np.nan,
                        "cpc_adj": np.nan,
                        "bss": np.nan,
                        "wss": np.nan,
                        "tss": np.nan,
                        "null_mean": np.nan,
                        "null_sd": np.nan,
                        "z": np.nan,
                        "ci_lo": np.nan,
                        "ci_hi": np.nan,
                        "n_docs": int(len(y_lab)),
                        "n_parties": int(len(counts)),
                        "counts": counts.to_dict(),
                        "mode": mode,
                        "weight_by": None,
                    }
                )
                continue

            # observed CPC / CPC_adj
            obs_stats = compute_cpc(X_mat, y_lab)
            obs_val = _pick(obs_stats)

            # permutation: stratify; shuffle at SPEAKER level
            rng = np.random.default_rng(42)
            spk = d_bin.groupby("speaker_name", sort=False)["political_group"].agg(
                lambda x: x.mode().iloc[0]
            ).astype(str)
            spk_lg = d_bin.groupby("speaker_name", sort=False)["language"].agg(
                lambda x: x.mode().iloc[0]
            ).astype(str)
            spk_tp = d_bin.groupby("speaker_name", sort=False)["topic"].agg(
                lambda x: x.mode().iloc[0]
            ).astype(str)
            spk_df = pd.DataFrame(
                {"lab": spk.values, "lg": spk_lg.values, "tp": spk_tp.values},
                index=spk.index,
            )

            nulls = []
            for _ in range(null_iters):
                perm = spk_df["lab"].copy()
                if shuffle_strata == "language_topic":
                    groups = spk_df.groupby(["lg", "tp"], sort=False)
                elif shuffle_strata == "language":
                    groups = spk_df.groupby(["lg"], sort=False)
                elif shuffle_strata == "topic":
                    groups = spk_df.groupby(["tp"], sort=False)
                else:
                    groups = [("_all", spk_df)]
                for _, block in groups:
                    vals = block["lab"].to_numpy().copy()
                    rng.shuffle(vals)
                    perm.loc[block.index] = vals
                y_perm = perm.values
                perm_stats = compute_cpc(X_mat, y_perm)
                nulls.append(_pick(perm_stats))
            nulls = np.array(nulls)
            null_mean = float(nulls.mean())
            null_sd = float(nulls.std(ddof=1))

            # cluster bootstrap by speaker
            rngb = np.random.default_rng(1000)
            n_spk = len(y_lab)
            boots = []
            for _ in range(boot_iters):
                b = rngb.integers(0, n_spk, size=n_spk)
                boot_stats = compute_cpc(X_mat[b], y_lab[b])
                boots.append(_pick(boot_stats))
            lo, hi = np.percentile(boots, [2.5, 97.5])

        # ===== MODE: SPEECH (capped) =====
        elif mode == "speech_capped":
            out = _build_speech_matrix(X_bin, d_bin, cap_per_speaker=cap_per_speaker)
            X_use, y_use, spk_ids, lg_use, tp_use, w_use = out
            if X_use is None:
                rows.append(
                    {
                        "languages": ",".join(sorted(d_bin["language"].unique())),
                        "macro_topic": macro_label,
                        "time_bin": bin_val,
                        "ok": False,
                        "reason": "empty",
                        "obs_bss_tss": np.nan,
                        "cpc": np.nan,
                        "cpc_adj": np.nan,
                        "bss": np.nan,
                        "wss": np.nan,
                        "tss": np.nan,
                        "null_mean": np.nan,
                        "null_sd": np.nan,
                        "z": np.nan,
                        "ci_lo": np.nan,
                        "ci_hi": np.nan,
                        "n_docs": 0,
                        "n_parties": 0,
                        "counts": {},
                        "mode": mode,
                        "weight_by": None,
                    }
                )
                continue

            counts_speech = pd.Series(y_use).value_counts()
            spk_party = pd.Series(spk_ids).groupby(pd.Series(y_use)).nunique()

            keep_parties = [
                p
                for p in counts_speech.index
                if (counts_speech.get(p, 0) >= min_speeches_per_party)
                and (spk_party.get(p, 0) >= min_unique_speakers_per_party)
            ]

            m = np.isin(y_use, keep_parties)
            X_mat, y_lab = X_use[m], y_use[m]
            spk_ids, lg_use, tp_use, w_use = (
                spk_ids[m],
                lg_use[m],
                tp_use[m],
                w_use[m],
            )
            counts = pd.Series(y_lab).value_counts()
            ok_bin = len(counts) >= require_parties

            if not ok_bin:
                rows.append(
                    {
                        "languages": ",".join(sorted(d_bin["language"].unique())),
                        "macro_topic": macro_label,
                        "time_bin": bin_val,
                        "ok": False,
                        "reason": "low_n",
                        "obs_bss_tss": np.nan,
                        "cpc": np.nan,
                        "cpc_adj": np.nan,
                        "bss": np.nan,
                        "wss": np.nan,
                        "tss": np.nan,
                        "null_mean": np.nan,
                        "null_sd": np.nan,
                        "z": np.nan,
                        "ci_lo": np.nan,
                        "ci_hi": np.nan,
                        "n_docs": int(len(y_lab)),
                        "n_parties": int(len(counts)),
                        "counts": counts.to_dict(),
                        "mode": mode,
                        "weight_by": None,
                    }
                )
                continue

            obs_stats = compute_cpc(X_mat, y_lab)
            obs_val = _pick(obs_stats)

            # permutation (clustered by speaker)
            rng = np.random.default_rng(42)
            sel = pd.DataFrame(
                {"spk": spk_ids, "lab": y_lab, "lg": lg_use, "tp": tp_use}
            )
            spk_major = sel.groupby("spk", sort=False)["lab"].agg(
                lambda x: x.mode().iloc[0]
            )
            spk_meta = sel.groupby("spk", sort=False)[["lg", "tp"]].agg(
                lambda x: x.iloc[0]
            )

            nulls = []
            for _ in range(null_iters):
                perm_spk_lab = spk_major.copy()
                if shuffle_strata == "language_topic":
                    groups = spk_meta.groupby(["lg", "tp"], sort=False)
                elif shuffle_strata == "language":
                    groups = spk_meta.groupby(["lg"], sort=False)
                elif shuffle_strata == "topic":
                    groups = spk_meta.groupby(["tp"], sort=False)
                else:
                    groups = [("_all", spk_meta)]
                for _, block in groups:
                    vals = perm_spk_lab.loc[block.index].to_numpy().copy()
                    rng.shuffle(vals)
                    perm_spk_lab.loc[block.index] = vals
                y_perm = sel["spk"].map(perm_spk_lab).to_numpy()
                perm_stats = compute_cpc(X_mat, y_perm)
                nulls.append(_pick(perm_stats))
            nulls = np.array(nulls)
            null_mean = float(nulls.mean())
            null_sd = float(nulls.std(ddof=1))

            # clustered bootstrap by speaker
            rngb = np.random.default_rng(1000)
            unique_spk = np.unique(spk_ids)
            boots = []
            for _ in range(boot_iters):
                draw = rngb.choice(unique_spk, size=len(unique_spk), replace=True)
                take = np.isin(spk_ids, draw)
                boot_stats = compute_cpc(X_mat[take], y_lab[take])
                boots.append(_pick(boot_stats))
            lo, hi = np.percentile(boots, [2.5, 97.5])

        # ===== MODE: SPEECH (weighted; ALL speeches) =====
        elif mode == "speech_weighted":
            X_mat = X_bin
            y_lab = d_bin["political_group"].astype(str).values
            if weight_by == "tokens":
                w = np.array(
                    [_length_in_tokens(t) for t in d_bin["speech_content"]],
                    dtype=float,
                )
            else:
                w = None

            counts_speech = pd.Series(y_lab).value_counts()
            uniq_spk_per_party = d_bin.groupby("political_group")["speaker_name"].nunique()
            keep_parties = [
                p
                for p in counts_speech.index
                if (counts_speech.get(p, 0) >= min_speeches_per_party)
                and (uniq_spk_per_party.get(p, 0) >= min_unique_speakers_per_party)
            ]
            sel = np.isin(y_lab, keep_parties)
            X_mat, y_lab = X_mat[sel], y_lab[sel]
            w = None if w is None else w[sel]
            counts = pd.Series(y_lab).value_counts()
            ok_bin = len(counts) >= require_parties

            if not ok_bin:
                rows.append(
                    {
                        "languages": ",".join(sorted(d_bin["language"].unique())),
                        "macro_topic": macro_label,
                        "time_bin": bin_val,
                        "ok": False,
                        "reason": "low_n",
                        "obs_bss_tss": np.nan,
                        "cpc": np.nan,
                        "cpc_adj": np.nan,
                        "bss": np.nan,
                        "wss": np.nan,
                        "tss": np.nan,
                        "null_mean": np.nan,
                        "null_sd": np.nan,
                        "z": np.nan,
                        "ci_lo": np.nan,
                        "ci_hi": np.nan,
                        "n_docs": int(len(y_lab)),
                        "n_parties": int(len(counts)),
                        "counts": counts.to_dict(),
                        "mode": mode,
                        "weight_by": weight_by,
                    }
                )
                continue

            # weighted CPC (CPC_adj left NaN when weights used)
            obs_stats = compute_cpc(X_mat, y_lab, w=w)
            obs_val = _pick(obs_stats)

            rng = np.random.default_rng(42)
            spk_cats = pd.Categorical(d_bin["speaker_name"].astype(str).iloc[sel])
            spk_ids = spk_cats.codes
            lg_use = d_bin["language"].astype(str).iloc[sel].values
            tp_use = d_bin["topic"].astype(str).iloc[sel].values

            sel_df = pd.DataFrame(
                {"spk": spk_ids, "lab": y_lab, "lg": lg_use, "tp": tp_use}
            )
            spk_major = sel_df.groupby("spk", sort=False)["lab"].agg(
                lambda x: x.mode().iloc[0]
            )
            spk_meta = sel_df.groupby("spk", sort=False)[["lg", "tp"]].agg(
                lambda x: x.iloc[0]
            )

            nulls = []
            for _ in range(null_iters):
                perm_spk_lab = spk_major.copy()
                if shuffle_strata == "language_topic":
                    groups = spk_meta.groupby(["lg", "tp"], sort=False)
                elif shuffle_strata == "language":
                    groups = spk_meta.groupby(["lg"], sort=False)
                elif shuffle_strata == "topic":
                    groups = spk_meta.groupby(["tp"], sort=False)
                else:
                    groups = [("_all", spk_meta)]
                for _, block in groups:
                    vals = perm_spk_lab.loc[block.index].to_numpy().copy()
                    rng.shuffle(vals)
                    perm_spk_lab.loc[block.index] = vals
                y_perm = sel_df["spk"].map(perm_spk_lab).to_numpy()
                perm_stats = compute_cpc(X_mat, y_perm, w=w)
                nulls.append(_pick(perm_stats))
            nulls = np.array(nulls)
            null_mean = float(nulls.mean())
            null_sd = float(nulls.std(ddof=1))

            # clustered bootstrap by speaker
            rngb = np.random.default_rng(1000)
            unique_spk = np.unique(spk_ids)
            boots = []
            for _ in range(boot_iters):
                draw = rngb.choice(unique_spk, size=len(unique_spk), replace=True)
                take = np.isin(spk_ids, draw)
                boot_stats = compute_cpc(
                    X_mat[take],
                    y_lab[take],
                    w=None if w is None else w[take],
                )
                boots.append(_pick(boot_stats))
            lo, hi = np.percentile(boots, [2.5, 97.5])

        else:
            raise ValueError("mode must be 'speaker', 'speech_capped', or 'speech_weighted'")

        rows.append(
            {
                "languages": ",".join(sorted(d_bin["language"].unique())),
                "macro_topic": macro_label,
                "time_bin": bin_val,
                "ok": True,
                "reason": None,
                "obs_bss_tss": float(obs_val),
                "cpc": float(obs_stats["CPC"]),
                "cpc_adj": float(obs_stats["CPC_adj"]),
                "bss": float(obs_stats["BSS"]),
                "wss": float(obs_stats["WSS"]),
                "tss": float(obs_stats["TSS"]),
                "null_mean": float(null_mean),
                "null_sd": float(null_sd),
                "z": float((obs_val - null_mean) / (null_sd + 1e-12)),
                "ci_lo": float(lo),
                "ci_hi": float(hi),
                "n_docs": int(X_mat.shape[0]),
                "n_parties": int(len(counts)),
                "counts": counts.to_dict(),
                "mode": mode,
                "weight_by": (weight_by if mode == "speech_weighted" else None),
            }
        )

    res = pd.DataFrame(rows).sort_values("time_bin")

    # ---- Pairwise distances for latest valid bin (same unit as active mode) ----
    valid = res[res["ok"]]
    if not valid.empty:
        latest_bin = valid["time_bin"].max()
        idx = d.index[d["time_bin"] == latest_bin].to_numpy()
        X_bin = E_p[idx]
        d_bin = d.iloc[idx].copy().reset_index(drop=True)

        if mode == "speaker":
            X_latest, y_latest = _aggregate_by_speaker(X_bin, d_bin)

        elif mode == "speech_capped":
            out = _build_speech_matrix(X_bin, d_bin, cap_per_speaker=cap_per_speaker)
            X_use, y_use, spk_ids, lg_use, tp_use, w_use = out
            if X_use is None:
                return res, None
            counts_speech = pd.Series(y_use).value_counts()
            spk_party = pd.Series(spk_ids).groupby(pd.Series(y_use)).nunique()
            keep_parties = [
                p
                for p in counts_speech.index
                if (counts_speech.get(p, 0) >= min_speeches_per_party)
                and (spk_party.get(p, 0) >= min_unique_speakers_per_party)
            ]
            m = np.isin(y_use, keep_parties)
            X_latest, y_latest = X_use[m], y_use[m]

        else:  # speech_weighted
            X_latest = X_bin
            y_latest = d_bin["political_group"].astype(str).values
            counts_speech = pd.Series(y_latest).value_counts()
            uniq_spk_per_party = d_bin.groupby("political_group")["speaker_name"].nunique()
            keep_parties = [
                p
                for p in counts_speech.index
                if (counts_speech.get(p, 0) >= min_speeches_per_party)
                and (uniq_spk_per_party.get(p, 0) >= min_unique_speakers_per_party)
            ]
            m = np.isin(y_latest, keep_parties)
            X_latest, y_latest = X_latest[m], y_latest[m]

        if len(np.unique(y_latest)) >= 2:
            pw = pairwise_std_centroid_dists(X_latest, y_latest)
        else:
            pw = None
    else:
        pw = None

    return res, pw




# Run all topics

### Mashine

In [None]:
# =========================
# EP Polarization: V6 Batch Runner (VERBOSE)
# =========================
# Prereqs in memory: df, run_macro_series (V6 version)
# Creates per-topic folders under reports_v6/<topic-slug>/ with CSVs + a one-page PDF,
# plus an ALL-speeches slice under reports_v6/_all_speeches/.

import os, re, sys, time, math, datetime as dt
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")  # headless-safe
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
from matplotlib.gridspec import GridSpec

# ---------- Pretty logging ----------
def _now():
    return dt.datetime.now().strftime("%H:%M:%S")

def log(msg):
    print(f"[{_now()}] {msg}", flush=True)

# ---------- Progress bar helper ----------
try:
    from tqdm import tqdm
except Exception:
    tqdm = None

def _progress(iterable, desc):
    if tqdm is not None:
        return tqdm(iterable, desc=desc, ncols=100)
    log(desc); return iterable

# ---------- Topics (deduped + normalized) ----------
RAW_TOPICS = [
    "Agriculture & fisheries",
    "Climate, environment & biodiversity",
    "Development & humanitarian aid",
    "Digital policy & data protection",
    "EU budget & MFF",
    "Economy & industrial policy",
    "Education, culture & sport",
    "Energy & energy security",
    "Enlargement & neighbourhood policy",
    "Foreign policy — Americas",
    "Foreign policy — Asia-Pacific",
    "Foreign policy — Europe & Eastern Neighbourhood",
    "Foreign policy — Middle East & North Africa",
    "Foreign policy — Sub-Saharan Africa",
    "Health",
    "Institutional affairs & governance",
    "Justice, security & policing",
    "Media, information & disinformation",
    "Migration & asylum",
    "Monetary & financial stability",
    "Procedural & Parliamentary business",
    "Research, innovation & space",
    "Rule of law & fundamental rights",
    "Security & defence",
    "Security & policing",
    "Single market, competition & consumer protection",
    "Social policy & employment",
    "Taxation & anti–money laundering",
    "Trade & globalization",
    "Transport & mobility",
]

def _normalize_topic(t):
    t = t.replace("&amp;", "&")
    t = t.replace("\u2011", "-")   # non-breaking hyphen → hyphen
    t = t.replace("\u2013", "-")   # en dash → hyphen
    t = t.replace("\u2014", "—")   # keep em dash for display
    t = re.sub(r"\s+", " ", t).strip()
    return t

TOPICS = sorted(set(_normalize_topic(t) for t in RAW_TOPICS))

# ---------- Filenames & folders ----------
BASE_DIR = "reports_v6"
os.makedirs(BASE_DIR, exist_ok=True)

def _slugify(name: str) -> str:
    s = name.lower()
    s = s.replace("&", "and")
    s = re.sub(r"[^\w\s-]", "", s)
    s = re.sub(r"\s+", "-", s).strip("-")
    s = re.sub(r"-+", "-", s)
    return s

# ---------- Robust plotting helpers ----------
def _ensure_time_and_numeric(df):
    df = df.copy()
    if "time_bin" in df:
        df["time_bin"] = pd.to_datetime(df["time_bin"], errors="coerce")
    for col in ["obs_bss_tss","ci_lo","ci_hi"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    return df

def _split_ok_low(df):
    if "ok" not in df.columns:
        ok = df.copy(); low = df.iloc[0:0].copy()
    else:
        ok  = df[df["ok"]==True].copy()
        low = df[df["ok"]!=True].copy()
    ok  = ok.sort_values("time_bin")
    low = low.sort_values("time_bin")
    return ok, low

def _shared_ylim(res_list):
    hi = []
    for r in res_list:
        r = _ensure_time_and_numeric(r)
        ok, _ = _split_ok_low(r)
        if not ok.empty:
            if "ci_hi" in ok:
                hi.extend(ok["ci_hi"].dropna().tolist())
            elif "obs_bss_tss" in ok:
                hi.extend(ok["obs_bss_tss"].dropna().tolist())
    ymax = max(hi) if hi else 0.05
    return (0, ymax * 1.15)

def _heat_cbar_limits(pw_list):
    mx = 0.0
    for pw in pw_list:
        if pw is None or len(pw)==0 or "std_dist" not in (pw.columns if hasattr(pw, "columns") else []): 
            continue
        vals = pd.to_numeric(pw["std_dist"], errors="coerce").dropna()
        if not vals.empty: mx = max(mx, float(vals.max()))
    if mx <= 0: mx = 0.01
    return (0.0, mx)

def plot_time_series(ax, res_df, title, show_legend=False, ylim=None):
    df = _ensure_time_and_numeric(res_df)
    ok, low = _split_ok_low(df)
    drew = False

    if not ok.empty and {"time_bin","obs_bss_tss"} <= set(ok.columns):
        if {"ci_lo","ci_hi"} <= set(ok.columns):
            x_ord = mdates.date2num(ok["time_bin"])
            ax.fill_between(x_ord, ok["ci_lo"].values, ok["ci_hi"].values, alpha=0.2, label="95% CI")
        ax.plot(ok["time_bin"], ok["obs_bss_tss"], marker="o", lw=1.8, label="CPC (adj/unadj)")
        ax.xaxis.set_major_locator(mdates.YearLocator(base=2))
        ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
        drew = True

    if not low.empty and "time_bin" in low:
        ax.scatter(low["time_bin"], np.zeros(len(low)), facecolors="none", edgecolors="grey", lw=1.0, label="low_n")
        drew = True

    ax.set_title(title, fontsize=11)
    ax.set_ylabel("CPC"); ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    if ylim: ax.set_ylim(*ylim)
    ax.grid(alpha=0.25)

    if show_legend and drew:
        ax.legend(loc="upper left", fontsize=8, frameon=False)
    if not drew:
        ax.axis("off"); ax.text(0.5, 0.5, "No valid rows", ha="center", va="center", fontsize=9)

def plot_heatmap(ax, pw_df, title, vmin=0.0, vmax=0.4):
    if pw_df is None or len(pw_df)==0 or {"party_a","party_b","std_dist"} - set(pw_df.columns):
        ax.axis("off"); ax.set_title(title, fontsize=11)
        ax.text(0.5, 0.5, "No pairwise table for latest valid bin.", ha="center", va="center", fontsize=9)
        return

    d = pw_df.copy()
    d["std_dist"] = pd.to_numeric(d["std_dist"], errors="coerce")
    d = d.dropna(subset=["std_dist"])
    if d.empty:
        ax.axis("off"); ax.set_title(title, fontsize=11)
        ax.text(0.5, 0.5, "No numeric distances.", ha="center", va="center", fontsize=9)
        return

    parties = sorted(set(d["party_a"]).union(set(d["party_b"])))
    idx = {p:i for i,p in enumerate(parties)}
    M = np.zeros((len(parties), len(parties)), dtype=float)
    for _, r in d.iterrows():
        i, j = idx[r["party_a"]], idx[r["party_b"]]
        M[i, j] = r["std_dist"]; M[j, i] = r["std_dist"]
    np.fill_diagonal(M, 0.0)

    im = ax.imshow(M, aspect="auto", vmin=vmin, vmax=vmax)
    ax.set_xticks(range(len(parties))); ax.set_yticks(range(len(parties)))
    ax.set_xticklabels(parties, rotation=45, ha="right", fontsize=8)
    ax.set_yticklabels(parties, fontsize=8)
    ax.set_title(title, fontsize=11)
    cbar = plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.set_label("std. centroid distance", fontsize=8)
    for t in cbar.ax.get_yticklabels(): t.set_fontsize(8)

# ---------- One-page PDF ----------
def build_topic_pdf(topic_title, res_meps, res_cap, res_w, pw_meps, pw_cap, pw_w, outpath):
    log(f'  step: build PDF layout for "{topic_title}"')
    ylim = _shared_ylim([res_meps, res_cap, res_w])
    vmin, vmax = _heat_cbar_limits([pw_meps, pw_cap, pw_w])

    plt.rcParams.update({"font.size": 10})
    fig = plt.figure(figsize=(11.69, 8.27))
    gs  = GridSpec(4, 3, figure=fig,
                   height_ratios=[0.9, 3, 3, 0.6],
                   hspace=0.75, wspace=0.6)

    # Title + subtitle
    ax_title = fig.add_subplot(gs[0, :]); ax_title.axis("off")
    ax_title.text(0.5, 0.78,
                  f"European Parliament Rhetorical Polarization — {topic_title}",
                  ha="center", va="center", fontsize=16, weight="bold")
    ax_title.text(0.5, 0.22,
                  "Line charts: CPC per year (higher = parties sound more distinct); shaded = 95% CI; "
                  "hollow circles at 0% = years failing data thresholds.  "
                  "Heatmaps: who is far/near in the latest valid year (larger = farther, relative to within-party variation).",
                  ha="center", va="center", fontsize=9, color="dimgray")

    # Time series row
    ax_ts1 = fig.add_subplot(gs[1, 0])
    ax_ts2 = fig.add_subplot(gs[1, 1], sharey=ax_ts1)
    ax_ts3 = fig.add_subplot(gs[1, 2], sharey=ax_ts1)
    plot_time_series(ax_ts1, res_meps, "MEP-averaged (speaker-equal)", show_legend=True,  ylim=ylim)
    plot_time_series(ax_ts2, res_cap,  "Speech-capped (per-MEP cap)", show_legend=False, ylim=ylim)
    plot_time_series(ax_ts3, res_w,    "Speech-weighted (as heard)",  show_legend=False, ylim=ylim)
    for ax in (ax_ts2, ax_ts3):
        plt.setp(ax.get_yticklabels(), visible=False)
        ax.set_ylabel("")

    # Heatmaps row
    ax_hm1 = fig.add_subplot(gs[2, 0])
    ax_hm2 = fig.add_subplot(gs[2, 1])
    ax_hm3 = fig.add_subplot(gs[2, 2])
    plot_heatmap(ax_hm1, pw_meps, "Pairwise — MEP-averaged", vmin, vmax)
    plot_heatmap(ax_hm2, pw_cap,  "Pairwise — Speech-capped", vmin, vmax)
    plot_heatmap(ax_hm3, pw_w,    "Pairwise — Speech-weighted", vmin, vmax)

    # Footnote
    ax_foot = fig.add_subplot(gs[3, :]); ax_foot.axis("off")
    ax_foot.text(
        0.01, 0.55,
        "Notes: CPC = BSS/TSS (Mehlhaff’s group-based polarization). For unweighted modes we use the small-sample adjusted CPC (CPC_adj); "
        "uncertainty via speaker-cluster bootstrap. Distances standardized by within-party spread. "
        "Language/topic de-meaning and one PCA keep years comparable; guardrails can yield low_n (no valid bin).",
        ha="left", va="center", fontsize=8, color="dimgray"
    )

    fig.savefig(outpath, dpi=300, bbox_inches="tight")
    plt.close(fig)
    log(f"  step: PDF saved → {outpath}")

# ---------- Small summaries ----------
def _summarize_res(df):
    df = _ensure_time_and_numeric(df)
    ok, low = _split_ok_low(df)
    info = {}
    if not ok.empty:
        info["n_bins_total"] = int(len(df))
        info["n_bins_ok"]    = int(len(ok))
        info["first_ok"]     = pd.to_datetime(ok["time_bin"]).min()
        info["latest_ok"]    = pd.to_datetime(ok["time_bin"]).max()
        info["eta_mean"]     = float(ok["obs_bss_tss"].mean())
        info["eta_max"]      = float(ok["obs_bss_tss"].max())
    else:
        info["n_bins_total"] = int(len(df))
        info["n_bins_ok"]    = 0
        info["first_ok"]     = None
        info["latest_ok"]    = None
        info["eta_mean"]     = None
        info["eta_max"]      = None
    return info

# ---------- Core runner for one topic (VERBOSE) ----------
def run_one_topic(df, topic, out_dir,
                  cap_per_speaker=5,
                  min_speeches_per_party=20,
                  min_unique_speakers_per_party=5,
                  require_parties=3):
    """
    topic: macro_topic string, or the special label 'ALL' for all speeches.
    """
    os.makedirs(out_dir, exist_ok=True)
    pretty_name = "All speeches" if str(topic).upper() == "ALL" else topic
    log(f'▶ topic: "{pretty_name}" → {out_dir}')
    t0 = time.time()

    # quick pre-count
    if str(topic).upper() == "ALL":
        pre_n = len(df)
    else:
        pre_n = int((df["macro_topic"]==topic).sum()) if "macro_topic" in df.columns else None
    if pre_n is not None:
        log(f"  step: filter count → {pre_n} speeches in df for this topic (before guardrails)")

    # 1) MEP-averaged
    log("  step: run mode=speaker (MEP-averaged)")
    s = time.time()
    res_meps, pw_meps = run_macro_series(
        df, topic, mode="speaker",
        keep_min_speakers=0, require_parties=require_parties
    )
    log(f"    done in {time.time()-s:.1f}s | rows={len(res_meps)} | pw={'ok' if pw_meps is not None else 'none'}")
    info = _summarize_res(res_meps); log(f"    bins_ok={info['n_bins_ok']} latest_ok={info['latest_ok']} CPC_max={info['eta_max']}")

    # 2) Speech-capped
    log("  step: run mode=speech_capped (per-MEP cap)")
    s = time.time()
    res_cap, pw_cap = run_macro_series(
        df, topic, mode="speech_capped", cap_per_speaker=cap_per_speaker,
        min_speeches_per_party=min_speeches_per_party,
        min_unique_speakers_per_party=min_unique_speakers_per_party,
        require_parties=require_parties
    )
    log(f"    done in {time.time()-s:.1f}s | rows={len(res_cap)} | pw={'ok' if pw_cap is not None else 'none'}")
    info = _summarize_res(res_cap); log(f"    bins_ok={info['n_bins_ok']} latest_ok={info['latest_ok']} CPC_max={info['eta_max']}")

    # 3) Speech-weighted
    log("  step: run mode=speech_weighted (as heard)")
    s = time.time()
    res_w, pw_w = run_macro_series(
        df, topic, mode="speech_weighted", weight_by="tokens",
        min_speeches_per_party=min_speeches_per_party,
        min_unique_speakers_per_party=min_unique_speakers_per_party,
        require_parties=require_parties
    )
    log(f"    done in {time.time()-s:.1f}s | rows={len(res_w)} | pw={'ok' if pw_w is not None else 'none'}")
    info = _summarize_res(res_w); log(f"    bins_ok={info['n_bins_ok']} latest_ok={info['latest_ok']} CPC_max={info['eta_max']}")

    # 4) Save CSVs
    log("  step: save CSVs")
    res_meps.to_csv(os.path.join(out_dir, "res_meps.csv"), index=False)
    res_cap.to_csv( os.path.join(out_dir, "res_cap.csv"),  index=False)
    res_w.to_csv(   os.path.join(out_dir, "res_w.csv"),    index=False)
    if pw_meps is not None: pw_meps.to_csv(os.path.join(out_dir, "pw_meps.csv"), index=False)
    if pw_cap  is not None: pw_cap.to_csv( os.path.join(out_dir, "pw_cap.csv"),  index=False)
    if pw_w    is not None: pw_w.to_csv(   os.path.join(out_dir, "pw_w.csv"),    index=False)
    log("    CSVs saved.")

    # 5) PDF
    pdf_path = os.path.join(out_dir, "topic_report.pdf")
    build_topic_pdf(pretty_name, res_meps, res_cap, res_w, pw_meps, pw_cap, pw_w, pdf_path)

    log(f'✔ topic done: "{pretty_name}" in {time.time()-t0:.1f}s | PDF → {pdf_path}\n')

    # return summary row
    def _last_ok(df_):
        v = df_[df_["ok"]==True]
        return pd.to_datetime(v["time_bin"]).max() if not v.empty else pd.NaT
    return {
        "topic": pretty_name,
        "macro_topic": topic,
        "out_dir": out_dir,
        "latest_ok_meps": _last_ok(res_meps),
        "latest_ok_cap":  _last_ok(res_cap),
        "latest_ok_w":    _last_ok(res_w),
        "pdf": pdf_path
    }

# ---------- Topics summary based on saved CSVs ----------
def _metrics_for_mode(res_df, base_end_year=2018):
    if res_df is None or res_df.empty:
        return {
            "base_eta": np.nan,
            "peak_eta": np.nan,
            "peak_year": np.nan,
            "peak_z": np.nan,
            "latest_eta": np.nan,
            "latest_year": np.nan,
            "last3_eta": np.nan,
            "uplift_latest": np.nan,
            "uplift_last3": np.nan,
            "coverage": 0.0,
        }
    df = _ensure_time_and_numeric(res_df)
    df["year"] = pd.to_datetime(df["time_bin"], errors="coerce").dt.year
    ok = df[df["ok"]==True].copy()
    if ok.empty:
        return {
            "base_eta": np.nan,
            "peak_eta": np.nan,
            "peak_year": np.nan,
            "peak_z": np.nan,
            "latest_eta": np.nan,
            "latest_year": np.nan,
            "last3_eta": np.nan,
            "uplift_latest": np.nan,
            "uplift_last3": np.nan,
            "coverage": 0.0,
        }

    coverage = len(ok) / len(df)
    base = ok[ok["year"] <= base_end_year]
    base_eta = float(base["obs_bss_tss"].mean()) if not base.empty else np.nan

    idx_peak = ok["obs_bss_tss"].idxmax()
    peak_eta = float(ok.loc[idx_peak, "obs_bss_tss"])
    peak_year = int(ok.loc[idx_peak, "year"])
    peak_z = float(ok.loc[idx_peak, "z"])

    latest_year = int(ok["year"].max())
    latest_row = ok[ok["year"]==latest_year].sort_values("time_bin").iloc[-1]
    latest_eta = float(latest_row["obs_bss_tss"])

    years = sorted(ok["year"].dropna().unique())
    last3_years = years[-3:] if len(years) >= 3 else years
    last3 = ok[ok["year"].isin(last3_years)]
    last3_eta = float(last3["obs_bss_tss"].mean()) if not last3.empty else np.nan

    def _uplift(x, base):
        if base is None or not np.isfinite(base) or base <= 0:
            return np.nan
        return (x - base) / base

    uplift_latest = _uplift(latest_eta, base_eta)
    uplift_last3 = _uplift(last3_eta, base_eta)

    return {
        "base_eta": base_eta,
        "peak_eta": peak_eta,
        "peak_year": peak_year,
        "peak_z": peak_z,
        "latest_eta": latest_eta,
        "latest_year": latest_year,
        "last3_eta": last3_eta,
        "uplift_latest": uplift_latest,
        "uplift_last3": uplift_last3,
        "coverage": coverage,
    }

def _top_pair_from_pw(pw_df):
    if pw_df is None or pw_df.empty or "std_dist" not in pw_df.columns:
        return (None, np.nan)
    d = pw_df.copy()
    d["std_dist"] = pd.to_numeric(d["std_dist"], errors="coerce")
    d = d.dropna(subset=["std_dist"])
    if d.empty:
        return (None, np.nan)
    row = d.loc[d["std_dist"].idxmax()]
    pair = f"{row['party_a']}–{row['party_b']}"
    return (pair, float(row["std_dist"]))

def build_topics_summary(base_dir=BASE_DIR, topics=None, include_all=True):
    """
    Build a simple topics_summary_v6.csv using the saved res_*.csv and pw_*.csv.
    """
    if topics is None:
        topics = TOPICS
    rows = []

    if include_all:
        topics_for_summary = ["ALL"] + list(topics)
    else:
        topics_for_summary = list(topics)

    for topic in topics_for_summary:
        slug = "_all_speeches" if str(topic).upper()=="ALL" else _slugify(topic)
        out_dir = os.path.join(base_dir, slug)
        pretty_name = "All speeches" if str(topic).upper()=="ALL" else topic

        try:
            res_meps = pd.read_csv(os.path.join(out_dir, "res_meps.csv"))
        except Exception:
            res_meps = None
        try:
            res_cap = pd.read_csv(os.path.join(out_dir, "res_cap.csv"))
        except Exception:
            res_cap = None
        try:
            res_w = pd.read_csv(os.path.join(out_dir, "res_w.csv"))
        except Exception:
            res_w = None

        try:
            pw_meps = pd.read_csv(os.path.join(out_dir, "pw_meps.csv"))
        except Exception:
            pw_meps = None
        try:
            pw_cap = pd.read_csv(os.path.join(out_dir, "pw_cap.csv"))
        except Exception:
            pw_cap = None
        try:
            pw_w = pd.read_csv(os.path.join(out_dir, "pw_w.csv"))
        except Exception:
            pw_w = None

        m_meps = _metrics_for_mode(res_meps)
        m_cap  = _metrics_for_mode(res_cap)
        m_w    = _metrics_for_mode(res_w)

        pair_meps, dist_meps = _top_pair_from_pw(pw_meps)
        pair_cap,  dist_cap  = _top_pair_from_pw(pw_cap)
        pair_w,    dist_w    = _top_pair_from_pw(pw_w)

        rows.append({
            "topic": pretty_name,
            "macro_topic": topic,
            # MEP-avg
            "m_base_eta": m_meps["base_eta"],
            "m_peak_eta": m_meps["peak_eta"],
            "m_peak_year": m_meps["peak_year"],
            "m_peak_z": m_meps["peak_z"],
            "m_latest_eta": m_meps["latest_eta"],
            "m_latest_year": m_meps["latest_year"],
            "m_last3_eta": m_meps["last3_eta"],
            "m_uplift_latest": m_meps["uplift_latest"],
            "m_uplift_last3": m_meps["uplift_last3"],
            "m_coverage": m_meps["coverage"],
            "m_top_pair": pair_meps,
            "m_top_dist": dist_meps,
            # capped
            "c_base_eta": m_cap["base_eta"],
            "c_peak_eta": m_cap["peak_eta"],
            "c_peak_year": m_cap["peak_year"],
            "c_peak_z": m_cap["peak_z"],
            "c_latest_eta": m_cap["latest_eta"],
            "c_latest_year": m_cap["latest_year"],
            "c_last3_eta": m_cap["last3_eta"],
            "c_uplift_latest": m_cap["uplift_latest"],
            "c_uplift_last3": m_cap["uplift_last3"],
            "c_coverage": m_cap["coverage"],
            "c_top_pair": pair_cap,
            "c_top_dist": dist_cap,
            # weighted
            "w_base_eta": m_w["base_eta"],
            "w_peak_eta": m_w["peak_eta"],
            "w_peak_year": m_w["peak_year"],
            "w_peak_z": m_w["peak_z"],
            "w_latest_eta": m_w["latest_eta"],
            "w_latest_year": m_w["latest_year"],
            "w_last3_eta": m_w["last3_eta"],
            "w_uplift_latest": m_w["uplift_latest"],
            "w_uplift_last3": m_w["uplift_last3"],
            "w_coverage": m_w["coverage"],
            "w_top_pair": pair_w,
            "w_top_dist": dist_w,
        })

    ts_df = pd.DataFrame(rows)
    out_path = os.path.join(base_dir, "topics_summary_v6.csv")
    ts_df.to_csv(out_path, index=False)
    log(f"Topics summary saved → {out_path}")
    return ts_df

# ---------- Batch all topics (VERBOSE) ----------
def run_all_topics(df, topics=TOPICS, base_dir=BASE_DIR, include_all=True):
    log(f"Batch start — {len(topics)} topics → base folder: {base_dir}")
    summaries = []

    # ALL-speeches slice first (optional)
    if include_all:
        all_dir = os.path.join(base_dir, "_all_speeches")
        try:
            summary_all = run_one_topic(df, "ALL", all_dir)
            summaries.append(summary_all)
        except Exception as e:
            os.makedirs(all_dir, exist_ok=True)
            msg = f"{type(e).__name__}: {e}"
            log(f"✖ ERROR topic: ALL → {msg}")
            with open(os.path.join(all_dir, "ERROR.txt"), "w", encoding="utf-8") as f:
                f.write(f"ALL\n{msg}\n")
            summaries.append({
                "topic": "All speeches", "macro_topic": "ALL",
                "out_dir": all_dir, "latest_ok_meps": pd.NaT,
                "latest_ok_cap": pd.NaT, "latest_ok_w": pd.NaT,
                "pdf": os.path.join(all_dir, "topic_report.pdf"), "error": msg
            })

    # Macro topics
    for topic in _progress(topics, "Processing topics"):
        slug = _slugify(topic)
        out_dir = os.path.join(base_dir, slug)
        try:
            summary = run_one_topic(df, topic, out_dir)
            summaries.append(summary)
        except Exception as e:
            os.makedirs(out_dir, exist_ok=True)
            msg = f"{type(e).__name__}: {e}"
            log(f"✖ ERROR topic: {topic} → {msg}")
            with open(os.path.join(out_dir, "ERROR.txt"), "w", encoding="utf-8") as f:
                f.write(f"{topic}\n{msg}\n")
            summaries.append({
                "topic": topic, "macro_topic": topic,
                "out_dir": out_dir, "latest_ok_meps": pd.NaT,
                "latest_ok_cap": pd.NaT, "latest_ok_w": pd.NaT,
                "pdf": os.path.join(out_dir, "topic_report.pdf"), "error": msg
            })

    sm_df = pd.DataFrame(summaries)
    out_summary = os.path.join(base_dir, "batch_summary.csv")
    sm_df.to_csv(out_summary, index=False)
    log(f"Batch finished → {out_summary}")

    # Build topics_summary_v6.csv
    _ = build_topics_summary(base_dir=base_dir, topics=topics, include_all=include_all)

    return sm_df



In [5]:
import os
import pandas as pd

# ---------- helper: does this topic look complete? ----------
def _topic_is_complete(out_dir):
    """
    A topic is considered 'done' if the three result CSVs and the PDF exist
    and are non-empty. If any are missing or zero-byte, we will re-run it.
    """
    required_files = [
        os.path.join(out_dir, "res_meps.csv"),
        os.path.join(out_dir, "res_cap.csv"),
        os.path.join(out_dir, "res_w.csv"),
        os.path.join(out_dir, "topic_report.pdf"),
    ]
    for f in required_files:
        if not os.path.exists(f):
            return False
        try:
            if os.path.getsize(f) <= 0:
                return False
        except OSError:
            return False
    return True

def _last_ok_from_res(path):
    """
    Read a res_*.csv and return the latest time_bin with ok==True, or NaT.
    Used only to populate the summary_df row on resume.
    """
    try:
        df = pd.read_csv(path)
    except Exception:
        return pd.NaT
    if "ok" not in df.columns or "time_bin" not in df.columns:
        return pd.NaT
    v = df[df["ok"] == True].copy()
    if v.empty:
        return pd.NaT
    v["time_bin"] = pd.to_datetime(v["time_bin"], errors="coerce")
    return v["time_bin"].max()

# ---------- NEW: resume-aware batch runner ----------
def run_all_topics_resume(df, topics=TOPICS, base_dir=BASE_DIR, include_all=True):
    """
    Resume-aware version of run_all_topics:
    - SKIPS topics (and ALL-speeches) whose outputs already exist.
    - Re-runs only incomplete topics.
    - Rebuilds topics_summary_v6.csv at the end.
    """
    log(f"Resume batch start — {len(topics)} topics → base folder: {base_dir}")
    summaries = []

    # ---- ALL-speeches slice first (optional) ----
    if include_all:
        all_dir = os.path.join(base_dir, "_all_speeches")
        pretty_name = "All speeches"
        if _topic_is_complete(all_dir):
            log(f'▶ SKIP "All speeches" — existing outputs found in {all_dir}')
            res_meps_path = os.path.join(all_dir, "res_meps.csv")
            summaries.append({
                "topic": pretty_name,
                "macro_topic": "ALL",
                "out_dir": all_dir,
                "latest_ok_meps": _last_ok_from_res(res_meps_path),
                "latest_ok_cap":  _last_ok_from_res(os.path.join(all_dir, "res_cap.csv")),
                "latest_ok_w":    _last_ok_from_res(os.path.join(all_dir, "res_w.csv")),
                "pdf": os.path.join(all_dir, "topic_report.pdf")
            })
        else:
            log(f'▶ RESUME "All speeches" — outputs missing/incomplete, re-running')
            try:
                os.makedirs(all_dir, exist_ok=True)
                summary_all = run_one_topic(df, "ALL", all_dir)
                summaries.append(summary_all)
            except Exception as e:
                msg = f"{type(e).__name__}: {e}"
                log(f"✖ ERROR topic: ALL → {msg}")
                with open(os.path.join(all_dir, "ERROR.txt"), "w", encoding="utf-8") as f:
                    f.write(f"ALL\n{msg}\n")
                summaries.append({
                    "topic": pretty_name, "macro_topic": "ALL",
                    "out_dir": all_dir, "latest_ok_meps": pd.NaT,
                    "latest_ok_cap": pd.NaT, "latest_ok_w": pd.NaT,
                    "pdf": os.path.join(all_dir, "topic_report.pdf"), "error": msg
                })

    # ---- Macro topics ----
    for topic in _progress(topics, "Processing topics (resume)"):
        slug = _slugify(topic)
        out_dir = os.path.join(base_dir, slug)
        pretty_name = topic

        if _topic_is_complete(out_dir):
            log(f'▶ SKIP "{pretty_name}" — existing outputs found in {out_dir}')
            summaries.append({
                "topic": pretty_name,
                "macro_topic": topic,
                "out_dir": out_dir,
                "latest_ok_meps": _last_ok_from_res(os.path.join(out_dir, "res_meps.csv")),
                "latest_ok_cap":  _last_ok_from_res(os.path.join(out_dir, "res_cap.csv")),
                "latest_ok_w":    _last_ok_from_res(os.path.join(out_dir, "res_w.csv")),
                "pdf": os.path.join(out_dir, "topic_report.pdf")
            })
            continue

        log(f'▶ RESUME "{pretty_name}" — outputs missing/incomplete, re-running')
        try:
            os.makedirs(out_dir, exist_ok=True)
            summary = run_one_topic(df, topic, out_dir)
            summaries.append(summary)
        except Exception as e:
            msg = f"{type(e).__name__}: {e}"
            log(f"✖ ERROR topic: {topic} → {msg}")
            with open(os.path.join(out_dir, "ERROR.txt"), "w", encoding="utf-8") as f:
                f.write(f"{topic}\n{msg}\n")
            summaries.append({
                "topic": pretty_name, "macro_topic": topic,
                "out_dir": out_dir, "latest_ok_meps": pd.NaT,
                "latest_ok_cap": pd.NaT, "latest_ok_w": pd.NaT,
                "pdf": os.path.join(out_dir, "topic_report.pdf"), "error": msg
            })

    sm_df = pd.DataFrame(summaries)
    out_summary = os.path.join(base_dir, "batch_summary_resume.csv")
    sm_df.to_csv(out_summary, index=False)
    log(f"Resume batch finished → {out_summary}")

    # Rebuild topics_summary_v6.csv from whatever is now in reports_v6/
    _ = build_topics_summary(base_dir=base_dir, topics=topics, include_all=include_all)

    return sm_df




In [6]:
# ---------- CALL THIS instead of run_all_topics(df) ----------
summary_df = run_all_topics_resume(df)

[11:27:59] Resume batch start — 30 topics → base folder: reports_v6
[11:27:59] ▶ SKIP "All speeches" — existing outputs found in reports_v6/_all_speeches


Processing topics (resume):   0%|                                            | 0/30 [00:00<?, ?it/s]

[11:27:59] ▶ SKIP "Agriculture & fisheries" — existing outputs found in reports_v6/agriculture-and-fisheries
[11:27:59] ▶ SKIP "Climate, environment & biodiversity" — existing outputs found in reports_v6/climate-environment-and-biodiversity
[11:27:59] ▶ SKIP "Development & humanitarian aid" — existing outputs found in reports_v6/development-and-humanitarian-aid
[11:27:59] ▶ SKIP "Digital policy & data protection" — existing outputs found in reports_v6/digital-policy-and-data-protection
[11:27:59] ▶ SKIP "EU budget & MFF" — existing outputs found in reports_v6/eu-budget-and-mff
[11:27:59] ▶ SKIP "Economy & industrial policy" — existing outputs found in reports_v6/economy-and-industrial-policy
[11:27:59] ▶ SKIP "Education, culture & sport" — existing outputs found in reports_v6/education-culture-and-sport
[11:27:59] ▶ SKIP "Energy & energy security" — existing outputs found in reports_v6/energy-and-energy-security
[11:27:59] ▶ SKIP "Enlargement & neighbourhood policy" — existing outputs 

Processing topics (resume):  67%|██████████████████████▋           | 20/30 [00:00<00:00, 195.42it/s]

[11:27:59] ▶ SKIP "Procedural & Parliamentary business" — existing outputs found in reports_v6/procedural-and-parliamentary-business
[11:27:59] ▶ SKIP "Research, innovation & space" — existing outputs found in reports_v6/research-innovation-and-space
[11:27:59] ▶ SKIP "Rule of law & fundamental rights" — existing outputs found in reports_v6/rule-of-law-and-fundamental-rights
[11:27:59] ▶ SKIP "Security & defence" — existing outputs found in reports_v6/security-and-defence
[11:27:59] ▶ SKIP "Security & policing" — existing outputs found in reports_v6/security-and-policing
[11:27:59] ▶ SKIP "Single market, competition & consumer protection" — existing outputs found in reports_v6/single-market-competition-and-consumer-protection
[11:27:59] ▶ RESUME "Social policy & employment" — outputs missing/incomplete, re-running
[11:27:59] ▶ topic: "Social policy & employment" → reports_v6/social-policy-and-employment
[11:27:59]   step: filter count → 7525 speeches in df for this topic (before guardr

Batches:   0%|          | 0/167 [00:00<?, ?it/s]

Processing topics (resume):  67%|██████████████████████▋           | 20/30 [00:20<00:00, 195.42it/s]

[11:33:45]     done in 345.9s | rows=10 | pw=ok
[11:33:45]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.04620823204813713
[11:33:45]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/167 [00:00<?, ?it/s]

[11:39:05]     done in 319.5s | rows=10 | pw=ok
[11:39:05]     bins_ok=10 latest_ok=2024-01-01 00:00:00 CPC_max=0.041418160839152514
[11:39:05]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/167 [00:00<?, ?it/s]

[11:45:25]     done in 380.4s | rows=10 | pw=ok
[11:45:25]     bins_ok=10 latest_ok=2024-01-01 00:00:00 CPC_max=0.04703684753298065
[11:45:25]   step: save CSVs
[11:45:25]     CSVs saved.
[11:45:25]   step: build PDF layout for "Social policy & employment"
[11:45:27]   step: PDF saved → reports_v6/social-policy-and-employment/topic_report.pdf
[11:45:27] ✔ topic done: "Social policy & employment" in 1047.6s | PDF → reports_v6/social-policy-and-employment/topic_report.pdf



Processing topics (resume):  90%|███████████████████████████████▌   | 27/30 [17:27<02:29, 49.89s/it]

[11:45:27] ▶ RESUME "Taxation & anti-money laundering" — outputs missing/incomplete, re-running
[11:45:27] ▶ topic: "Taxation & anti-money laundering" → reports_v6/taxation-and-anti-money-laundering
[11:45:27]   step: filter count → 0 speeches in df for this topic (before guardrails)
[11:45:27]   step: run mode=speaker (MEP-averaged)
[11:45:27]     done in 0.0s | rows=0 | pw=none
[11:45:27]     bins_ok=0 latest_ok=None CPC_max=None
[11:45:27]   step: run mode=speech_capped (per-MEP cap)
[11:45:27]     done in 0.0s | rows=0 | pw=none
[11:45:27]     bins_ok=0 latest_ok=None CPC_max=None
[11:45:27]   step: run mode=speech_weighted (as heard)
[11:45:27]     done in 0.0s | rows=0 | pw=none
[11:45:27]     bins_ok=0 latest_ok=None CPC_max=None
[11:45:27]   step: save CSVs
[11:45:27]     CSVs saved.
[11:45:27]   step: build PDF layout for "Taxation & anti-money laundering"
[11:45:27]   step: PDF saved → reports_v6/taxation-and-anti-money-laundering/topic_report.pdf
[11:45:27] ✔ topic done: "Ta

Processing topics (resume):  93%|████████████████████████████████▋  | 28/30 [17:27<01:33, 46.73s/it]

[11:45:27] ▶ RESUME "Trade & globalization" — outputs missing/incomplete, re-running
[11:45:27] ▶ topic: "Trade & globalization" → reports_v6/trade-and-globalization
[11:45:27]   step: filter count → 5979 speeches in df for this topic (before guardrails)
[11:45:27]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/130 [00:00<?, ?it/s]

Processing topics (resume):  93%|████████████████████████████████▋  | 28/30 [17:41<01:33, 46.73s/it]

[11:50:34]     done in 307.3s | rows=11 | pw=none
[11:50:34]     bins_ok=0 latest_ok=None CPC_max=None
[11:50:34]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/130 [00:00<?, ?it/s]

[11:55:44]     done in 310.0s | rows=11 | pw=ok
[11:55:44]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.038201171587525735
[11:55:44]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/130 [00:00<?, ?it/s]

[12:00:32]     done in 287.7s | rows=11 | pw=ok
[12:00:32]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.041836972641064424
[12:00:32]   step: save CSVs
[12:00:32]     CSVs saved.
[12:00:32]   step: build PDF layout for "Trade & globalization"
[12:00:32]   step: PDF saved → reports_v6/trade-and-globalization/topic_report.pdf
[12:00:32] ✔ topic done: "Trade & globalization" in 905.6s | PDF → reports_v6/trade-and-globalization/topic_report.pdf



Processing topics (resume):  97%|████████████████████████████████▊ | 29/30 [32:33<01:58, 118.36s/it]

[12:00:32] ▶ RESUME "Transport & mobility" — outputs missing/incomplete, re-running
[12:00:32] ▶ topic: "Transport & mobility" → reports_v6/transport-and-mobility
[12:00:32]   step: filter count → 2769 speeches in df for this topic (before guardrails)
[12:00:32]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[12:05:33]     done in 300.4s | rows=10 | pw=none
[12:05:33]     bins_ok=0 latest_ok=None CPC_max=None
[12:05:33]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[12:10:21]     done in 288.2s | rows=10 | pw=ok
[12:10:21]     bins_ok=5 latest_ok=2019-01-01 00:00:00 CPC_max=0.022026286748800365
[12:10:21]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[12:13:41]     done in 199.4s | rows=10 | pw=ok
[12:13:41]     bins_ok=5 latest_ok=2019-01-01 00:00:00 CPC_max=0.05495741967304367
[12:13:41]   step: save CSVs
[12:13:41]     CSVs saved.
[12:13:41]   step: build PDF layout for "Transport & mobility"
[12:13:41]   step: PDF saved → reports_v6/transport-and-mobility/topic_report.pdf
[12:13:41] ✔ topic done: "Transport & mobility" in 788.8s | PDF → reports_v6/transport-and-mobility/topic_report.pdf



Processing topics (resume): 100%|███████████████████████████████████| 30/30 [45:42<00:00, 91.41s/it]

[12:13:41] Resume batch finished → reports_v6/batch_summary_resume.csv





[12:13:42] Topics summary saved → reports_v6/topics_summary_v6.csv


In [5]:
summary_df = run_all_topics(df)


[20:19:44] Batch start — 30 topics → base folder: reports_v6
[20:19:44] ▶ topic: "All speeches" → reports_v6/_all_speeches
[20:19:44]   step: filter count → 163079 speeches in df for this topic (before guardrails)
[20:19:44]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/3344 [00:00<?, ?it/s]

[21:09:10]     done in 2965.4s | rows=11 | pw=ok
[21:09:10]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.04029521102753136
[21:09:10]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/3344 [00:00<?, ?it/s]

[22:19:11]     done in 4201.5s | rows=11 | pw=ok
[22:19:11]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=-0.22342949501500553
[22:19:11]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/3344 [00:00<?, ?it/s]

[22:58:46]     done in 2374.6s | rows=11 | pw=ok
[22:58:46]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.010345831829924676
[22:58:46]   step: save CSVs
[22:58:46]     CSVs saved.
[22:58:46]   step: build PDF layout for "All speeches"
[22:58:47]   step: PDF saved → reports_v6/_all_speeches/topic_report.pdf
[22:58:47] ✔ topic done: "All speeches" in 9542.4s | PDF → reports_v6/_all_speeches/topic_report.pdf



Processing topics:   0%|                                                     | 0/30 [00:00<?, ?it/s]

[22:58:47] ▶ topic: "Agriculture & fisheries" → reports_v6/agriculture-and-fisheries
[22:58:47]   step: filter count → 5505 speeches in df for this topic (before guardrails)
[22:58:47]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/121 [00:00<?, ?it/s]

[23:00:18]     done in 91.2s | rows=11 | pw=ok
[23:00:18]     bins_ok=2 latest_ok=2024-01-01 00:00:00 CPC_max=0.053718635337507536
[23:00:18]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/121 [00:00<?, ?it/s]

[23:02:10]     done in 111.8s | rows=11 | pw=ok
[23:02:10]     bins_ok=10 latest_ok=2024-01-01 00:00:00 CPC_max=0.040072851550602966
[23:02:10]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/121 [00:00<?, ?it/s]

[23:04:01]     done in 111.5s | rows=11 | pw=ok
[23:04:01]     bins_ok=10 latest_ok=2024-01-01 00:00:00 CPC_max=0.04018157745738253
[23:04:01]   step: save CSVs
[23:04:01]     CSVs saved.
[23:04:01]   step: build PDF layout for "Agriculture & fisheries"
[23:04:02]   step: PDF saved → reports_v6/agriculture-and-fisheries/topic_report.pdf
[23:04:02] ✔ topic done: "Agriculture & fisheries" in 315.1s | PDF → reports_v6/agriculture-and-fisheries/topic_report.pdf



Processing topics:   3%|█▍                                        | 1/30 [05:15<2:32:16, 315.06s/it]

[23:04:02] ▶ topic: "Climate, environment & biodiversity" → reports_v6/climate-environment-and-biodiversity
[23:04:02]   step: filter count → 9654 speeches in df for this topic (before guardrails)
[23:04:02]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/215 [00:00<?, ?it/s]

[23:06:37]     done in 155.4s | rows=11 | pw=ok
[23:06:37]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.03872135353547916
[23:06:37]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/215 [00:00<?, ?it/s]

[23:09:43]     done in 185.5s | rows=11 | pw=ok
[23:09:43]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.0348974593595183
[23:09:43]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/215 [00:00<?, ?it/s]

[23:12:48]     done in 184.8s | rows=11 | pw=ok
[23:12:48]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.03463035592939664
[23:12:48]   step: save CSVs
[23:12:48]     CSVs saved.
[23:12:48]   step: build PDF layout for "Climate, environment & biodiversity"
[23:12:48]   step: PDF saved → reports_v6/climate-environment-and-biodiversity/topic_report.pdf
[23:12:48] ✔ topic done: "Climate, environment & biodiversity" in 526.2s | PDF → reports_v6/climate-environment-and-biodiversity/topic_report.pdf



Processing topics:   7%|██▊                                       | 2/30 [14:01<3:24:58, 439.25s/it]

[23:12:48] ▶ topic: "Development & humanitarian aid" → reports_v6/development-and-humanitarian-aid
[23:12:48]   step: filter count → 3771 speeches in df for this topic (before guardrails)
[23:12:48]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

[23:13:56]     done in 68.0s | rows=11 | pw=ok
[23:13:56]     bins_ok=1 latest_ok=2023-01-01 00:00:00 CPC_max=0.04871206791435968
[23:13:56]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

[23:15:13]     done in 77.4s | rows=11 | pw=ok
[23:15:13]     bins_ok=8 latest_ok=2023-01-01 00:00:00 CPC_max=0.04120650835584948
[23:15:13]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/81 [00:00<?, ?it/s]

[23:16:32]     done in 78.5s | rows=11 | pw=ok
[23:16:32]     bins_ok=8 latest_ok=2023-01-01 00:00:00 CPC_max=0.04749766052454758
[23:16:32]   step: save CSVs
[23:16:32]     CSVs saved.
[23:16:32]   step: build PDF layout for "Development & humanitarian aid"
[23:16:33]   step: PDF saved → reports_v6/development-and-humanitarian-aid/topic_report.pdf
[23:16:33] ✔ topic done: "Development & humanitarian aid" in 224.4s | PDF → reports_v6/development-and-humanitarian-aid/topic_report.pdf



Processing topics:  10%|████▏                                     | 3/30 [17:45<2:33:31, 341.15s/it]

[23:16:33] ▶ topic: "Digital policy & data protection" → reports_v6/digital-policy-and-data-protection
[23:16:33]   step: filter count → 4735 speeches in df for this topic (before guardrails)
[23:16:33]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

[23:17:54]     done in 81.5s | rows=11 | pw=ok
[23:17:54]     bins_ok=2 latest_ok=2025-01-01 00:00:00 CPC_max=0.03520844333003744
[23:17:54]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

[23:19:29]     done in 95.2s | rows=11 | pw=ok
[23:19:29]     bins_ok=8 latest_ok=2025-01-01 00:00:00 CPC_max=0.021086207013183644
[23:19:29]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

[23:21:07]     done in 98.1s | rows=11 | pw=ok
[23:21:07]     bins_ok=8 latest_ok=2025-01-01 00:00:00 CPC_max=0.10350981499545676
[23:21:07]   step: save CSVs
[23:21:07]     CSVs saved.
[23:21:07]   step: build PDF layout for "Digital policy & data protection"
[23:21:08]   step: PDF saved → reports_v6/digital-policy-and-data-protection/topic_report.pdf
[23:21:08] ✔ topic done: "Digital policy & data protection" in 275.4s | PDF → reports_v6/digital-policy-and-data-protection/topic_report.pdf



Processing topics:  13%|█████▌                                    | 4/30 [22:21<2:16:35, 315.21s/it]

[23:21:08] ▶ topic: "EU budget & MFF" → reports_v6/eu-budget-and-mff
[23:21:08]   step: filter count → 6868 speeches in df for this topic (before guardrails)
[23:21:08]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/143 [00:00<?, ?it/s]

[23:22:58]     done in 110.1s | rows=11 | pw=ok
[23:22:58]     bins_ok=2 latest_ok=2022-01-01 00:00:00 CPC_max=0.041946279600748634
[23:22:58]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/143 [00:00<?, ?it/s]

[23:25:06]     done in 128.0s | rows=11 | pw=ok
[23:25:06]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.029884687025901153
[23:25:06]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/143 [00:00<?, ?it/s]

[23:27:15]     done in 129.2s | rows=11 | pw=ok
[23:27:15]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.03443085650113968
[23:27:15]   step: save CSVs
[23:27:15]     CSVs saved.
[23:27:15]   step: build PDF layout for "EU budget & MFF"
[23:27:16]   step: PDF saved → reports_v6/eu-budget-and-mff/topic_report.pdf
[23:27:16] ✔ topic done: "EU budget & MFF" in 368.0s | PDF → reports_v6/eu-budget-and-mff/topic_report.pdf



Processing topics:  17%|███████                                   | 5/30 [28:29<2:19:15, 334.24s/it]

[23:27:16] ▶ topic: "Economy & industrial policy" → reports_v6/economy-and-industrial-policy
[23:27:16]   step: filter count → 4939 speeches in df for this topic (before guardrails)
[23:27:16]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/107 [00:00<?, ?it/s]

[23:28:40]     done in 84.0s | rows=11 | pw=ok
[23:28:40]     bins_ok=1 latest_ok=2023-01-01 00:00:00 CPC_max=0.038428952077089594
[23:28:40]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/107 [00:00<?, ?it/s]

[23:30:20]     done in 99.8s | rows=11 | pw=ok
[23:30:20]     bins_ok=8 latest_ok=2024-01-01 00:00:00 CPC_max=0.03363367484027276
[23:30:20]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/107 [00:00<?, ?it/s]

[23:31:59]     done in 99.1s | rows=11 | pw=ok
[23:31:59]     bins_ok=8 latest_ok=2024-01-01 00:00:00 CPC_max=0.044274473127485614
[23:31:59]   step: save CSVs
[23:31:59]     CSVs saved.
[23:31:59]   step: build PDF layout for "Economy & industrial policy"
[23:32:00]   step: PDF saved → reports_v6/economy-and-industrial-policy/topic_report.pdf
[23:32:00] ✔ topic done: "Economy & industrial policy" in 283.9s | PDF → reports_v6/economy-and-industrial-policy/topic_report.pdf



Processing topics:  20%|████████▍                                 | 6/30 [33:12<2:06:50, 317.12s/it]

[23:32:00] ▶ topic: "Education, culture & sport" → reports_v6/education-culture-and-sport
[23:32:00]   step: filter count → 2705 speeches in df for this topic (before guardrails)
[23:32:00]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/58 [00:00<?, ?it/s]

[23:32:47]     done in 47.3s | rows=11 | pw=none
[23:32:47]     bins_ok=0 latest_ok=None CPC_max=None
[23:32:47]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/58 [00:00<?, ?it/s]

[23:33:40]     done in 52.4s | rows=11 | pw=ok
[23:33:40]     bins_ok=5 latest_ok=2021-01-01 00:00:00 CPC_max=0.02498099652400391
[23:33:40]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/58 [00:00<?, ?it/s]

[23:34:34]     done in 54.9s | rows=11 | pw=ok
[23:34:34]     bins_ok=6 latest_ok=2021-01-01 00:00:00 CPC_max=0.04381369997675613
[23:34:34]   step: save CSVs
[23:34:34]     CSVs saved.
[23:34:34]   step: build PDF layout for "Education, culture & sport"
[23:34:35]   step: PDF saved → reports_v6/education-culture-and-sport/topic_report.pdf
[23:34:35] ✔ topic done: "Education, culture & sport" in 155.0s | PDF → reports_v6/education-culture-and-sport/topic_report.pdf



Processing topics:  23%|█████████▊                                | 7/30 [35:47<1:41:14, 264.11s/it]

[23:34:35] ▶ topic: "Energy & energy security" → reports_v6/energy-and-energy-security
[23:34:35]   step: filter count → 4493 speeches in df for this topic (before guardrails)
[23:34:35]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

[23:35:58]     done in 82.8s | rows=11 | pw=ok
[23:35:58]     bins_ok=2 latest_ok=2023-01-01 00:00:00 CPC_max=0.04270453002888198
[23:35:58]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

[23:37:30]     done in 92.6s | rows=11 | pw=ok
[23:37:30]     bins_ok=8 latest_ok=2025-01-01 00:00:00 CPC_max=0.035881773470076914
[23:37:30]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

[23:39:02]     done in 91.7s | rows=11 | pw=ok
[23:39:02]     bins_ok=8 latest_ok=2025-01-01 00:00:00 CPC_max=0.04856699910393851
[23:39:02]   step: save CSVs
[23:39:02]     CSVs saved.
[23:39:02]   step: build PDF layout for "Energy & energy security"
[23:39:03]   step: PDF saved → reports_v6/energy-and-energy-security/topic_report.pdf
[23:39:03] ✔ topic done: "Energy & energy security" in 267.8s | PDF → reports_v6/energy-and-energy-security/topic_report.pdf



Processing topics:  27%|███████████▏                              | 8/30 [40:15<1:37:16, 265.29s/it]

[23:39:03] ▶ topic: "Enlargement & neighbourhood policy" → reports_v6/enlargement-and-neighbourhood-policy
[23:39:03]   step: filter count → 2844 speeches in df for this topic (before guardrails)
[23:39:03]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/61 [00:00<?, ?it/s]

[23:39:53]     done in 50.4s | rows=11 | pw=none
[23:39:53]     bins_ok=0 latest_ok=None CPC_max=None
[23:39:53]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/61 [00:00<?, ?it/s]

[23:40:46]     done in 53.0s | rows=11 | pw=ok
[23:40:46]     bins_ok=5 latest_ok=2023-01-01 00:00:00 CPC_max=0.029007621302950263
[23:40:46]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/61 [00:00<?, ?it/s]

[23:41:40]     done in 53.8s | rows=11 | pw=ok
[23:41:40]     bins_ok=5 latest_ok=2023-01-01 00:00:00 CPC_max=0.04204899925647839
[23:41:40]   step: save CSVs
[23:41:40]     CSVs saved.
[23:41:40]   step: build PDF layout for "Enlargement & neighbourhood policy"
[23:41:40]   step: PDF saved → reports_v6/enlargement-and-neighbourhood-policy/topic_report.pdf
[23:41:40] ✔ topic done: "Enlargement & neighbourhood policy" in 157.6s | PDF → reports_v6/enlargement-and-neighbourhood-policy/topic_report.pdf



Processing topics:  30%|████████████▌                             | 9/30 [42:53<1:21:03, 231.61s/it]

[23:41:40] ▶ topic: "Foreign policy — Americas" → reports_v6/foreign-policy-americas
[23:41:40]   step: filter count → 2655 speeches in df for this topic (before guardrails)
[23:41:40]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

[23:42:31]     done in 51.3s | rows=11 | pw=ok
[23:42:31]     bins_ok=1 latest_ok=2021-01-01 00:00:00 CPC_max=0.06092986764646691
[23:42:31]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

[23:43:26]     done in 54.3s | rows=11 | pw=ok
[23:43:26]     bins_ok=7 latest_ok=2025-01-01 00:00:00 CPC_max=0.040164930512165374
[23:43:26]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/63 [00:00<?, ?it/s]

[23:44:19]     done in 53.2s | rows=11 | pw=ok
[23:44:19]     bins_ok=7 latest_ok=2025-01-01 00:00:00 CPC_max=0.061193472746280765
[23:44:19]   step: save CSVs
[23:44:19]     CSVs saved.
[23:44:19]   step: build PDF layout for "Foreign policy — Americas"
[23:44:19]   step: PDF saved → reports_v6/foreign-policy-americas/topic_report.pdf
[23:44:19] ✔ topic done: "Foreign policy — Americas" in 159.2s | PDF → reports_v6/foreign-policy-americas/topic_report.pdf



Processing topics:  33%|█████████████▋                           | 10/30 [45:32<1:09:45, 209.27s/it]

[23:44:19] ▶ topic: "Foreign policy — Asia-Pacific" → reports_v6/foreign-policy-asia-pacific
[23:44:19]   step: filter count → 956 speeches in df for this topic (before guardrails)
[23:44:19]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

[23:44:44]     done in 24.3s | rows=10 | pw=none
[23:44:44]     bins_ok=0 latest_ok=None CPC_max=None
[23:44:44]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

[23:45:04]     done in 20.1s | rows=10 | pw=ok
[23:45:04]     bins_ok=1 latest_ok=2022-01-01 00:00:00 CPC_max=0.030443242275798296
[23:45:04]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/20 [00:00<?, ?it/s]

[23:45:23]     done in 19.5s | rows=10 | pw=ok
[23:45:23]     bins_ok=1 latest_ok=2022-01-01 00:00:00 CPC_max=0.0332040882552528
[23:45:23]   step: save CSVs
[23:45:23]     CSVs saved.
[23:45:23]   step: build PDF layout for "Foreign policy — Asia-Pacific"
[23:45:24]   step: PDF saved → reports_v6/foreign-policy-asia-pacific/topic_report.pdf
[23:45:24] ✔ topic done: "Foreign policy — Asia-Pacific" in 64.3s | PDF → reports_v6/foreign-policy-asia-pacific/topic_report.pdf



Processing topics:  37%|███████████████▊                           | 11/30 [46:36<52:13, 164.91s/it]

[23:45:24] ▶ topic: "Foreign policy — Europe & Eastern Neighbourhood" → reports_v6/foreign-policy-europe-and-eastern-neighbourhood
[23:45:24]   step: filter count → 6807 speeches in df for this topic (before guardrails)
[23:45:24]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

[23:47:20]     done in 116.2s | rows=11 | pw=ok
[23:47:20]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.046839587564849544
[23:47:20]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

[23:49:35]     done in 135.4s | rows=11 | pw=ok
[23:49:35]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.024967707789997212
[23:49:35]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

[23:51:54]     done in 138.4s | rows=11 | pw=ok
[23:51:54]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.12567029581601577
[23:51:54]   step: save CSVs
[23:51:54]     CSVs saved.
[23:51:54]   step: build PDF layout for "Foreign policy — Europe & Eastern Neighbourhood"
[23:51:55]   step: PDF saved → reports_v6/foreign-policy-europe-and-eastern-neighbourhood/topic_report.pdf
[23:51:55] ✔ topic done: "Foreign policy — Europe & Eastern Neighbourhood" in 390.8s | PDF → reports_v6/foreign-policy-europe-and-eastern-neighbourhood/topic_report.pdf



Processing topics:  40%|████████████████▍                        | 12/30 [53:07<1:10:05, 233.63s/it]

[23:51:55] ▶ topic: "Foreign policy — Middle East & North Africa" → reports_v6/foreign-policy-middle-east-and-north-africa
[23:51:55]   step: filter count → 4195 speeches in df for this topic (before guardrails)
[23:51:55]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

[23:53:15]     done in 80.6s | rows=11 | pw=none
[23:53:15]     bins_ok=0 latest_ok=None CPC_max=None
[23:53:15]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

[23:54:47]     done in 91.7s | rows=11 | pw=ok
[23:54:47]     bins_ok=9 latest_ok=2024-01-01 00:00:00 CPC_max=0.06760599637545363
[23:54:47]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/102 [00:00<?, ?it/s]

[23:56:19]     done in 92.1s | rows=11 | pw=ok
[23:56:19]     bins_ok=9 latest_ok=2024-01-01 00:00:00 CPC_max=0.07041668827710333
[23:56:19]   step: save CSVs
[23:56:19]     CSVs saved.
[23:56:19]   step: build PDF layout for "Foreign policy — Middle East & North Africa"
[23:56:20]   step: PDF saved → reports_v6/foreign-policy-middle-east-and-north-africa/topic_report.pdf
[23:56:20] ✔ topic done: "Foreign policy — Middle East & North Africa" in 265.0s | PDF → reports_v6/foreign-policy-middle-east-and-north-africa/topic_report.pdf



Processing topics:  43%|█████████████████▊                       | 13/30 [57:32<1:08:53, 243.13s/it]

[23:56:20] ▶ topic: "Foreign policy — Sub-Saharan Africa" → reports_v6/foreign-policy-sub-saharan-africa
[23:56:20]   step: filter count → 1027 speeches in df for this topic (before guardrails)
[23:56:20]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

[23:56:43]     done in 23.7s | rows=10 | pw=none
[23:56:43]     bins_ok=0 latest_ok=None CPC_max=None
[23:56:43]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

[23:57:02]     done in 19.1s | rows=10 | pw=none
[23:57:02]     bins_ok=0 latest_ok=None CPC_max=None
[23:57:02]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

[23:57:21]     done in 18.4s | rows=10 | pw=none
[23:57:21]     bins_ok=0 latest_ok=None CPC_max=None
[23:57:21]   step: save CSVs
[23:57:21]     CSVs saved.
[23:57:21]   step: build PDF layout for "Foreign policy — Sub-Saharan Africa"
[23:57:21]   step: PDF saved → reports_v6/foreign-policy-sub-saharan-africa/topic_report.pdf
[23:57:21] ✔ topic done: "Foreign policy — Sub-Saharan Africa" in 61.5s | PDF → reports_v6/foreign-policy-sub-saharan-africa/topic_report.pdf



Processing topics:  47%|████████████████████                       | 14/30 [58:34<50:12, 188.27s/it]

[23:57:21] ▶ topic: "Health" → reports_v6/health
[23:57:21]   step: filter count → 4529 speeches in df for this topic (before guardrails)
[23:57:21]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

[23:58:45]     done in 83.6s | rows=11 | pw=ok
[23:58:45]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.059408918488772985
[23:58:45]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

[00:00:21]     done in 96.1s | rows=11 | pw=ok
[00:00:21]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.04236106398103326
[00:00:21]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/103 [00:00<?, ?it/s]

[00:01:57]     done in 96.7s | rows=11 | pw=ok
[00:01:57]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.04382748506354786
[00:01:57]   step: save CSVs
[00:01:57]     CSVs saved.
[00:01:57]   step: build PDF layout for "Health"
[00:01:58]   step: PDF saved → reports_v6/health/topic_report.pdf
[00:01:58] ✔ topic done: "Health" in 276.9s | PDF → reports_v6/health/topic_report.pdf



Processing topics:  50%|████████████████████▌                    | 15/30 [1:03:11<53:44, 215.00s/it]

[00:01:58] ▶ topic: "Institutional affairs & governance" → reports_v6/institutional-affairs-and-governance
[00:01:58]   step: filter count → 5378 speeches in df for this topic (before guardrails)
[00:01:58]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

[00:03:31]     done in 92.8s | rows=11 | pw=ok
[00:03:31]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.041146661006434666
[00:03:31]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

[00:05:21]     done in 110.2s | rows=11 | pw=ok
[00:05:21]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.03712445940004549
[00:05:21]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

[00:07:14]     done in 113.4s | rows=11 | pw=ok
[00:07:14]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.050152685809933414
[00:07:14]   step: save CSVs
[00:07:14]     CSVs saved.
[00:07:14]   step: build PDF layout for "Institutional affairs & governance"
[00:07:15]   step: PDF saved → reports_v6/institutional-affairs-and-governance/topic_report.pdf
[00:07:15] ✔ topic done: "Institutional affairs & governance" in 317.0s | PDF → reports_v6/institutional-affairs-and-governance/topic_report.pdf



Processing topics:  53%|█████████████████████▊                   | 16/30 [1:08:28<57:19, 245.70s/it]

[00:07:15] ▶ topic: "Justice, security & policing" → reports_v6/justice-security-and-policing
[00:07:15]   step: filter count → 3314 speeches in df for this topic (before guardrails)
[00:07:15]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/72 [00:00<?, ?it/s]

[00:08:13]     done in 57.6s | rows=11 | pw=none
[00:08:13]     bins_ok=0 latest_ok=None CPC_max=None
[00:08:13]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/72 [00:00<?, ?it/s]

[00:09:17]     done in 63.9s | rows=11 | pw=ok
[00:09:17]     bins_ok=6 latest_ok=2025-01-01 00:00:00 CPC_max=0.030035118315010913
[00:09:17]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/72 [00:00<?, ?it/s]

[00:10:21]     done in 64.5s | rows=11 | pw=ok
[00:10:21]     bins_ok=6 latest_ok=2025-01-01 00:00:00 CPC_max=0.04113036926376177
[00:10:21]   step: save CSVs
[00:10:21]     CSVs saved.
[00:10:21]   step: build PDF layout for "Justice, security & policing"
[00:10:22]   step: PDF saved → reports_v6/justice-security-and-policing/topic_report.pdf
[00:10:22] ✔ topic done: "Justice, security & policing" in 186.5s | PDF → reports_v6/justice-security-and-policing/topic_report.pdf



Processing topics:  57%|███████████████████████▏                 | 17/30 [1:11:34<49:22, 227.90s/it]

[00:10:22] ▶ topic: "Media, information & disinformation" → reports_v6/media-information-and-disinformation
[00:10:22]   step: filter count → 2012 speeches in df for this topic (before guardrails)
[00:10:22]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

[00:11:03]     done in 41.4s | rows=10 | pw=none
[00:11:03]     bins_ok=0 latest_ok=None CPC_max=None
[00:11:03]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

[00:11:48]     done in 45.3s | rows=10 | pw=ok
[00:11:48]     bins_ok=5 latest_ok=2024-01-01 00:00:00 CPC_max=0.027449693931147018
[00:11:48]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/46 [00:00<?, ?it/s]

[00:12:33]     done in 45.0s | rows=10 | pw=ok
[00:12:33]     bins_ok=5 latest_ok=2024-01-01 00:00:00 CPC_max=0.03475289473624651
[00:12:33]   step: save CSVs
[00:12:33]     CSVs saved.
[00:12:33]   step: build PDF layout for "Media, information & disinformation"
[00:12:34]   step: PDF saved → reports_v6/media-information-and-disinformation/topic_report.pdf
[00:12:34] ✔ topic done: "Media, information & disinformation" in 132.1s | PDF → reports_v6/media-information-and-disinformation/topic_report.pdf



Processing topics:  60%|████████████████████████▌                | 18/30 [1:13:46<39:49, 199.13s/it]

[00:12:34] ▶ topic: "Migration & asylum" → reports_v6/migration-and-asylum
[00:12:34]   step: filter count → 6863 speeches in df for this topic (before guardrails)
[00:12:34]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/153 [00:00<?, ?it/s]

[00:14:29]     done in 115.7s | rows=10 | pw=ok
[00:14:29]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.052932842726926534
[00:14:29]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/153 [00:00<?, ?it/s]

[00:16:44]     done in 134.2s | rows=10 | pw=ok
[00:16:44]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.03297528472895107
[00:16:44]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/153 [00:00<?, ?it/s]

[00:18:58]     done in 134.9s | rows=10 | pw=ok
[00:18:58]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.0358150184283815
[00:18:58]   step: save CSVs
[00:18:58]     CSVs saved.
[00:18:58]   step: build PDF layout for "Migration & asylum"
[00:19:00]   step: PDF saved → reports_v6/migration-and-asylum/topic_report.pdf
[00:19:00] ✔ topic done: "Migration & asylum" in 386.0s | PDF → reports_v6/migration-and-asylum/topic_report.pdf



Processing topics:  63%|█████████████████████████▉               | 19/30 [1:20:12<46:47, 255.25s/it]

[00:19:00] ▶ topic: "Monetary & financial stability" → reports_v6/monetary-and-financial-stability
[00:19:00]   step: filter count → 2339 speeches in df for this topic (before guardrails)
[00:19:00]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

[00:19:41]     done in 41.0s | rows=11 | pw=none
[00:19:41]     bins_ok=0 latest_ok=None CPC_max=None
[00:19:41]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

[00:20:24]     done in 43.5s | rows=11 | pw=ok
[00:20:24]     bins_ok=4 latest_ok=2018-01-01 00:00:00 CPC_max=0.02518752059689635
[00:20:24]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

[00:21:08]     done in 44.1s | rows=11 | pw=ok
[00:21:08]     bins_ok=4 latest_ok=2018-01-01 00:00:00 CPC_max=0.042044462903326564
[00:21:08]   step: save CSVs
[00:21:08]     CSVs saved.
[00:21:08]   step: build PDF layout for "Monetary & financial stability"
[00:21:09]   step: PDF saved → reports_v6/monetary-and-financial-stability/topic_report.pdf
[00:21:09] ✔ topic done: "Monetary & financial stability" in 129.0s | PDF → reports_v6/monetary-and-financial-stability/topic_report.pdf



Processing topics:  67%|███████████████████████████▎             | 20/30 [1:22:21<36:13, 217.36s/it]

[00:21:09] ▶ topic: "Procedural & Parliamentary business" → reports_v6/procedural-and-parliamentary-business
[00:21:09]   step: filter count → 23685 speeches in df for this topic (before guardrails)
[00:21:09]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/456 [00:00<?, ?it/s]

[00:26:45]     done in 336.1s | rows=11 | pw=ok
[00:26:45]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.032377938030626975
[00:26:45]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/456 [00:00<?, ?it/s]

[00:32:34]     done in 348.7s | rows=11 | pw=ok
[00:32:34]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.014436044856369727
[00:32:34]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/456 [00:00<?, ?it/s]

[00:38:25]     done in 351.3s | rows=11 | pw=ok
[00:38:25]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.02625492788259982
[00:38:25]   step: save CSVs
[00:38:25]     CSVs saved.
[00:38:25]   step: build PDF layout for "Procedural & Parliamentary business"
[00:38:26]   step: PDF saved → reports_v6/procedural-and-parliamentary-business/topic_report.pdf
[00:38:26] ✔ topic done: "Procedural & Parliamentary business" in 1036.8s | PDF → reports_v6/procedural-and-parliamentary-business/topic_report.pdf



Processing topics:  70%|███████████████████████████▎           | 21/30 [1:39:38<1:09:30, 463.34s/it]

[00:38:26] ▶ topic: "Research, innovation & space" → reports_v6/research-innovation-and-space
[00:38:26]   step: filter count → 1098 speeches in df for this topic (before guardrails)
[00:38:26]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

[00:38:52]     done in 26.8s | rows=10 | pw=none
[00:38:52]     bins_ok=0 latest_ok=None CPC_max=None
[00:38:52]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

[00:39:14]     done in 21.3s | rows=10 | pw=ok
[00:39:14]     bins_ok=1 latest_ok=2024-01-01 00:00:00 CPC_max=0.02135981669253606
[00:39:14]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/23 [00:00<?, ?it/s]

[00:39:34]     done in 20.4s | rows=10 | pw=ok
[00:39:34]     bins_ok=1 latest_ok=2024-01-01 00:00:00 CPC_max=0.024143781123930234
[00:39:34]   step: save CSVs
[00:39:34]     CSVs saved.
[00:39:34]   step: build PDF layout for "Research, innovation & space"
[00:39:34]   step: PDF saved → reports_v6/research-innovation-and-space/topic_report.pdf
[00:39:34] ✔ topic done: "Research, innovation & space" in 69.0s | PDF → reports_v6/research-innovation-and-space/topic_report.pdf



Processing topics:  73%|██████████████████████████████           | 22/30 [1:40:47<45:59, 344.98s/it]

[00:39:34] ▶ topic: "Rule of law & fundamental rights" → reports_v6/rule-of-law-and-fundamental-rights
[00:39:34]   step: filter count → 17208 speeches in df for this topic (before guardrails)
[00:39:34]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/366 [00:00<?, ?it/s]

[00:44:11]     done in 276.9s | rows=11 | pw=ok
[00:44:11]     bins_ok=8 latest_ok=2023-01-01 00:00:00 CPC_max=0.0314824227908932
[00:44:11]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/366 [00:00<?, ?it/s]

[00:49:05]     done in 293.9s | rows=11 | pw=ok
[00:49:05]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.03625478308685593
[00:49:05]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/366 [00:00<?, ?it/s]

[00:54:03]     done in 297.5s | rows=11 | pw=ok
[00:54:03]     bins_ok=11 latest_ok=2025-01-01 00:00:00 CPC_max=0.038898469582491955
[00:54:03]   step: save CSVs
[00:54:03]     CSVs saved.
[00:54:03]   step: build PDF layout for "Rule of law & fundamental rights"
[00:54:04]   step: PDF saved → reports_v6/rule-of-law-and-fundamental-rights/topic_report.pdf
[00:54:04] ✔ topic done: "Rule of law & fundamental rights" in 869.4s | PDF → reports_v6/rule-of-law-and-fundamental-rights/topic_report.pdf



Processing topics:  77%|███████████████████████████████▍         | 23/30 [1:55:17<58:36, 502.36s/it]

[00:54:04] ▶ topic: "Security & defence" → reports_v6/security-and-defence
[00:54:04]   step: filter count → 4084 speeches in df for this topic (before guardrails)
[00:54:04]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

[00:55:19]     done in 75.2s | rows=11 | pw=ok
[00:55:19]     bins_ok=1 latest_ok=2023-01-01 00:00:00 CPC_max=0.052268053454380084
[00:55:19]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

[00:56:45]     done in 86.1s | rows=11 | pw=ok
[00:56:45]     bins_ok=9 latest_ok=2025-01-01 00:00:00 CPC_max=0.03346005418730572
[00:56:45]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/92 [00:00<?, ?it/s]

[00:58:11]     done in 85.7s | rows=11 | pw=ok
[00:58:11]     bins_ok=9 latest_ok=2025-01-01 00:00:00 CPC_max=0.06475367889776883
[00:58:11]   step: save CSVs
[00:58:11]     CSVs saved.
[00:58:11]   step: build PDF layout for "Security & defence"
[00:58:11]   step: PDF saved → reports_v6/security-and-defence/topic_report.pdf
[00:58:11] ✔ topic done: "Security & defence" in 247.5s | PDF → reports_v6/security-and-defence/topic_report.pdf



Processing topics:  80%|████████████████████████████████▊        | 24/30 [1:59:24<42:35, 425.89s/it]

[00:58:11] ▶ topic: "Security & policing" → reports_v6/security-and-policing
[00:58:11]   step: filter count → 41 speeches in df for this topic (before guardrails)
[00:58:11]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[00:58:23]     done in 11.9s | rows=1 | pw=none
[00:58:23]     bins_ok=0 latest_ok=None CPC_max=None
[00:58:23]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[00:58:31]     done in 7.7s | rows=1 | pw=none
[00:58:31]     bins_ok=0 latest_ok=None CPC_max=None
[00:58:31]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[00:58:38]     done in 6.7s | rows=1 | pw=none
[00:58:38]     bins_ok=0 latest_ok=None CPC_max=None
[00:58:38]   step: save CSVs
[00:58:38]     CSVs saved.
[00:58:38]   step: build PDF layout for "Security & policing"
[00:58:38]   step: PDF saved → reports_v6/security-and-policing/topic_report.pdf
[00:58:38] ✔ topic done: "Security & policing" in 26.8s | PDF → reports_v6/security-and-policing/topic_report.pdf



Processing topics:  83%|██████████████████████████████████▏      | 25/30 [1:59:51<25:30, 306.15s/it]

[00:58:38] ▶ topic: "Single market, competition & consumer protection" → reports_v6/single-market-competition-and-consumer-protection
[00:58:38]   step: filter count → 3302 speeches in df for this topic (before guardrails)
[00:58:38]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/73 [00:00<?, ?it/s]

[00:59:34]     done in 55.9s | rows=11 | pw=none
[00:59:34]     bins_ok=0 latest_ok=None CPC_max=None
[00:59:34]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/73 [00:00<?, ?it/s]

[01:00:45]     done in 70.7s | rows=11 | pw=ok
[01:00:45]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.040029495684586844
[01:00:45]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/73 [00:00<?, ?it/s]

[01:02:00]     done in 74.7s | rows=11 | pw=ok
[01:02:00]     bins_ok=9 latest_ok=2023-01-01 00:00:00 CPC_max=0.048004157544610994
[01:02:00]   step: save CSVs
[01:02:00]     CSVs saved.
[01:02:00]   step: build PDF layout for "Single market, competition & consumer protection"
[01:02:00]   step: PDF saved → reports_v6/single-market-competition-and-consumer-protection/topic_report.pdf
[01:02:00] ✔ topic done: "Single market, competition & consumer protection" in 201.9s | PDF → reports_v6/single-market-competition-and-consumer-protection/topic_report.pdf



Processing topics:  87%|███████████████████████████████████▌     | 26/30 [2:03:13<18:19, 274.89s/it]

[01:02:00] ▶ topic: "Social policy & employment" → reports_v6/social-policy-and-employment
[01:02:00]   step: filter count → 7525 speeches in df for this topic (before guardrails)
[01:02:00]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/167 [00:00<?, ?it/s]

[01:04:05]     done in 124.3s | rows=10 | pw=ok
[01:04:05]     bins_ok=3 latest_ok=2023-01-01 00:00:00 CPC_max=0.04568527396685119
[01:04:05]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/167 [00:00<?, ?it/s]

[01:06:36]     done in 151.6s | rows=10 | pw=ok
[01:06:36]     bins_ok=10 latest_ok=2024-01-01 00:00:00 CPC_max=0.041476691412939484
[01:06:36]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/167 [00:00<?, ?it/s]

[01:09:08]     done in 152.4s | rows=10 | pw=ok
[01:09:08]     bins_ok=10 latest_ok=2024-01-01 00:00:00 CPC_max=0.04723697924084339
[01:09:08]   step: save CSVs
[01:09:09]     CSVs saved.
[01:09:09]   step: build PDF layout for "Social policy & employment"
[01:09:09]   step: PDF saved → reports_v6/social-policy-and-employment/topic_report.pdf
[01:09:09] ✔ topic done: "Social policy & employment" in 428.8s | PDF → reports_v6/social-policy-and-employment/topic_report.pdf



Processing topics:  90%|████████████████████████████████████▉    | 27/30 [2:10:22<16:03, 321.08s/it]

[01:09:09] ▶ topic: "Taxation & anti-money laundering" → reports_v6/taxation-and-anti-money-laundering
[01:09:09]   step: filter count → 0 speeches in df for this topic (before guardrails)
[01:09:09]   step: run mode=speaker (MEP-averaged)
[01:09:09]     done in 0.0s | rows=0 | pw=none
[01:09:09]     bins_ok=0 latest_ok=None CPC_max=None
[01:09:09]   step: run mode=speech_capped (per-MEP cap)
[01:09:09]     done in 0.0s | rows=0 | pw=none
[01:09:09]     bins_ok=0 latest_ok=None CPC_max=None
[01:09:09]   step: run mode=speech_weighted (as heard)
[01:09:09]     done in 0.0s | rows=0 | pw=none
[01:09:09]     bins_ok=0 latest_ok=None CPC_max=None
[01:09:09]   step: save CSVs
[01:09:09]     CSVs saved.
[01:09:09]   step: build PDF layout for "Taxation & anti-money laundering"
[01:09:09]   step: PDF saved → reports_v6/taxation-and-anti-money-laundering/topic_report.pdf
[01:09:09] ✔ topic done: "Taxation & anti-money laundering" in 0.3s | PDF → reports_v6/taxation-and-anti-money-laundering/to

Processing topics:  93%|██████████████████████████████████████▎  | 28/30 [2:10:22<07:29, 224.83s/it]

[01:09:09] ▶ topic: "Trade & globalization" → reports_v6/trade-and-globalization
[01:09:09]   step: filter count → 5979 speeches in df for this topic (before guardrails)
[01:09:09]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/130 [00:00<?, ?it/s]

[01:10:43]     done in 93.4s | rows=11 | pw=none
[01:10:43]     bins_ok=0 latest_ok=None CPC_max=None
[01:10:43]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/130 [00:00<?, ?it/s]

[01:12:40]     done in 117.4s | rows=11 | pw=ok
[01:12:40]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.036857308243540304
[01:12:40]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/130 [00:00<?, ?it/s]

[01:14:38]     done in 118.0s | rows=11 | pw=ok
[01:14:38]     bins_ok=10 latest_ok=2025-01-01 00:00:00 CPC_max=0.04280351490659047
[01:14:38]   step: save CSVs
[01:14:38]     CSVs saved.
[01:14:38]   step: build PDF layout for "Trade & globalization"
[01:14:39]   step: PDF saved → reports_v6/trade-and-globalization/topic_report.pdf
[01:14:39] ✔ topic done: "Trade & globalization" in 329.3s | PDF → reports_v6/trade-and-globalization/topic_report.pdf



Processing topics:  97%|███████████████████████████████████████▋ | 29/30 [2:15:51<04:16, 256.19s/it]

[01:14:39] ▶ topic: "Transport & mobility" → reports_v6/transport-and-mobility
[01:14:39]   step: filter count → 2769 speeches in df for this topic (before guardrails)
[01:14:39]   step: run mode=speaker (MEP-averaged)


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[01:15:30]     done in 51.6s | rows=10 | pw=none
[01:15:30]     bins_ok=0 latest_ok=None CPC_max=None
[01:15:30]   step: run mode=speech_capped (per-MEP cap)


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[01:16:26]     done in 56.0s | rows=10 | pw=ok
[01:16:26]     bins_ok=5 latest_ok=2019-01-01 00:00:00 CPC_max=0.020376982914868974
[01:16:26]   step: run mode=speech_weighted (as heard)


Batches:   0%|          | 0/60 [00:00<?, ?it/s]

[01:17:22]     done in 55.3s | rows=10 | pw=ok
[01:17:22]     bins_ok=5 latest_ok=2019-01-01 00:00:00 CPC_max=0.05378946345341698
[01:17:22]   step: save CSVs
[01:17:22]     CSVs saved.
[01:17:22]   step: build PDF layout for "Transport & mobility"
[01:17:22]   step: PDF saved → reports_v6/transport-and-mobility/topic_report.pdf
[01:17:22] ✔ topic done: "Transport & mobility" in 163.7s | PDF → reports_v6/transport-and-mobility/topic_report.pdf



Processing topics: 100%|█████████████████████████████████████████| 30/30 [2:18:35<00:00, 277.18s/it]

[01:17:22] Batch finished → reports_v6/batch_summary.csv





[01:17:23] Topics summary saved → reports_v6/topics_summary_v6.csv


## Redo report summaries

In [11]:
import os, re, datetime as dt
import numpy as np
import pandas as pd

import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
from matplotlib.gridspec import GridSpec

# ---------- logging ----------
def _now():
    return dt.datetime.now().strftime("%H:%M:%S")

def log(msg):
    print(f"[{_now()}] {msg}", flush=True)

# ---------- topics & paths ----------
RAW_TOPICS = [
    "Agriculture & fisheries",
    "Climate, environment & biodiversity",
    "Development & humanitarian aid",
    "Digital policy & data protection",
    "EU budget & MFF",
    "Economy & industrial policy",
    "Education, culture & sport",
    "Energy & energy security",
    "Enlargement & neighbourhood policy",
    "Foreign policy — Americas",
    "Foreign policy — Asia-Pacific",
    "Foreign policy — Asia-Pacific",
    "Foreign policy — Europe & Eastern Neighbourhood",
    "Foreign policy — Europe &amp; Eastern Neighbourhood",
    "Foreign policy — Middle East & North Africa",
    "Foreign policy — Sub-Saharan Africa",
    "Foreign policy — Sub-Saharan Africa",
    "Health",
    "Institutional affairs & governance",
    "Justice, security & policing",
    "Media, information & disinformation",
    "Migration & asylum",
    "Monetary & financial stability",
    "Procedural & Parliamentary business",
    "Research, innovation & space",
    "Rule of law & fundamental rights",
    "Security & defence",
    "Security & policing",
    "Single market, competition & consumer protection",
    "Social policy & employment",
    "Taxation & anti–money laundering",
    "Trade & globalization",
    "Transport & mobility",
]

def _normalize_topic(t):
    t = t.replace("&amp;", "&")
    t = t.replace("\u2011", "-")
    t = t.replace("\u2013", "-")
    t = t.replace("\u2014", "—")
    t = re.sub(r"\s+", " ", t).strip()
    return t

TOPICS = sorted(set(_normalize_topic(t) for t in RAW_TOPICS))

BASE_DIR = "reports_v6"

def _slugify(name: str) -> str:
    s = name.lower()
    s = s.replace("&", "and")
    s = re.sub(r"[^\w\s-]", "", s)
    s = re.sub(r"\s+", "-", s).strip("-")
    s = re.sub(r"-+", "-", s)
    return s

# ---------- helpers ----------
def _ensure_time_and_numeric(df):
    df = df.copy()
    if "time_bin" in df:
        df["time_bin"] = pd.to_datetime(df["time_bin"], errors="coerce")
    for col in ["cpc", "cpc_adj", "obs_bss_tss", "ci_lo", "ci_hi"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    if "ok" not in df.columns:
        df["ok"] = True
    return df

def _ok_rows(df):
    df = _ensure_time_and_numeric(df)
    ok = df[df["ok"] == True].copy()
    return ok.sort_values("time_bin")

def _y_limits_cpc(res_list):
    """
    Y-limits for CPC plots.

    - Base range on CPC line (0–1 scale).
    - Include CI bounds, but ignore insane outliers so the axis
      doesn't explode to -1000% if one bootstrap goes wild.
    """
    cpc_vals = []
    ci_vals = []

    for df in res_list:
        if df is None or len(df) == 0:
            continue
        ok = _ok_rows(df)

        if "cpc" in ok.columns:
            cpc_vals.extend(
                ok["cpc"].replace([np.inf, -np.inf], np.nan).dropna().tolist()
            )

        # collect CI bounds but only if they look reasonable for a share-of-variance metric
        if {"ci_lo", "ci_hi"} <= set(ok.columns):
            ci = pd.concat([ok["ci_lo"], ok["ci_hi"]])
            ci = pd.to_numeric(ci, errors="coerce").replace([np.inf, -np.inf], np.nan)
            # keep only values roughly in [-5%, 50%] on the 0–1 scale
            ci = ci[(ci > -0.05) & (ci < 0.5)]
            ci_vals.extend(ci.dropna().tolist())

    # Fallback if nothing valid
    if not cpc_vals and not ci_vals:
        return (0.0, 0.05)

    vals = cpc_vals + ci_vals
    vmin = min(vals)
    vmax = max(vals)

    # For CPC we always show the baseline at 0
    vmin = min(0.0, vmin)

    span = vmax - vmin
    if span <= 0:
        span = 0.05  # small default span (5 percentage points)

    # a bit more generous margin so CI never kisses the frame
    margin = 0.15 * span
    return (vmin - margin, vmax + margin)


def _y_limits_cpc_adj(res_list):
    """
    Y-limits for CPC_adj plots.

    - Base range on CPC_adj line.
    - Include CI bounds, but clamp to a sensible range so one crazy value
      doesn't dominate. CPC_adj can be negative, but we cap it around [-0.5, 0.5].
    """
    adj_vals = []
    ci_vals = []

    for df in res_list:
        if df is None or len(df) == 0:
            continue
        ok = _ok_rows(df)

        if "cpc_adj" in ok.columns:
            adj_vals.extend(
                ok["cpc_adj"].replace([np.inf, -np.inf], np.nan).dropna().tolist()
            )

        if {"ci_lo", "ci_hi"} <= set(ok.columns):
            ci = pd.concat([ok["ci_lo"], ok["ci_hi"]])
            ci = pd.to_numeric(ci, errors="coerce").replace([np.inf, -np.inf], np.nan)
            # keep CI values in a sensible band for adjusted shares
            ci = ci[(ci > -0.6) & (ci < 0.6)]
            ci_vals.extend(ci.dropna().tolist())

    if not adj_vals and not ci_vals:
        return (-0.5, 0.05)

    vals = adj_vals + ci_vals
    vmin = min(vals)
    vmax = max(vals)

    # Clamp crazy negatives so a few bins don't dominate everything
    vmin = max(vmin, -0.5)
    vmin = min(vmin, 0.0)

    span = vmax - vmin
    if span <= 0:
        span = 0.05

    margin = 0.15 * span
    return (vmin - margin, vmax + margin)



def _heat_limits(pw_list):
    mx = 0.0
    for pw in pw_list:
        if pw is None or len(pw) == 0 or "std_dist" not in (pw.columns if hasattr(pw, "columns") else []):
            continue
        vals = pd.to_numeric(pw["std_dist"], errors="coerce").dropna()
        if not vals.empty:
            mx = max(mx, float(vals.max()))
    if mx <= 0:
        mx = 0.01
    return (0.0, mx)

# ---------- plotting primitives ----------
def _fill_ci_band(ax, ok, ci_mask, label=None, alpha=0.18):
    if "ci_lo" not in ok.columns or "ci_hi" not in ok.columns:
        return
    ci_lo = ok["ci_lo"]
    ci_hi = ok["ci_hi"]
    mask = ci_mask & ci_lo.notna() & ci_hi.notna()
    if not mask.any():
        return
    x_ord = mdates.date2num(ok.loc[mask, "time_bin"])
    ax.fill_between(x_ord,
                    ci_lo.loc[mask].values,
                    ci_hi.loc[mask].values,
                    alpha=alpha,
                    label=label)

def plot_cpc(ax, res_df, title, ylim, show_legend=False):
    ax.set_title(title, fontsize=11)
    if res_df is None or len(res_df) == 0:
        ax.axis("off"); ax.text(0.5, 0.5, "No data.", ha="center", va="center", fontsize=9)
        return

    ok = _ok_rows(res_df)
    if ok.empty or "cpc" not in ok.columns:
        ax.axis("off"); ax.text(0.5, 0.5, "No valid CPC.", ha="center", va="center", fontsize=9)
        return

    # CI band: use any row with ci_lo/hi (this is the engine metric, usually CPC)
    ci_mask = ok["ci_lo"].notna() & ok["ci_hi"].notna()
    _fill_ci_band(ax, ok, ci_mask, label="95% CI (engine metric)")

    # CPC line
    ax.plot(ok["time_bin"], ok["cpc"], marker="o", lw=1.8, label="CPC (raw)")

    ax.set_ylabel("CPC")
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    ax.set_ylim(*ylim)
    ax.grid(alpha=0.25)
    ax.axhline(0.0, lw=0.8, alpha=0.7)
    ax.xaxis.set_major_locator(mdates.YearLocator(base=2))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

    if show_legend:
        ax.legend(loc="upper left", fontsize=8, frameon=False)

def plot_cpc_adj(ax, res_df, title, ylim, show_legend=False, note_if_missing=False):
    ax.set_title(title, fontsize=11)
    if res_df is None or len(res_df) == 0:
        ax.axis("off"); ax.text(0.5, 0.5, "No data.", ha="center", va="center", fontsize=9)
        return

    ok = _ok_rows(res_df)
    if "cpc_adj" not in ok.columns or not ok["cpc_adj"].notna().any():
        ax.axis("off")
        msg = "CPC_adj not defined\n(token-weighted)" if note_if_missing else "No CPC_adj values."
        ax.text(0.5, 0.5, msg, ha="center", va="center", fontsize=9)
        return

    # CI band: only where obs_bss_tss equals cpc_adj (those bins used CPC_adj as engine metric)
    if {"obs_bss_tss", "ci_lo", "ci_hi"} <= set(ok.columns):
        ci_mask = ok["cpc_adj"].notna() & np.isclose(ok["obs_bss_tss"], ok["cpc_adj"], atol=1e-10)
        _fill_ci_band(ax, ok, ci_mask, label="95% CI (engine metric)")

    # CPC_adj line
    ax.plot(ok["time_bin"], ok["cpc_adj"], marker="s", lw=1.6, linestyle="--", label="CPC_adj")

    ax.set_ylabel("CPC_adj")
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0))
    ax.set_ylim(*ylim)
    ax.grid(alpha=0.25)
    ax.axhline(0.0, lw=0.8, alpha=0.7)
    ax.xaxis.set_major_locator(mdates.YearLocator(base=2))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

    if show_legend:
        ax.legend(loc="upper left", fontsize=8, frameon=False)

def plot_heatmap(ax, pw_df, title, vmin, vmax):
    ax.set_title(title, fontsize=11)
    if pw_df is None or len(pw_df) == 0 or {"party_a","party_b","std_dist"} - set(pw_df.columns):
        ax.axis("off")
        ax.text(0.5, 0.5, "No pairwise table.", ha="center", va="center", fontsize=9)
        return

    d = pw_df.copy()
    d["std_dist"] = pd.to_numeric(d["std_dist"], errors="coerce")
    d = d.dropna(subset=["std_dist"])
    if d.empty:
        ax.axis("off")
        ax.text(0.5, 0.5, "No numeric distances.", ha="center", va="center", fontsize=9)
        return

    parties = sorted(set(d["party_a"]).union(set(d["party_b"])))
    idx = {p: i for i, p in enumerate(parties)}
    M = np.zeros((len(parties), len(parties)), dtype=float)
    for _, r in d.iterrows():
        i, j = idx[r["party_a"]], idx[r["party_b"]]
        M[i, j] = r["std_dist"]
        M[j, i] = r["std_dist"]
    np.fill_diagonal(M, 0.0)

    # keep same info, just cleaner
    im = ax.imshow(M, aspect="auto", vmin=vmin, vmax=vmax)

    ax.set_xticks(range(len(parties)))
    ax.set_yticks(range(len(parties)))
    ax.set_xticklabels(parties, rotation=45, ha="right", fontsize=7)
    ax.set_yticklabels(parties, fontsize=7)
    ax.tick_params(axis="x", pad=2)

    cbar = plt.colorbar(im, ax=ax, fraction=0.04, pad=0.03)
    cbar.set_label("std. centroid distance", fontsize=7)
    cbar.ax.tick_params(labelsize=7)


# ---------- full 9-panel report ----------
def build_topic_report_9panels(topic_title,
                               res_meps, res_cap, res_w,
                               pw_meps, pw_cap, pw_w,
                               outpath):
    log(f"  build 9-panel report → {outpath}")

    ylim_cpc     = _y_limits_cpc([res_meps, res_cap, res_w])
    ylim_cpc_adj = _y_limits_cpc_adj([res_meps, res_cap, res_w])
    vmin, vmax   = _heat_limits([pw_meps, pw_cap, pw_w])

    if os.path.exists(outpath):
        os.remove(outpath)

    # base font
    plt.rcParams.update({"font.size": 10})

    # *** BIGGER, TALLER PAGE ***
    # still wide like A4 landscape, but extra vertical space so rows are nice and high
    fig = plt.figure(figsize=(11.69, 12.0))

    # Layout: 5 rows x 3 cols
    # row0: title (1x3)
    # row1: CPC (3)
    # row2: CPC_adj (3)
    # row3: heatmaps (3)
    # row4: text block (1x3)
    gs = GridSpec(
        5, 3, figure=fig,
        # More weight for the 3 data rows, slimmer title/footer.
        height_ratios=[1.1, 4.3, 4.3, 4.3, 1.6],
        # Smaller hspace so the graphs themselves get more height.
        hspace=0.35,
        wspace=0.38
    )

    # ---- Title row ----
    ax_title = fig.add_subplot(gs[0, :])
    ax_title.axis("off")

    title_str = f"European Parliament Rhetorical Polarization — {topic_title}"
    subtitle_str = (
        "Row 1: CPC (raw share of variance in rhetoric explained by party) with 95% bootstrap bands "
        "for the engine metric.\n"
        "Row 2: adjusted CPC (CPC_adj). Row 3: standardized pairwise distances between party centroids "
        "in the latest valid year."
    )

    ax_title.text(
        0.5, 0.70,
        title_str,
        ha="center", va="center",
        fontsize=16, weight="bold"
    )
    ax_title.text(
        0.5, 0.20,
        subtitle_str,
        ha="center", va="center",
        fontsize=9, color="dimgray",
        linespacing=1.25
    )

    # ---- Row 1: CPC + CI ----
    ax_cpc1 = fig.add_subplot(gs[1, 0])
    ax_cpc2 = fig.add_subplot(gs[1, 1], sharey=ax_cpc1)
    ax_cpc3 = fig.add_subplot(gs[1, 2], sharey=ax_cpc1)

    plot_cpc(ax_cpc1, res_meps, "MEP-averaged (speaker-equal)", ylim_cpc, show_legend=True)
    plot_cpc(ax_cpc2, res_cap,  "Speech-capped (per-MEP cap)", ylim_cpc, show_legend=False)
    plot_cpc(ax_cpc3, res_w,    "Speech-weighted (as heard)", ylim_cpc, show_legend=True)

    for ax in (ax_cpc2, ax_cpc3):
        plt.setp(ax.get_yticklabels(), visible=False)
        ax.set_ylabel("")

    # ---- Row 2: CPC_adj + CI where available ----
    ax_adj1 = fig.add_subplot(gs[2, 0])
    ax_adj2 = fig.add_subplot(gs[2, 1], sharey=ax_adj1)
    ax_adj3 = fig.add_subplot(gs[2, 2], sharey=ax_adj1)

    plot_cpc_adj(ax_adj1, res_meps, "MEP-averaged — CPC_adj", ylim_cpc_adj, show_legend=True)
    plot_cpc_adj(ax_adj2, res_cap,  "Speech-capped — CPC_adj", ylim_cpc_adj, show_legend=False)
    plot_cpc_adj(
        ax_adj3, res_w,
        "Speech-weighted — CPC_adj", ylim_cpc_adj,
        show_legend=True, note_if_missing=True
    )

    for ax in (ax_adj2, ax_adj3):
        plt.setp(ax.get_yticklabels(), visible=False)
        ax.set_ylabel("")

    # ---- Row 3: heatmaps ----
    ax_h1 = fig.add_subplot(gs[3, 0])
    ax_h2 = fig.add_subplot(gs[3, 1])
    ax_h3 = fig.add_subplot(gs[3, 2])

    plot_heatmap(ax_h1, pw_meps, "Pairwise — MEP-averaged", vmin, vmax)
    plot_heatmap(ax_h2, pw_cap,  "Pairwise — Speech-capped", vmin, vmax)
    plot_heatmap(ax_h3, pw_w,    "Pairwise — Speech-weighted", vmin, vmax)

    # ---- Row 4: description block ----
    ax_text = fig.add_subplot(gs[4, :])
    ax_text.axis("off")

    footer_text = (
        "CPC = BSS/TSS: share of variance in de-meaned embeddings explained by party identity\n"
        "(higher values = parties sound more distinct). CPC_adj applies a degrees-of-freedom correction,\n"
        "analogous to adjusted R²: it penalises apparent separation when there are many dimensions and parties\n"
        "relative to the number of speakers, and can be negative when between-party differences are not reliably above chance.\n\n"
        "Pairwise heatmaps show standardized distances between party centroids in the latest valid year.\n"
        "Distances are divided by within-party spread, so a value around 1 means two parties are about as far apart as the\n"
        "average internal dispersion of their rhetoric; larger numbers indicate stronger rhetorical separation."
    )

    ax_text.text(
        0.03, 0.96,
        footer_text,
        ha="left", va="top",
        fontsize=8.5, color="dimgray",
        linespacing=1.25
    )

    # *** GENEROUS PAGE MARGINS so nothing ever gets cut off ***
    fig.subplots_adjust(
        left=0.09,   # room for y labels & heatmap tick labels
        right=0.97,
        top=0.93,
        bottom=0.13
    )

    fig.savefig(outpath, dpi=300)
    plt.close(fig)

# ---------- rebuild for all topics ----------
def rebuild_all_reports_9panels(base_dir=BASE_DIR, topics=TOPICS, include_all=True):
    if include_all:
        topics_list = ["ALL"] + list(topics)
    else:
        topics_list = list(topics)

    for topic in topics_list:
        if str(topic).upper() == "ALL":
            slug = "_all_speeches"
            pretty_name = "All speeches"
        else:
            slug = _slugify(topic)
            pretty_name = topic

        topic_dir = os.path.join(base_dir, slug)
        if not os.path.isdir(topic_dir):
            log(f"✖ skip {pretty_name} (no folder {topic_dir})")
            continue

        def _read_csv_safe(path):
            try:
                return pd.read_csv(path)
            except Exception:
                return None

        res_meps = _read_csv_safe(os.path.join(topic_dir, "res_meps.csv"))
        res_cap  = _read_csv_safe(os.path.join(topic_dir, "res_cap.csv"))
        res_w    = _read_csv_safe(os.path.join(topic_dir, "res_w.csv"))
        pw_meps  = _read_csv_safe(os.path.join(topic_dir, "pw_meps.csv"))
        pw_cap   = _read_csv_safe(os.path.join(topic_dir, "pw_cap.csv"))
        pw_w     = _read_csv_safe(os.path.join(topic_dir, "pw_w.csv"))

        # if we have literally no CPC results for this topic, skip
        if res_meps is None and res_cap is None and res_w is None:
            log(f"✖ skip {pretty_name} (no res_*.csv files)")
            continue

        out_pdf = os.path.join(topic_dir, "topic_report_extended.pdf")
        build_topic_report_9panels(
            pretty_name,
            res_meps, res_cap, res_w,
            pw_meps, pw_cap, pw_w,
            out_pdf
        )

    log("All 9-panel PDFs rebuilt.")


# --------- RUN THIS ---------
if __name__ == "__main__":
    rebuild_all_reports_9panels()


[12:25:53]   build 9-panel report → reports_v6/_all_speeches/topic_report_extended.pdf
[12:25:54]   build 9-panel report → reports_v6/agriculture-and-fisheries/topic_report_extended.pdf
[12:25:54]   build 9-panel report → reports_v6/climate-environment-and-biodiversity/topic_report_extended.pdf
[12:25:55]   build 9-panel report → reports_v6/development-and-humanitarian-aid/topic_report_extended.pdf
[12:25:55]   build 9-panel report → reports_v6/digital-policy-and-data-protection/topic_report_extended.pdf
[12:25:56]   build 9-panel report → reports_v6/eu-budget-and-mff/topic_report_extended.pdf
[12:25:56]   build 9-panel report → reports_v6/economy-and-industrial-policy/topic_report_extended.pdf
[12:25:57]   build 9-panel report → reports_v6/education-culture-and-sport/topic_report_extended.pdf
[12:25:57]   build 9-panel report → reports_v6/energy-and-energy-security/topic_report_extended.pdf
[12:25:58]   build 9-panel report → reports_v6/enlargement-and-neighbourhood-policy/topic_repor

In [19]:
df.head()

Unnamed: 0,id,sitting_id,date,speech_order,speaker_name,political_group,title,speech_content,language,topic,macro_topic,specific_focus
0,575688,eli/dl/event/MTG-PL-2025-07-10-OTH-2017033239364,2025-07-10,1,President,,,The President has received from the Council it...,EN,Council position at first reading,Procedural & Parliamentary business,
1,575689,eli/dl/event/MTG-PL-2025-07-10-OTH-2017033239364,2025-07-10,2,President,,,The next item on the agenda is the debate on t...,EN,Post-2027 Common Agricultural Policy,Agriculture & fisheries,CAP post-2027 reform
2,575690,eli/dl/event/MTG-PL-2025-07-10-OTH-2017033239364,2025-07-10,3,Christophe Hansen,,Member of the Commission,"Madam President, honourable Members, dear coll...",EN,Post-2027 Common Agricultural Policy,Agriculture & fisheries,CAP post-2027 reform
3,575691,eli/dl/event/MTG-PL-2025-07-10-OTH-2017033239364,2025-07-10,4,Herbert Dorfmann,PPE,,"Frau Präsidentin, Herr Kommissar, Kolleginnen ...",DE,Post-2027 Common Agricultural Policy,Agriculture & fisheries,CAP post-2027 reform
4,575692,eli/dl/event/MTG-PL-2025-07-10-OTH-2017033239364,2025-07-10,5,Dario Nardella,S&D,,"Signora Presidente, signor Commissario, onorev...",IT,Post-2027 Common Agricultural Policy,Agriculture & fisheries,CAP post-2027 reform


In [21]:
# compute average number of words for speech_content of first 500 speeches
n = min(10000, len(df))
texts = df["speech_content"].astype(str).iloc[:n].map(_scrub_boiler)  # use existing scrubber
word_counts = texts.map(lambda t: len(t.split()))
avg_words_first_500 = float(word_counts.mean())

print(f"Counted {n} speeches. Average words per speech (first {n}): {avg_words_first_500:.2f}")

Counted 10000 speeches. Average words per speech (first 10000): 190.72


In [12]:
# give me the row with id = 620840
df[df["id"] == 620840]

Unnamed: 0,id,sitting_id,date,speech_order,speaker_name,political_group,title,speech_content,language,topic,macro_topic,specific_focus
24296,620840,eli/dl/event/MTG-PL-2023-11-21-OTH-25530000,2023-11-21,376,Ernő Schaller-Baross,NI,Frage nach dem Verfahren der „blauen Karte“,Herr Freund! Sie haben fünf Jahre für die Soro...,DE,"Continuing threat to the rule of law, the inde...",Rule of law & fundamental rights,Hungary: rule of law & funding conditionality
