In [None]:
import re
import json
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.cluster import AgglomerativeClustering

!pip install hnswlib pip install sentence-transformers hnswlib
import hnswlib
from sentence_transformers import SentenceTransformer



# 1) –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞

def _strip_markdown_links(s: str) -> str:
    # [text](url) -> text
    return re.sub(r"\[([^\]]+)\]\((https?://[^)]+)\)", r"\1", s)

def _normalize_ws(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _is_time_line(line: str) -> bool:
    return bool(re.fullmatch(r"\d{1,2}:\d{2}", line))

def _is_small_digit_line(line: str) -> bool:
    return line.isdigit() and len(line) <= 2

def _should_drop_line(line: str) -> bool:
    if not line:
        return True

    line2 = _PUA_RE.sub("", line).strip()
    if not line2:
        return True

    # —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª–∏
    if set(line2) <= set("-‚Äì‚Äî_"):
        return True

    # –º–µ—Ç–æ–¥–∞–Ω–Ω—ã–µ
    if line2.startswith("üì¢ From:") or line2.startswith("üìÅ Category:") or line2.startswith("üîó Link:") or line2.startswith("‚è∞ Time:"):
        return True

    # –º–∞—Ä–∫–µ—Ä—ã –º–µ–¥–∏–∞
    if line2.startswith("[üì∑") or line2.startswith("[üé¨"):
        return True

    # —Å–ª—É–∂–µ–±–Ω—ã–µ
    if line2 == "Telegram":
        return True
    if "VIEW MESSAGE" in line2 or "VIEW CHANNEL" in line2 or "INSTANT VIEW" in line2:
        return True

    # –º—É—Å–æ—Ä
    if _is_time_line(line2) or _is_small_digit_line(line2):
        return True

    # URL
    if re.fullmatch(r"https?://\S+", line2):
        return True

    # —Ä–µ–∫–ª–∞–º–∞
    if "–†–ï–ö–õ–ê–ú–ê" in line2:
        return True
    if line2.startswith("–ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è") or "–ü–æ–¥–ø–∏—Å–∞—Ç—å—Å—è" in line2:
        return True
    if "–ü—Ä–∏—Å–ª–∞—Ç—å –Ω–æ–≤–æ—Å—Ç–∏" in line2 or "–ù–æ–≤–æ—Å—Ç–∏ —Å—é–¥–∞" in line2:
        return True
    if "knd.gov.ru/license" in line2:
        return True

    # –∫–æ–Ω—Ç–∞–∫—Ç–Ω—ã–µ —Å—Ç—Ä–æ–∫–∏
    if "@" in line2 and any(w in line2.lower() for w in ["bot", "—Ä–µ–∫–ª–∞–º–∞", "admin", "pr", "–ø–æ–¥–±–æ—Ä"]):
        return True

    return False

def extract_semantic_text(text: str) -> str:
    if not isinstance(text, str):
        return ""

    text = _strip_markdown_links(text)

    lines = [ln.strip() for ln in text.splitlines()]
    kept = []
    skip_after_telegram = 0

    for ln in lines:
        if not ln:
            continue

        # –ø–æ—Å–ª–µ —Å—Ç—Ä–æ–∫–∏ "Telegram" –æ–±—ã—á–Ω–æ –∏–¥—ë—Ç —Å—Ç—Ä–æ–∫–∞ —Å –Ω–∞–∑–≤–∞–Ω–∏–µ–º –∫–∞–Ω–∞–ª–∞ - —É–±–∏—Ä–∞–µ–º
        if skip_after_telegram > 0:
            skip_after_telegram -= 1
            continue

        ln2 = _PUA_RE.sub("", ln).strip()

        if ln2 == "Telegram":
            skip_after_telegram = 1
            continue

        if _should_drop_line(ln2):
            continue

        # URLs –≤–Ω—É—Ç—Ä–∏ —Å—Ç—Ä–æ–∫–∏
        ln2 = re.sub(r"http\S+|www\.\S+", " ", ln2).strip()

        # –æ—á–∏—Å—Ç–∫–∞ markdown
        ln2 = ln2.replace("**", "").replace("__", "")
        ln2 = _normalize_ws(ln2)

        if ln2:
            kept.append(ln2)

    return "\n".join(kept).strip()

def make_title_from_semantic(semantic_text: str, max_len: int = 180) -> str:
    lines = [_normalize_ws(x) for x in semantic_text.splitlines() if _normalize_ws(x)]
    if not lines:
        return ""
    return lines[0][:max_len]

def is_digest_text(semantic_text: str) -> bool:
    return bool(re.search(r"(–≥–ª–∞–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏|–∫ —ç—Ç–æ–º—É —á–∞—Å—É|—Å–≤–æ–¥–∫–∞|–¥–∞–π–¥–∂–µ—Å—Ç|–≥–ª–∞–≤–Ω–æ–µ –∑–∞ –¥–µ–Ω—å)", semantic_text, flags=re.IGNORECASE))


# 2) –§–æ—Ä–º–∞—Ç–∏—Ä—É–µ–º –¥–∞—Ç—É

def _infer_base_date_from_texts(df: pd.DataFrame) -> datetime:
    ts = []
    for t in df["text"].fillna("").tolist():
        m = _SCRAPE_TS_RE.search(t)
        if m:
            dt = pd.to_datetime(f"{m.group(1)} {m.group(2)}", errors="coerce")
            if pd.notna(dt):
                ts.append(dt.to_pydatetime())
    return max(ts) if ts else datetime.now()

def parse_telegram_date(raw: str, base_date: datetime) -> pd.Timestamp:
    """
    –º–µ—Å—è—Ü/–≥–æ–¥ –±–µ—Ä—ë–º –∏–∑ base_date. –ï—Å–ª–∏ –ø–æ–ª—É—á–∏–ª–∞—Å—å –¥–∞—Ç–∞ ‚Äú–≤ –±—É–¥—É—â–µ–º‚Äù, —Å–¥–≤–∏–≥–∞–µ–º –Ω–∞ –º–µ—Å—è—Ü –Ω–∞–∑–∞–¥.
    """
    if not isinstance(raw, str) or not raw.strip():
        return pd.NaT

    try:
        cleaned = _PUA_RE.sub(" ", raw)
        cleaned = re.sub(r"[^\d:\s]", " ", cleaned)
        cleaned = _normalize_ws(cleaned)

        day = None
        hm = None
        for part in cleaned.split():
            if part.isdigit() and len(part) <= 2:
                day = int(part)
            elif re.fullmatch(r"\d{1,2}:\d{2}", part):
                hm = part

        if day is None or hm is None:
            return pd.NaT

        y, m = base_date.year, base_date.month
        candidate = pd.to_datetime(f"{y:04d}-{m:02d}-{day:02d} {hm}", format="%Y-%m-%d %H:%M", errors="coerce")
        if pd.isna(candidate):
            return pd.NaT

        if candidate.to_pydatetime() > base_date:
            prev = (pd.Timestamp(y, m, 1) - pd.Timedelta(days=1))
            candidate = pd.to_datetime(f"{prev.year:04d}-{prev.month:02d}-{day:02d} {hm}", errors="coerce")

        return candidate
    except Exception:
        return pd.NaT



# 3) –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ dataframe

def preprocess_data(df: pd.DataFrame, min_semantic_chars: int = 40) -> pd.DataFrame:
    df = df.copy()

    # –¥–∞—Ç—ã
    base_date = _infer_base_date_from_texts(df)
    df["published_at"] = df["published_at"].apply(lambda x: parse_telegram_date(x, base_date))

    # —Ç–µ–∫—Å—Ç
    df["semantic_text"] = df["text"].apply(extract_semantic_text)
    df["title"] = df["semantic_text"].apply(make_title_from_semantic)

    # —Ñ–ª–∞–≥–∏
    df["is_digest"] = df["semantic_text"].apply(is_digest_text)
    df["is_short"] = df["semantic_text"].fillna("").str.len() < min_semantic_chars

    # —É–±–∏—Ä–∞–µ–º –Ω–æ–≤–æ—Å—Ç–∏ –±–µ–∑ —Å–º—ã—Å–ª–æ–≤–æ–≥–æ –Ω–∞–ø–æ–ª–Ω–µ–Ω–∏—è —Å –¥–æ–ø –∞–ª–µ—Ä—Ç–æ–º
    before = len(df)
    df = df[df["semantic_text"].fillna("").str.len() > 0].copy()
    after = len(df)
    if after < before:
        print(f"–í–Ω–∏–º–∞–Ω–∏–µ: —É–¥–∞–ª–µ–Ω–æ {before-after} –∑–∞–ø–∏—Å–µ–π –±–µ–∑ —Å–º—ã—Å–ª–æ–≤–æ–≥–æ —Ç–µ–∫—Å—Ç–∞ (—Å–ª—É–∂–µ–±–Ω—ã–µ/–ø—É—Å—Ç—ã–µ).")

    df.dropna(subset=["published_at"], inplace=True)

    df["full_text"] = df["semantic_text"]
    return df



# 4) –≠–º–±–µ–¥–¥–∏–Ω–≥–∏

def embed_texts(texts: list[str], model: SentenceTransformer, batch_size: int = 32) -> np.ndarray:
    emb = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True,
        normalize_embeddings=True
    )
    return emb.astype(np.float32)



# 5) –ö–ª–∞—Å—Ç–µ—Ä–∏–∑–∞—Ü–∏—è (1-–π –ø—Ä–æ—Ö–æ–¥)

def _fit_agglom_cosine(embeddings: np.ndarray, distance_threshold: float, linkage: str = "complete") -> np.ndarray:
    try:
        model = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=distance_threshold,
            linkage=linkage,
            metric="cosine",
        )
    except TypeError:
        model = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=distance_threshold,
            linkage=linkage,
            affinity="cosine",
        )
    return model.fit_predict(embeddings)

def cluster_first_pass(
    df: pd.DataFrame,
    embeddings_all: np.ndarray,
    dist_thr_main: float = 0.16,
) -> pd.DataFrame:
    """
    1-–π –ø—Ä–æ—Ö–æ–¥: –∫–ª–∞—Å—Ç–µ—Ä–∏–∑—É–µ–º –¢–û–õ–¨–ö–û —Å–æ–±—ã—Ç–∏–π–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏.
    Digest-–Ω–æ–≤–æ—Å—Ç–∏ –Ω–∞–∑–Ω–∞—á–∞–µ–º –æ—Ç–¥–µ–ª—å–Ω—ã–µ cluster_id –ø–æ–∑–∂–µ.
    –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ —Å—á–∏—Ç–∞–µ–º 1 —Ä–∞–∑ –¥–ª—è –≤—Å–µ—Ö, —Å—é–¥–∞ –ø–µ—Ä–µ–¥–∞–µ–º embeddings_all.
    """
    df = df.copy()
    df["cluster_id"] = -1

    # ¬´–∏—Å—Ç–∏–Ω–Ω—ã–µ¬ª –Ω–æ–≤–æ—Å—Ç–∏
    mask_main = (~df["is_short"]) & (~df["is_digest"])
    if mask_main.any():
        emb_main = embeddings_all[mask_main.to_numpy()]
        labels_main = _fit_agglom_cosine(
            emb_main,
            distance_threshold=dist_thr_main,
            linkage="complete"
        )
        df.loc[mask_main, "cluster_id"] = labels_main

    # –≤—Å–µ –æ—Å—Ç–∞–ª—å–Ω—ã–µ (digest –∏ short) –ø–æ–∫–∞ –æ—Å—Ç–∞–≤–ª—è–µ–º -1
    df["cluster_id"] = df["cluster_id"].astype(int)
    return df

def assign_non_event_clusters(df: pd.DataFrame) -> pd.DataFrame:
    """
    Digest –∏ short –Ω–æ–≤–æ—Å—Ç–∏ –Ω–µ —Å—á–∏—Ç–∞–µ–º "—Å–æ–±—ã—Ç–∏—è–º–∏".
    short: –∫–∞–∂–¥–∞—è –∑–∞–ø–∏—Å—å –æ—Ç–¥–µ–ª—å–Ω—ã–π –∫–ª–∞—Å—Ç–µ—Ä
    digest: –≥—Ä—É–ø–ø–∏—Ä—É–µ–º –ø–æ (–Ω–æ—Ä–º–∞–ª–∏–∑–æ–≤–∞–Ω–Ω—ã–π –∑–∞–≥–æ–ª–æ–≤–æ–∫, –æ–∫—Ä—É–≥–ª–µ–Ω–∏–µ –≤—Ä–µ–º–µ–Ω–∏ –¥–æ —á–∞—Å–∞)
    —á—Ç–æ–±—ã –æ–¥–∏–Ω–∞–∫–æ–≤—ã–µ "–ì–ª–∞–≤–Ω–æ–µ –∑–∞ –¥–µ–Ω—å" –Ω–µ –¥—Ä–æ–±–∏–ª–∏—Å—å.
    """
    df = df.copy()

    # —Å—Ç–∞—Ä—Ç–æ–≤—ã–π offset –¥–ª—è –Ω–æ–≤—ã—Ö –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
    offset = int(df["cluster_id"].max()) + 1 if (df["cluster_id"].max() >= 0) else 0

    # short: —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ
    mask_s = df["is_short"]
    if mask_s.any():
        n = int(mask_s.sum())
        df.loc[mask_s, "cluster_id"] = np.arange(offset, offset + n)
        offset += n

    # digest: –ø—Ä–∞–≤–∏–ª–æ –ø–æ –∑–∞–≥–æ–ª–æ–≤–∫—É + –≤—Ä–µ–º—è –ø—É–±–ª–∏–∫–∞—Ü–∏–∏
    mask_d = df["is_digest"] & (~df["is_short"])
    if mask_d.any():
        def norm_title(t: str) -> str:
            t = (t or "").lower()
            t = re.sub(r"\s+", " ", t).strip()

            t = t.replace("–≥–ª–∞–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏ –∫ —ç—Ç–æ–º—É —á–∞—Å—É", "–≥–ª–∞–≤–Ω—ã–µ –Ω–æ–≤–æ—Å—Ç–∏ –∫ —ç—Ç–æ–º—É —á–∞—Å—É")
            t = t.replace("–≥–ª–∞–≤–Ω–æ–µ –∑–∞ –¥–µ–Ω—å", "–≥–ª–∞–≤–Ω–æ–µ –∑–∞ –¥–µ–Ω—å")
            return t[:80]

        tmp = df.loc[mask_d, ["title", "published_at"]].copy()
        tmp["digest_key"] = tmp["title"].astype(str).apply(norm_title)
        tmp["hour_bucket"] = pd.to_datetime(tmp["published_at"]).dt.floor("H")


        keys = tmp["digest_key"].astype(str) + "||" + tmp["hour_bucket"].astype(str)
        codes, _ = pd.factorize(keys, sort=True)

        df.loc[mask_d, "cluster_id"] = codes + offset
        offset = int(df["cluster_id"].max()) + 1

    df["cluster_id"] = df["cluster_id"].astype(int)
    return df


# 6) –í—ã–±–æ—Ä –∫–∞–Ω–æ–Ω–∏—á–µ—Å–∫–æ–π –Ω–æ–≤–æ—Å—Ç–∏ –∏ summary

def select_canonical_and_summary(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    df = df.copy()
    df = df.sort_values(["cluster_id", "published_at"], ascending=[True, True])

    df["is_canonical"] = False
    canon_idx = df.groupby("cluster_id", as_index=False).head(1).index
    df.loc[canon_idx, "is_canonical"] = True

    sizes = df.groupby("cluster_id")["id"].size()
    news_ids = df.groupby("cluster_id")["id"].apply(list)

    canon = df[df["is_canonical"]].set_index("cluster_id")
    summary = pd.DataFrame({
        "cluster_id": sizes.index,
        "size": sizes.values,
        "canonical_news_id": canon["id"],
        "canonical_title": canon["title"].fillna("").astype(str).str.slice(0, 220),
        "news_ids": news_ids.values
    }).reset_index(drop=True)

    return df, summary



# 7) –í—Ç–æ—Ä–æ–π –ø—Ä–æ—Ö–æ–¥ –º—ë—Ä–¥–∂–∞ (–Ω–∞ —É—Ä–æ–≤–Ω–µ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤)

def merge_clusters_second_pass_centroid(
    df: pd.DataFrame,
    embeddings_all: np.ndarray,
    sim_threshold: float = 0.91,
    top_k: int = 10,
    time_window_hours: int = 96,
) -> pd.DataFrame:
    """
    2-–π –ø—Ä–æ—Ö–æ–¥: —Å–ª–∏–≤–∞–µ–º –∫–ª–∞—Å—Ç–µ—Ä–∞ –ø–æ —Å—Ä–µ–¥–Ω–µ–º—É —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ –∫–ª–∞—Å—Ç–µ—Ä–∞.
    –≠–º–±–µ–¥–¥–∏–Ω–≥–∏ –Ω–µ –ø–µ—Ä–µ—Å—á–∏—Ç—ã–≤–∞–µ–º ‚Äì –±–µ—Ä—ë–º embeddings_all.
    –°–ª–∏–≤–∞–µ–º —Ç–æ–ª—å–∫–æ —Å–æ–±—ã—Ç–∏–π–Ω—ã–µ –∫–ª–∞—Å—Ç–µ—Ä–∞.
    """
    df = df.copy()

    event_mask = (~df["is_digest"]) & (~df["is_short"])
    if event_mask.sum() <= 1:
        return df

    df_event = df[event_mask].copy()
    emb_event = embeddings_all[event_mask.to_numpy()]

    cluster_ids = df_event["cluster_id"].to_numpy()
    uniq = np.unique(cluster_ids)

    if len(uniq) <= 1:
        return df

    centroids = []
    times = []
    uniq_list = []

    for cid in uniq:
        idx = np.where(cluster_ids == cid)[0]
        c = emb_event[idx].mean(axis=0)
        c = c / (np.linalg.norm(c) + 1e-12)
        centroids.append(c.astype(np.float32))
        uniq_list.append(int(cid))

        t = df_event.loc[df_event["cluster_id"] == cid, "published_at"].min()
        times.append(pd.to_datetime(t))

    centroids = np.vstack(centroids).astype(np.float32)
    times = np.array(times, dtype="datetime64[ns]")

    K, D = centroids.shape
    index = hnswlib.Index(space="cosine", dim=D)
    index.init_index(max_elements=K, ef_construction=200, M=32)
    index.add_items(centroids, np.arange(K))
    index.set_ef(80)

    nbrs, dists = index.knn_query(centroids, k=min(top_k, K))
    sims = 1.0 - dists
    window = np.timedelta64(time_window_hours, "h")

    # —Å–ª–∏—è–Ω–∏–µ —á–µ—Ä–µ–∑ DSU
    class DSU:
        def __init__(self, n):
            self.p = list(range(n))
            self.r = [0]*n
        def find(self, x):
            while self.p[x] != x:
                self.p[x] = self.p[self.p[x]]
                x = self.p[x]
            return x
        def union(self, a, b):
            ra, rb = self.find(a), self.find(b)
            if ra == rb:
                return
            if self.r[ra] < self.r[rb]:
                self.p[ra] = rb
            elif self.r[ra] > self.r[rb]:
                self.p[rb] = ra
            else:
                self.p[rb] = ra
                self.r[ra] += 1

    dsu = DSU(K)

    for i in range(K):
        for j, sim in zip(nbrs[i], sims[i]):
            j = int(j)
            if j == i:
                continue
            if sim < sim_threshold:
                continue
            if np.abs(times[i] - times[j]) > window:
                continue
            dsu.union(i, j)

    roots = [dsu.find(i) for i in range(K)]
    root2new = {}
    old2new = {}
    new_id = 0
    for i, r in enumerate(roots):
        if r not in root2new:
            root2new[r] = new_id
            new_id += 1
        old2new[uniq_list[i]] = root2new[r]

    df.loc[event_mask, "cluster_id"] = df.loc[event_mask, "cluster_id"].map(old2new).astype(int)

    return df

def reindex_cluster_ids(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    uniq = np.sort(df["cluster_id"].unique())
    mapping = {old: new for new, old in enumerate(uniq)}
    df["cluster_id"] = df["cluster_id"].map(mapping).astype(int)
    return df



# 8) –ü–∞–π–ø–ª–∞–π–Ω

def run_pipeline(
    input_path: str,
    model_name: str = "ai-forever/sbert_large_nlu_ru",
    batch_size: int = 32,
    min_semantic_chars: int = 40,
    dist_thr_main: float = 0.16,
    merge_sim_threshold: float = 0.92,
    merge_time_window_hours: int = 96,
    merge_top_k: int = 25,
):
    # load
    df_raw = pd.read_json(input_path)
    required = {"id", "text", "source", "published_at"}
    missing = required - set(df_raw.columns)
    if missing:
        raise ValueError(f"Missing required columns: {missing}")


    df = preprocess_data(df_raw, min_semantic_chars=min_semantic_chars)
    print(f"–û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –Ω–æ–≤–æ—Å—Ç–µ–π –ø–æ—Å–ª–µ –æ—á–∏—Å—Ç–∫–∏: {len(df)}")


    model = SentenceTransformer(model_name)

    embeddings_all = embed_texts(df["full_text"].tolist(), model, batch_size=batch_size)


    df = cluster_first_pass(
        df,
        embeddings_all=embeddings_all,
        dist_thr_main=dist_thr_main,
    )

    df = assign_non_event_clusters(df)

    df = merge_clusters_second_pass_centroid(
        df,
        embeddings_all=embeddings_all,
        sim_threshold=merge_sim_threshold,
        top_k=merge_top_k,
        time_window_hours=merge_time_window_hours,
    )

    df = reindex_cluster_ids(df)

    final_df, clusters_summary_df = select_canonical_and_summary(df)
    return final_df, clusters_summary_df



# 9) –ó–∞–ø—É—Å–∫ –ø–∞–π–ø–ª–∞–π–Ω–∞ —Å –≤—ã–≤–æ–¥–æ–º –∏ —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ–º

INPUT_PATH = "tg_messages (116).json" # tg_messages (116).json - 116 –∑–∞–ø–∏—Å–µ–π // news.json - 2000 –∑–∞–ø–∏—Å–µ–π

final_df, clusters_summary_df = run_pipeline(
    input_path=INPUT_PATH,
    model_name="ai-forever/sbert_large_nlu_ru",
    batch_size=32,
    min_semantic_chars=40,
    dist_thr_main=0.16,
    merge_sim_threshold=0.92,
    merge_time_window_hours=96,
    merge_top_k=25,
)

print("\n–ù–∞–π–¥–µ–Ω–æ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤ (–∏—Ç–æ–≥):", final_df["cluster_id"].nunique())

print("\n--- –°–≤–æ–¥–∫–∞ –ø–æ –∫–ª–∞—Å—Ç–µ—Ä–∞–º ---")
print(
    clusters_summary_df[["cluster_id", "size", "canonical_title"]]
      .sort_values("size", ascending=False)
)

print("\n--- –§–∏–Ω–∞–ª—å–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç (—Ñ—Ä–∞–≥–º–µ–Ω—Ç) ---")
result_columns = ["id", "title", "source", "published_at", "cluster_id", "is_canonical"]
print(final_df[result_columns].sort_values("cluster_id").head(60))

final_df.to_csv("news_dedup_full.csv", index=False, encoding="utf-8")

clusters_summary_df.to_csv("clusters_summary.csv", index=False, encoding="utf-8")

clusters_list = (
    clusters_summary_df[["cluster_id", "news_ids"]]
      .rename(columns={"cluster_id": "group_id"})
      .to_dict("records")
)
with open("clusters_list.json", "w", encoding="utf-8") as f:
    json.dump(clusters_list, f, ensure_ascii=False, indent=2)

print("\nSaved files: news_dedup_full.csv, news_dedup_full.jsonl, clusters_summary.csv, clusters_summary.jsonl, clusters_list.json")


In [None]:
# –ü—Ä–æ–≤–µ—Ä–∫–∞ —Å–æ–≥–ª–∞—Å–æ–≤–∞–Ω–Ω–æ—Å—Ç–∏ –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
assert (final_df.groupby("cluster_id")["is_canonical"].sum() == 1).all()

# summary —Å–æ–≥–ª–∞—Å–æ–≤–∞–Ω —Å final_df
canon_map = final_df.loc[final_df["is_canonical"], ["cluster_id", "id"]].set_index("cluster_id")["id"].to_dict()
sum_map = clusters_summary_df.set_index("cluster_id")["canonical_news_id"].to_dict()
bad = [(cid, canon_map.get(cid), sum_map.get(cid)) for cid in sum_map.keys() if canon_map.get(cid) != sum_map.get(cid)]
print("Canonical mismatches:", bad[:10])


In [None]:
# –í–∏–∑—É–∞–ª–∏–∑–∞—Ü–∏—è –∫–ª–∞—Å—Ç–µ—Ä–æ–≤
try:
    emb_vis = embeddings_all
except NameError:
    try:
        emb_vis = embeddings
    except NameError:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer("ai-forever/sbert_large_nlu_ru")
        texts = final_df["semantic_text"].tolist() if "semantic_text" in final_df.columns else final_df["text"].astype(str).tolist()
        emb_vis = model.encode(texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True)


import matplotlib.pyplot as plt

X = np.asarray(emb_vis)

try:
    import umap
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric="cosine", random_state=42)
    X2 = reducer.fit_transform(X)
    method = "UMAP"
except Exception:
    from sklearn.decomposition import PCA
    X2 = PCA(n_components=2, random_state=42).fit_transform(X)
    method = "PCA(2D)"


plt.figure(figsize=(10, 8))
plt.scatter(X2[:, 0], X2[:, 1], s=12, c=final_df["cluster_id"].astype(int), alpha=0.8)
plt.title(f"Clusters visualization ({method}) | N={len(final_df)} | K={final_df['cluster_id'].nunique()}")
plt.xlabel("dim-1")
plt.ylabel("dim-2")
plt.colorbar(label="cluster_id")
plt.show()


canon_mask = final_df["is_canonical"].astype(bool).values
plt.figure(figsize=(10, 8))
plt.scatter(X2[:, 0], X2[:, 1], s=10, c=final_df["cluster_id"].astype(int), alpha=0.25)
plt.scatter(X2[canon_mask, 0], X2[canon_mask, 1], s=120, edgecolors="black", linewidths=1.0,
            c=final_df.loc[canon_mask, "cluster_id"].astype(int))
plt.title(f"Canonical points highlighted ({method})")
plt.xlabel("dim-1")
plt.ylabel("dim-2")
plt.show()
