In [None]:
import math
import os
import warnings
from pathlib import Path

import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from dotenv import load_dotenv
from razdel import sentenize
from tqdm.auto import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    set_seed,
)

import src.utils.data as data_utils
import src.utils.io as io_utils
import src.utils.models as model_utils

In [None]:
load_dotenv()

warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

EXTERNAL = Path(os.getenv("EXTERNAL_STORAGE_DIR"))
ROOT = io_utils.repo_root()
SPLIT_DIR = ROOT / "data/splits"
CONFIG_DIR = ROOT / "config"
METRIC_DIR = ROOT / "metrics"
RANDOM_STATE = 42
set_seed(RANDOM_STATE)

In [None]:
VAL_IDS_PATH = io_utils.load_yaml(CONFIG_DIR / "dataset.ids.yml")["splits_ids"][
    "val_ids"
]
val_ids = pd.read_csv(ROOT / VAL_IDS_PATH, header=None)

In [None]:
raw_val = load_dataset("IlyaGusev/gazeta")["validation"].to_pandas()

print("raw val shape:", raw_val.shape)
raw_val.head()

In [None]:
val = raw_val.loc[val_ids.squeeze(), ["title", "text", "summary"]]
for col in val.columns:
    val[col] = data_utils.clean(val[col])
val.head(2)

In [None]:
MODEL_CFG_PATH = CONFIG_DIR / "models.params.yml"
model_cfg = None
if torch.cuda.is_available():
    model_cfg = io_utils.load_yaml(MODEL_CFG_PATH)["cuda_model"]
else:
    model_cfg = io_utils.load_yaml(MODEL_CFG_PATH)["cpu_model"]

model_cfg

In [None]:
device = model_cfg["device"]
model_id = model_cfg["model_id"]
n_eval = model_cfg["n_eval"]
use_4bit = model_cfg["use_4bit"]
device_map = model_cfg["device_map"]
torch_dtype = (
    torch.bfloat16
    if device == "cuda" and torch.cuda.is_bf16_supported()
    else (torch.float16 if device == "cuda" else torch.float32)
)
if n_eval is None:
    subset_val = val
else:
    subset_val = val.sample(n=min(n_eval, val.shape[0]), random_state=RANDOM_STATE)

subset_val.head(2)

In [None]:
quantization_config = None
if use_4bit:
    try:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch_dtype,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
    except Exception as e:
        print("bitsandbytes не готов, продолжаем без 4-бит:", e)
        quantization_config = None

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=(None if quantization_config else torch_dtype),
    device_map=device_map,
    quantization_config=quantization_config,
)

if device != "cuda":
    model.to(device)

In [None]:
SYSTEM_PROMPT = (
    "Ты помощник по резюмированию русскоязычных новостей. "
    "Сделай краткое, нейтральное резюме исходного текста (3–5 предложений). "
    "Не добавляй фактов, которых нет в тексте."
)

GEN_EVAL = GenerationConfig(
    max_new_tokens=160,
    do_sample=False,
)

MAX_INPUT_TOKENS = model_utils.get_max_input_tokens(tokenizer, GEN_EVAL)


def build_chat(text: str):
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"Задача: кратко резюмируй.\n\nТекст статьи:\n{text}",
        },
    ]


def generate_batch(
    texts: list[str], batch_size: int = 1, show_progress: bool = True
) -> list[str]:
    out = []
    model.eval()

    it = range(0, len(texts), batch_size)
    if show_progress:
        it = tqdm(
            it, total=math.ceil(len(texts) / batch_size), desc="Generating", leave=False
        )

    # на всякий случай — паддинг токен
    if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
        tokenizer.pad_token = tokenizer.eos_token

    for i in it:
        chunk = texts[i : i + batch_size]

        # 1) шаблон → строки
        prompts = [
            tokenizer.apply_chat_template(
                build_chat(t), tokenize=False, add_generation_prompt=True
            )
            for t in chunk
        ]

        # 2) строки → тензоры (BatchEncoding / dict)
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=MAX_INPUT_TOKENS,
        ).to(device)

        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                generation_config=GEN_EVAL,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=getattr(tokenizer, "eos_token_id", None),
            )

        # вырезаем только ответ
        gen_ids = output_ids[:, inputs["input_ids"].shape[1] :]
        decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        cleaned = [d.strip() for d in decoded]
        out.extend(cleaned)

    return out


BATCH = 1 if device != "cuda" else 6

preds_llm = generate_batch(
    subset_val["text"].tolist(), batch_size=BATCH, show_progress=True
)
refs_llm = subset_val["summary"].tolist()
len(preds_llm), len(refs_llm)

In [None]:
preds_llm[:2]

In [None]:
refs_llm[:2]

In [None]:
rouge_scores = data_utils.get_rouge_f1(preds_llm, refs_llm)

rouge_scores

In [None]:
Path(METRIC_DIR).mkdir(parents=True, exist_ok=True)

df_metrics = pd.DataFrame(
    [
        {
            "system": f"llm_zero_shot_{model_id.split('/')[-1]}",
            "split": (
                f"validation_full"
                if n_eval is None
                else f"validation_{len(subset_val)}"
            ),
            "rouge1": rouge_scores.get("rouge1", 0.0),
            "rouge2": rouge_scores.get("rouge2", 0.0),
            "rougeL": rouge_scores.get("rougeL", 0.0),
            "rougeLsum": rouge_scores.get("rougeLsum", 0.0),
            "avg_pred_len_tokens": float(np.mean([len(p.split()) for p in preds_llm])),
            "k": None,
            "n_examples": len(subset_val),
        }
    ]
)
df_metrics.to_csv(METRIC_DIR / f"llm_zero_shot_validation_{device}.csv", index=False)

df_sampels = pd.DataFrame(
    [
        {
            "title": subset_val["title"].head(3) if "title" in subset_val else [""] * 3,
            "reference": refs_llm[:3],
            "prediction": preds_llm[:3],
        }
    ]
)
df_sampels.to_csv(
    METRIC_DIR / f"llm_zero_shot_examples_{device}.tsv", sep="\t", index=False
)