In [None]:
import math
import warnings
from pathlib import Path

import pandas as pd
import torch
import peft
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from tqdm.auto import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
    set_seed,
)
from trl import SFTConfig, SFTTrainer

import src.utils.data as data_utils
import src.utils.io as io_utils
import src.utils.models as model_utils

In [None]:
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

# EXTERNAL = Path(os.getenv("EXTERNAL_STORAGE_DIR"))
ROOT = io_utils.repo_root()
SPLIT_DIR = ROOT / "data/splits"
CONFIG_DIR = ROOT / "config"
METRIC_DIR = ROOT / "metrics"
RANDOM_STATE = 42

set_seed(RANDOM_STATE)

In [None]:
ROOT

In [None]:
IDS_PATH = io_utils.load_yaml(CONFIG_DIR / "dataset.ids.yml")["splits_ids"]
TRAIN_IDS_PATH = IDS_PATH["train_ids"]
VAL_IDS_PATH = IDS_PATH["val_ids"]

train_ids = pd.read_csv(ROOT / TRAIN_IDS_PATH, header=None)
val_ids = pd.read_csv(ROOT / VAL_IDS_PATH, header=None)

In [None]:
raw_train = load_dataset("IlyaGusev/gazeta")["train"].to_pandas()
raw_val = load_dataset("IlyaGusev/gazeta")["validation"].to_pandas()

print("raw train shape:", raw_train.shape, "raw val shape:", raw_val.shape)
raw_val.head()

In [None]:
columns = ["text", "summary"]
train = raw_train.loc[train_ids.squeeze(), columns]
val = raw_val.loc[val_ids.squeeze(), columns]
for col in columns:
    train[col] = data_utils.clean(train[col])
    val[col] = data_utils.clean(val[col])
val.head(2)

In [None]:
MODEL_CFG_PATH = CONFIG_DIR / "models.params.yml"
model_cfg = None
if torch.cuda.is_available():
    model_cfg = io_utils.load_yaml(MODEL_CFG_PATH)["cuda_model"]
else:
    model_cfg = io_utils.load_yaml(MODEL_CFG_PATH)["cpu_model"]

model_cfg

In [None]:
device = model_cfg["device"]
model_id = model_cfg["model_id"]
n_eval = model_cfg["n_eval"]
n_train = model_cfg["n_train"]
use_4bit = model_cfg["use_4bit"]
device_map = model_cfg["device_map"]
torch_dtype = (
    torch.bfloat16
    if device == "cuda" and torch.cuda.is_bf16_supported()
    else (torch.float16 if device == "cuda" else torch.float32)
)

subset_val = val.sample(
    n=min(n_eval, val.shape[0]), random_state=RANDOM_STATE
).reset_index(drop=True)

subset_train = train.sample(
    n=min(n_train, train.shape[0]), random_state=RANDOM_STATE
).reset_index(drop=True)

subset_val.head(2)

In [None]:
quantization_config = None
if use_4bit:
    try:
        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch_dtype,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
    except Exception as e:
        print("bitsandbytes не готов, продолжаем без 4-бит:", e)
        quantization_config = None

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=(None if quantization_config else torch_dtype),
    device_map=device_map,
    quantization_config=quantization_config,
)

tokenizer.padding_side = "left"
tokenizer.truncation_side = "left"

if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token

model.config.pad_token_id = tokenizer.pad_token_id
if getattr(model, "generation_config", None) is not None:
    model.generation_config.pad_token_id = tokenizer.pad_token_id

if device != "cuda":
    model.to(device)

In [None]:
SYSTEM_PROMPT = (
    "Ты помощник по резюмированию русскоязычных новостей. "
    "Сделай краткое, нейтральное резюме исходного текста (3–5 предложений). "
    "Не добавляй фактов, которых нет в тексте."
)


def build_messages(row):
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {
                "role": "user",
                "content":
                f"Задача: кратко резюмируй.\n\nТекст статьи:\n{row['text']}",
            },
            {"role": "assistant", "content": row["summary"]},
        ]
    }

In [None]:
train_ds = Dataset.from_pandas(subset_train).map(
    build_messages, remove_columns=subset_train.columns.to_list()
)
val_ds = Dataset.from_pandas(subset_val).map(
    build_messages, remove_columns=subset_val.columns.to_list()
)

train_ds[0]

In [None]:
import torch.nn as nn
try:
    import bitsandbytes as bnb
    LinearLike = (nn.Linear, bnb.nn.Linear4bit, bnb.nn.Linear8bitLt)
except Exception:
    LinearLike = (nn.Linear,)

suffixes = sorted({n.split(".")[-1] for n,m in model.named_modules() if isinstance(m, LinearLike)})
print("Linear suffixes:", suffixes)
# пример: ['W_pack', 'o_proj', 'w1', 'w2', 'w3']  # для Qwen-1.5, например

In [None]:
peft_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['down_proj', 'gate_proj', 'k_proj', 'o_proj', 'q_proj', 'up_proj', 'v_proj'],
    modules_to_save=["lm_head"],
)

CHECK_PATH = str(ROOT / "src/checkpoints")

if getattr(model, "is_loaded_in_4bit", False) or getattr(
    model, "is_loaded_in_8bit", False
):
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing=True,
        gradient_checkpointing_kwargs={"use_reentrant": False},
    )
model = get_peft_model(model, peft_cfg)

In [None]:
print("is PeftModel:", isinstance(model, peft.PeftModel))

# 2) Есть ли где-то lora_A/lora_B?
wrapped = [n for n, m in model.named_modules() if hasattr(m, "lora_A")]
print("LoRA-wrapped modules (first 20):", wrapped[:20])

# 3) lm_head стал обучаемым (из-за modules_to_save)?
print("lm_head requires_grad:", getattr(getattr(model, "lm_head", None), "weight", None) is not None
      and model.lm_head.weight.requires_grad)

# 4) Итог
model.print_trainable_parameters()
assert any(p.requires_grad for p in model.parameters()), "No trainable parameters!"

In [None]:
sft_cfg = SFTConfig(
    output_dir=CHECK_PATH,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_steps=100,
    logging_steps=20,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    report_to=[],
    packing=False,
    max_length=2048,
    bf16=(device == "cuda"),
    tf32=(device == "cuda"),
    optim=(
        "adamw_bnb_8bit"
        if use_4bit
        else ("adamw_torch_fused" if device == "cuda" else "adamw_torch")
    ),
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    group_by_length=True,
    eos_token=tokenizer.eos_token,
    dataset_text_field=None,
)

trainer = SFTTrainer(
    model=model,
    processing_class=tokenizer,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    args=sft_cfg,
    peft_config=peft_cfg,
)

GEN_EVAL = GenerationConfig(
    max_new_tokens=200,
    do_sample=False,
)

MAX_INPUT_TOKENS = model_utils.get_max_input_tokens(tokenizer, GEN_EVAL)

In [None]:
from peft import PeftModel
print("trainer.model is PeftModel:", isinstance(trainer.model, PeftModel))
print("active adapters:", getattr(trainer.model, "active_adapters", None))
trainer.model.print_trainable_parameters()

In [None]:
wrapped = [n for n, m in trainer.model.named_modules() if hasattr(m, "lora_A")]
print(wrapped[:20])

In [None]:
if hasattr(trainer.model, "config"):
    trainer.model.config.use_cache = False
train_result = trainer.train()
train_result

In [None]:
trainer.save_model()
tokenizer.save_pretrained(sft_cfg.output_dir)

In [None]:
def build_chat(text: str):
    msgs = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": f"Задача: кратко резюмируй.\n\nТекст статьи:\n{text}",
        },
    ]
    return tokenizer.apply_chat_template(
        msgs, tokenize=False, add_generation_prompt=True
    )

In [None]:
def generate_batch(texts, batch_size=4, show_progress=True):
    out = []
    it = range(0, len(texts), batch_size)
    if show_progress:
        it = tqdm(
            it,
            total=math.ceil(len(texts) / batch_size),
            desc="Generating SFT infer",
            leave=False,
        )

    for i in it:
        chunk = [build_chat(t) for t in texts[i : i + batch_size]]
        inputs = tokenizer(
            chunk,
            return_tensors="pt",
            padding=True,
            truncation=True,
            pad_to_multiple_of=8,
            max_length=MAX_INPUT_TOKENS,
        ).to(device)

        with torch.no_grad():
            output_ids = model.generate(**inputs, generation_config=GEN_EVAL)

        gen_ids = output_ids[:, inputs["input_ids"].shape[1] :]
        decoded = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
        out.extend([d.strip() for d in decoded])

    return out

In [None]:
import torch

def prepare_for_inference_with_aligned_head(model):
    # 1) инференс-режим: выключаем GC, включаем кэш
    if hasattr(model, "gradient_checkpointing_disable"):
        model.gradient_checkpointing_disable()
    if hasattr(model, "config"):
        model.config.use_cache = True
    model.eval()

    # 2) выбираем "референсный" dtype по финальной норме (ближе всего к hidden states)
    target_dtype = None
    # Qwen2: base_model.model.norm.weight
    try:
        target_dtype = model.base_model.model.norm.weight.dtype
    except Exception:
        pass
    if target_dtype is None:
        # универсальный поиск: какая-нибудь финальная нормализация
        for n, p in model.named_parameters():
            if n.endswith(("norm.weight", "final_layernorm.weight", "ln_f.weight")):
                target_dtype = p.dtype
                break
    if target_dtype is None:
        # запасной вариант
        target_dtype = next(model.parameters()).dtype

    # 3) приводим lm_head к target_dtype
    if getattr(model, "lm_head", None) is not None:
        model.lm_head.to(target_dtype)

    # 4) приводим все ModulesToSaveWrapper к target_dtype (совместимо с peft 0.17)
    try:
        from peft.tuners.tuners_utils import ModulesToSaveWrapper
    except Exception:
        ModulesToSaveWrapper = tuple()  # на всякий случай

    for _, m in model.named_modules():
        if isinstance(m, ModulesToSaveWrapper):
            try:
                m.to(target_dtype)
            except Exception:
                pass

    return target_dtype

target_dtype = prepare_for_inference_with_aligned_head(model)
print("Aligned target dtype:", target_dtype)

In [None]:
BATCH = 1 if device != "cuda" else 6
texts = subset_val["text"].tolist()
refs = subset_val["summary"].tolist()
preds_sft = generate_batch(texts, batch_size=BATCH, show_progress=True)

In [None]:
preds_sft[:2]

In [None]:
refs[:2]

In [None]:
scores = data_utils.get_all_scores(preds_sft, refs, device=device)
scores

In [None]:
Path(METRIC_DIR).mkdir(parents=True, exist_ok=True)

df_metrics = pd.DataFrame(
    [
        {
            "system": "SRT QLoRA",
            "split": "validation_full",
            "rouge1": scores.get("rouge1", 0.0),
            "rouge2": scores.get("rouge2", 0.0),
            "rougeL": scores.get("rougeL", 0.0),
            "rougeLsum": scores.get("rougeLsum", 0.0),
            "bertscore_precision": scores.get("bertscore_precision", 0.0),
            "bertscore_recall": scores.get("bertscore_recall", 0.0),
            "bertscore_f1": scores.get("bertscore_f1", 0.0),
            "avg_len_pred": scores.get("avg_len_pred", 0.0),
            "avg_len_ref": scores.get("avg_len_ref", 0.0),
            "len_ratio_pred_to_ref": scores.get("len_ratio_pred_to_ref", 0.0),
            "k": None,
            "n_examples": n_eval,
        }
    ]
)
df_metrics.to_csv(
    METRIC_DIR / f"llm_qlora_validation_{device}_{n_eval}.csv", index=False
)

df_sampels = pd.DataFrame(
    [
        {
            "title": subset_val["title"].head(3) if "title" in subset_val else [""] * 3,
            "reference": refs[:3],
            "prediction": preds_sft[:3],
        }
    ]
)
df_sampels.to_csv(
    METRIC_DIR / f"llm_qlora_examples_{device}.tsv", sep="\t", index=False
)

In [None]:
!nvidia-smi

import torch

print("torch:", torch.__version__, "| CUDA доступна:", torch.cuda.is_available())

# ----------------------------------------------------------------------------------

from google.colab import drive

drive.mount("/content/drive", force_remount=True)

# ----------------------------------------------------------------------------------

import os

BASE = "/content/drive/MyDrive/llm-news"
for sub in ["models", "metrics", "hf_cache"]:
    os.makedirs(os.path.join(BASE, sub), exist_ok=True)

print("Созданы/проверены папки:", os.listdir(BASE))

# ----------------------------------------------------------------------------------

import subprocess
import sys

REPO_URL = "https://github.com/mdayssi/llm-news-summarizer-ru.git"
REPO_DIR = "/content/llm-news"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}
else:
    print("Репозиторий уже есть:", REPO_DIR)


%cd {REPO_DIR}
!git rev-parse --short HEAD


# ----------------------------------------------------------------------------------

from pathlib import Path

env_path = Path(REPO_DIR) / ".env"
kv = {
    "EXTERNAL_MODELS_DIR": "/content/drive/MyDrive/llm-news/models",
    "EXTERNAL_METRICS_DIR": "/content/drive/MyDrive/llm-news/metrics_big",
    "EXTERNAL_CACHE_DIR": "/content/drive/MyDrive/llm-news/hf_cache",
}
text = "\n".join([f"{k}={v}" for k, v in kv.items()]) + "\n"
env_path.write_text(text, encoding="utf-8")

print(".env создано:")
print(env_path.read_text())


# ----------------------------------------------------------------------------------
%pip -q install --upgrade \
  evaluate rouge-score bert_score\
  razdel bitsandbytes accelerate\
  python-dotenv pyyaml peft trl

import accelerate
import bert_score
import bitsandbytes
import datasets
import dotenv
import evaluate
import razdel
import rouge_score
import sentencepiece
import torch
import tqdm
import transformers
import yaml

print("torch:", torch.__version__, "| cuda avail:", torch.cuda.is_available())
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("evaluate:", evaluate.__version__)

# ----------------------------------------------------------------------------------
import sys

repo_src = "/content/llm-news/src"
if repo_src not in sys.path:
    sys.path.insert(0, repo_src)
print("sys.path ok")

In [None]:
!cp /content/llm-news/src/checkpoints /content/drive/MyDrive/llm-news/cache