In [None]:
"""
Fine-tune **ruGPT-3-small** (≈ 1.3 B, GPT-2-style) с LoRA
на MacBook (Apple M-series, 16 GB unified memory, MPS).

— контекст 1024 токенов;
— без bitsandbytes (MPS его не поддерживает);
— минимальный batch (1) + градиент-аккумуляция, чтобы не вылетать по памяти.
"""

In [2]:
import os, gc, torch
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"   # <- глушим fp16/bf16
gc.collect(); torch.mps.empty_cache()

In [4]:
# ============================================================
# 0. УСТАНОВКА
# ============================================================
# Внутри venv / conda с PyTorch ≥ 2.2 и поддержкой mps_device.
!pip install -U transformers datasets peft accelerate evaluate sentencepiece



In [6]:
# ============================================================
# 1. ЧТЕНИЕ КОРПУСА (CSV/TSV/JSONL) → src/tgt
# ============================================================
import pandas as pd, numpy as np, torch, textwrap, random, os
from datasets import Dataset
from sklearn.model_selection import train_test_split

DATA_FILE = "finetuning_text_pairs_3_clean_v2.csv"
FILE_TYPE = "csv"          # csv | tsv | jsonl
COL_SRC   = "finetuning_text_pairs_3"
COL_TGT   = "Unnamed: 1"
VAL_SPLIT = 0.1
SEED      = 42

if FILE_TYPE == "tsv":
    df = pd.read_csv(DATA_FILE, sep="\t")
elif FILE_TYPE == "csv":
    df = pd.read_csv(DATA_FILE)
elif FILE_TYPE == "jsonl":
    df = pd.read_json(DATA_FILE, lines=True)
else:
    raise ValueError("unknown FILE_TYPE")

df = df.dropna(subset=[COL_SRC, COL_TGT]).rename(
    columns={COL_SRC: "src", COL_TGT: "tgt"}
)
COL_SRC = "src"; COL_TGT = "tgt"

train_df, val_df = train_test_split(df, test_size=VAL_SPLIT, random_state=SEED)
train_ds, val_ds = map(Dataset.from_pandas, (train_df, val_df))

In [8]:
# ============================================================
# 2. МОДЕЛЬ + LoRA (без квантов, всё fp16 на MPS)
# ============================================================
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

MODEL_ID = "sberbank-ai/rugpt3small_based_on_gpt2"

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
tok.pad_token = tok.eos_token        # GPT-2 нет pad, используем eos

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
)
base_model.to(device)

lora_cfg = LoraConfig(
    r=8, lora_alpha=16,
    target_modules=["c_attn", "c_proj"],  # основные линейные в GPT-2-блоке
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(base_model, lora_cfg)
model.print_trainable_parameters()

trainable params: 811,008 || all params: 126,042,624 || trainable%: 0.6434




In [10]:
# ============================================================
# 3. ТОКЕНИЗАЦИЯ (контекст 1024, -100 на prompt)
# ============================================================
MAXLEN  = 1024
PROMPT  = "Упрости текст, сохраняя смысл:\n\n{src}\n\nУпрощённая версия:"

def tokenize(batch):
    input_ids, labels, attention = [], [], []
    for src, tgt in zip(batch[COL_SRC], batch[COL_TGT]):
        prompt_ids  = tok(PROMPT.format(src=src), add_special_tokens=False).input_ids
        target_ids  = tok(tgt, add_special_tokens=False).input_ids + [tok.eos_token_id]
        ids   = (prompt_ids + target_ids)[:MAXLEN]
        label = ([-100]*len(prompt_ids) + target_ids)[:MAXLEN]
        input_ids.append(ids)
        labels.append(label)
        attention.append([1]*len(ids))
    return {"input_ids":input_ids, "labels":labels, "attention_mask":attention}

train_ds = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
val_ds   = val_ds.map(tokenize, batched=True, remove_columns=val_ds.column_names)

Map:   0%|          | 0/2576 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2499 > 2048). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/287 [00:00<?, ? examples/s]

In [14]:
# ============================================================
# 4. ТРЕНИРОВКА (batch 1, GA 32 → effective BS 32)
# ============================================================
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import DataCollatorForSeq2Seq

args = TrainingArguments(
    output_dir="rugpt3_lora",
    num_train_epochs=8,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    fp16=False,     # ← выключаем
    bf16=False,     # ← и это тоже
    dataloader_pin_memory=False,   # MPS не любит pin_memory
    report_to="none",
)

collator = DataCollatorForSeq2Seq(
    tok,
    model=model,
    pad_to_multiple_of=8,   # аккуратный padding
    label_pad_token_id=-100
)
trainer  = Trainer(model=model, args=args,
                   train_dataset=train_ds, eval_dataset=val_ds,
                   data_collator=collator)

trainer.train()
model.save_pretrained("rugpt3_lora_adapter")
tok.save_pretrained("rugpt3_lora_adapter")

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.7555,
2,1.5864,
3,1.5224,
4,1.4534,
5,1.4241,
6,1.4172,
7,1.3862,


('rugpt3_lora_adapter/tokenizer_config.json',
 'rugpt3_lora_adapter/special_tokens_map.json',
 'rugpt3_lora_adapter/vocab.json',
 'rugpt3_lora_adapter/merges.txt',
 'rugpt3_lora_adapter/added_tokens.json',
 'rugpt3_lora_adapter/tokenizer.json')

In [18]:
!pip install rouge_score

python(34381) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
[33m  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=cb05602d865717c78b94c170a08b79cc8ee840812be03d85c60b66a1ec57a33a
  Stored in directory: /Users/aleksandraskorodumova/Library/Caches/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e

In [22]:
!pip install sacrebleu

python(34407) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [sacrebleu]
[1A[2KSuccessfully installed portalocker-3.1.1 sacrebleu-2.5.1


In [24]:
# ============================================================
# 5. БЫСТРАЯ ПРОВЕРКА (ROUGE-L + chrF)
# ============================================================
import evaluate, textwrap
rouge = evaluate.load("rouge")
chrf  = evaluate.load("chrf")

@torch.inference_mode()
def simplify(text, temp=0.3, top_p=0.9, max_new=256):
    prompt = PROMPT.format(src=text)
    ids = tok(prompt, return_tensors="pt").to(device)
    out = model.generate(**ids, do_sample=True, temperature=temp,
                         top_p=top_p, max_new_tokens=max_new,
                         eos_token_id=tok.eos_token_id)
    return tok.decode(out[0], skip_special_tokens=True)\
             .split("Упрощённая версия:")[-1].strip()

sample = val_df.sample(10, random_state=SEED)
preds  = [simplify(t) for t in sample[COL_SRC]]

for i, (src, ref, pr) in enumerate(zip(sample[COL_SRC], sample[COL_TGT], preds), 1):
    print(f"\n--- #{i}\nSRC: {textwrap.shorten(src, 90)}"
          f"\nREF: {textwrap.shorten(ref, 90)}"
          f"\nPRD: {textwrap.shorten(pr,  90)}")

print("\nROUGE-L:", rouge.compute(predictions=preds, references=sample[COL_TGT])["rougeL"])
print("chrF:",     chrf.compute(predictions=preds, references=sample[COL_TGT]))



--- #1
SRC: Антивирусное программное обеспечение способно обнаружить и уничтожить большинство [...]
REF: Для защиты от вредоносных программ на вашем компьютере предустановлен антивирус [...]
PRD: ### Антивирусное программное обеспечение: как защитить от вирусов #### Инструкция по [...]

--- #2
SRC: НАСТРОЙКИ ДЛЯ ОТДЕЛЬНОГО ПРИЛОЖЕНИЯ Можно настроить инструменты для использования в [...]
REF: Настройки для отдельного приложения Вы можете настроить инструменты для конкретного [...]
PRD: ### Установка инструментов для работы с приложениями #### Как настроить инструмент [...]

--- #3
SRC: Масса нетто кг 34 3. ПОДГОТОВКА К ЭКСПЛУАТАЦИИ СТАНКА. Если станок внесен в зимнее [...]
REF: ### Подготовка к эксплуатации станка **Масса нетто:** 34 кг #### Если станок занесли [...]
PRD: ### Подготовка к установке станка 1. **Подготовка к установке** 1. **Установка** 2. [...]

--- #4
SRC: Макро Менеджер 15 Проверим функцию в игре: В реальных шутерах FPS, отдача оружия [...]
REF: Макро Менеджер 15: Про

KeyError: 0