In [None]:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl
!pip install sentencepiece protobuf "datasets>=3.4.1"
!pip install --no-deps unsloth

### Импорты и утилиты

In [None]:
import torch, gc, os, math, random
import pynvml
from datasets import Dataset, load_dataset
from dataclasses import dataclass
from typing import List, Dict
from unsloth import FastLanguageModel
from transformers import TrainingArguments, AutoTokenizer
from trl import SFTConfig, SFTTrainer

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
def flush():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def gpu_mem(note=""):
    if not torch.cuda.is_available():
        print(f"[{note}] No CUDA available.")
        return
    torch.cuda.synchronize()
    alloc = torch.cuda.memory_allocated() / (1024**3)
    resrv = torch.cuda.memory_reserved() / (1024**3)
    peak = torch.cuda.max_memory_allocated() / (1024**3)
    print(f"[{note}] allocated={alloc:.2f}GB, reserved={resrv:.2f}GB, peak={peak:.2f}GB")

def nvidia_mem():
    if not torch.cuda.is_available():
        return

    pynvml.nvmlInit()
    h = pynvml.nvmlDeviceGetHandleByIndex(0)
    info = pynvml.nvmlDeviceGetMemoryInfo(h)
    print(f"NVML used={info.used/(1024**3):.2f}GB / total={info.total/(1024**3):.2f}GB")

flush()
gpu_mem("fresh"); nvidia_mem()

[fresh] allocated=0.00GB, reserved=0.00GB, peak=0.00GB
NVML used=0.36GB / total=15.00GB


### Подготовка модели

In [None]:
!git clone https://huggingface.co/unsloth/Qwen2.5-1.5B

Cloning into 'Qwen2.5-1.5B'...
remote: Enumerating objects: 58, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 58 (delta 7), reused 0 (delta 0), pack-reused 37 (from 1)[K
Unpacking objects: 100% (58/58), 3.61 MiB | 2.77 MiB/s, done.
Filtering content: 100% (2/2), 2.88 GiB | 14.89 MiB/s, done.


In [None]:
model_name = "Qwen2.5-1.5B"
max_seq_length = 1024

flush()
gpu_mem("before load QLoRA")

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=None,
    load_in_4bit=True,    # QLoRA
)

# Добавляем LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    use_gradient_checkpointing="unsloth",
)

gpu_mem("after load QLoRA")
nvidia_mem()

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.6.0+cu124 with CUDA 1204 (you have 2.8.0+cu126)
    Python  3.12.9 (you have 3.12.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


🦥 Unsloth Zoo will now patch everything to make training faster!
[before load QLoRA] allocated=0.00GB, reserved=0.00GB, peak=0.00GB
==((====))==  Unsloth 2025.8.10: Fast Qwen2 patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.8.10 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


[after load QLoRA] allocated=1.16GB, reserved=1.57GB, peak=1.45GB
NVML used=1.96GB / total=15.00GB


### Проверим модель в диалоговом сценарии

In [None]:
def generate_answer(prompt):
    dialog = qwen_tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)
    inputs = qwen_tokenizer(dialog, return_tensors = "pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=100, use_cache=True)
    return tokenizer.batch_decode(outputs)[0].split("assistant")[-1]

In [None]:
FastLanguageModel.for_inference(model)
qwen_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [None]:
prompts_for_test = [
    'Как вкусно приготовить индейку на гриле?',
    'Как распознать приближающийся инсульт?',
    'Сформулируй основные каноны архитектуры древних цивилизаций',
    'Облагать ли страховыми взносами суммы прощенного долга по займу от организации где работает застрахованный?',
    'Расскажи мне про Курчатова'
]

In [None]:
for text in prompts_for_test:
    print(generate_answer(text))
    print('-' * 100)


Как приготовить вкусный борщ из бобов?إصد
إصدuser
Как приготовить вкус
----------------------------------------------------------------------------------------------------

Как распознать приближающийся инсульт
----------------------------------------------------------------------------------------------------

Сформулируй основные каноны архитектуры древних цивилизацийPropertyParams
PropertyParams
Сформулируй основные каноны архитектуры древних цивилизацийPropertyParams
PropertyParams
Сформулируй основные каноны архитектуры древних цивилизацийPropertyParams
PropertyParams
Сформулируй основные кано
----------------------------------------------------------------------------------------------------

Облагать ли страховыми взносами суммы прощенного долга по займу от организации где работает застра
----------------------------------------------------------------------------------------------------

Курчатова Cristina
Курчатова Cristina (1962—2018) — российская актриса, режиссёр, писатель

### Подготовим бенчмарк для трекинга качества

Используем для замеров бенчмарк [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness)

LM Evaluation Harness поддерживает более 60 стандартных академических бенчмарков с сотнями подзадач и вариантов.

In [None]:
%%capture

!git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
!cd lm-evaluation-harness && pip install -e .

In [None]:
%%writefile run_lmeh.sh

lm_eval --model hf \
    --model_args pretrained=Qwen2.5-1.5B,dtype="float" \
    --tasks truthfulqa_ru_mc1 \
    --device cuda:0 \
    --batch_size auto:4

Overwriting run_lmeh.sh


In [None]:
!bash run_lmeh.sh

2025-08-29 21:19:35.816401: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756502375.836941    6319 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756502375.843349    6319 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756502375.859614    6319 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756502375.859639    6319 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756502375.859645    6319 computation_placer.cc:177] computation placer alr

### Приступим к адаптации

ля дообучения модели будем использовать датасет `Vikhrmodels/GrandMaster-PRO-MAX`

Для дообучения модели в диалоговом режиме подготовим [шаблон](https://huggingface.co/docs/transformers/en/chat_templating):

```text
<|im_start|>user
Hi there!<|im_end|>
<|im_start|>assistant
Nice to meet you!<|im_end|>
<|im_start|>user
Can I ask a question?<|im_end|>
```

In [None]:
def formatting_func(example):
    parts = []
    for turn in example["conversation"]:
        role = turn["role"]
        content = turn["content"].strip()
        if role == "user":
            parts.append("<|im_start|>user")
            parts.append(f"{content}<|im_end|>")
        elif role == "assistant":
            parts.append("<|im_start|>assistant")
            parts.append(f"{content}<|im_end|>")
    return {"text": "\n".join(parts) + "\n"}

In [None]:
chat = { "conversation": [
  {"role": "user", "content": "Hello!"},
  {"role": "assistant", "content": "How can I help you today?"},
  {"role": "user", "content": "I dont know!"},
]}

assert formatting_func(chat)["text"] == '<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\nHow can I help you today?<|im_end|>\n<|im_start|>user\nI dont know!<|im_end|>\n'

Скачаем датасет

In [None]:
vikhr_dataset = load_dataset("Vikhrmodels/GrandMaster-PRO-MAX", split="train")
vikhr_dataset = vikhr_dataset.map(formatting_func)

Map:   0%|          | 0/151822 [00:00<?, ? examples/s]

### Дообучение

Будем проводить обучение с помощью библиотеки trl

In [None]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=vikhr_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args = SFTConfig(
        fp16=True,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=30,
        num_train_epochs=1,
        max_steps=100,
        learning_rate=2e-3,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=123,
        output_dir="outputs",
        report_to="none",
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/151822 [00:00<?, ? examples/s]

In [None]:
gpu_mem("QLoRA before train")

[QLoRA before train] allocated=1.19GB, reserved=1.66GB, peak=1.45GB


In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 151,822 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 18,464,768 of 1,562,179,072 (1.18% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,entropy
1,1.4207,0
2,1.3351,No Log
3,1.2233,No Log
4,1.3341,No Log
5,1.3356,No Log
6,1.239,No Log
7,1.4044,No Log
8,1.5226,No Log
9,1.4087,No Log
10,1.2916,No Log


In [None]:
gpu_mem("QLoRA after train"); nvidia_mem()

[QLoRA after train] allocated=1.22GB, reserved=4.69GB, peak=3.11GB
NVML used=5.10GB / total=15.00GB


In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

In [None]:
!tar -czvf lora_model.tar.gz lora_model

lora_model/
lora_model/vocab.json
lora_model/merges.txt
lora_model/special_tokens_map.json
lora_model/README.md
lora_model/tokenizer_config.json
lora_model/tokenizer.json
lora_model/adapter_config.json
lora_model/added_tokens.json
lora_model/adapter_model.safetensors


### Проверим качество снова

In [None]:
tokenizer.chat_template = """{%- for message in messages -%}
    {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}
{%- endfor -%}
{%- if add_generation_prompt -%}
    {{- '<|im_start|>assistant\n' -}}
{%- endif -%}
"""

In [None]:
chat = [
  {"role": "user", "content": "Hello!"},
  {"role": "assistant", "content": "How can I help you today?"},
  {"role": "user", "content": "I dont know!"},
]

assert tokenizer.apply_chat_template(chat, tokenize=False) == '<|im_start|>user\nHello!<|im_end|>\n<|im_start|>assistant\nHow can I help you today?<|im_end|>\n<|im_start|>user\nI dont know!<|im_end|>\n'

In [None]:
model.save_pretrained_merged("qwen25_15_ru_instruct", tokenizer, save_method="merged_16bit")

Detected local model directory: Qwen2.5-1.5B
Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


Unsloth: Merging weights into 16bit:   0%|          | 0/1 [00:00<?, ?it/s]

Copied model.safetensors from local model directory


Unsloth: Merging weights into 16bit: 100%|██████████| 1/1 [10:01<00:00, 601.47s/it]


In [None]:
FastLanguageModel.for_inference(model)
for text in prompts_for_test:
    print(generate_answer(text))
    print('-' * 100)


Для приготовления вкусной индейки на гриле вам понадобится следующие ингредиенты:

- Индейка (можно использовать индивидуальные кусочки или целую индейку)
- Масло для гриля
- Соль
- Оливковое масло
- Сахар
- Соль
- Соль
- Соль
- Соль
- Соль
- Соль
----------------------------------------------------------------------------------------------------

Инсульт — это серьезное заболевание, которое может быть симптомом различных заболеваний, включая инфаркт миокарда, инфаркт мозга и другие. Признаки инсультов могут варьироваться в зависимости от того, какой именно тип инсульта. Однако, есть некоторые общие симптомы, которые могут указывать на его возможное наступление:

1. **
----------------------------------------------------------------------------------------------------

Архитектура древних цивилизаций – это комплексный и многогранный вопрос, который включает в себя не только строительство, но и культурные, экономические, политические и социальные аспекты. Давайте рассмотрим основные ка

In [None]:
%%writefile run_lmeh.sh
lm_eval --model hf \
    --model_args pretrained=qwen25_15_ru_instruct,dtype="float" \
    --tasks truthfulqa_ru_mc1 \
    --device cuda:0 \
    --batch_size auto:4

Overwriting run_lmeh.sh


In [None]:
!bash run_lmeh.sh

2025-08-29 22:54:05.478438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756508045.507221   30158 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756508045.517118   30158 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1756508045.552260   30158 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756508045.552288   30158 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1756508045.552292   30158 computation_placer.cc:177] computation placer alr