In [1]:
from IPython.display import clear_output

In [2]:
!pip install transformers peft datasets huggingface_hub trl==0.10.1 omegaconf autoawq --upgrade
clear_output()

In [3]:
from kaggle_secrets import UserSecretsClient
import os
api_keys = UserSecretsClient()

os.environ['HF_TOKEN'] = api_keys.get_secret("huggingface-cli")
os.environ['WANDB_API_KEY'] = api_keys.get_secret("wandb")

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
import torch.nn.functional as F

import transformers
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM 
from transformers import TrainingArguments, Trainer, Seq2SeqTrainingArguments, Seq2SeqTrainer, GenerationConfig, DataCollatorWithPadding
from transformers import pipeline
from peft import BOFTConfig, get_peft_model, LoraConfig, TaskType
from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer, check_target_module_exists, onload_layer
from datasets import load_dataset, DatasetDict
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM

import wandb
from omegaconf import OmegaConf

import tqdm.notebook as tqdm

import gc
import warnings

# Сравнение pre-trained и instruction following моделей

#### Pre-trained model

In [5]:
pl = pipeline(
    task="text-generation",
    model="meta-llama/Llama-3.2-1B",
)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Device set to use cuda:0


In [6]:
prompt = """What is the purpose of the list C in the code below?
def binomial_coefficient(n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        j -= 1;
    return C[r]
"""

print(prompt)

What is the purpose of the list C in the code below?
def binomial_coefficient(n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        j -= 1;
    return C[r]



In [7]:
model_pred = pl(
    prompt,
    max_new_tokens=64, 
    return_full_text=True,
    do_sample=True,
    temperature=None,
    top_p=None,
)

print(model_pred[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is the purpose of the list C in the code below?
def binomial_coefficient(n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        j -= 1;
    return C[r]
Please explain why the author thinks C is zero? It doesn’t look like it is zero after taking r = 3 and n = 5 into consideration.
Is there a typo? I think r has to be > 0
And do we always take the sum in the reverse order?
The C in the bottom


In [8]:
del pl

gc.collect()
torch.cuda.empty_cache()

#### Instruction following model

In [9]:
pl = pipeline(
    task="text-generation",
    model="meta-llama/Llama-3.2-1B-Instruct",
)

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Device set to use cuda:0


In [10]:
prompt = """What is the purpose of the list C in the code below?
def binomial_coefficient(n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        j -= 1;
    return C[r]
"""

print(prompt)

What is the purpose of the list C in the code below?
def binomial_coefficient(n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        j -= 1;
    return C[r]



In [11]:
model_pred = pl(
    prompt,
    max_new_tokens=256, 
    return_full_text=True,
    do_sample=True,
    temperature=None,
    top_p=None,
)

print(model_pred[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


What is the purpose of the list C in the code below?
def binomial_coefficient(n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        j -= 1;
    return C[r]
return binomial_coefficient(n, r)

def calculate_max_profit(prices):
    max_profit = 0;
    for i in range(len(prices) - 1):
        if prices[i] < prices[i+1]:
            profit = prices[i+1] - prices[i]
            max_profit = max(max_profit, profit);
    return max_profit

def calculate_max_profit_given(prices, n, r):
    C = [0 for i in range(r + 1)];
    C[0] = 1;
    for i in range(1, n + 1):
        j = min(i, r);
        while j > 0:
        C[j] += C[j - 1];
        if prices[i - 1] < prices[i] and j > 0:
            C[j] += C[j - 1];
            C[j] += C[j - 1];
            return C[r];
    return 0

def calculate_max_profit_by_iterative_strategy(prices, n, r):
    final_profit = 0;
    i = 0;
    j = 0;
    wh

Подготавливаем промпт в формате ...

In [12]:
system_prompt = "You are a precise and efficient code assistant. Analyze user requests carefully, provide clean, idiomatic code with best practices, and explain concepts concisely. Prioritize correctness, performance, and readability. Highlight edge cases and errors proactively. Tailor responses to the user’s expertise level."

print(f"{system_prompt=}")

instruction = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt},
]

system_prompt='You are a precise and efficient code assistant. Analyze user requests carefully, provide clean, idiomatic code with best practices, and explain concepts concisely. Prioritize correctness, performance, and readability. Highlight edge cases and errors proactively. Tailor responses to the user’s expertise level.'


In [13]:
model_pred = pl(
    instruction,
    max_new_tokens=1024, 
    return_full_text=True,
    do_sample=True,
    temperature=None,
    top_p=None,
)

print(model_pred[0]['generated_text'][-1]['content'])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


The list C in the given code is an implementation of the matrix "Combinatorial formula" notation, specifically representing the binomial coefficient C(n, r).

A binomial coefficient C(n, r) represents the number of ways to choose r items from a set of n distinct items, without regard to the order of selection.

Here's a breakdown of how it works:

- `C[r]` represents the binomial coefficient C(n, r), where n is the total number of distinct items, and r is the number of items to choose.

In this implementation, an array `C` of size `r + 1` is used to store the values of `C`. Each element `C[j]` represents the number of ways to choose j items from n distinct items, which is calculated by summing the values of `C[j - 1]`.

For example, if we want to compute C(5, 3), we would calculate C[3] using the following steps:

1. Calculate C[3]: C[3] = C[3 - 1] = C[2] = 2 (1 way to choose 0 items, 1 way to choose 1 item, 1 way to choose 2 items)
2. Calculate C[4]: C[4] = C[3] + C[2] = 2 + 2 = 4 (1 

In [14]:
del pl

gc.collect()
torch.cuda.empty_cache()

# Задачи NLP

<div style="max-width:1000px;margin-right: auto; margin-right: 0;">
    <img alt="Классификация задач NLP" src="https://media.mobidev.biz/2024/12/nlp-tasks.png?strip=all&lossy=1&ssl=1&_gl=1*5860h5*_gcl_au*MjYwODA2Nzc0LjE3NDM0NDc1MDc.">
</div>

# Дообучаем модель генерировать анекдоты

#### Загружаем датасет

In [15]:
dataset = load_dataset("igorktech/anekdots_dialogs")

dataset

README.md:   0%|          | 0.00/3.47k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/100834 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['original', 'parsed', 'total_mark', 'date', 'downvote', 'total_votes', 'upvote', 'hash', 'alpha_frac', 'LDR', 'days_since_publication', 'time_decay', 'LDR_time_decay'],
        num_rows: 100834
    })
})

In [16]:
dataset = dataset.remove_columns(
    [col for col in dataset.column_names['train'] if col not in ('original', 'upvote')]
)

dataset

DatasetDict({
    train: Dataset({
        features: ['original', 'upvote'],
        num_rows: 100834
    })
})

#### Загружаем модель и токенизатор

In [17]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

tokenizer.pad_token = tokenizer.eos_token 

#### Генерируем анекдоты

In [18]:
system_prompt = "Вы профессиональный генератор анекдотов на русском языке. Создавайте оригинальные, культурно-релевантные шутки с использованием игры слов, иронии и кратких историй. Избегайте запрещенных тем, стереотипов и оскорбительного контента. Адаптируйте стиль под аудиторию (семейный, сатирический и т.д.), если не указано иное."

user_prompt = "Сгенерируй анекдот про Петьку и Василия Ивановича"

print(f"{system_prompt=}\n\n{user_prompt=}")

instruction = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt},
]

system_prompt='Вы профессиональный генератор анекдотов на русском языке. Создавайте оригинальные, культурно-релевантные шутки с использованием игры слов, иронии и кратких историй. Избегайте запрещенных тем, стереотипов и оскорбительного контента. Адаптируйте стиль под аудиторию (семейный, сатирический и т.д.), если не указано иное.'

user_prompt='Сгенерируй анекдот про Петьку и Василия Ивановича'


In [19]:
pl = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer
)

Device set to use cuda:0


In [20]:
model_pred = pl(
    instruction,
    max_new_tokens=128, 
    return_full_text=True,
    do_sample=True,
    temperature=None,
    top_p=None,
)

print(model_pred[0]['generated_text'][-1]['content'])

Всегда найдешь Петьку в клоthing store в КурсKE. Он стоит 500 долларов.


#### Сомнительно, но окей... Давайте дообучать

### Пишем свою LoRA

Создаем слой, на который будут заменяться линейные слои модели

[Исходники PEFT](https://github.com/huggingface/peft/tree/v0.14.0/src/peft/tuners/lora)

In [21]:
# # del model
# # gc.collect()

# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")

# tokenizer.pad_token = tokenizer.eos_token 

In [22]:
def get_layer(model, name):
    layer = model
    for attr in name.split("."):
        layer = getattr(layer, attr)
    return layer

def set_layer(model, name, layer):
    try:
        attrs, name = name.rsplit(".", 1)
        model = get_layer(model, attrs)
    except ValueError:
        pass
    setattr(model, name, layer)

def get_submodules(model, key):
    parent = model.get_submodule(".".join(key.split(".")[:-1]))
    target_name = key.split(".")[-1]
    target = model.get_submodule(key)
    return parent, target, target_name

def print_num_trainable(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    frac = (float(trainable_params) / total_params) * 100
    
    print(f"trainable: {trainable_params}  |  total: {total_params}  |  trainable(%): {frac:.6f}")

In [23]:
class LoRALayer(nn.Module):
    def __init__(self, base_layer: nn.Linear, r: int, lora_alpha: int, dropout_p: float = 0.0):
        super().__init__()

        self.in_features = base_layer.in_features
        self.out_features = base_layer.out_features
        self.r = r
        self.lora_alpha = lora_alpha
        self.scale = self.lora_alpha / r
        self._enabled = True
        self._merged = False

        self.base_layer = base_layer
        self.lora_A = nn.Linear(self.in_features, r, bias=False, device=base_layer.weight.device)
        self.lora_B = nn.Linear(r, self.out_features, bias=False, device=base_layer.weight.device)
        self.lora_dropout = nn.Dropout(dropout_p).to(base_layer.weight.device)
        self.reset_parameters()

    @torch.no_grad()
    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.lora_A.weight, a=np.sqrt(5))
        nn.init.zeros_(self.lora_B.weight)

    def enable_adapters(self, enable: bool = True):
        self._enabled = enable 
    
    def forward(self, x):       
        if self._enabled:
            y = self.lora_dropout(x)
            y = self.lora_A(y)
            y = self.lora_B(y)
            
            y = self.scale * y + self.base_layer(x)
        else:
            y = self.base_layer(x)

        return y
    
    @torch.no_grad()
    def merge(self) -> None:
        if self._merged:
            return
            
        self._merged = True
        W_0 = self.base_layer.weight # (out_f, in_f)
        A = self.lora_A.weight       # (r,     in_f)
        B = self.lora_B.weight       # (out_f, r)
        
        delta_W = self.scale * (B @ A).to(W_0.dtype)
        self.base_layer.weight.data = W_0 + delta_W

    @torch.no_grad()
    def unmerge(self) -> None:
        if not self._merged:
            return
            
        self._merged = False
        W_0 = self.base_layer.weight # (out_f, in_f)
        A = self.lora_A.weight       # (r,     in_f)
        B = self.lora_B.weight       # (out_f, r)
        
        delta_W = self.scale * (B @ A).to(W_0.dtype)
        self.base_layer.weight.data = W_0 - delta_W

In [24]:
# del LoRALayer
# del model
# del model_adapter

# gc.collect()

In [25]:
from dataclasses import dataclass

@dataclass
class LoRAConfig:
    r: int
    lora_dropout: float
    lora_alpha: float
    target_modules: list[str]

In [26]:
lora_config = LoRAConfig(
    r=16,
    lora_dropout=0.05,
    lora_alpha=32,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
)

lora_config

LoRAConfig(r=16, lora_dropout=0.05, lora_alpha=32, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj'])

In [27]:
class LoRAModel(nn.Module):
    def __init__(self, model, lora_config: LoRAConfig):
        super().__init__()

        self.model = model
        self.device = model.device # ???
        self.config = model.config
        self.lora_config = lora_config

        self.generation_config = model.generation_config
        self.can_generate = model.can_generate
        
        self._setup_adapters()

    def _setup_adapters(self) -> None:
        for param_name, param in self.model.named_parameters():
            param.requires_grad = False

        for name, module in self.model.named_modules():
            if not check_target_module_exists(self.lora_config, name):
                continue
    
            if not isinstance(module, nn.Linear):
                continue

            out_f, in_f = module.weight.shape
            kwargs = {
                'r': self.lora_config.r,
                'lora_alpha': self.lora_config.lora_alpha,
                'dropout_p': self.lora_config.lora_dropout,
            }

            lora_layer = LoRALayer(module, **kwargs)
            set_layer(self.model, name, lora_layer)

    def enable_adapters(self):
        for name, layer in self.named_modules():
            if isinstance(layer, LoRALayer):
                layer.enable_adapters(True)

    def disable_adapters(self):
        for name, layer in self.named_modules():
            if isinstance(layer, LoRALayer):
                layer.enable_adapters(False)

    def unload_and_optionally_merge(self, merge: bool = True, progressbar: bool = False) -> nn.Module:
        desc = "Unloading " + ("and merging " if merge else "") + "model"
        named_modules = list(filter(
            lambda p: isinstance(p[1], LoRALayer),
            self.named_modules()
        ))
    
        for name, lora_layer in tqdm.tqdm(named_modules, disable=not progressbar, desc=desc):
            parent, target, target_name = get_submodules(self, name)
            
            # new_module = gs_linear.merge() if merge else gs_linear.pre_layer
            if merge:
                lora_layer.merge()
            set_layer(parent, target_name, lora_layer.base_layer)
    
        return self.model

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_ids=None,
        **kwargs,
    ):
        
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
            **kwargs,
        )

    def generate(self, *args, **kwargs):
        return self.model.generate(*args, **kwargs)

In [28]:
# model_adapter.unload_and_optionally_merge(False)

In [29]:
model_adapter = LoRAModel(model, lora_config)

model_adapter

LoRAModel(
  (model): LlamaForCausalLM(
    (model): LlamaModel(
      (embed_tokens): Embedding(128256, 2048)
      (layers): ModuleList(
        (0-15): 16 x LlamaDecoderLayer(
          (self_attn): LlamaSdpaAttention(
            (q_proj): LoRALayer(
              (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
              (lora_A): Linear(in_features=2048, out_features=16, bias=False)
              (lora_B): Linear(in_features=16, out_features=2048, bias=False)
              (lora_dropout): Dropout(p=0.05, inplace=False)
            )
            (k_proj): LoRALayer(
              (base_layer): Linear(in_features=2048, out_features=512, bias=False)
              (lora_A): Linear(in_features=2048, out_features=16, bias=False)
              (lora_B): Linear(in_features=16, out_features=512, bias=False)
              (lora_dropout): Dropout(p=0.05, inplace=False)
            )
            (v_proj): LoRALayer(
              (base_layer): Linear(in_features=2048

In [30]:
print_num_trainable(model_adapter)

trainable: 3407872  |  total: 1239222272  |  trainable(%): 0.275001


In [31]:
# from peft import get_peft_model, LoraConfig as PeftLoRAConfig, TaskType

In [32]:
# peft_lora_config = PeftLoRAConfig(
#     task_type=TaskType.CAUSAL_LM,
#     inference_mode=False,
#     r=16,
#     lora_dropout=0.05,
#     lora_alpha=32,
#     target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
# )

# model_adapter =  get_peft_model(model, peft_lora_config)

In [33]:
# peft_model_adapter.unload()

In [34]:
pl = pipeline(
    "text-generation",
    model=model_adapter,
    tokenizer=tokenizer
)

Device set to use cuda:0
The model 'LoRAModel' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 

In [35]:
pl("Hello", max_new_tokens=16)

[{'generated_text': "Hello, I'm looking for a new laptop for my family. I'm looking for"}]

### Пишем collator для SFT

In [36]:
from transformers import DataCollatorForLanguageModeling
from typing import Union, Any

In [37]:
class DataCollatorForCompletionOnlyLM(DataCollatorForLanguageModeling):
    """
    Data collator used for completion tasks. It ensures that all the tokens of the labels are set to an 'ignore_index'
    when they do not come from the assistant. This ensure that the loss is only
    calculated on the completion made by the assistant.

    Args:
        response_template (`Union[str, list[int]]`): the template form that indicates the start of the response, typically something like
            '### Response:\n'. It can also be passed as tokenized ids, which can be useful when using a tokenizer that encodes the response
            differently if it does not have proper context.
        instruction_template (`Union[str, list[int]]`): the template form that indicates the start of the human instruction, typically something like
            '### Human:\n'. Useful for assistant-style conversation datasets. It can also be passed as tokenized ids.
        mlm (`bool`, *optional*, defaults to `False`): Whether to use masked language modeling in the underlying
            `DataCollatorForLanguageModeling` class. Note that this option currently has no effect but is present
             for flexibility and backwards-compatibility.
        ignore_index (`int`, *optional*, defaults to `-100`):
            The index to use to ignore the initial tokens with
    """

    def __init__(
        self,
        response_template: str,
        # instruction_template: str = None,
        *args,
        mlm: bool = False,
        ignore_index: int = -100,
        **kwargs,
    ):
        super().__init__(*args, mlm=mlm, **kwargs)

        # self.instruction_template = instruction_template
        # self.instruction_token_ids = self.tokenizer.encode(self.instruction_template, add_special_tokens=False)

        self.response_template = response_template
        self.response_token_ids = self.tokenizer.encode(self.response_template, add_special_tokens=False)

        self.instruction_template = None
        if not self.mlm and self.instruction_template and self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
            warnings.warn(
                "The pad_token_id and eos_token_id values of this tokenizer are identical. "
                "If you are planning for multi-turn training, "
                "it can result in the model continuously generating questions and answers without eos token. "
                "To avoid this, set the pad_token_id to a different value.",
                UserWarning,
            )

        self.ignore_index = ignore_index

    def torch_call(self, examples: list[Union[list[int], Any, dict[str, Any]]]) -> dict[str, Any]:
        batch = super().torch_call(examples)

        for i in range(len(examples)):
            response_token_ids_start_idx = None

            for idx in np.where(batch["labels"][i] == self.response_token_ids[0])[0]:
                # `response_token_ids` is `'### Response:\n'`, here we are just making sure that the token IDs match
                if (
                    self.response_token_ids
                    == batch["labels"][i][idx : idx + len(self.response_token_ids)].tolist()
                ):
                    response_token_ids_start_idx = idx

            if response_token_ids_start_idx is None:
                warnings.warn(
                    f"Could not find response key `{self.response_template}` in the following instance: "
                    f"{self.tokenizer.decode(batch['input_ids'][i])}. This instance will be ignored in loss "
                    "calculation. Note, if this happens often, consider increasing the `max_length`.",
                    UserWarning,
                )
                batch["labels"][i, :] = self.ignore_index
            else:
                response_token_ids_end_idx = response_token_ids_start_idx + len(self.response_token_ids)

                # Make pytorch loss function ignore all tokens up through the end of the response key
                batch["labels"][i, :response_token_ids_end_idx] = self.ignore_index

        return batch

In [38]:
dataset

DatasetDict({
    train: Dataset({
        features: ['original', 'upvote'],
        num_rows: 100834
    })
})

In [39]:
def get_instructions(example):
    system_prompt = "Вы профессиональный генератор анекдотов на русском языке. Создавайте оригинальные, культурно-релевантные шутки с использованием игры слов, иронии и кратких историй. Избегайте запрещенных тем, стереотипов и оскорбительного контента. Адаптируйте стиль под аудиторию (семейный, сатирический и т.д.), если не указано иное."
    user_prompt = "Сгенерируй смешной анекдот"
    agent_ans = example['original']

    instructions = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt},
        {'role': 'agent', 'content': agent_ans}
    ]

    text = tokenizer.apply_chat_template(instructions, tokenize=False)
    return {'text': text}

In [40]:
dataset = dataset.sort("upvote", reverse=True)
dataset = dataset['train'].select(range(2_000))
dataset = dataset.map(get_instructions)
dataset = dataset.train_test_split(test_size=64)

dataset

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['original', 'upvote', 'text'],
        num_rows: 1936
    })
    test: Dataset({
        features: ['original', 'upvote', 'text'],
        num_rows: 64
    })
})

In [41]:
training_args = SFTConfig(
    output_dir='./SFT_sem',
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=32,
    dataset_text_field='text',
    fp16=True,
    bf16=False,
    lr_scheduler_type='linear',
    warmup_ratio=0.1,
    max_steps=10,
    learning_rate=1e-4,
    weight_decay=0.01,
    eval_strategy='steps',
    eval_steps=10,
    logging_steps=10,
    seed=42,
    data_seed=42,
    report_to='none',
    save_strategy='no',
)

In [42]:
response_template = '<|start_header_id|>agent<|end_header_id|>'
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [43]:
trainer = SFTTrainer(
    model=model_adapter,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=collator,
    max_seq_length=1024
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1936 [00:00<?, ? examples/s]

Map:   0%|          | 0/64 [00:00<?, ? examples/s]

  super().__init__(


In [44]:
trainer.train()

Step,Training Loss,Validation Loss
10,3.9214,3.932203


TrainOutput(global_step=10, training_loss=3.921402359008789, metrics={'train_runtime': 91.8324, 'train_samples_per_second': 3.485, 'train_steps_per_second': 0.109, 'total_flos': 0.0, 'train_loss': 3.921402359008789, 'epoch': 0.1652892561983471})