In [1]:
import sys
sys.path.append('../../..')

In [7]:
import torch

from omegaconf import OmegaConf
from peft import LoraConfig
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTConfig, SFTTrainer
from tqdm import tqdm

from src.utils import seed_everything
from src.data_prepocessing import load_ds, tokenize_ds
from src.evaluation import Evaluator

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Experiment setup

In [8]:
config = OmegaConf.load("vikhr_gemma_lora_config.yaml")
print(OmegaConf.to_yaml(config))

model_name: Vikhrmodels/Vikhr-Gemma-2B-instruct
prompt: 'Перепиши неполное высказывание на основе истории диалога. Твой ответ должен
  содержать только переписанное неполное высказвание. '
sft_args:
  packing: true
  report_to: wandb
  per_device_train_batch_size: 1
  per_device_eval_batch_size: 1
  gradient_accumulation_steps: 512
  num_train_epochs: 10
  optim: paged_adamw_8bit
  learning_rate: 0.003
  eos_token: <end_of_turn>
  do_eval: true
  eval_strategy: steps
  eval_steps: 1
  logging_steps: 1
lora_args:
  r: 1
  target_modules:
  - q_proj
  - v_proj
  task_type: CAUSAL_LM



In [9]:
seed_everything(42)

# Model and data loading

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    quantization_config=bnb_config,
    device_map={"": torch.cuda.current_device()}
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

In [None]:
def preprocess_function(sample):
    prompt = (config.prompt
              + "История: " 
              + sample['history'][-1]
              + " Неполное высказвание: " 
              + sample["phrase"])

    msg = {"prompt": "<start_of_turn>user\n" + prompt,
           "completion": "<start_of_turn>model\n" + sample["rewrite"]}
    
    return msg

In [None]:
ds = load_ds("2rca_checked_version.json")
tokenized_ds = tokenize_ds(ds, preprocess_function)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 4411/4411 [00:01<00:00, 3861.14 examples/s]
Map: 100%|██████████| 551/551 [00:00<00:00, 3956.06 examples/s]
Map: 100%|██████████| 551/551 [00:00<00:00, 3977.76 examples/s]


In [None]:
tokenized_ds["train"]["prompt"][0]

'<start_of_turn>user\nПерепиши неполное высказывание на основе истории диалога. Твой ответ должен содержать только переписанное неполное высказвание. История: Моей собаке уже 5 лет, и я даже не представляю, как я могла жить без своей собаки раньше?! Я думаю, что у тебя всё получится и у вас скоро обязательно появится питомец! Ведь собаки такие милые! Что сегодня будешь готовить на ужин? Неполное высказвание: Сегодня будет мясо с кровью! Вот только надо в магазин... Эх, пойду прогуляюсь под дождём, это успокаивает.'

In [None]:
tokenized_ds["train"]["completion"][0]

'<start_of_turn>model\nСегодня на ужин будет мясо с кровью! Вот только надо в магазин... Эх, пойду прогуляюсь под дождём, это успокаивает.'

# Model training

In [None]:
lora_config = LoraConfig(**config.lora_args.__dict__)
training_args = SFTConfig(**config.sft_args.__dict__)

trainer = SFTTrainer(
    model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["val"],
    peft_config=lora_config
)


trainer.train()

Converting train dataset to ChatML:   0%|          | 0/4411 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/4411 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/4411 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/4411 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/551 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/551 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/551 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/551 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Currently logged in as: [33mpvlshkunov[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss,Validation Loss
1,2.0436,1.919052


In [None]:
def infer_ds(ds, model):
    test_results = []

    for i in tqdm(range(len(ds['test']))):
        messages = [
            {
                "role": "user",
                "content": (config.prompt 
                            + "История: " 
                            + ds['test']['history'][i][-1] 
                            + " Неполное высказвание: " 
                            + ds['test']["phrase"][i])
                }
            ]
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to('cuda')
            outputs = model.generate(inputs, 
                                     max_new_tokens=config.inference_args.max_new_tokens, 
                                     num_beams=config.inference_args.num_beams)
        out = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        test_results.append(out[0])

    return test_results

In [None]:
evaluator = Evaluator(dataset=tokenized_ds, 
                      model=model, 
                      tokenizer=tokenizer, 
                      infer_func=infer_ds)

evaluator.evaluate()

Unnamed: 0_level_0,bleu_score,rouge-1,rouge-2,rouge-3,rouge-4,rouge-l,rf_score_1,rf_score_2,rf_score_3,rf_score_4
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2rca,81.584098,0.827842,0.773599,0.734291,0.691708,0.827763,0.129307,0.102482,0.090379,0.0842
