In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from math import factorial

import json
import re

import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoConfig

from datasets import Dataset
from transformers import AutoModelForCausalLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import LoraConfig, get_peft_model

import sys
from eval_utils import get_metrics_computer, PrintCallback
from utils import get_preprocessor, Format, ShuffleCollator, count_parameters



In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cuda:0'

In [3]:
with open("configs/config_ruT5-base-st.json", "rb") as config:
    params = json.load(config)

params

{'format': 'SpecTokens',
 'max_bundles': 5,
 'model': 'ai-forever/ruT5-base',
 'add_nl_token': False,
 'add_eos_token': False,
 'change_pad_to_eos': False,
 'shuffle_bundles': True,
 'save_folder': 'ruT5-base',
 'train': {'n_epochs': 10,
  'lr': 5e-05,
  'batch_size': 16,
  'weight_decay': 0.01,
  'scheduler': 'cosine',
  'warmup_steps': 500,
  'fp16': True},
 'eval': {'batch_size': 16, 'show': 5}}

In [4]:
out_format = Format.SpecTokens if params["format"] == "SpecTokens" else Format.JustJson

In [5]:
train_data = pd.read_csv("~/work/resources/data/ads_train.csv")
train_data = train_data[train_data["n_bundles"] <= params.get("max_bundles", np.inf)]
train_data.set_index(np.arange(len(train_data)), inplace=True)
print(f"We have train datset of size {len(train_data)}")
train_data.head()

We have train datset of size 8632


Unnamed: 0,Text,bundles,n_bundles
0,Продам телефон POCO F4 128GB телефону 8 месяце...,"[{""Title"": ""\u0442\u0435\u043b\u0435\u0444\u04...",1
1,"продам sup борд 10'6 - 320х76х15 см, пользова...","[{""Title"": ""sup \u0431\u043e\u0440\u0434 \u043...",1
2,Изменились планы . Отдам за 2000,[],0
3,Продам рацию MegaJet MJ-333 в хорошем рабочем ...,"[{""Title"": ""\u0440\u0430\u0434\u0438\u043e\u04...",1
4,Продам PlayStation 3\n3 джойстик и все диски\n...,"[{""Title"": ""PlayStation 3 \u0441 3 \u0434\u043...",1


In [6]:
eval_data = pd.read_csv("~/work/resources/data/ads_eval.csv")
eval_data = eval_data[:params.get("max_eval_size", len(eval_data))]
eval_data.set_index(np.arange(len(eval_data)), inplace=True)
print(f"We have eval datset of size {len(eval_data)}")
eval_data.head()

We have eval datset of size 611


Unnamed: 0,Text,bundles,n_bundles
0,Распродажа \nПлатья \nЦена: 300р\nРазмер: 44-4...,"[{""Title"": ""\u041f\u043b\u0430\u0442\u044c\u04...",1
1,"вегетарианская энциклопедия вкусов, что с чем ...","[{""Title"": ""\u0432\u0435\u0433\u0435\u0442\u04...",1
2,"продам щетку для сухого массажа, новая (так и ...","[{""Title"": ""\u0429\u0435\u0442\u043a\u0430 \u0...",1
3,"Зимний тёплый комбинезон, пух, Crockid, p.74...","[{""Title"": ""\u0417\u0438\u043c\u043d\u0438\u04...",1
4,"удлинитель икеа 1,5 м на 6 розеток\n25 лари\n📍...","[{""Title"": ""\u0443\u0434\u043b\u0438\u043d\u04...",1


In [7]:
model_checkpoint = params["model"]
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

tokenizer.add_tokens(["₾", "$", "€"], special_tokens=False)
if params.get("add_nl_token", False):
    tokenizer.add_tokens(["<NL>"], special_tokens=False)
if out_format == Format.SpecTokens:
    tokenizer.add_tokens(["<BOB>", "<EOB>", "<BOT>", "<EOT>", "<BOP>", "<EOP>", "<BOC1>", "<EOC1>", "<BOC2>", "<EOC2>"], special_tokens=False)
elif out_format == Format.JustJson:
    tokenizer.add_tokens(["{", "}"], special_tokens=False)
model.resize_token_embeddings(len(tokenizer))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  return self.fget.__get__(instance, owner)()


Embedding(32112, 768)

In [8]:
train_dataset = Dataset.from_pandas(train_data[["Text", "bundles"]])
train_ads = train_dataset.map(
    get_preprocessor(tokenizer, out_format),
    batched=True,
    num_proc=4,
    remove_columns=train_dataset.column_names
)
train_ads = train_ads.flatten()

eval_dataset = Dataset.from_pandas(eval_data[["Text", "bundles"]])
eval_ads = eval_dataset.map(
    get_preprocessor(tokenizer, out_format),
    batched=True,
    num_proc=4,
    remove_columns=eval_dataset.column_names
)
eval_ads = eval_ads.flatten()

Map (num_proc=4): 100%|██████████| 8632/8632 [00:00<00:00, 9892.45 examples/s] 
Map (num_proc=4): 100%|██████████| 611/611 [00:00<00:00, 1290.79 examples/s]


In [9]:
if params.get("change_pad_to_eos", False):
    tokenizer.pad_token = tokenizer.eos_token

data_collator = ShuffleCollator(tokenizer, out_format, params.get("add_eos_token", False), params.get("shuffle_bundles", False))

In [10]:
train_params = params["train"]
eval_params = params["eval"]

lora_rank = params.get("lora", {}).get("rank")
if lora_rank is not None:
    lora_config = LoraConfig(
        r=lora_rank,
        lora_alpha=32,
        target_modules=["k", "q", "v", "o", "lm_head"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM",
    )

    model = get_peft_model(model, lora_config)

count_parameters(model)

training_args = Seq2SeqTrainingArguments(
    output_dir="tmp_checkpoints",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=train_params["lr"],
    per_device_train_batch_size=train_params["batch_size"],
    per_device_eval_batch_size=eval_params["batch_size"],
    weight_decay=train_params.get("weight_decay", 0),
    save_total_limit=1,
    num_train_epochs=train_params["n_epochs"],
    predict_with_generate=True,
    generation_max_length=256,
    lr_scheduler_type=train_params.get("scheduler", "cosine"),
    group_by_length=False,
    warmup_steps=train_params.get("warmup_steps", 0),
    fp16=train_params.get("fp16", False),
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ads,
    eval_dataset=eval_ads,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=get_metrics_computer(tokenizer, out_format),
    callbacks=[PrintCallback(out_format, show=eval_params.get("show", 3), device=device)]
)

Total parameters: 222,891,264
Trainable parameters: 222,891,264
Non-trainable parameters: 0


In [None]:
trainer.train()

  9%|▉         | 500/5400 [01:55<18:46,  4.35it/s]

{'loss': 2.3505, 'grad_norm': 0.9308339357376099, 'learning_rate': 4.92e-05, 'epoch': 0.93}


 10%|█         | 540/5400 [02:59<18:12,  4.45it/s]   
  0%|          | 0/39 [00:00<?, ?it/s][A
  5%|▌         | 2/39 [00:01<00:31,  1.19it/s][A
  8%|▊         | 3/39 [00:03<00:39,  1.09s/it][A
 10%|█         | 4/39 [00:11<02:16,  3.90s/it][A
 13%|█▎        | 5/39 [00:20<03:09,  5.57s/it][A
 15%|█▌        | 6/39 [00:29<03:36,  6.57s/it][A
 18%|█▊        | 7/39 [00:33<03:05,  5.79s/it][A
 21%|██        | 8/39 [00:41<03:28,  6.73s/it][A
 23%|██▎       | 9/39 [00:44<02:41,  5.38s/it][A
 26%|██▌       | 10/39 [00:47<02:13,  4.60s/it][A
 28%|██▊       | 11/39 [00:56<02:45,  5.91s/it][A
 31%|███       | 12/39 [01:04<03:03,  6.80s/it][A
 33%|███▎      | 13/39 [01:06<02:15,  5.22s/it][A
 36%|███▌      | 14/39 [01:11<02:05,  5.02s/it][A
 38%|███▊      | 15/39 [01:16<02:01,  5.07s/it][A
 41%|████      | 16/39 [01:18<01:35,  4.13s/it][A
 44%|████▎     | 17/39 [01:26<02:01,  5.51s/it][A
 46%|████▌     | 18/39 [01:28<01:34,  4.49s/it][A
 49%|████▊     | 19/39 [01:30<01:11,  3.60s/i

{'eval_loss': 0.12973809242248535, 'eval_Global BLEU': 61.197050713994074, 'eval_valid_answer_structure_precent': 95.34109816971714, 'eval_valid_bundles_precent': 100.0, 'eval_n_bundles_mae': 0.3787085514834206, 'eval_mean_over_bundles': 0.06806282722513089, 'eval_mean_under_bundles': 0.3106457242582897, 'eval_title_bleu_1_bundle': 31.959071138182768, 'eval_title_chrf_1_bundle': 69.25242721199052, 'eval_price_match_precent_1_bundle': 84.28030303030303, 'eval_currency_match_precent_1_bundle': 92.23484848484848, 'eval_count_match_precent_1_bundle': 89.01515151515152, 'eval_title_bleu_multi_bundle': 19.27024936521683, 'eval_title_chrf_multi_bundle': 49.01640137530453, 'eval_price_match_precent_multi_bundle': 33.33333333333333, 'eval_currency_match_precent_multi_bundle': 83.33333333333334, 'eval_count_match_precent_multi_bundle': 100.0, 'eval_runtime': 886.8077, 'eval_samples_per_second': 0.689, 'eval_steps_per_second': 0.044, 'epoch': 1.0}



                                               [A


ORIGINAL TEXT
	RAW
	 Распродажа Платья Цена: 300р Размер: 44-46-48-50 Без выбора цвета Арт: 2Г-14 корпус Б</s>

TARGET
	RAW
	 <BOB> <BOT> Платья размер 44-46-48-50 <EOT> <BOP> 300 <EOP> <BOC1> RUB <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'Платья размер 44-46-48-50', 'price': '300', 'currency': 'RUB', 'count': '1'}]

PREDICTED
	RAW
	 <BOB> <BOT> Платья <EOT> <BOP> 300 <EOP> <BOC1> RUB <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'Платья', 'price': '300', 'currency': 'RUB', 'count': '1'}]
--------------------------------------------------


ORIGINAL TEXT
	RAW
	 вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров. 15 eur. лимассол.</s>

TARGET
	RAW
	 <BOB> <BOT> вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров <EOT> <BOP> 15 <EOP> <BOC1> EUR <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров', 'price': '15',

 19%|█▊        | 1000/5400 [19:32<16:27,  4.46it/s]    

{'loss': 0.1674, 'grad_norm': 1.9292196035385132, 'learning_rate': 4.876648718415608e-05, 'epoch': 1.85}


 20%|██        | 1080/5400 [20:45<15:15,  4.72it/s]   
  0%|          | 0/39 [00:00<?, ?it/s][A
  5%|▌         | 2/39 [00:04<01:27,  2.35s/it][A
  8%|▊         | 3/39 [00:13<03:02,  5.07s/it][A
 10%|█         | 4/39 [00:22<03:44,  6.41s/it][A
 13%|█▎        | 5/39 [00:30<04:04,  7.20s/it][A
 15%|█▌        | 6/39 [00:35<03:29,  6.34s/it][A
 18%|█▊        | 7/39 [00:41<03:20,  6.28s/it][A
 21%|██        | 8/39 [00:50<03:40,  7.11s/it][A
 23%|██▎       | 9/39 [00:55<03:14,  6.48s/it][A
 26%|██▌       | 10/39 [01:04<03:28,  7.18s/it][A
 28%|██▊       | 11/39 [01:13<03:33,  7.64s/it][A
 31%|███       | 12/39 [01:19<03:18,  7.36s/it][A
 33%|███▎      | 13/39 [01:28<03:21,  7.76s/it][A
 36%|███▌      | 14/39 [01:31<02:37,  6.29s/it][A
 38%|███▊      | 15/39 [01:40<02:47,  6.99s/it][A
 41%|████      | 16/39 [01:45<02:27,  6.39s/it][A
 44%|████▎     | 17/39 [01:53<02:35,  7.07s/it][A
 46%|████▌     | 18/39 [01:58<02:16,  6.48s/it][A
 49%|████▊     | 19/39 [02:04<02:05,  6.27s/

{'eval_loss': 0.09975825250148773, 'eval_Global BLEU': 71.0072311918113, 'eval_valid_answer_structure_precent': 95.50748752079868, 'eval_valid_bundles_precent': 100.0, 'eval_n_bundles_mae': 0.2613240418118467, 'eval_mean_over_bundles': 0.21080139372822299, 'eval_mean_under_bundles': 0.050522648083623695, 'eval_title_bleu_1_bundle': 33.791613281413156, 'eval_title_chrf_1_bundle': 71.7810092731908, 'eval_price_match_precent_1_bundle': 91.08695652173913, 'eval_currency_match_precent_1_bundle': 94.34782608695652, 'eval_count_match_precent_1_bundle': 91.95652173913044, 'eval_title_bleu_multi_bundle': 26.152254277990227, 'eval_title_chrf_multi_bundle': 58.698369275574166, 'eval_price_match_precent_multi_bundle': 62.01877934272301, 'eval_currency_match_precent_multi_bundle': 85.91549295774648, 'eval_count_match_precent_multi_bundle': 89.20187793427229, 'eval_runtime': 984.5117, 'eval_samples_per_second': 0.621, 'eval_steps_per_second': 0.04, 'epoch': 2.0}



                                               [A


ORIGINAL TEXT
	RAW
	 Распродажа Платья Цена: 300р Размер: 44-46-48-50 Без выбора цвета Арт: 2Г-14 корпус Б</s>

TARGET
	RAW
	 <BOB> <BOT> Платья размер 44-46-48-50 <EOT> <BOP> 300 <EOP> <BOC1> RUB <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'Платья размер 44-46-48-50', 'price': '300', 'currency': 'RUB', 'count': '1'}]

PREDICTED
	RAW
	 <BOB> <BOT> Платья <EOT> <BOP> 300 <EOP> <BOC1> RUB <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'Платья', 'price': '300', 'currency': 'RUB', 'count': '1'}]
--------------------------------------------------


ORIGINAL TEXT
	RAW
	 вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров. 15 eur. лимассол.</s>

TARGET
	RAW
	 <BOB> <BOT> вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров <EOT> <BOP> 15 <EOP> <BOC1> EUR <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров', 'price': '15',

 28%|██▊       | 1500/5400 [38:47<14:45,  4.41it/s]     

{'loss': 0.129, 'grad_norm': 1.0328357219696045, 'learning_rate': 4.511177134582914e-05, 'epoch': 2.78}


 30%|███       | 1620/5400 [40:08<13:48,  4.56it/s]   
  0%|          | 0/39 [00:00<?, ?it/s][A
  5%|▌         | 2/39 [00:04<01:15,  2.03s/it][A
  8%|▊         | 3/39 [00:12<02:55,  4.88s/it][A
 10%|█         | 4/39 [00:21<03:42,  6.35s/it][A
 13%|█▎        | 5/39 [00:27<03:32,  6.25s/it][A
 15%|█▌        | 6/39 [00:32<03:08,  5.71s/it][A
 18%|█▊        | 7/39 [00:38<03:04,  5.75s/it][A
 21%|██        | 8/39 [00:44<03:00,  5.82s/it][A
 23%|██▎       | 9/39 [00:49<02:46,  5.55s/it][A
 26%|██▌       | 10/39 [00:57<03:03,  6.34s/it][A
 28%|██▊       | 11/39 [01:06<03:18,  7.10s/it][A
 31%|███       | 12/39 [01:12<03:06,  6.91s/it][A
 33%|███▎      | 13/39 [01:19<02:56,  6.80s/it][A
 36%|███▌      | 14/39 [01:21<02:19,  5.58s/it][A
 38%|███▊      | 15/39 [01:28<02:17,  5.73s/it][A
 41%|████      | 16/39 [01:30<01:49,  4.77s/it][A
 44%|████▎     | 17/39 [01:33<01:29,  4.08s/it][A
 46%|████▌     | 18/39 [01:38<01:31,  4.37s/it][A
 49%|████▊     | 19/39 [01:41<01:23,  4.17s/

{'eval_loss': 0.09591393172740936, 'eval_Global BLEU': 77.593588620695, 'eval_valid_answer_structure_precent': 99.0, 'eval_valid_bundles_precent': 100.0, 'eval_n_bundles_mae': 0.19023569023569023, 'eval_mean_over_bundles': 0.13973063973063973, 'eval_mean_under_bundles': 0.050505050505050504, 'eval_title_bleu_1_bundle': 36.03530959297528, 'eval_title_chrf_1_bundle': 72.44280639645692, 'eval_price_match_precent_1_bundle': 93.34763948497854, 'eval_currency_match_precent_1_bundle': 94.63519313304721, 'eval_count_match_precent_1_bundle': 91.41630901287554, 'eval_title_bleu_multi_bundle': 25.831816699965792, 'eval_title_chrf_multi_bundle': 60.79868806603791, 'eval_price_match_precent_multi_bundle': 63.789682539682545, 'eval_currency_match_precent_multi_bundle': 90.47619047619048, 'eval_count_match_precent_multi_bundle': 92.81746031746032, 'eval_runtime': 1058.458, 'eval_samples_per_second': 0.577, 'eval_steps_per_second': 0.037, 'epoch': 3.0}



                                               [A


ORIGINAL TEXT
	RAW
	 Распродажа Платья Цена: 300р Размер: 44-46-48-50 Без выбора цвета Арт: 2Г-14 корпус Б</s>

TARGET
	RAW
	 <BOB> <BOT> Платья размер 44-46-48-50 <EOT> <BOP> 300 <EOP> <BOC1> RUB <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'Платья размер 44-46-48-50', 'price': '300', 'currency': 'RUB', 'count': '1'}]

PREDICTED
	RAW
	 <BOB> <BOT> Платья <EOT> <BOP> 300 <EOP> <BOC1> RUB <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'Платья', 'price': '300', 'currency': 'RUB', 'count': '1'}]
--------------------------------------------------


ORIGINAL TEXT
	RAW
	 вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров. 15 eur. лимассол.</s>

TARGET
	RAW
	 <BOB> <BOT> вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров <EOT> <BOP> 15 <EOP> <BOC1> EUR <EOC1> <BOC2> 1 <EOC2> <EOB></s>
	DECODED:
	 [{'title': 'вегетарианская энциклопедия вкусов, что с чем сочетается, справочник от шеф поваров', 'price': '15',

 37%|███▋      | 2000/5400 [59:15<13:12,  4.29it/s]     

{'loss': 0.1126, 'grad_norm': 1.0584112405776978, 'learning_rate': 3.940789717249119e-05, 'epoch': 3.7}


 40%|████      | 2160/5400 [1:00:46<12:17,  4.39it/s]   
  0%|          | 0/39 [00:00<?, ?it/s][A
  5%|▌         | 2/39 [00:04<01:19,  2.16s/it][A
  8%|▊         | 3/39 [00:08<01:53,  3.14s/it][A
 10%|█         | 4/39 [00:17<03:02,  5.21s/it][A
 13%|█▎        | 5/39 [00:23<03:08,  5.54s/it][A
 15%|█▌        | 6/39 [00:28<02:52,  5.22s/it][A
 18%|█▊        | 7/39 [00:32<02:40,  5.03s/it][A
 21%|██        | 8/39 [00:38<02:37,  5.08s/it][A
 23%|██▎       | 9/39 [00:42<02:25,  4.85s/it][A
 26%|██▌       | 10/39 [00:48<02:34,  5.32s/it][A
 28%|██▊       | 11/39 [00:55<02:37,  5.61s/it][A
 31%|███       | 12/39 [01:00<02:33,  5.67s/it][A
 33%|███▎      | 13/39 [01:06<02:25,  5.61s/it][A
 36%|███▌      | 14/39 [01:09<01:58,  4.76s/it][A
 38%|███▊      | 15/39 [01:14<01:56,  4.84s/it][A
 41%|████      | 16/39 [01:16<01:35,  4.15s/it][A
 44%|████▎     | 17/39 [01:25<02:00,  5.49s/it][A
 46%|████▌     | 18/39 [01:29<01:47,  5.13s/it][A
 49%|████▊     | 19/39 [01:33<01:32,  4.65

In [None]:
if lora_rank is not None:
    model = model.merge_and_unload()
output_dir = f"../good_checkpoints/{params["save_folder"]}"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)