# Fine-tuning ruT5  

В данном ноутбуке дообучаю seq2seq модель `sberbank-ai/ruT5-base` для задачи генерации описаний карточек товаров (одежды) по названию и характеристикам.

Входные данные: название и характеристики.   
Выходные данные: развернутое рекламное описание.

In [2]:
pip install -U transformers



In [3]:
pip install datasets evaluate rouge_score

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=f8cdd4c943e3268f979e196401b998ac4ac8e105fc2fbaf90cb61d01ca573cb6
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.5 rouge_score-0.1.2


## 1. Загрузка и подготовка данных

In [4]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import os
import pandas as pd
import numpy as np

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/proj_cloth_desc_gen/data_cleaned_short.csv', index_col = 0)
print(len(data))
data.head()

In [6]:
import torch
import random

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

Делим датасет на:
- трейн - обучение
- валидация - отбор «лучшей модели»
- тест - тестирование качества генераций моделей

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(data, test_size=0.3, random_state=42, stratify = data['type'])
df_train, df_val = train_test_split(df_train, test_size=0.10, random_state=42, stratify = df_train['type'])
len(df_train), len(df_val), len(df_test)

(44727, 4970, 21299)

In [None]:
df_test.to_csv('drive/MyDrive/proj_cloth_desc_gen/df_test.csv')
df_train.to_csv('drive/MyDrive/proj_cloth_desc_gen/df_train.csv')
df_val.to_csv('drive/MyDrive/proj_cloth_desc_gen/df_val.csv')

In [None]:
df_test =  pd.read_csv('drive/MyDrive/proj_cloth_desc_gen/df_test.csv', index_col =0)
df_train = pd.read_csv('drive/MyDrive/proj_cloth_desc_gen/df_train.csv', index_col =0)
df_val = pd.read_csv('drive/MyDrive/proj_cloth_desc_gen/df_val.csv', index_col =0)

## 2. Преобразование данных для seq2seq и токенизация
В своей задаче использую русскоязычную T5 `ai-forever/rut5-base` от Sber AI

In [None]:
from datasets import Dataset

train_dataset = Dataset.from_dict({'input_text': df_train['input'], 'target_text': df_train['final_desc']})
val_dataset = Dataset.from_dict({'input_text': df_val['input'], 'target_text': df_val['final_desc']})
test_dataset = Dataset.from_dict({'input_text': df_test['input'], 'target_text': df_test['final_desc']})

In [None]:
# загрузка токенизатора и модели
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "ai-forever/rut5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
# токенизация
def preprocess_function(examples):
    inputs = [item for item in examples["input_text"]]

    # токенизация входа 
    tokenized_inputs = tokenizer(
        inputs,
        max_length=160,
        truncation=True,
        padding="max_length"
    )

    # токенизация выхода - описания
    labels = tokenizer(
        examples["target_text"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    for label in labels["input_ids"]:
        if any(token >= tokenizer.vocab_size or token < 0 for token in label):
            print("Incorrect tokens in label:", label)

    # заменяем pad_token_id на -100, чтобы игнорировать паддинги при расчёте loss 
    tokenized_inputs["labels"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in example]
        for example in labels["input_ids"]
    ]

    return tokenized_inputs


In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=['input_text', 'target_text'])
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=['input_text', 'target_text'])
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=['input_text', 'target_text'])


Map:   0%|          | 0/44727 [00:00<?, ? examples/s]

Map:   0%|          | 0/4970 [00:00<?, ? examples/s]

Map:   0%|          | 0/21299 [00:00<?, ? examples/s]

## 3. Fine-tune модели ruT5
- Обучаю модель на GPU A100 с использованием fp16 для ускорения.  
- Лучшая модель выбирается автоматически по метрике ROUGE-L на валидации. Метрика ROUGE-L измеряет длину наибольшей общей подпоследовательности между эталонным описанием и сгенерированным текстом, что позволяет оценивать не только совпадение слов, но и их порядок. 
- Во время валидации используется генерация, так как для расчета метрики ROUGE-L требуется генерация текста.  
- Также использую раннюю остановку, чтобы не тратить время на лишние эпохи, если качество перестаёт расти.

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
import evaluate

rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")
meteor_metric = evaluate.load("meteor")

# функция декодирования, чтобы не падало если появляется токен вне словаря
def safe_decode(preds, tokenizer):
   unk_token_id = tokenizer.unk_token_id
   vocab_size = tokenizer.vocab_size
   safe_preds = [ [token if 0 <= token < vocab_size else unk_token_id for token in pred]
                for pred in preds ]
   decoded_texts = tokenizer.batch_decode(safe_preds, skip_special_tokens=True)
   return decoded_texts

# функция расчета метрик
def compute_metrics_train(eval_pred):

    preds, labels = eval_pred
    print(f"Shape of preds: {preds.shape}, Shape of labels: {labels.shape}")

    # декодируем предсказания
    decoded_preds = safe_decode(preds, tokenizer)

    # возвращаем обратно паддинги, чтобы можно было декодировать
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    rouge_results = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
    rouge_1 = rouge_results["rouge1"] * 100
    rouge_2 = rouge_results["rouge2"] * 100
    rouge_l = rouge_results["rougeL"] * 100

    bleu_results = bleu_metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])
    bleu = bleu_results["bleu"] * 100

    meteor_results = meteor_metric.compute(predictions=decoded_preds, references=decoded_labels)
    meteor = meteor_results["meteor"] * 100

    metrics = {
        "rouge1": rouge_1,
        "rouge2": rouge_2,
        "rougeL": rouge_l,
        "bleu": bleu,
        "meteor": meteor,
    }

    return metrics

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

30

In [None]:
!nvidia-smi

Tue Jun  3 10:46:44 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             47W /  400W |       5MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# пайплайн обучения
from transformers import  Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/proj_cloth_desc_gen/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=3e-5, 
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="eval_rougeL",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=512,
    # generation_num_beams = 1,
    optim="adafactor",
    seed=42,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none",
    logging_strategy="steps",
    logging_steps=100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_train,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Meteor
1,2.6589,2.292594,1.295422,0.313,1.302076,1.931478,13.762413
2,2.4324,2.172657,1.282488,0.27229,1.278255,2.743541,15.683539
3,2.3308,2.102715,1.342835,0.289251,1.333051,3.243134,16.627683
4,2.2809,2.059923,1.591822,0.311566,1.587838,3.749806,16.155527
5,2.2287,2.034802,1.511416,0.350884,1.502459,3.538695,15.919805
6,2.1854,1.998801,1.821017,0.375951,1.806832,3.578144,15.939519


Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)


In [None]:
checkpoints = [d for d in os.listdir("drive/MyDrive/proj_cloth_desc_gen/results") if d.startswith("checkpoint")]
checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
print(checkpoints[-1]) 

checkpoint-4194


In [None]:
trainer.train(resume_from_checkpoint='drive/MyDrive/proj_cloth_desc_gen/results/checkpoint-4194')

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Meteor
7,2.1538,1.98308,1.912019,0.353367,1.892035,3.308132,15.597947
8,2.1316,1.973265,1.861841,0.373513,1.841243,3.289819,15.654258
9,2.1237,1.956903,2.105618,0.389234,2.092297,3.271575,15.700356
10,2.1173,1.948321,2.292069,0.421228,2.274883,3.140431,15.461627
11,2.0942,1.939864,1.995189,0.39509,1.993335,3.116032,15.470513
12,2.0747,1.934288,2.166715,0.362786,2.174963,3.170089,15.508807
13,2.0733,1.930493,2.160514,0.389256,2.166935,3.243425,15.773367


Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=9087, training_loss=1.1385063999646619, metrics={'train_runtime': 25441.1246, 'train_samples_per_second': 26.371, 'train_steps_per_second': 0.412, 'total_flos': 1.106497420075008e+17, 'train_loss': 1.1385063999646619, 'epoch': 13.0})

In [None]:
checkpoints = [d for d in os.listdir("drive/MyDrive/proj_cloth_desc_gen/results") if d.startswith("checkpoint")]
checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
print(checkpoints[-1]) 

checkpoint-9087


In [None]:
trainer.train(resume_from_checkpoint='drive/MyDrive/proj_cloth_desc_gen/results/checkpoint-6990')

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,Meteor
11,2.0942,1.939864,1.995189,0.39509,1.993335,3.116032,15.470513
12,2.0747,1.934288,2.166715,0.362786,2.174963,3.170089,15.508807
13,2.0733,1.930493,2.160514,0.389256,2.166935,3.243425,15.773367


Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)
Shape of preds: (4970, 512), Shape of labels: (4970, 512)


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=9087, training_loss=0.4816788473682355, metrics={'train_runtime': 11158.1955, 'train_samples_per_second': 60.127, 'train_steps_per_second': 0.94, 'total_flos': 1.106497420075008e+17, 'train_loss': 0.4816788473682355, 'epoch': 13.0})

In [None]:
# сохранение финальной модели
trainer.save_model('drive/MyDrive/proj_cloth_desc_gen/final_model')
tokenizer.save_pretrained('drive/MyDrive/proj_cloth_desc_gen/final_model')

('drive/MyDrive/proj_cloth_desc_gen/final_model/tokenizer_config.json',
 'drive/MyDrive/proj_cloth_desc_gen/final_model/special_tokens_map.json',
 'drive/MyDrive/proj_cloth_desc_gen/final_model/tokenizer.json')

## 4. Оценка модели на тесте

In [None]:
# загружаем финальную модель
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'drive/MyDrive/proj_cloth_desc_gen/final_model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
model.eval().to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
from transformers import  Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/proj_cloth_desc_gen/results",  #
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=1,
    learning_rate=3e-5,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=15,
    lr_scheduler_type="linear",
    load_best_model_at_end=True,
    metric_for_best_model="eval_rougeL",
    greater_is_better=True,
    predict_with_generate=True,
    generation_max_length=512,
    optim="adafactor",
    seed=42,
    fp16=True,
    dataloader_num_workers=4,
    report_to="none",
    logging_strategy="steps",
    logging_steps=100
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_train,
    eval_dataset=tokenized_test
)


In [None]:
test_result = trainer.predict(tokenized_test)
print("Test metrics:")
for key, value in test_result.metrics.items():
    print(f"{key}: {value:.2f}")


Shape of preds: (21299, 512), Shape of labels: (21299, 512)
Test metrics:
test_loss: 1.94
test_model_preparation_time: 0.01
test_rouge1: 2.45
test_rouge2: 0.58
test_rougeL: 2.44
test_bleu: 3.18
test_meteor: 15.50
test_runtime: 13115.16
test_samples_per_second: 1.62
test_steps_per_second: 0.10


Метрики на тесте примерно совпадают с валидацией.  
Их значения низкие, но это ожидаемо: цель не копировать текст один в один, а генерировать связные маркетинговые описания.  
Поэтому основной упор делаем на ручной анализ качества примеров генерации.

## 5. Генерация текста
Беру сэмпл из 200 примеров из тестовой выборки. Для каждого примера генерирую описание и сохраняю результат в CSV. 

Используемые параметры генерации:
- num_beams = 5, do_sample = False - beam search для уменьшения вариативности и галлюцинаций
- no_repeat_ngram_size = 3, repetition_penalty = 1.5  - защита от повторов и клише


In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'drive/MyDrive/proj_cloth_desc_gen/final_model'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
df_test = pd.read_csv('drive/MyDrive/proj_cloth_desc_gen/df_test.csv', index_col =0)
print(df_test.shape[0])
df_test.head()

In [1]:
# функция генерации текста
def generate_description(input_text, gen_params = None):
    if gen_params is None:
        gen_params = {}

    inputs = tokenizer(
        input_text, 
        max_length=160, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt").to(model.device)

    outputs = model.generate(**inputs, **gen_params)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
random_row = df_test.sample(n=1).iloc[0]

input_text = random_row['input']
true_target = random_row['final_desc']

print(f"Input text:\n{input_text}\n")
print(f"True target:\n{true_target}\n")

Input text:
Сгенерируй описание одежды для карточки товара:
Наименование товара: Спортивный костюм теплый с начесом
Утеплитель: начес
Температурный режим: от -15 °С до +10 °С
Фактура материала: плотная
Опции капюшона: несъемный капюшон
Тип карманов: кенгуру
Вид застежки: завязки
Особенности модели: с начесом; капюшон; теплый
Сезон: зима
Тип рукава: длинные
Покрой: оверсайз
Комплектация: худи; джоггеры
Страна производства: Россия

True target:
Спортивныи костюм мужскои с начесом - это настоящии лидер среди спортивнои одежды. Демисезонныи костюм изготовлен из высококачественного футера с 80% хлопком.
Ткань отлично держит тепло, не образует катышки, приятна к телу и проста в уходе.Утепленныи спортивныи комплект состоит из худи с капюшоном, длинными рукавами, карманом кенгуру и брюк - джоггеров с резинкои в поясе и прорезными боковыми карманами. Подоидет высоким и не высоким парням.Благодаря тому, что этот костюм изготовлен на собственном производстве в России - вы можете не сомневаться в 

In [None]:
gen_params = dict(
    num_beams=5,
    repetition_penalty=1.5,
    length_penalty=1.0,
    no_repeat_ngram_size=3,
    do_sample=False
)

generated_text = generate_description(input_text, gen_params=gen_params))

print(f"Input text:\n{input_text}\n")
print(f"Generated text:\n{generated_text}\n")

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Input text:
Сгенерируй описание одежды для карточки товара:
Наименование товара: Спортивный костюм теплый с начесом
Утеплитель: начес
Температурный режим: от -15 °С до +10 °С
Фактура материала: плотная
Опции капюшона: несъемный капюшон
Тип карманов: кенгуру
Вид застежки: завязки
Особенности модели: с начесом; капюшон; теплый
Сезон: зима
Тип рукава: длинные
Покрой: оверсайз
Комплектация: худи; джоггеры
Страна производства: Россия

Generated text:
Спортивныи костюм теплыи с начесом от бренда представляет собои идеальныи выбор для мужчин, ищущих комфорт и стиль в повседневнои одежде. Этот костюм выполнен из высококачественного материала, состоящего из хлопка и полиэстера, что обеспечивает не только тепло, но и долговечность изделия. Особенностью этого костюма является его оверсаиз крои, которыи обеспечивает свободу движении и удобство при носке. Костюм оснащен капюшоном и карманами кенгуру, что делает его удобным и практичным выбором для ношения в холодное время года. Отсутствие декоратив

In [None]:
gen_params = dict(
    num_beams=5,
    repetition_penalty=1.5,
    length_penalty=1.0,
    no_repeat_ngram_size=3,
    do_sample=False
)


sample_df = df_test.sample(n=200, random_state=42)
sample_df["t5_gen"] = sample_df["input"].apply(lambda x: generate_description(x, gen_params=gen_params)

In [14]:
sample_df.to_csv('drive/MyDrive/proj_cloth_desc_gen/df_sample_pred.csv')