In [7]:
!pip install -qq evaluate rouge_score

/bin/bash: /opt/conda/lib/libtinfo.so.6: no version information available (required by /bin/bash)


In [8]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer
from transformers import get_scheduler

import pandas as pd
import numpy as np
import re, os

from sklearn.model_selection import train_test_split

from nltk.tokenize import sent_tokenize
import nltk
nltk.download("punkt")

import evaluate
import torch

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
model_checkpoint = "sberbank-ai/ruT5-base"

os.environ["WANDB_DISABLED"] = "true"
api_key = "56c1808ba8ff36c4a47631bc7ed7085928332d7b"

In [10]:
df = pd.read_csv("/kaggle/input/recipes-and-interpretation-dim/all_recepies_inter.csv", sep="\t", usecols=["name", "Инструкции"]).rename(columns={"Инструкции": "Instructions"})

In [11]:
df["name"] = df["name"].apply(lambda x: x.replace(u'\xa0', u' '))
df["Instructions"] = df["Instructions"].apply(lambda x: x.replace(u'\xa0', u' '))
df["Instructions"] = df["Instructions"].apply(lambda x: re.sub("\n|\r|\t", " ",  x))

In [12]:
train, valid = train_test_split(df, test_size=0.3, random_state=42)
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)

In [13]:
train_ds = Dataset.from_pandas(train)
valid_ds = Dataset.from_pandas(valid)

ds = DatasetDict()

ds['train'] = train_ds
ds['validation'] = valid_ds

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/980k [00:00<?, ?B/s]

In [15]:
max_input_length = 512
max_target_length = 16


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["Instructions"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["name"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_datasets = ds.map(preprocess_function, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [18]:
batch_size = 8
num_train_epochs = 3

logging_steps = len(tokenized_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name}-finetuned-sber-ai",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
    report_to="none",
)

In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [21]:
tokenized_datasets = tokenized_datasets.remove_columns(
    ds["train"].column_names
)

In [22]:
rouge_score = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    # result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v * 100, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [23]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [24]:
trainer.train()

***** Running training *****
  Num examples = 70
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 27


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,7.6581,3.71955,0.0,0.0,0.0,0.0
2,5.1441,2.303535,0.0,0.0,0.0,0.0
3,4.1863,2.130424,0.0,0.0,0.0,0.0


***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8
***** Running Evaluation *****
  Num examples = 30
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=27, training_loss=5.48429376107675, metrics={'train_runtime': 18.8287, 'train_samples_per_second': 11.153, 'train_steps_per_second': 1.434, 'total_flos': 85358769315840.0, 'train_loss': 5.48429376107675, 'epoch': 3.0})

In [27]:
def generate_title(model, text=None, verbose=True):
    if text is None:
        idx = np.random.randint(len(valid))
        text = valid.iloc[idx]["Instructions"]
        if verbose:
            print(f"GT: {text}")
    
    inputs = tokenizer(text, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    
    outputs = model.generate(
                            **inputs, 
                            do_sample=False,
                            max_length=max_target_length+15, 
                            repetition_penalty=5., 
                            temperature=0.5,
                            num_beams=10,
                        )
    decoded = tokenizer.decode(outputs[0])
    decoded = decoded.replace("<pad>", "").replace("</s>", "") 
    if verbose:
        print(f"Title: {decoded}\n")
    return decoded

In [28]:
for _ in range(5):
    t = generate_title(model)

GT: 1. Вскипятите в кастрюле 1 литр слегка подсоленной воды, засыпать кускус и варить до готовности, около 10 минут. Если кускус готов, но не вся вода впиталась, то просто слейте ее.  2. Изюм промойте, залейте на несколько минут горячей водой. Затем слейте воду, добавьте изюм к кускусу и поставьте в теплое место.  3. Куриное филе нарежьте кубиками.  4. Стручки перца разрежьте на 4 части, удалите семена и разрежьте на квадратики.  5. Лук нарежьте тонкими кольцами.  6. Куриное филе обжарьте вместе с орехами в течение 2–3 минут, затем добавьте сладкий перец, лук и тушите еще 8 минут.  7. Смешайте все с кускусом, приправьте солью, тмином, карри и измельченным чесноком.
Title:  Куриное филе с кускусом и орехами

GT: 1. Мелко нарезать лук, натереть на крупной тёрке морковь и нарезать небольшими кусочками весь чеснок.  2. Обжарить лук до полуготовности. Часть из него отложить на фарш.  3. В лук всыпать натертую морковь и чеснок — тушить на среднем огне минут 7, периодически помешивая. Нарезат

# [SEE RESULTS](https://www.kaggle.com/code/pankratozzi/hf-torch-text-summarization)