In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from matplotlib import pyplot as plt

import pandas as pd
import transformers as tf
import datasets as ds

In [None]:
data = ds.load_dataset('csv', data_files='puzzles_dataset.csv')
data = data.shuffle(seed=42)['train']
data = data.train_test_split(test_size=0.05)
train = data['train'].remove_columns(['Unnamed: 0'])
test = data['test'].remove_columns(['Unnamed: 0'])
print(train, test)



  0%|          | 0/1 [00:00<?, ?it/s]



Dataset({
    features: ['riddle', 'answer'],
    num_rows: 1502
}) Dataset({
    features: ['riddle', 'answer'],
    num_rows: 80
})


In [None]:
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer

model_name = "cointegrated/rut5-small" # you can specify the model size here
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

loading configuration file https://huggingface.co/cointegrated/rut5-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/c273e887f3514427c040f14673c0ae0b98e39840a74a5daf3ac418d11036772e.baa5ae8d61b82b2d6c8a21d560eed1d911b7e8a664d2ed99c18d570b052c71a5
Model config MT5Config {
  "_name_or_path": "cointegrated/rut5-small",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "mt5",
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "transformers_version": 

In [None]:
prefix = 'guess: '

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["riddle"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["answer"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = data.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
EPOCH_NUM = 4

training_args = tf.Seq2SeqTrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    do_train=True,
    logging_steps=8,
    save_steps=100, 
    warmup_steps=32,  
    num_train_epochs = EPOCH_NUM, #TRAIN_EPOCHS
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
)

trainer = tf.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `MT5ForConditionalGeneration.forward` and have been ignored: riddle, Unnamed: 0, answer. If riddle, Unnamed: 0, answer are not expected by `MT5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1502
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 376


Step,Training Loss
8,7.8629
16,8.3063
24,7.7594
32,7.5217
40,7.2017
48,6.8612
56,6.5911
64,6.0727
72,5.8641
80,5.6679


Saving model checkpoint to ./results/checkpoint-100
Configuration saved in ./results/checkpoint-100/config.json
Model weights saved in ./results/checkpoint-100/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-100/special_tokens_map.json
Copy vocab file to ./results/checkpoint-100/spiece.model
Saving model checkpoint to ./results/checkpoint-200
Configuration saved in ./results/checkpoint-200/config.json
Model weights saved in ./results/checkpoint-200/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-200/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-200/special_tokens_map.json
Copy vocab file to ./results/checkpoint-200/spiece.model
Saving model checkpoint to ./results/checkpoint-300
Configuration saved in ./results/checkpoint-300/config.json
Model weights saved in ./results/checkpoint-300/pytorch_model.bin
tokenizer config file saved in ./results

TrainOutput(global_step=376, training_loss=5.244500393563128, metrics={'train_runtime': 1916.3655, 'train_samples_per_second': 3.135, 'train_steps_per_second': 0.196, 'total_flos': 135584603541504.0, 'train_loss': 5.244500393563128, 'epoch': 4.0})

In [None]:
def guess_the_riddle(sample):

  tokens = tokenizer('guess: ' + sample['riddle'], return_tensors='pt').input_ids
  outputs = model.generate(tokens)
  sample['prediction'] = tokenizer.decode(outputs[0], skip_special_tokens=True)

  return sample

In [None]:
#print(type(test))
#riddles = test.map(guess_the_riddle)
for i in range(10, 20):
  print(f'''
    RIDDLE: {riddles['riddle'][i]}
    ANSWER: {riddles['answer'][i]}
    PRED_ANSWER: {riddles['prediction'][i]}''')


    RIDDLE: На дне, где тихо и темно,  Лежит усатое бревно.
    ANSWER: Сом
    PRED_ANSWER: На дне, где тихо и темно

    RIDDLE: Домик круглый, домик белый,  Домик был сначала целый,  А как треснул наконец,  Так и выскочил жилец!
    ANSWER: цыпленок
    PRED_ANSWER: Что такое Домик круглый, домик белый, домик

    RIDDLE: Твоему мячу подобен, Только вкусен и съедобен. Хоть зелёные бока, Мякоть красная сладка.
    ANSWER: Арбуз
    PRED_ANSWER: Твоему мячу

    RIDDLE: Не тронь его мякоть, А то придётся плакать: Но поваров такая боль Не остановит ни на сколь. 
    ANSWER: Лук
    PRED_ANSWER: Что ж

    RIDDLE: Хвостиком виляет, зубаста, а не лает.
    ANSWER: собака
    PRED_ANSWER: Что такое зубаста

    RIDDLE: Маленькая, горбатенькая, Скачет, несется, В руки  Не дается
    ANSWER: Блоха
    PRED_ANSWER: Что такое Маленькая, горбатенькая

    RIDDLE: Бывают ли у дождика Четыре колеса? Скажи, как называются Такие чудеса?
    ANSWER: Поливальная машина
    PRED_ANSWER: Как называют