In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch as pt
import transformers as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datasets as ds

In [None]:
model_name = "cointegrated/rubert-tiny2"
#model_name = 'sberbank-ai/ruBert-base'
#model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = tf.BertTokenizer.from_pretrained(model_name)
model = tf.EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/16af2afaa4ceaa8d50b689bd4c2f7ef7fe3bfac06c0aac7d82a5c1c72298b62a.cc3312d07ccf88871a3c2b7cb3442138e6785101efead94d9f77e96301cf7f4a
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/20317640533199c6b37a557395cd5ee5fcb8777be7c89bb1314bfd43058b35e9.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/e818c3e83969c6aa46da3b5b2eafe049b197e0e787503bf7c643ec64422a51fc.1cec470309dd45bda58f63ce3bb829fe84e2a93e1fc2857ceff76e77262d7944
loading configuration file https://huggingface.co/cointegrate

In [None]:
# hyperparameters
BATCH_SIZE = 32
EPOCH_NUM = 3

encoder_max_length = 256
decoder_max_length = 16

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

In [None]:
def process_data_to_model_inputs(batch):
  # Tokenize the input and target data
  inputs = tokenizer(batch['riddle'], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch['answer'], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

In [None]:
data = ds.load_dataset('csv', data_files='puzzles_dataset.csv')['train'].shuffle(seed=42)
data = data.train_test_split(test_size=0.1)
train_data = data['train']
test_data = data['test']

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=BATCH_SIZE, 
    remove_columns=['riddle', 'answer', 'Unnamed: 0']
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
print(len(train_data))



  0%|          | 0/1 [00:00<?, ?it/s]



1423


In [None]:
training_args = tf.Seq2SeqTrainingArguments(
    output_dir = '/',
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    predict_with_generate=True,
    #evaluate_during_training=True,
    do_train=True,
    logging_steps=1,
    save_steps=100, 
    warmup_steps=32,  
    #max_steps=1500, # delete for full training
    num_train_epochs = EPOCH_NUM, #TRAIN_EPOCHS
    save_total_limit=1,
)

# instantiate trainer
trainer = tf.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 1423
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 135


Step,Training Loss
1,9.9525
2,9.8378
3,10.2866
4,8.9286
5,9.3593
6,9.689
7,9.2959
8,9.6759
9,9.1669
10,8.6835


Saving model checkpoint to /checkpoint-100
Configuration saved in /checkpoint-100/config.json
Model weights saved in /checkpoint-100/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=135, training_loss=1.8325515544490405, metrics={'train_runtime': 1132.0777, 'train_samples_per_second': 3.771, 'train_steps_per_second': 0.119, 'total_flos': 39722502666240.0, 'train_loss': 1.8325515544490405, 'epoch': 3.0})

In [None]:
def guess_the_riddle(batch):
    # cut off at BERT max length 512
    inputs = tokenizer(batch['riddle'], padding="max_length", truncation=True, max_length=decoder_max_length, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch['pred_answer'] = output_str

    return batch

In [None]:
model.config.max_length = decoder_max_length
model.config.min_length = 1
model.config.no_repeat_ngram_size = 5
model.config.early_stopping = True
model.config.repetition_penalty = 10.0
model.config.temperature = 0.01
model.config.length_penalty = 5.0
model.config.num_beams = 10
model.config.bad_words = ['.', '-', ')', '(', '"', "'", ',', '«', '»']

In [None]:
print(type(test_data))

results = test_data.map(guess_the_riddle, batched=True, batch_size=BATCH_SIZE, remove_columns=[])
"""dummy_model = tf.EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

dummy_model.config.decoder_start_token_id = tokenizer.cls_token_id
dummy_model.config.eos_token_id = tokenizer.sep_token_id
dummy_model.config.pad_token_id = tokenizer.pad_token_id
dummy_model.config.vocab_size = model.config.encoder.vocab_size"""

<class 'datasets.arrow_dataset.Dataset'>


  0%|          | 0/5 [00:00<?, ?ba/s]

'dummy_model = tf.EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)\n\ndummy_model.config.decoder_start_token_id = tokenizer.cls_token_id\ndummy_model.config.eos_token_id = tokenizer.sep_token_id\ndummy_model.config.pad_token_id = tokenizer.pad_token_id\ndummy_model.config.vocab_size = model.config.encoder.vocab_size'

In [None]:
for i in range(10):
  #print(results['riddle'][i], results['answer'][i], '\n', results['pred_answer'][i])
  print(results['pred_answer'][i])

это это это это это Это « « « « « » »
женщина женщина женщина женщина женщина Женщина Женщина Женщина Женщина Женщина Женщины
« « « « « „ „ » » » »
женщина женщина женщина женщина женщина Женщина Женщина Женщина Женщина Женщина Женщины
женщина женщина женщина женщина женщина Женщина Женщина Женщина Женщина Женщина Женщины
,, ( ( ( ( ( ) ) ) )
,,,, ( ( ( « « « «
с с с с с С С С С - - -
- - - - В В В С С С С
« « « « „ „ » » » » »
