In [1]:
import re, json

from typing import Union

import torch as tt

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from transformers import EvalPrediction
from datasets import Dataset

In [2]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
model = AutoModelForCausalLM.from_pretrained("ai-forever/rugpt3small_based_on_gpt2")
tokenizer.pad_token = tokenizer.eos_token

  return self.fget.__get__(instance, owner)()


In [3]:
with open("tf_dataset_pretty_filtered.json", 'r', encoding="utf8") as inp:
    tf_dataset = json.load(inp)

In [4]:
tf_dataset_train, tf_dataset_val, tf_dataset_test = tf_dataset["train"], tf_dataset["val"], tf_dataset["test"]
tf_dataset_train = Dataset.from_list(tf_dataset_train)
tf_dataset_val = Dataset.from_list(tf_dataset_val)
tf_dataset_test = Dataset.from_list(tf_dataset_test)

In [5]:
len(tf_dataset_train), len(tf_dataset_val), len(tf_dataset_test)

(3288, 175, 187)

In [6]:
set(item["answer"] for item in tf_dataset_train)

{'A', 'B', 'C', 'D'}

In [7]:
set(item["answer"] for item in tf_dataset_val)

{'A', 'B', 'C', 'D'}

In [8]:
set(item["answer"] for item in tf_dataset_test)

{'A', 'B', 'C', 'D'}

In [9]:
option_id_dict = {
    'A': 0, 'B': 1, 'C': 2, 'D': 3
}

def to_new_format(example: dict[str, Union[str, list[str]]]) -> str:
  example["options_ru"] = [option for option in example["options_ru"] if option]
  right_answer = example['options_ru'][option_id_dict[example['answer']]]

  qtext_orig = example["question"].lower()
  outp = ""

  if ("not true" in qtext_orig) or ("false" in qtext_orig) or ("n't true" in qtext_orig) or ("untrue" in qtext_orig):
    if ("not false" in qtext_orig) or ("n't false" in qtext_orig):
      outp += example['article_ru'] + "\n" + "ВОПРОС: Какое высказывание СООТВЕТСТВУЕТ тексту? "
    else:
      outp += example['article_ru'] + " " + "ВОПРОС: Какое высказывание НЕ СООТВЕТСТВУЕТ тексту? "
  else:
      outp += example['article_ru'] + " " + "ВОПРОС: Какое высказывание СООТВЕТСТВУЕТ тексту? "

  outp += f"ПРАВИЛЬНЫЙ ОТВЕТ: {right_answer}"
  outp += "\nНЕПРАВИЛЬНЫЕ ВАРИАНТЫ ОТВЕТА:"
  for option in example["options_ru"]:
      if option != right_answer:
          #print(option)
          outp += f"\n  {option}"
  #print(outp)
  #raise Exception
  return {"inp": outp}

In [10]:
tf_dataset_train = tf_dataset_train.map(to_new_format)
tf_dataset_val = tf_dataset_val.map(to_new_format)
tf_dataset_test = tf_dataset_test.map(to_new_format)

Map:   0%|          | 0/3288 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

In [11]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inp"]
    )
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

In [12]:
tf_dataset_train = tf_dataset_train.map(preprocess_function, batched=True)
tf_dataset_val = tf_dataset_val.map(preprocess_function, batched=True)
tf_dataset_test = tf_dataset_test.map(preprocess_function, batched=True)

Map:   0%|          | 0/3288 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/187 [00:00<?, ? examples/s]

In [13]:
#max([len(example["labels"]) for example in tf_dataset_train]), max([len(example["input_ids"]) for example in tf_dataset_train])

In [14]:
#max([len(example["labels"]) for example in tf_dataset_test])

In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
NUM_TRAIN_EPOCHS=20
BATCH_SIZE=1
#STEPS=1000

training_args = TrainingArguments(
    output_dir="./RuGPT3-RuRACE",
    evaluation_strategy="epoch",
    weight_decay=0.01,
    learning_rate=5e-5,
    load_best_model_at_end=True,   
    save_strategy="epoch",
    num_train_epochs=NUM_TRAIN_EPOCHS,
    save_total_limit=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    prediction_loss_only=True,
    gradient_checkpointing=True,
    logging_dir="./rugpt3-rurace-tf-logs"
)

In [17]:
trainer = Trainer(
    model,
    args=training_args,
    train_dataset=tf_dataset_train,
    eval_dataset=tf_dataset_val,
    data_collator=data_collator
)

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,2.81,2.74577
2,2.4771,2.715271
3,2.24,2.761566
4,1.9252,2.833804
