In [None]:
!pip install --q transformers datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
dataset = load_dataset("RussianNLP/tape", dataset_name)

df_train = dataset['train'].data.to_pandas()
df_test = dataset['test'].data.to_pandas()

train_nli = Dataset.from_pandas(transform_df_to_mnli(df_train), split="train")

tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(example):
    return tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=300, return_tensors='pt').to(device)

train_dataset = train_nli.map(preprocess_function, batched=True)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datasets import load_dataset, Dataset
import transformers
import torch
from sklearn.metrics import accuracy_score
from transformers import XLMRobertaForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np


device = "cuda" if torch.cuda.is_available() else "cpu"
# device = 'cpu'

In [None]:
def get_episode_data(data: pd.DataFrame, episode: int) -> pd.DataFrame:
    """
    Returns all the data from the specified episode
    Parameters
    ----------
    data: pd.DataFrame
        data to work with
    episode: int
        episode number
    Returns
    -------
    pd.DataFrame
        train data from the passed episode
    """
    ids = data.episode.apply(lambda x: episode in x)

    return data[ids]

In [None]:
metric = evaluate.load("accuracy")


def get_hypotheses(question: list, prompt: str):
  _, ans_A, ans_B, ans_C, ans_D = question
  hypotheses = [prompt.replace('[ANSWER]', ans) for ans in [ans_A, ans_B, ans_C, ans_D]]
  return hypotheses

def get_label(index: int, answer: int):
  if index % 4 == answer:
    return 1
  else:
    return 0

def transform_df_to_mnli(df: pd.DataFrame):
  PROMPT = 'Ответ: [ANSWER]'

  # divide question text into the question and the answers
  df['question'] = df.question.replace(to_replace=r'\((A|B|C|D)\)', value='[SEP]', regex=True).apply(lambda x: x.split('[SEP]'))
  df['premise'] = 'Вопрос: ' + df.question.apply(lambda x: x[0] + f' Варианты ответа: (A) {x[1]} (B) {x[2]} (C) {x[3]} (D) {x[4]}') 
  df['hypothesis'] =  df.apply(lambda x: get_hypotheses(question=x['question'], prompt=PROMPT), axis=1)
  df = df.explode('hypothesis')

  # turn answers to binary labels
  df['answer'] = df['answer'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3})
  df.reset_index(inplace=True, drop=True)
  df['label'] = df.apply(lambda x: get_label(x.name, x['answer']), axis=1)
  
  return df[['premise', 'hypothesis', 'label']]


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def transform_predictions(pred, df):
    answers = pred.predictions[:,1].reshape(len(df), 4).argmax(axis=1)
    ans_dict = {0: 'A', 1: 'B', 2: 'C', 3: 'D'}
    answers = [ans_dict[ans] for ans in answers]

    assert len(answers) == len(df)

    return answers

            
def run_experiment(model_name, dataset_name, device):
    dataset = load_dataset("RussianNLP/tape", dataset_name)

    df_train = dataset['train'].data.to_pandas()
    df_test = dataset['test'].data.to_pandas()

    train_nli = Dataset.from_pandas(transform_df_to_mnli(df_train), split="train")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def preprocess_function(example):
        return tokenizer(example['premise'], example['hypothesis'], truncation=True, padding='max_length', max_length=300, return_tensors='pt').to(device)
    
    train_dataset = train_nli.map(preprocess_function, batched=True)

    evaluation_results = []
    episodes = list(np.unique(np.hstack(df_train.episode.values)))

    for episode in sorted(episodes):
        print('Training for episode ', str(episode))

        if 'roberta' in model_name:
            nli_model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
        else:
            nli_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
            
        k_shots = get_episode_data(df_train, episode)

        training_args = TrainingArguments(
            output_dir=f'.tape/{model_name}/results',
            evaluation_strategy='no',
            save_strategy='no',
            learning_rate=2e-5,
            per_device_train_batch_size=4,
            num_train_epochs=5,
            weight_decay=0.01,
            push_to_hub=False,
            logging_dir=f'.tape/{model_name}/logs',
            logging_steps=500,
            load_best_model_at_end=True,
            metric_for_best_model='accuracy',
            report_to='none'
            )

        trainer = Trainer(
            model=nli_model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=train_dataset,
            tokenizer=tokenizer,
            compute_metrics = compute_metrics
        )

        if episode != 4:
            trainer.train()

        train_pred = trainer.predict(train_dataset)
        train_answers = transform_predictions(train_pred, df_train)
        train_acc = accuracy_score(df_train.answer, train_answers)
        print('Episode: ', episode, 'Shot: ', k_shots.shape[0], 'Train Accuracy: ', train_acc)
        
        for perturbation, test in df_test.groupby('perturbation'):
            test_sample_nli = Dataset.from_pandas(transform_df_to_mnli(test), split="test")
            test_sample_dataset = test_sample_nli.map(preprocess_function, batched=True)

            test_pred = trainer.predict(test_sample_dataset)
            predictions = transform_predictions(test_pred, test)
            
            evaluation_results.append({
                "episode": episode,
                "shot": k_shots.shape[0],
                "slice": perturbation,
                "preds": predictions
            })

        inter_res = pd.DataFrame(evaluation_results)
        if 'roberta' in model_name:
          inter_res[inter_res.episode == episode].to_csv(f"drive/My Drive/Colab Notebooks/dl4nlp_labs/project/eval_results/{model_name}_{dataset_name.split('.')[0]}_{episode}.csv")
        else:
          inter_res[inter_res.episode == episode].to_csv(f"drive/My Drive/Colab Notebooks/dl4nlp_labs/project/eval_results/{model_name.split('/')[1]}_{dataset_name.split('.')[0]}_{episode}.csv")
        
        del trainer, nli_model
            
    return evaluation_results

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
evaluation_results = run_experiment('DeepPavlov/rubert-base-cased', 'ru_openbook.episodes', device)

Downloading builder script:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

Downloading and preparing dataset tape/ru_openbook.episodes to /root/.cache/huggingface/datasets/RussianNLP___tape/ru_openbook.episodes/0.0.1/df7620d31cfdf6508b0b8442f699b93aad2bdd0dac90800435e8f42628280577...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/16.7k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset tape downloaded and prepared to /root/.cache/huggingface/datasets/RussianNLP___tape/ru_openbook.episodes/0.0.1/df7620d31cfdf6508b0b8442f699b93aad2bdd0dac90800435e8f42628280577. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Map:   0%|          | 0/192 [00:00<?, ? examples/s]

Training for episode  5


Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  5 Shot:  1 Train Accuracy:  0.5


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  6


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  6 Shot:  1 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  7


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  7 Shot:  1 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  8


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  8 Shot:  1 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  9


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  9 Shot:  1 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  10


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  10 Shot:  4 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  11


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  11 Shot:  4 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  12


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  12 Shot:  4 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  13


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  13 Shot:  4 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  14


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  14 Shot:  4 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  15


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  15 Shot:  8 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  16


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  16 Shot:  8 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training for episode  17


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Step,Training Loss


Episode:  17 Shot:  8 Train Accuracy:  0.7083333333333334


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
def create_submission(model: str, dataset: str):
  submission_df = pd.DataFrame()
  for ep in range(4,20):
    pred_df = pd.read_csv(f'./eval_results/{model}_{dataset}_{ep}.csv')
    submission_df = submission_df.append(pred_df)

  submission_df.drop(columns=['Unnamed: 0'], inplace=True)
  submission_df['preds'] = submission_df.preds.apply(eval)

  return submission_df

In [None]:
create_submission('rubert-base-cased-nli-twoway', 'ru_worldtree').to_json(
          "./rubert_nli_submission/predictions/RuWorldTree.json",
          orient="records",
          force_ascii=False,
      )

create_submission('rubert-base-cased-nli-twoway', 'ru_openbook').to_json(
          "./rubert_nli_submission/predictions/RuOpenBookQA.json",
          orient="records",
          force_ascii=False,
      )

create_submission('rubert-base-cased', 'ru_worldtree').to_json(
          "./rubert_submission/predictions/RuWorldTree.json",
          orient="records",
          force_ascii=False,
      )

create_submission('rubert-base-cased', 'ru_openbook').to_json(
          "./rubert_submission/predictions/RuOpenBookQA.json",
          orient="records",
          force_ascii=False,
      )

create_submission('xlm-roberta-base', 'ru_worldtree').to_json(
          "./xlm-r_submission/predictions/RuWorldTree.json",
          orient="records",
          force_ascii=False,
      )

create_submission('xlm-roberta-base', 'ru_openbook').to_json(
          "./xlm-r_submission/predictions/RuOpenBookQA.json",
          orient="records",
          force_ascii=False,
      )