# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [None]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [None]:
import pandas as pd
from datasets import DatasetDict, Dataset


exploded_df = pd.json_normalize(pd.json_normalize(pd.read_json('../data/translated_alina.json')['data'])['paragraphs'].explode())
exploded_df = exploded_df[~exploded_df['context_en'].isna()].reset_index(drop=True)
exploded_df = exploded_df.drop('qas', axis=1).join(pd.DataFrame(exploded_df['qas'].explode())).reset_index(drop=True)
exploded_df = exploded_df.join(pd.json_normalize(exploded_df['qas'])).drop('qas', axis=1)
exploded_df = exploded_df[~exploded_df['question_en'].isna()].reset_index(drop=True)

exploded_df['answers'] = exploded_df['answers'].apply(lambda an: {
    'text_en': [an[0]['text_en']],
    'answer_start_en': [an[0]['answer_start']],
    'answer_end_en': [an[0]['answer_end']],
})

alina_translation = Dataset.from_pandas(exploded_df)

squad_alina = DatasetDict({
    'validation': alina_translation,
    'train': alina_translation
})

Getting contexts, questions and answers from the train and validation sets

In [4]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride.
    # This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question_en"],
        examples["context_en"],
        truncation="only_second",  # truncate context, not the question
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context.
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start_en"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start_en"][0]
            end_char = start_char + len(answers["text_en"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [7]:
import pickle
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad_alina['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_times = {}
for batch, lr, epochs, model_name, model_path in [
    (16, 2e-4, 6, 'alina-bert', 'google-bert/bert-base-uncased'),
    (16, 2e-4, 6, 'alina-m-bert', 'bert-base-multilingual-cased'),
    (16, 2e-4, 6, 'alina-m-distil-bert', 'distilbert/distilbert-base-multilingual-cased'),
    (16, 2e-4, 6, 'alina-xlm-roberta', 'FacebookAI/xlm-roberta-base'),
    (16, 2e-4, 6, 'alina-ru-bert', 'DeepPavlov/rubert-base-cased'),
]:
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    tokenized_datasets = squad_alina.map(prepare_train_features, batched=True, remove_columns=squad_alina["train"].column_names)

    with open(f"../data/engish_tokenized_{model_name}_datasets.pkl","wb") as file:
        pickle.dump(tokenized_datasets, file)


    args = TrainingArguments(
        output_dir=f"../models/en_{model_name}",
        evaluation_strategy = "epoch",
        save_strategy="epoch", 
        learning_rate=lr,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=epochs,
        report_to='tensorboard',
        logging_dir=f'../logs/en_{model_name}',
        load_best_model_at_end=True,
        # weight_decay=0.01,
    )


    data_collator = DefaultDataCollator()

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    start_time = datetime.now()
    trainer.train()
    print("english - model", model_name, "train time", datetime.now() - start_time)
    train_times[model_name] = datetime.now() - start_time

    trainer.save_model()

Map: 100%|██████████| 51/51 [00:00<00:00, 5172.39 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 3871.17 examples/s]


{'eval_loss': 4.036243438720703, 'eval_runtime': 0.1772, 'eval_samples_per_second': 287.801, 'eval_steps_per_second': 22.573, 'epoch': 1.0}
{'eval_loss': 2.8767335414886475, 'eval_runtime': 0.1766, 'eval_samples_per_second': 288.773, 'eval_steps_per_second': 22.649, 'epoch': 2.0}
{'eval_loss': 2.1038503646850586, 'eval_runtime': 0.1767, 'eval_samples_per_second': 288.618, 'eval_steps_per_second': 22.637, 'epoch': 3.0}
{'eval_loss': 1.6164547204971313, 'eval_runtime': 0.1777, 'eval_samples_per_second': 286.99, 'eval_steps_per_second': 22.509, 'epoch': 4.0}
{'eval_loss': 1.4257676601409912, 'eval_runtime': 0.1783, 'eval_samples_per_second': 286.1, 'eval_steps_per_second': 22.439, 'epoch': 5.0}
{'eval_loss': 1.335403323173523, 'eval_runtime': 0.1784, 'eval_samples_per_second': 285.945, 'eval_steps_per_second': 22.427, 'epoch': 6.0}
{'train_runtime': 11.2601, 'train_samples_per_second': 27.176, 'train_steps_per_second': 2.131, 'train_loss': 2.910313606262207, 'epoch': 6.0}
english - model 

Map: 100%|██████████| 51/51 [00:00<00:00, 5363.02 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5713.09 examples/s]


{'eval_loss': 4.43958044052124, 'eval_runtime': 0.1825, 'eval_samples_per_second': 279.487, 'eval_steps_per_second': 21.921, 'epoch': 1.0}
{'eval_loss': 3.15783953666687, 'eval_runtime': 0.182, 'eval_samples_per_second': 280.261, 'eval_steps_per_second': 21.981, 'epoch': 2.0}
{'eval_loss': 2.162809133529663, 'eval_runtime': 0.1838, 'eval_samples_per_second': 277.424, 'eval_steps_per_second': 21.759, 'epoch': 3.0}
{'eval_loss': 1.6234484910964966, 'eval_runtime': 0.1837, 'eval_samples_per_second': 277.66, 'eval_steps_per_second': 21.777, 'epoch': 4.0}
{'eval_loss': 1.4975240230560303, 'eval_runtime': 0.1832, 'eval_samples_per_second': 278.408, 'eval_steps_per_second': 21.836, 'epoch': 5.0}
{'eval_loss': 1.3576291799545288, 'eval_runtime': 0.1833, 'eval_samples_per_second': 278.265, 'eval_steps_per_second': 21.825, 'epoch': 6.0}
{'train_runtime': 17.3477, 'train_samples_per_second': 17.639, 'train_steps_per_second': 1.383, 'train_loss': 3.1401100158691406, 'epoch': 6.0}
english - model a

Map: 100%|██████████| 51/51 [00:00<00:00, 5945.73 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 3155.33 examples/s]


{'eval_loss': 4.197112560272217, 'eval_runtime': 0.1295, 'eval_samples_per_second': 393.807, 'eval_steps_per_second': 30.887, 'epoch': 1.0}
{'eval_loss': 2.6797759532928467, 'eval_runtime': 0.1276, 'eval_samples_per_second': 399.764, 'eval_steps_per_second': 31.354, 'epoch': 2.0}
{'eval_loss': 1.8237992525100708, 'eval_runtime': 0.1289, 'eval_samples_per_second': 395.666, 'eval_steps_per_second': 31.033, 'epoch': 3.0}
{'eval_loss': 1.3446433544158936, 'eval_runtime': 0.1278, 'eval_samples_per_second': 398.913, 'eval_steps_per_second': 31.287, 'epoch': 4.0}
{'eval_loss': 1.209793210029602, 'eval_runtime': 0.128, 'eval_samples_per_second': 398.374, 'eval_steps_per_second': 31.245, 'epoch': 5.0}
{'eval_loss': 1.1174259185791016, 'eval_runtime': 0.1294, 'eval_samples_per_second': 394.101, 'eval_steps_per_second': 30.91, 'epoch': 6.0}
{'train_runtime': 12.7731, 'train_samples_per_second': 23.957, 'train_steps_per_second': 1.879, 'train_loss': 2.7692626317342124, 'epoch': 6.0}
english - mode

Map: 100%|██████████| 51/51 [00:00<00:00, 5696.96 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5873.73 examples/s]


{'eval_loss': 5.245875358581543, 'eval_runtime': 0.2621, 'eval_samples_per_second': 194.571, 'eval_steps_per_second': 15.26, 'epoch': 1.0}
{'eval_loss': 4.924711227416992, 'eval_runtime': 0.2573, 'eval_samples_per_second': 198.251, 'eval_steps_per_second': 15.549, 'epoch': 2.0}
{'eval_loss': 5.524502754211426, 'eval_runtime': 0.2495, 'eval_samples_per_second': 204.433, 'eval_steps_per_second': 16.034, 'epoch': 3.0}
{'eval_loss': 5.410308837890625, 'eval_runtime': 0.2569, 'eval_samples_per_second': 198.507, 'eval_steps_per_second': 15.569, 'epoch': 4.0}
{'eval_loss': 4.9071149826049805, 'eval_runtime': 0.2535, 'eval_samples_per_second': 201.173, 'eval_steps_per_second': 15.778, 'epoch': 5.0}
{'eval_loss': 4.76708984375, 'eval_runtime': 0.2524, 'eval_samples_per_second': 202.062, 'eval_steps_per_second': 15.848, 'epoch': 6.0}
{'train_runtime': 26.6631, 'train_samples_per_second': 11.477, 'train_steps_per_second': 0.9, 'train_loss': 5.5441023508707685, 'epoch': 6.0}
english - model alina-

Map: 100%|██████████| 51/51 [00:00<00:00, 5130.83 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 4758.73 examples/s]


{'eval_loss': 4.441382884979248, 'eval_runtime': 0.1832, 'eval_samples_per_second': 278.353, 'eval_steps_per_second': 21.832, 'epoch': 1.0}
{'eval_loss': 2.8611700534820557, 'eval_runtime': 0.1814, 'eval_samples_per_second': 281.116, 'eval_steps_per_second': 22.048, 'epoch': 2.0}
{'eval_loss': 2.0255956649780273, 'eval_runtime': 0.1835, 'eval_samples_per_second': 277.972, 'eval_steps_per_second': 21.802, 'epoch': 3.0}
{'eval_loss': 1.6546499729156494, 'eval_runtime': 0.1827, 'eval_samples_per_second': 279.144, 'eval_steps_per_second': 21.894, 'epoch': 4.0}
{'eval_loss': 1.282227635383606, 'eval_runtime': 0.1811, 'eval_samples_per_second': 281.651, 'eval_steps_per_second': 22.09, 'epoch': 5.0}
{'eval_loss': 1.1587496995925903, 'eval_runtime': 0.1822, 'eval_samples_per_second': 279.96, 'eval_steps_per_second': 21.958, 'epoch': 6.0}
{'train_runtime': 17.0947, 'train_samples_per_second': 17.9, 'train_steps_per_second': 1.404, 'train_loss': 2.981151262919108, 'epoch': 6.0}
english - model a

In [8]:
train_times

{'alina-bert': datetime.timedelta(seconds=11, microseconds=384706),
 'alina-m-bert': datetime.timedelta(seconds=17, microseconds=469055),
 'alina-m-distil-bert': datetime.timedelta(seconds=12, microseconds=904222),
 'alina-xlm-roberta': datetime.timedelta(seconds=26, microseconds=781594),
 'alina-ru-bert': datetime.timedelta(seconds=17, microseconds=242755)}

In [9]:
import pickle
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad_alina['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for model_name in ['alina-bert', 'alina-m-bert', 'alina-m-distil-bert', 'alina-xlm-roberta', 'alina-ru-bert']:

    model = AutoModelForQuestionAnswering.from_pretrained(f"../models/en_{model_name}").cuda()
    tokenizer = AutoTokenizer.from_pretrained(f"../models/en_{model_name}")

    tokenized_datasets = squad_alina.map(prepare_train_features, batched=True, remove_columns=squad_alina["train"].column_names)

    eval_answers = []

    for instance in tqdm(squad_alina['validation']):
        context = instance['context_en']
        question = instance['question_en']

        given_answer = instance['answers']['text_en'][0]  # Assuming the first answer is the correct one

        inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
    
        start_idx = torch.argmax(output.start_logits)
        end_idx = torch.argmax(output.end_logits)

        predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

        eval_answers.append(predicted_answer)

    num_c = []
    num_p = []
    num_g = []

    for a in range(len(eval_answers)):

        common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
        num_common = sum(common.values())

        num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

        num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

        num_c.append(num_common)
        num_p.append(num_pred)
        num_g.append(num_gold)

    precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
    recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
    invalid_f1_score= (2 * precision * recall) / (precision + recall)
    print("english model", model_name, "invalid f1 score", invalid_f1_score)

    predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
    references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'answer_start': r['answer_start_en'], 'text': r['text_en']} for r in squad_alina['validation']['answers']])]

    results = squad_v2_metric.compute(predictions=predictions, references=references)
    print("english model", model_name, "squad results", results)

Map: 100%|██████████| 51/51 [00:00<00:00, 5186.82 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5823.99 examples/s]
100%|██████████| 51/51 [00:00<00:00, 351.59it/s]


english model alina-bert invalid f1 score 0.9861818181818183
english model alina-bert squad results {'exact': 1.9607843137254901, 'f1': 30.7736734953822, 'total': 51, 'HasAns_exact': 1.9607843137254901, 'HasAns_f1': 30.7736734953822, 'HasAns_total': 51, 'best_exact': 1.9607843137254901, 'best_exact_thresh': 0.0, 'best_f1': 30.7736734953822, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 5548.74 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5538.11 examples/s]
100%|██████████| 51/51 [00:00<00:00, 371.49it/s]


english model alina-m-bert invalid f1 score 1.1979286536248561
english model alina-m-bert squad results {'exact': 1.9607843137254901, 'f1': 35.4097687609201, 'total': 51, 'HasAns_exact': 1.9607843137254901, 'HasAns_f1': 35.4097687609201, 'HasAns_total': 51, 'best_exact': 1.9607843137254901, 'best_exact_thresh': 0.0, 'best_f1': 35.4097687609201, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 6210.36 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 6597.87 examples/s]
100%|██████████| 51/51 [00:00<00:00, 535.84it/s]


english model alina-m-distil-bert invalid f1 score 1.251342642320086
english model alina-m-distil-bert squad results {'exact': 3.9215686274509802, 'f1': 42.41026929298009, 'total': 51, 'HasAns_exact': 3.9215686274509802, 'HasAns_f1': 42.41026929298009, 'HasAns_total': 51, 'best_exact': 3.9215686274509802, 'best_exact_thresh': 0.0, 'best_f1': 42.41026929298009, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 3797.10 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 3844.04 examples/s]
100%|██████████| 51/51 [00:00<00:00, 334.78it/s]


english model alina-xlm-roberta invalid f1 score 1.219484882418813
english model alina-xlm-roberta squad results {'exact': 0.0, 'f1': 26.4158789761732, 'total': 51, 'HasAns_exact': 0.0, 'HasAns_f1': 26.4158789761732, 'HasAns_total': 51, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 26.4158789761732, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 5322.19 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5421.47 examples/s]
100%|██████████| 51/51 [00:00<00:00, 347.61it/s]

english model alina-ru-bert invalid f1 score 1.1092651757188499
english model alina-ru-bert squad results {'exact': 1.9607843137254901, 'f1': 37.90860526240914, 'total': 51, 'HasAns_exact': 1.9607843137254901, 'HasAns_f1': 37.90860526240914, 'HasAns_total': 51, 'best_exact': 1.9607843137254901, 'best_exact_thresh': 0.0, 'best_f1': 37.90860526240914, 'best_f1_thresh': 0.0}





In [10]:
import pandas as pd
from datasets import DatasetDict, Dataset


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')


squad = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })

In [11]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer
from tqdm import tqdm
import torch
import collections
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for model_name in ['alina-bert', 'alina-m-bert', 'alina-m-distil-bert', 'alina-xlm-roberta', 'alina-ru-bert']:

    model = AutoModelForQuestionAnswering.from_pretrained(f"../models/en_{model_name}").cuda()
    tokenizer = AutoTokenizer.from_pretrained(f"../models/en_{model_name}")

    tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

    eval_answers = []

    for instance in tqdm(squad['validation']):
        context = instance['context_en']
        question = instance['question_en']

        given_answer = instance['answers']['text_en'][0]  # Assuming the first answer is the correct one

        inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
    
        start_idx = torch.argmax(output.start_logits)
        end_idx = torch.argmax(output.end_logits)

        predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

        eval_answers.append(predicted_answer)

    num_c = []
    num_p = []
    num_g = []

    for a in range(len(eval_answers)):

        common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
        num_common = sum(common.values())

        num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

        num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

        num_c.append(num_common)
        num_p.append(num_pred)
        num_g.append(num_gold)

    precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
    recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
    invalid_f1_score= (2 * precision * recall) / (precision + recall)
    print("english model", model_name, "invalid f1 score", invalid_f1_score)

    predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
    references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'answer_start': r['answer_start_en'], 'text': r['text_en']} for r in squad['validation']['answers']])]

    results = squad_v2_metric.compute(predictions=predictions, references=references)
    print("english model", model_name, "squad results", results)

Map: 100%|██████████| 2901/2901 [00:00<00:00, 7335.83 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 6827.95 examples/s]
100%|██████████| 643/643 [00:01<00:00, 389.93it/s]


english model alina-bert invalid f1 score 0.7119780842622331
english model alina-bert squad results {'exact': 2.332814930015552, 'f1': 14.457673616217312, 'total': 643, 'HasAns_exact': 2.332814930015552, 'HasAns_f1': 14.457673616217312, 'HasAns_total': 643, 'best_exact': 2.332814930015552, 'best_exact_thresh': 0.0, 'best_f1': 14.457673616217312, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 6026.50 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7452.24 examples/s]
100%|██████████| 643/643 [00:01<00:00, 381.16it/s]


english model alina-m-bert invalid f1 score 0.891102797657775
english model alina-m-bert squad results {'exact': 2.6438569206842923, 'f1': 18.26755486119045, 'total': 643, 'HasAns_exact': 2.6438569206842923, 'HasAns_f1': 18.26755486119045, 'HasAns_total': 643, 'best_exact': 2.6438569206842923, 'best_exact_thresh': 0.0, 'best_f1': 18.26755486119045, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 8588.45 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 8895.44 examples/s]
100%|██████████| 643/643 [00:01<00:00, 547.37it/s]


english model alina-m-distil-bert invalid f1 score 0.8482007095793208
english model alina-m-distil-bert squad results {'exact': 2.488335925349922, 'f1': 21.2490252044137, 'total': 643, 'HasAns_exact': 2.488335925349922, 'HasAns_f1': 21.2490252044137, 'HasAns_total': 643, 'best_exact': 2.488335925349922, 'best_exact_thresh': 0.0, 'best_f1': 21.2490252044137, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 7656.14 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 8305.50 examples/s]
100%|██████████| 643/643 [00:01<00:00, 344.61it/s]


english model alina-xlm-roberta invalid f1 score 0.7837837837837839
english model alina-xlm-roberta squad results {'exact': 4.043545878693624, 'f1': 19.014945898528016, 'total': 643, 'HasAns_exact': 4.043545878693624, 'HasAns_f1': 19.014945898528016, 'HasAns_total': 643, 'best_exact': 4.043545878693624, 'best_exact_thresh': 0.0, 'best_f1': 19.014945898528016, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 7400.38 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7327.60 examples/s]
100%|██████████| 643/643 [00:01<00:00, 343.80it/s]


english model alina-ru-bert invalid f1 score 0.6488629044245157
english model alina-ru-bert squad results {'exact': 5.443234836702955, 'f1': 15.95582529409292, 'total': 643, 'HasAns_exact': 5.443234836702955, 'HasAns_f1': 15.95582529409292, 'HasAns_total': 643, 'best_exact': 5.443234836702955, 'best_exact_thresh': 0.0, 'best_f1': 15.95582529409292, 'best_f1_thresh': 0.0}
