# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [15]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [16]:
import pandas as pd
from datasets import DatasetDict, Dataset


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')


squad = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })

Getting contexts, questions and answers from the train and validation sets

In [86]:
def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride.
    # This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question_en"],
        examples["context_en"],
        truncation="only_second",  # truncate context, not the question
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context.
    # This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start_en"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start_en"][0]
            end_char = start_char + len(answers["text_en"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [22]:
import pickle
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_times = {}
for batch, lr, epochs, model_name, model_path in [
    (16, 2e-5, 6, 'bert', 'google-bert/bert-base-uncased'),
    (16, 2e-5, 6, 'm-bert', 'bert-base-multilingual-cased'),
    (16, 2e-5, 6, 'm-distil-bert', 'distilbert/distilbert-base-multilingual-cased'),
    (16, 2e-5, 6, 'xlm-roberta', 'FacebookAI/xlm-roberta-base'),
    (16, 2e-5, 6, 'ru-bert', 'DeepPavlov/rubert-base-cased'),
]:
    model = AutoModelForQuestionAnswering.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

    with open(f"../data/engish_tokenized_{model_name}_datasets.pkl","wb") as file:
        pickle.dump(tokenized_datasets, file)


    args = TrainingArguments(
        output_dir=f"../models/en_{model_name}",
        evaluation_strategy = "epoch",
        save_strategy="epoch", 
        learning_rate=lr,
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=epochs,
        report_to='tensorboard',
        logging_dir=f'../logs/en_{model_name}',
        load_best_model_at_end=True,
        # weight_decay=0.01,
    )


    data_collator = DefaultDataCollator()

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        data_collator=data_collator,
        tokenizer=tokenizer,
    )

    start_time = datetime.now()
    trainer.train()
    print("english - model", model_name, "train time", datetime.now() - start_time)
    train_times[model_name] = datetime.now() - start_time

    trainer.save_model()

Map: 100%|██████████| 2901/2901 [00:00<00:00, 7245.54 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7196.55 examples/s]


{'eval_loss': 2.6011884212493896, 'eval_runtime': 2.2423, 'eval_samples_per_second': 286.761, 'eval_steps_per_second': 18.285, 'epoch': 1.0}
{'eval_loss': 2.317234754562378, 'eval_runtime': 2.2176, 'eval_samples_per_second': 289.954, 'eval_steps_per_second': 18.488, 'epoch': 2.0}
{'loss': 2.3436, 'grad_norm': 19.98981285095215, 'learning_rate': 1.0842490842490842e-05, 'epoch': 2.7472527472527473}
{'eval_loss': 2.366225004196167, 'eval_runtime': 2.3309, 'eval_samples_per_second': 275.86, 'eval_steps_per_second': 17.59, 'epoch': 3.0}
{'eval_loss': 2.460500717163086, 'eval_runtime': 2.2817, 'eval_samples_per_second': 281.809, 'eval_steps_per_second': 17.969, 'epoch': 4.0}
{'eval_loss': 2.541428804397583, 'eval_runtime': 2.2937, 'eval_samples_per_second': 280.336, 'eval_steps_per_second': 17.875, 'epoch': 5.0}
{'loss': 1.0727, 'grad_norm': 15.659433364868164, 'learning_rate': 1.6849816849816852e-06, 'epoch': 5.4945054945054945}
{'eval_loss': 2.5757017135620117, 'eval_runtime': 2.3041, 'eva

Map: 100%|██████████| 2901/2901 [00:00<00:00, 6799.65 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 6128.60 examples/s]


{'eval_loss': 2.3052632808685303, 'eval_runtime': 2.2691, 'eval_samples_per_second': 283.366, 'eval_steps_per_second': 18.068, 'epoch': 1.0}
{'eval_loss': 2.1180062294006348, 'eval_runtime': 2.3144, 'eval_samples_per_second': 277.83, 'eval_steps_per_second': 17.715, 'epoch': 2.0}
{'loss': 2.0051, 'grad_norm': 24.252626419067383, 'learning_rate': 1.0842490842490842e-05, 'epoch': 2.7472527472527473}
{'eval_loss': 2.2582759857177734, 'eval_runtime': 2.3065, 'eval_samples_per_second': 278.78, 'eval_steps_per_second': 17.776, 'epoch': 3.0}
{'eval_loss': 2.3540070056915283, 'eval_runtime': 2.3137, 'eval_samples_per_second': 277.91, 'eval_steps_per_second': 17.721, 'epoch': 4.0}
{'eval_loss': 2.4695611000061035, 'eval_runtime': 2.2948, 'eval_samples_per_second': 280.203, 'eval_steps_per_second': 17.867, 'epoch': 5.0}
{'loss': 0.8447, 'grad_norm': 12.329636573791504, 'learning_rate': 1.6849816849816852e-06, 'epoch': 5.4945054945054945}
{'eval_loss': 2.569683313369751, 'eval_runtime': 2.3206, '

Map: 100%|██████████| 2901/2901 [00:00<00:00, 8460.12 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 8397.51 examples/s]


{'eval_loss': 2.639968156814575, 'eval_runtime': 1.5788, 'eval_samples_per_second': 407.265, 'eval_steps_per_second': 25.969, 'epoch': 1.0}
{'eval_loss': 2.4489474296569824, 'eval_runtime': 1.5609, 'eval_samples_per_second': 411.941, 'eval_steps_per_second': 26.267, 'epoch': 2.0}
{'loss': 2.4212, 'grad_norm': 18.29389762878418, 'learning_rate': 1.0842490842490842e-05, 'epoch': 2.7472527472527473}
{'eval_loss': 2.4486610889434814, 'eval_runtime': 1.542, 'eval_samples_per_second': 416.997, 'eval_steps_per_second': 26.589, 'epoch': 3.0}
{'eval_loss': 2.5039780139923096, 'eval_runtime': 1.5577, 'eval_samples_per_second': 412.801, 'eval_steps_per_second': 26.322, 'epoch': 4.0}
{'eval_loss': 2.532639980316162, 'eval_runtime': 1.5437, 'eval_samples_per_second': 416.518, 'eval_steps_per_second': 26.559, 'epoch': 5.0}
{'loss': 1.3143, 'grad_norm': 19.260374069213867, 'learning_rate': 1.6849816849816852e-06, 'epoch': 5.4945054945054945}
{'eval_loss': 2.637362241744995, 'eval_runtime': 1.5466, 'e

Map: 100%|██████████| 2901/2901 [00:00<00:00, 5798.88 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7040.99 examples/s]


{'eval_loss': 2.073545217514038, 'eval_runtime': 2.9259, 'eval_samples_per_second': 219.76, 'eval_steps_per_second': 14.013, 'epoch': 1.0}
{'eval_loss': 1.797814965248108, 'eval_runtime': 2.955, 'eval_samples_per_second': 217.601, 'eval_steps_per_second': 13.875, 'epoch': 2.0}
{'loss': 2.2504, 'grad_norm': 22.51395034790039, 'learning_rate': 1.0842490842490842e-05, 'epoch': 2.7472527472527473}
{'eval_loss': 1.7448272705078125, 'eval_runtime': 2.8168, 'eval_samples_per_second': 228.276, 'eval_steps_per_second': 14.556, 'epoch': 3.0}
{'eval_loss': 1.841406226158142, 'eval_runtime': 2.8039, 'eval_samples_per_second': 229.324, 'eval_steps_per_second': 14.623, 'epoch': 4.0}
{'eval_loss': 1.926429033279419, 'eval_runtime': 2.9238, 'eval_samples_per_second': 219.923, 'eval_steps_per_second': 14.023, 'epoch': 5.0}
{'loss': 0.9787, 'grad_norm': 17.096050262451172, 'learning_rate': 1.6849816849816852e-06, 'epoch': 5.4945054945054945}
{'eval_loss': 1.9846575260162354, 'eval_runtime': 2.8638, 'eva

Map: 100%|██████████| 2901/2901 [00:00<00:00, 7158.42 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 6708.93 examples/s]


{'eval_loss': 2.677781581878662, 'eval_runtime': 2.3204, 'eval_samples_per_second': 277.109, 'eval_steps_per_second': 17.669, 'epoch': 1.0}
{'eval_loss': 2.4873955249786377, 'eval_runtime': 2.3389, 'eval_samples_per_second': 274.912, 'eval_steps_per_second': 17.529, 'epoch': 2.0}
{'loss': 2.3693, 'grad_norm': 20.97149658203125, 'learning_rate': 1.0842490842490842e-05, 'epoch': 2.7472527472527473}
{'eval_loss': 2.5279481410980225, 'eval_runtime': 2.3139, 'eval_samples_per_second': 277.89, 'eval_steps_per_second': 17.719, 'epoch': 3.0}
{'eval_loss': 2.5625193119049072, 'eval_runtime': 2.3331, 'eval_samples_per_second': 275.604, 'eval_steps_per_second': 17.573, 'epoch': 4.0}
{'eval_loss': 2.6762866973876953, 'eval_runtime': 2.255, 'eval_samples_per_second': 285.149, 'eval_steps_per_second': 18.182, 'epoch': 5.0}
{'loss': 1.1846, 'grad_norm': 17.931636810302734, 'learning_rate': 1.6849816849816852e-06, 'epoch': 5.4945054945054945}
{'eval_loss': 2.7333152294158936, 'eval_runtime': 2.3024, '

In [23]:
train_times

{'bert': datetime.timedelta(seconds=209, microseconds=779003),
 'm-bert': datetime.timedelta(seconds=224, microseconds=494248),
 'm-distil-bert': datetime.timedelta(seconds=154, microseconds=549765),
 'xlm-roberta': datetime.timedelta(seconds=293, microseconds=334490),
 'ru-bert': datetime.timedelta(seconds=224, microseconds=340473)}

In [87]:
import pickle
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for model_name in ['bert', 'm-bert', 'm-distil-bert', 'xlm-roberta', 'ru-bert']:

    model = AutoModelForQuestionAnswering.from_pretrained(f"../models/en_{model_name}").cuda()
    tokenizer = AutoTokenizer.from_pretrained(f"../models/en_{model_name}")

    tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

    eval_answers = []

    for instance in tqdm(squad['validation']):
        context = instance['context_en']
        question = instance['question_en']

        given_answer = instance['answers']['text_en'][0]  # Assuming the first answer is the correct one

        inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
    
        start_idx = torch.argmax(output.start_logits)
        end_idx = torch.argmax(output.end_logits)

        predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

        eval_answers.append(predicted_answer)

    num_c = []
    num_p = []
    num_g = []

    for a in range(len(eval_answers)):

        common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
        num_common = sum(common.values())

        num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

        num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

        num_c.append(num_common)
        num_p.append(num_pred)
        num_g.append(num_gold)

    precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
    recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
    invalid_f1_score= (2 * precision * recall) / (precision + recall)
    print("english model", model_name, "invalid f1 score", invalid_f1_score)

    predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
    references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'answer_start': r['answer_start_en'], 'text': r['text_en']} for r in squad['validation']['answers']])]

    results = squad_v2_metric.compute(predictions=predictions, references=references)
    print("english model", model_name, "squad results", results)

Map: 100%|██████████| 2901/2901 [00:00<00:00, 7161.71 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7380.74 examples/s]
100%|██████████| 643/643 [00:01<00:00, 373.98it/s]


english model bert invalid f1 score 0.9980526876584488
english model bert squad results {'exact': 6.531881804043546, 'f1': 43.97983294109015, 'total': 643, 'HasAns_exact': 6.531881804043546, 'HasAns_f1': 43.97983294109015, 'HasAns_total': 643, 'best_exact': 6.531881804043546, 'best_exact_thresh': 0.0, 'best_f1': 43.97983294109015, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 7036.54 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7343.46 examples/s]
100%|██████████| 643/643 [00:01<00:00, 370.82it/s]


english model m-bert invalid f1 score 0.9872994652406417
english model m-bert squad results {'exact': 6.842923794712286, 'f1': 51.11511206246131, 'total': 643, 'HasAns_exact': 6.842923794712286, 'HasAns_f1': 51.11511206246131, 'HasAns_total': 643, 'best_exact': 6.842923794712286, 'best_exact_thresh': 0.0, 'best_f1': 51.11511206246131, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 6184.59 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 8689.37 examples/s]
100%|██████████| 643/643 [00:01<00:00, 542.31it/s]


english model m-distil-bert invalid f1 score 0.8855741724560686
english model m-distil-bert squad results {'exact': 4.976671850699844, 'f1': 34.947770437423365, 'total': 643, 'HasAns_exact': 4.976671850699844, 'HasAns_f1': 34.947770437423365, 'HasAns_total': 643, 'best_exact': 4.976671850699844, 'best_exact_thresh': 0.0, 'best_f1': 34.947770437423365, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 8104.58 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 8234.50 examples/s]
100%|██████████| 643/643 [00:01<00:00, 334.65it/s]


english model xlm-roberta invalid f1 score 1.0148477294895415
english model xlm-roberta squad results {'exact': 11.35303265940902, 'f1': 68.92959734397188, 'total': 643, 'HasAns_exact': 11.35303265940902, 'HasAns_f1': 68.92959734397188, 'HasAns_total': 643, 'best_exact': 11.35303265940902, 'best_exact_thresh': 0.0, 'best_f1': 68.92959734397188, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 2901/2901 [00:00<00:00, 7175.14 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 7194.75 examples/s]
100%|██████████| 643/643 [00:01<00:00, 324.47it/s]

english model ru-bert invalid f1 score 1.0199813124416015
english model ru-bert squad results {'exact': 2.7993779160186625, 'f1': 47.17175718834036, 'total': 643, 'HasAns_exact': 2.7993779160186625, 'HasAns_f1': 47.17175718834036, 'HasAns_total': 643, 'best_exact': 2.7993779160186625, 'best_exact_thresh': 0.0, 'best_f1': 47.17175718834036, 'best_f1_thresh': 0.0}





In [89]:
exploded_df = pd.json_normalize(pd.json_normalize(pd.read_json('../data/translated_alina.json')['data'])['paragraphs'].explode())
exploded_df = exploded_df[~exploded_df['context_en'].isna()].reset_index(drop=True)
exploded_df = exploded_df.drop('qas', axis=1).join(pd.DataFrame(exploded_df['qas'].explode())).reset_index(drop=True)
exploded_df = exploded_df.join(pd.json_normalize(exploded_df['qas'])).drop('qas', axis=1)
exploded_df = exploded_df[~exploded_df['question_en'].isna()].reset_index(drop=True)

exploded_df['answers'] = exploded_df['answers'].apply(lambda an: {
    'text_en': [an[0]['text_en']],
    'answer_start_en': [an[0]['answer_start']],
    'answer_end_en': [an[0]['answer_end']],
})

alina_translation = Dataset.from_pandas(exploded_df)

squad_alina = DatasetDict({
    'validation': alina_translation,
    'train': alina_translation
})

In [91]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer
from tqdm import tqdm
import torch
import collections
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad_alina['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for model_name in ['bert', 'm-bert', 'm-distil-bert', 'xlm-roberta', 'ru-bert']:

    model = AutoModelForQuestionAnswering.from_pretrained(f"../models/en_{model_name}").cuda()
    tokenizer = AutoTokenizer.from_pretrained(f"../models/en_{model_name}")

    tokenized_datasets = squad_alina.map(prepare_train_features, batched=True, remove_columns=squad_alina["train"].column_names)

    eval_answers = []

    for instance in tqdm(squad_alina['validation']):
        context = instance['context_en']
        question = instance['question_en']

        given_answer = instance['answers']['text_en'][0]  # Assuming the first answer is the correct one

        inputs = tokenizer(question, context, return_tensors='pt', max_length=512, truncation=True)

        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            output = model(**inputs)
    
        start_idx = torch.argmax(output.start_logits)
        end_idx = torch.argmax(output.end_logits)

        predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx + 1]))

        eval_answers.append(predicted_answer)

    num_c = []
    num_p = []
    num_g = []

    for a in range(len(eval_answers)):

        common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
        num_common = sum(common.values())

        num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

        num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

        num_c.append(num_common)
        num_p.append(num_pred)
        num_g.append(num_gold)

    precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
    recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
    invalid_f1_score= (2 * precision * recall) / (precision + recall)
    print("english model", model_name, "invalid f1 score", invalid_f1_score)

    predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
    references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'answer_start': r['answer_start_en'], 'text': r['text_en']} for r in squad_alina['validation']['answers']])]

    results = squad_v2_metric.compute(predictions=predictions, references=references)
    print("english model", model_name, "squad results", results)

Map: 100%|██████████| 51/51 [00:00<00:00, 4826.48 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5468.17 examples/s]
100%|██████████| 51/51 [00:00<00:00, 344.87it/s]


english model bert invalid f1 score 1.3346062052505967
english model bert squad results {'exact': 1.9607843137254901, 'f1': 24.43551092730909, 'total': 51, 'HasAns_exact': 1.9607843137254901, 'HasAns_f1': 24.43551092730909, 'HasAns_total': 51, 'best_exact': 1.9607843137254901, 'best_exact_thresh': 0.0, 'best_f1': 24.43551092730909, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 5259.90 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5218.96 examples/s]
100%|██████████| 51/51 [00:00<00:00, 370.32it/s]


english model m-bert invalid f1 score 1.2659294365455502
english model m-bert squad results {'exact': 3.9215686274509802, 'f1': 32.98134741266544, 'total': 51, 'HasAns_exact': 3.9215686274509802, 'HasAns_f1': 32.98134741266544, 'HasAns_total': 51, 'best_exact': 3.9215686274509802, 'best_exact_thresh': 0.0, 'best_f1': 32.98134741266544, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 5845.16 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 6201.17 examples/s]
100%|██████████| 51/51 [00:00<00:00, 547.74it/s]


english model m-distil-bert invalid f1 score 1.2016036655211915
english model m-distil-bert squad results {'exact': 0.0, 'f1': 25.119571312907087, 'total': 51, 'HasAns_exact': 0.0, 'HasAns_f1': 25.119571312907087, 'HasAns_total': 51, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 25.119571312907087, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 5275.85 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 5326.17 examples/s]
100%|██████████| 51/51 [00:00<00:00, 344.24it/s]


english model xlm-roberta invalid f1 score 1.0894839973873287
english model xlm-roberta squad results {'exact': 0.0, 'f1': 10.600878891004594, 'total': 51, 'HasAns_exact': 0.0, 'HasAns_f1': 10.600878891004594, 'HasAns_total': 51, 'best_exact': 0.0, 'best_exact_thresh': 0.0, 'best_f1': 10.600878891004594, 'best_f1_thresh': 0.0}


Map: 100%|██████████| 51/51 [00:00<00:00, 5092.96 examples/s]
Map: 100%|██████████| 51/51 [00:00<00:00, 3692.23 examples/s]
100%|██████████| 51/51 [00:00<00:00, 333.77it/s]


english model ru-bert invalid f1 score 1.153612629022465
english model ru-bert squad results {'exact': 1.9607843137254901, 'f1': 28.09003672524135, 'total': 51, 'HasAns_exact': 1.9607843137254901, 'HasAns_f1': 28.09003672524135, 'HasAns_total': 51, 'best_exact': 1.9607843137254901, 'best_exact_thresh': 0.0, 'best_f1': 28.09003672524135, 'best_f1_thresh': 0.0}
