# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [15]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [16]:
import pandas as pd
from datasets import DatasetDict, Dataset


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')


squad = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })

Getting contexts, questions and answers from the train and validation sets

In [None]:
def add_answer_clean(r):
    r['answer_clean'] = '' if r['is_impossible'] else r['answers']['text_en'][0]
    return r

squad = squad.map(add_answer_clean)


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n{r['answer_clean']}" # valid
        # 'text': f"{r['question']}\n{r['answer_clean']}" # invalid used in paper
    }

squad = squad.map(get_text)

In [None]:
import pickle
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)
set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_times = {}
# batch = 16 # not enough memory on my PC, using gradient_accumulation_steps
batch, lr, epochs, model_name, model_path = 2, 3e-5, 3, 'en_gpt2-large', 'openai-community/gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def prepare_train_features(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)
tokenized_datasets.set_format("torch")

with open(f"../data/english_tokenized_{model_name}_datasets.pkl","wb") as file:
    pickle.dump(tokenized_datasets, file)


args = TrainingArguments(
    output_dir=f"../models/{model_name}",
    evaluation_strategy = "epoch",
    save_strategy="epoch", 
    learning_rate=lr,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    report_to='tensorboard',
    logging_dir=f'../logs/{model_name}',
    load_best_model_at_end=True,
    gradient_accumulation_steps=int(16 / batch)
    # weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

start_time = datetime.now()
trainer.train()
print("english model", model_name, "train time", datetime.now() - start_time)
train_times[model_name] = datetime.now() - start_time

trainer.save_model()

In [23]:
train_times

{'bert': datetime.timedelta(seconds=209, microseconds=779003),
 'm-bert': datetime.timedelta(seconds=224, microseconds=494248),
 'm-distil-bert': datetime.timedelta(seconds=154, microseconds=549765),
 'xlm-roberta': datetime.timedelta(seconds=293, microseconds=334490),
 'ru-bert': datetime.timedelta(seconds=224, microseconds=340473)}

In [None]:
from transformers import pipeline
model_name = 'en_gpt2-large'

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)


squad_dataset = pickle.load(open("../data/squad_dataset.pkl", 'rb'))
squad_dataset = squad_dataset.map(add_answer_clean)


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n" # valid
        # 'text': f"{r['question']}\n" # invalid used in paper
    }

squad_dataset = squad_dataset.map(get_text)
squad = squad_dataset


tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

eval_answers = []

for item in tqdm(squad['validation']):
    input_text = get_text(item)['text']
    output = qa_pipeline(input_text, num_return_sequences=1, do_samples=False)
    prediction = output[0]['generated_text'].split("\n")[1].strip()
    eval_answers.append(prediction)

num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

    common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
    num_common = sum(common.values())

    num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

    num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

    num_c.append(num_common)
    num_p.append(num_pred)
    num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
invalid_f1_score= (2 * precision * recall) / (precision + recall)
print("english model", model_name, "invalid f1 score", invalid_f1_score)

predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'text': r['text_en']} for r in squad['validation']['answers']])]

results = squad_v2_metric.compute(predictions=predictions, references=references)
print("english model", model_name, "squad results", results)