# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [None]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [6]:
import pandas as pd
import json
from pathlib import Path

Getting contexts, questions and answers from the train and validation sets

In [7]:
import pickle
from datasets import Dataset, DatasetDict


squad_dataset = pickle.load(open("../data/squad_dataset.pkl", 'rb'))

In [8]:
def add_answer_clean(r):
    r['answer_clean'] = '' if r['is_impossible'] else r['answers']['text'][0]
    return r

squad_dataset = squad_dataset.map(add_answer_clean)


def get_text(r):
    return {
        # 'text': f"{r['context']}\n{r['question']}\n{r['answer_clean']}" # valid
        'text': f"{r['question']}\n{r['answer_clean']}" # invalid used in original source code
    }

squad_dataset = squad_dataset.map(get_text)
squad = squad_dataset

Map: 100%|██████████| 2901/2901 [00:00<00:00, 21791.34 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 21883.80 examples/s]
Map: 100%|██████████| 2901/2901 [00:00<00:00, 40928.92 examples/s]
Map: 100%|██████████| 643/643 [00:00<00:00, 39087.18 examples/s]


In [None]:
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DefaultDataCollator
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)

set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_times = {}
# batch = 16 # not enough memory on my PC, using gradient_accumulation_steps
batch, lr, epochs, model_name, model_path = 2, 3e-5, 30, 'gpt2-large', 'openai-community/gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def prepare_train_features(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)
tokenized_datasets.set_format("torch")

with open(f"../data/tokenized_{model_name}_datasets.pkl","wb") as file:
    pickle.dump(tokenized_datasets, file)


args = TrainingArguments(
    output_dir=f"../models/{model_name}",
    evaluation_strategy = "epoch",
    save_strategy="epoch", 
    learning_rate=lr,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    report_to='tensorboard',
    logging_dir=f'../logs/{model_name}',
    load_best_model_at_end=True,
    gradient_accumulation_steps=int(16 / batch)
    # weight_decay=0.01,
)

resume_from_checkpoint = False
if os.path.exists(f"../models/{model_name}") and len(os.listdir(f"../models/{model_name}")) > 0:
    resume_from_checkpoint = True
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

start_time = datetime.now()
trainer.train(resume_from_checkpoint = resume_from_checkpoint)
print("model", model_name, "train time", datetime.now() - start_time)
train_times[model_name] = datetime.now() - start_time

trainer.save_model()

### time for training in hours!

In [14]:
(datetime.now() - start_time).seconds / 60 / 60

3.0375

In [14]:
import torch
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import collections
from tqdm import tqdm

from evaluate import load
squad_v2_metric = load("squad_v2")


model_name = 'gpt2-large'

model_path = '../models/gpt2-large'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

val_answers = [a['text'][0] for a in squad['validation']['answers']]
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=512)


squad_dataset = pickle.load(open("../data/squad_dataset.pkl", 'rb'))
squad_dataset = squad_dataset.map(add_answer_clean)


def get_text(r):
    return {
        # 'text': f"{r['context']}\n{r['question']}\n" # valid
        'text': f"{r['question']}\n" # invalid used in original source code
    }

squad_dataset = squad_dataset.map(get_text)
squad = squad_dataset

eval_answers = []

for item in tqdm(squad['validation']):
    input_text = get_text(item)['text']
    output = qa_pipeline(input_text, num_return_sequences=1)
    prediction = output[0]['generated_text'].split("\n")[1].strip()
    eval_answers.append(prediction)

num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

    common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
    num_common = sum(common.values())

    num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

    num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

    num_c.append(num_common)
    num_p.append(num_pred)
    num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
invalid_f1_score= (2 * precision * recall) / (precision + recall)
print("model", model_name, "invalid f1 score", invalid_f1_score)

predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate(squad['validation']['answers'])]

results = squad_v2_metric.compute(predictions=predictions, references=references)
print("model", model_name, "squad results", results)

model gpt2-large invalid f1 score 1.0409366168752523
model gpt2-large squad results {'exact': 6.842923794712286, 'f1': 13.966658223507563, 'total': 643, 'HasAns_exact': 6.842923794712286, 'HasAns_f1': 13.966658223507563, 'HasAns_total': 643, 'best_exact': 6.842923794712286, 'best_exact_thresh': 0.0, 'best_f1': 13.966658223507563, 'best_f1_thresh': 0.0}


### testing better checkpoint