# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [15]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [4]:
import pandas as pd
from datasets import DatasetDict, Dataset


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')


squad = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })

Getting contexts, questions and answers from the train and validation sets

In [5]:
def add_answer_clean(r):
    r['answer_clean'] = '' if r['is_impossible'] else r['answers']['text_en'][0]
    return r

squad = squad.map(add_answer_clean)


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n{r['answer_clean']}" # valid
        # 'text': f"{r['question']}\n{r['answer_clean']}" # invalid used in paper
    }

squad = squad.map(get_text)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2901/2901 [00:00<00:00, 11303.83 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 643/643 [00:00<00:00, 17790.88 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2901/2901 [00:00<00:00, 30961.64 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 643/643 [00:00<00:00, 29573.30 examples/s]


In [5]:
import pickle
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)
set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_times = {}
# batch = 16 # not enough memory on my PC, using gradient_accumulation_steps
batch, lr, epochs, model_name, model_path = 2, 3e-5, 3, 'en_gpt2-large', 'openai-community/gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def prepare_train_features(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)
tokenized_datasets.set_format("torch")

with open(f"../data/english_tokenized_{model_name}_datasets.pkl","wb") as file:
    pickle.dump(tokenized_datasets, file)


args = TrainingArguments(
    output_dir=f"../models/{model_name}",
    evaluation_strategy = "epoch",
    save_strategy="epoch", 
    learning_rate=lr,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    report_to='tensorboard',
    logging_dir=f'../logs/{model_name}',
    load_best_model_at_end=True,
    gradient_accumulation_steps=int(16 / batch)
    # weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

start_time = datetime.now()
trainer.train()
print("english model", model_name, "train time", datetime.now() - start_time)
train_times[model_name] = datetime.now() - start_time

trainer.save_model()

2024-08-11 17:05:37.456240: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-11 17:05:37.463297: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-11 17:05:37.471585: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-11 17:05:37.474059: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-11 17:05:37.480230: I tensorflow/core/platform/cpu_feature_guar

{'eval_loss': 0.5049452185630798, 'eval_runtime': 21.0993, 'eval_samples_per_second': 30.475, 'eval_steps_per_second': 15.261, 'epoch': 0.9979324603721571}
{'eval_loss': 0.5330626368522644, 'eval_runtime': 20.9925, 'eval_samples_per_second': 30.63, 'eval_steps_per_second': 15.339, 'epoch': 1.9958649207443142}
{'loss': 0.2909, 'grad_norm': 0.3425661325454712, 'learning_rate': 2.375690607734807e-06, 'epoch': 2.7567195037904892}
{'eval_loss': 0.5520020723342896, 'eval_runtime': 20.9112, 'eval_samples_per_second': 30.749, 'eval_steps_per_second': 15.398, 'epoch': 2.9937973811164715}
{'train_runtime': 1166.1404, 'train_samples_per_second': 7.463, 'train_steps_per_second': 0.466, 'train_loss': 0.2797871935652984, 'epoch': 2.9937973811164715}
english model en_gpt2-large train time 0:19:26.238848


In [6]:
train_times

{'en_gpt2-large': datetime.timedelta(seconds=1166, microseconds=238883)}

In [11]:
import torch
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import collections
from tqdm import tqdm

from evaluate import load
squad_v2_metric = load("squad_v2")

model_name = 'en_gpt2-large'
model_path = f"../models/{model_name}"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=512)


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')


squad = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n" # valid
        # 'text': f"{r['question']}\n" # invalid used in paper
    }

squad = squad.map(get_text)

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

eval_answers = []

for item in tqdm(squad['validation']):
    input_text = get_text(item)['text']
    output = qa_pipeline(input_text, num_return_sequences=1)
    prediction = output[0]['generated_text'].split("\n")[1].strip()
    eval_answers.append(prediction)

num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

    common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
    num_common = sum(common.values())

    num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

    num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

    num_c.append(num_common)
    num_p.append(num_pred)
    num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
invalid_f1_score= (2 * precision * recall) / (precision + recall)
print("english model", model_name, "invalid f1 score", invalid_f1_score)

predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'text': r['text_en'], 'answer_start': r['answer_start_en']} for r in squad['validation']['answers']])]

results = squad_v2_metric.compute(predictions=predictions, references=references)
print("english model", model_name, "squad results", results)

english model en_gpt2-large invalid f1 score 0.6212649780069771
english model en_gpt2-large squad results {'exact': 0.6220839813374806, 'f1': 17.330527419380857, 'total': 643, 'HasAns_exact': 0.6220839813374806, 'HasAns_f1': 17.330527419380857, 'HasAns_total': 643, 'best_exact': 0.6220839813374806, 'best_exact_thresh': 0.0, 'best_f1': 17.330527419380857, 'best_f1_thresh': 0.0}


In [8]:
import torch
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import collections
from tqdm import tqdm

from evaluate import load
squad_v2_metric = load("squad_v2")

model_name = 'en_gpt2-large'
model_path = f"../models/{model_name}"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=512)


exploded_df = pd.json_normalize(pd.json_normalize(pd.read_json('../data/translated_alina.json')['data'])['paragraphs'].explode())
exploded_df = exploded_df[~exploded_df['context_en'].isna()].reset_index(drop=True)
exploded_df = exploded_df.drop('qas', axis=1).join(pd.DataFrame(exploded_df['qas'].explode())).reset_index(drop=True)
exploded_df = exploded_df.join(pd.json_normalize(exploded_df['qas'])).drop('qas', axis=1)
exploded_df = exploded_df[~exploded_df['question_en'].isna()].reset_index(drop=True)

exploded_df['answers'] = exploded_df['answers'].apply(lambda an: {
    'text_en': [an[0]['text_en']],
    'answer_start_en': [an[0]['answer_start']],
    'answer_end_en': [an[0]['answer_end']],
})

alina_translation = Dataset.from_pandas(exploded_df)

squad_alina = DatasetDict({
    'validation': alina_translation,
    'train': alina_translation
})


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n" # valid
        # 'text': f"{r['question']}\n" # invalid used in paper
    }

squad_alina = squad_alina.map(get_text)

def prepare_train_features(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad_alina["train"].column_names)

eval_answers = []

for item in tqdm(squad_alina['validation']):
    input_text = get_text(item)['text']
    output = qa_pipeline(input_text, num_return_sequences=1)
    prediction = output[0]['generated_text'].split("\n")[1].strip()
    eval_answers.append(prediction)

num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

    common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
    num_common = sum(common.values())

    num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

    num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

    num_c.append(num_common)
    num_p.append(num_pred)
    num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
invalid_f1_score= (2 * precision * recall) / (precision + recall)
print("english model", model_name, "invalid f1 score", invalid_f1_score)

predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'text': r['text_en'], 'answer_start': r['answer_start_en']} for r in squad_alina['validation']['answers']])]

results = squad_v2_metric.compute(predictions=predictions, references=references)
print("english model", model_name, "squad results", results)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:00<00:00, 14546.72 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:00<00:00, 15568.38 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2901/2901 [00:00<00:00, 3265.37 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 643/643 [00:00<00:00, 3402.32 examples/s]
100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:17<00:00,  2.89it/s]


english model en_gpt2-large invalid f1 score 0.53125
english model en_gpt2-large squad results {'exact': 3.9215686274509802, 'f1': 18.902089253130494, 'total': 51, 'HasAns_exact': 3.9215686274509802, 'HasAns_f1': 18.902089253130494, 'HasAns_total': 51, 'best_exact': 3.9215686274509802, 'best_exact_thresh': 0.0, 'best_f1': 18.902089253130494, 'best_f1_thresh': 0.0}
