# Installing libraries

Installing HuggingFace Transformers (https://github.com/huggingface/transformers)

In [None]:
!pip install datasets transformers scikit-learn torch pandas evaluate tensorboardX

# Dataset processing

Uploading the dataset, splitting the data into train, validation and test sets

In [None]:
import pandas as pd
from datasets import DatasetDict, Dataset


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')

squad_tower = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })

Getting contexts, questions and answers from the train and validation sets

In [None]:
exploded_df = pd.json_normalize(pd.json_normalize(pd.read_json('../data/translated_alina.json')['data'])['paragraphs'].explode())
exploded_df = exploded_df[~exploded_df['context_en'].isna()].reset_index(drop=True)
exploded_df = exploded_df.drop('qas', axis=1).join(pd.DataFrame(exploded_df['qas'].explode())).reset_index(drop=True)
exploded_df = exploded_df.join(pd.json_normalize(exploded_df['qas'])).drop('qas', axis=1)
exploded_df = exploded_df[~exploded_df['question_en'].isna()].reset_index(drop=True)

exploded_df['answers'] = exploded_df['answers'].apply(lambda an: {
    'text_en': [an[0]['text_en']],
    'answer_start_en': [an[0]['answer_start']],
    'answer_end_en': [an[0]['answer_end']],
})

alina_translation = Dataset.from_pandas(exploded_df)

squad_alina = DatasetDict({
    'train': alina_translation.shuffle()
})

In [None]:
def reorder_columns(dataset, column_order):
  df = dataset.to_pandas()
  df = df[new_column_order]
  return Dataset.from_pandas(df)

new_column_order = ['question', 'context', 'is_impossible', 'answers', 'context_en', 'question_en']
# Assuming `dataset` is your Dataset object
aligned_squad_alina = DatasetDict({
    split: reorder_columns(squad_alina[split], new_column_order)
    for split in squad_alina.keys()
})

In [None]:
def concatenate_squad_datasets(dataset1, dataset2):
  # Convert to DataFrames
  df1 = dataset1.to_pandas()
  df2 = dataset2.to_pandas()

  # Normalize 'answers' field
  def normalize_answers(df):
    answers_df = df['answers'].apply(pd.Series)
    df_normalized = pd.concat([df.drop(columns=['answers']), answers_df], axis=1)
    return df_normalized

  df1_normalized = normalize_answers(df1)
  df2_normalized = normalize_answers(df2)

  # Align columns
  cols1 = set(df1_normalized.columns)
  cols2 = set(df2_normalized.columns)

  missing_in_df1 = cols2 - cols1
  missing_in_df2 = cols1 - cols2

  for col in missing_in_df1:
      df1_normalized[col] = pd.NA
  for col in missing_in_df2:
      df2_normalized[col] = pd.NA

  df1_normalized = df1_normalized[df2_normalized.columns]

  # Perform full outer concatenation
  full_outer_df = pd.concat([df1_normalized, df2_normalized], axis=0, ignore_index=True, sort=False)

  # Convert back to Dataset
  return Dataset.from_pandas(full_outer_df)

In [None]:
def create_answer_field(dataset):
  # Convert to DataFrame
  df = dataset.to_pandas()

  # Define a function to create the new dictionary field
  def create_dict_field(row):
      return {
          'answer_start': row['answer_start'],
          'text': row['text'],
          'text_en': row['text_en'],
          'answer_start_en': row['answer_start_en'],
          'answer_end_en': row['answer_end_en'],
          # Add more fields if needed
      }

  # Apply the function to create the new dictionary column
  df['answers'] = df.apply(create_dict_field, axis=1)

  # Drop original fields if they are no longer needed
  df = df.drop(columns=['answer_start', 'answer_start_en', 'answer_end_en', 'text', 'text_en'])

  return Dataset.from_pandas(df)

In [None]:
concatenated_train = concatenate_squad_datasets(squad_tower['train'], squad_alina['train'])
normalized_train = create_answer_field(concatenated_train)

squad = DatasetDict({
    'train': normalized_train.shuffle(),
    'validation': squad_tower['validation']
})

In [None]:
def add_answer_clean(r):
    r['answer_clean'] = '' if r['is_impossible'] else r['answers']['text_en'][0]
    return r

squad = squad.map(add_answer_clean)


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n{r['answer_clean']}" # valid
        # 'text': f"{r['question']}\n{r['answer_clean']}" # invalid used in paper
    }

squad = squad.map(get_text)

Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/643 [00:00<?, ? examples/s]

Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/643 [00:00<?, ? examples/s]

In [None]:
import pickle
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
import torch
import collections
from datetime import datetime
from tqdm import tqdm
from evaluate import load
from transformers.utils.logging import set_verbosity_error
from transformers import set_seed

set_seed(42)
set_verbosity_error()
squad_v2_metric = load("squad_v2")

val_answers = [a['text_en'][0] for a in squad['validation']['answers']]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_times = {}
# batch = 16 # not enough memory on my PC, using gradient_accumulation_steps
batch, lr, epochs, model_name, model_path = 2, 3e-5, 3, 'en_gpt2-large', 'openai-community/gpt2-large'
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

def prepare_train_features(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)
tokenized_datasets.set_format("torch")

with open(f"../data/english_tokenized_{model_name}_datasets.pkl","wb") as file:
    pickle.dump(tokenized_datasets, file)


args = TrainingArguments(
    output_dir=f"../models/{model_name}",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch,
    per_device_eval_batch_size=batch,
    num_train_epochs=epochs,
    report_to='tensorboard',
    logging_dir=f'../logs/{model_name}',
    load_best_model_at_end=True,
    gradient_accumulation_steps=int(16 / batch)
    # weight_decay=0.01,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

start_time = datetime.now()
trainer.train()
print("english model", model_name, "train time", datetime.now() - start_time)
train_times[model_name] = datetime.now() - start_time

trainer.save_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/2952 [00:00<?, ? examples/s]

Map:   0%|          | 0/643 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 

In [None]:
train_times

In [None]:
import torch
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import collections
from tqdm import tqdm

from evaluate import load
squad_v2_metric = load("squad_v2")

model_name = 'en_gpt2-large'
model_path = f"../models/{model_name}"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=512)


train_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_train.json', lines=True, encoding='utf-8')

validation_data = pd.read_json('../data/translated_Unbabel_TowerInstruct-v0.1_substring_logic_validation.json', lines=True, encoding='utf-8')


squad = DatasetDict(
    {'train': Dataset.from_pandas(train_data).shuffle(),
     'validation': Dataset.from_pandas(validation_data).shuffle()
     })


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n" # valid
        # 'text': f"{r['question']}\n" # invalid used in paper
    }

squad = squad.map(get_text)

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad["train"].column_names)

eval_answers = []

for item in tqdm(squad['validation']):
    input_text = get_text(item)['text']
    output = qa_pipeline(input_text, num_return_sequences=1)
    prediction = output[0]['generated_text'].split("\n")[1].strip()
    eval_answers.append(prediction)

num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

    common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
    num_common = sum(common.values())

    num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

    num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

    num_c.append(num_common)
    num_p.append(num_pred)
    num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
invalid_f1_score= (2 * precision * recall) / (precision + recall)
print("english model", model_name, "invalid f1 score", invalid_f1_score)

predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'text': r['text_en'], 'answer_start': r['answer_start_en']} for r in squad['validation']['answers']])]

results = squad_v2_metric.compute(predictions=predictions, references=references)
print("english model", model_name, "squad results", results)

In [None]:
import torch
from transformers import pipeline
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import collections
from tqdm import tqdm

from evaluate import load
squad_v2_metric = load("squad_v2")

model_name = 'en_gpt2-large'
model_path = f"../models/{model_name}"

model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device, max_length=512)


def get_text(r):
    return {
        'text': f"{r['context_en']}\n{r['question_en']}\n" # valid
        # 'text': f"{r['question']}\n" # invalid used in paper
    }

squad_alina = squad_alina.map(get_text)

def prepare_train_features(examples):
    encoding = tokenizer(
        examples["text"],
        max_length=512,
        padding="max_length",
        truncation=True,
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

tokenized_datasets = squad.map(prepare_train_features, batched=True, remove_columns=squad_alina["train"].column_names)

eval_answers = []

for item in tqdm(squad_alina['validation']):
    input_text = get_text(item)['text']
    output = qa_pipeline(input_text, num_return_sequences=1)
    prediction = output[0]['generated_text'].split("\n")[1].strip()
    eval_answers.append(prediction)

num_c = []
num_p = []
num_g = []

for a in range(len(eval_answers)):

    common = collections.Counter(eval_answers[a].split()) & collections.Counter(eval_answers[a].split()) # tokens shared between gold and predicted answers
    num_common = sum(common.values())

    num_pred = len(str(eval_answers[a]).split()) # the number of predicted tokens

    num_gold = len(str(val_answers[a]).split()) # the number of gold tokens

    num_c.append(num_common)
    num_p.append(num_pred)
    num_g.append(num_gold)

precision = 1.0 * sum(num_c) / sum(num_p) # the num of tokens shared between gold and predicted answers / the num of predicted tokens
recall = 1.0 * sum(num_c) / sum(num_g) # the num of tokens shared between gold and predicted answers / the num of gold tokens
invalid_f1_score= (2 * precision * recall) / (precision + recall)
print("english model", model_name, "invalid f1 score", invalid_f1_score)

predictions = [{'prediction_text': a, 'id': str(idx), 'no_answer_probability': 0.} for idx, a in enumerate(eval_answers)]
references = [{'answers': a, 'id': str(idx)} for idx, a in enumerate([{'text': r['text_en'], 'answer_start': r['answer_start_en']} for r in squad_alina['validation']['answers']])]

results = squad_v2_metric.compute(predictions=predictions, references=references)
print("english model", model_name, "squad results", results)