In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import Trainer, TrainingArguments

In [2]:
# import subprocess
# import os

# result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
# output = result.stdout
# for line in output.splitlines():
#     if '=' in line:
#         var, value = line.split('=', 1)
#         os.environ[var] = value

# load base modal and tokenizer

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-small")


In [4]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# load dataset

In [5]:
import json

def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

file_path = '../MISC/Dataset/train-v2.0.json'
data = load_data(file_path)

In [6]:
train_data = data['data']

In [7]:
# Prepare data to be converted into the format required by the datasets library
def extract_qas(data):
    questions = []
    contexts = []
    answers = []
    is_impossible = []

    for article in data:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                questions.append(qa['question'])
                contexts.append(context)  # Reuse the same context
                is_impossible.append(qa['is_impossible'])

                # Answers
                if qa['is_impossible']:
                    if 'plausible_answers' in qa and qa['plausible_answers']:
                        answers.append(qa['plausible_answers'][0]['text'])
                    else:
                        answers.append("")  
                else:
                    if qa['answers']:
                        answers.append(qa['answers'][0]['text'])
                    else:
                        answers.append("")  

    return {
        'question': questions,
        'context': contexts,
        'answer': answers,
        'is_impossible': is_impossible
    }





#  processe data
processed_data = extract_qas(train_data)
dataset = Dataset.from_dict(processed_data)

In [8]:
dataset

Dataset({
    features: ['question', 'context', 'answer', 'is_impossible'],
    num_rows: 130319
})

In [9]:
def preprocess_t5_qa(examples):
    # Combine questions and context into the format required for model input
    questions = examples['question']
    contexts = examples['context']
    source_texts = [f"question: {q} context: {c}" for q, c in zip(questions, contexts)]

    answers = examples['answer']
    target_texts = [ans if ans else "no answer" for ans in answers]

    # Use tokenizer to process batch data
    model_inputs = tokenizer(source_texts, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    model_outputs = tokenizer(target_texts, max_length=128, truncation=True, padding="max_length", return_tensors="pt")

    # print("Input IDs shape:", model_inputs['input_ids'].shape)
    # print("Labels shape:", model_outputs['input_ids'].shape)
    return {
        "input_ids": model_inputs['input_ids'],
        "attention_mask": model_inputs['attention_mask'],
        "labels": model_outputs['input_ids']
    }


processed_dataset = dataset.map(preprocess_t5_qa, batched=True, batch_size=64)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

In [10]:
processed_dataset

Dataset({
    features: ['question', 'context', 'answer', 'is_impossible', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 130319
})

# eval

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Convert NumPy arrays to Python lists and then to strings
    predictions = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predictions.tolist()]
    labels = [tokenizer.decode(label, skip_special_tokens=True) for label in labels.tolist()]

    # Ensure predictions and labels are text data
    predictions = [pred.strip() for pred in predictions]
    labels = [label.strip() for label in labels]

    # Compute EM and F1 scores
    squad_metric = load_metric("squad")
    results = squad_metric.compute(predictions=predictions, references=labels)

    return {"exact_match": results["exact_match"], "f1": results["f1"]}


# Training

In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./base',
    # evaluation_strategy='epoch',  # Evaluation strategy set to 'epoch'
    save_strategy = 'epoch',
    # evaluation_strategy='steps',  # Evaluation strategy set to 'epoch'
    # eval_steps=20,
    logging_steps=100,
    save_total_limit=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=4,
    predict_with_generate=True,
    load_best_model_at_end=False,
    metric_for_best_model='f1',
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)




In [14]:
trainer.train()

Step,Training Loss
100,3.7575
200,0.0857
300,0.0322
400,0.027
500,0.025
600,0.0236
700,0.0236
800,0.0228
900,0.0224
1000,0.0229


TrainOutput(global_step=6111, training_loss=0.08271375485526582, metrics={'train_runtime': 2814.5731, 'train_samples_per_second': 138.905, 'train_steps_per_second': 2.171, 'total_flos': 5.29128246780887e+16, 'train_loss': 0.08271375485526582, 'epoch': 3.0})

# Command line test

In [15]:
import torch


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define problem and context
context = "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."

question = "What century did the Normans first gain their separate identity?"
truth = "10th"
# Encode the input text using a tokenizer and make sure the input data is also on the correct device
input_text = f"question: {question} context: {context}"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)

# Generate answer
outputs = model.generate(input_ids, max_length=40)

# Convert the output token IDs to text
answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)
print("truth:", truth)

Generated Answer: 10th
truth: 10th


In [16]:
question = "Who was the Norse leader?"
truth = "Rollo"

input_text = f"question: {question} context: {context}"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)


outputs = model.generate(input_ids, max_length=40)


answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)
print("truth:", truth)

Generated Answer: King Charles III of West Francia
truth: Rollo


In [17]:
question = "From which countries did the Norse originate?"
truth = "Denmark, Iceland and Norway"

input_text = f"question: {question} context: {context}"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)


outputs = model.generate(input_ids, max_length=40)


answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)
print("truth:", truth)

Generated Answer: Denmark, Iceland and Norway
truth: Denmark, Iceland and Norway


In [18]:
question = "In what country is Normandy located?"
truth = "France"

input_text = f"question: {question} context: {context}"
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)


outputs = model.generate(input_ids, max_length=40)


answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated Answer:", answer)
print("truth:", truth)

Generated Answer: France
truth: France
