# Load simple-legal-questions-pl dataset\

In [1]:
import json
from datasets import Dataset

passage_dict = {}
title_dict = {}
with open("data/passages.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        passage_dict[d["_id"]] = d["text"]
        title_dict[d["_id"]] = d["title"]

question_dict = {}
with open("data/questions.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        question_dict[d["_id"]] = d["text"]

answer_dict = {}
with open("data/answers.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        answer_dict[d["question-id"]] = d["answer"]

test_dataset = {
    "id": [],
    "title": [],
    "context": [],
    "question": [],
    "answers": [],
}

with open("data/relevant.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        # test only on questions that have answers
        if d["question-id"] not in answer_dict or answer_dict[d["question-id"]] == "":
            continue
        test_dataset["id"].append(d["question-id"])
        test_dataset["title"].append(title_dict[d["passage-id"]])
        test_dataset["context"].append(passage_dict[d["passage-id"]])
        test_dataset["question"].append(question_dict[d["question-id"]])
        test_dataset["answers"].append(answer_dict[d["question-id"]])

test_dataset = Dataset.from_dict(test_dataset)

# Load and process PoQuAD dataset

In [2]:
from datasets import load_dataset

poquad = load_dataset("clarin-pl/poquad")

In [3]:
def load(dataset, path):
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)

    answers = []
    for data_dict in data["data"]:
        for p in data_dict["paragraphs"]:
            for qa in p["qas"]:
                if qa["is_impossible"]:
                    continue
                for answer in qa["answers"]:
                    answers.append(answer["generative_answer"])

    poquad_dataset = {
        "id": dataset["id"],
        "title": dataset["title"],
        "context": dataset["context"],
        "question": dataset["question"],
        "answers": answers,
    }

    return poquad_dataset

In [4]:
train_dataset = load(poquad["train"], "poquad-train.json")
train_dataset = Dataset.from_dict(train_dataset)

valid_dataset = load(poquad["validation"], "poquad-dev.json")
valid_dataset = Dataset.from_dict(valid_dataset)

# Load and prepare plt5 model

In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("allegro/plt5-base")

# Important class. Without that the trainer won't convert the 
model = AutoModelForSeq2SeqLM.from_pretrained("allegro/plt5-base")
model.to(device)
[]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[]

In [7]:
# def preprocess(examples):
#     inputs = [f"pytanie: {q} \n kontekst: {c}" for q, c in zip(examples["question"], examples["context"])]
#     targets = examples["answers"]
#
#     # model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
#     # with tokenizer.as_target_tokenizer():
#     #     labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length", return_tensors="pt").input_ids
#
#     model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
#     labels = tokenizer(text_target=targets, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
#
#     # Replace padding token id for labels to ignore index (-100) in loss computation
#     labels["input_ids"] = [[l if l != tokenizer.pad_token_id else -100 for l in label] for label in labels["input_ids"]]
#
#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs


def preprocess(examples):
    inputs = [
        f"pytanie: {q} \n kontekst: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]
    targets = examples["answers"]

    # Tokenizing inputs and targets
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # Tokenizing targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

    # Replace padding token id for labels to ignore index (-100) in loss computation
    labels["input_ids"] = [
        [l if l != tokenizer.pad_token_id else -100 for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Create dataset and preprocess

train_dataset_preprocessed = train_dataset.map(preprocess, batched=True)
valid_dataset_preprocessed = valid_dataset.map(preprocess, batched=True)
test_dataset_preprocessed = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/46187 [00:00<?, ? examples/s]



Map:   0%|          | 0/5764 [00:00<?, ? examples/s]

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=2000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=2000,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_preprocessed,
    eval_dataset=valid_dataset_preprocessed,
    processing_class=tokenizer,
)

# Train
trainer.train()




Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

# Evaluate model

In [None]:
from evaluate import load
import numpy as np

metric = load("exact_match", "f1")

def compute_metrics(eval_preds):
    predictions, labels, _ = eval_preds
    
    print("decoding predictions")
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    print("decoding labels")
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    print("computing metrics")
    results = metric.compute(predictions=predictions, references=labels)

    n = len(decoded_preds)
    results["exact_match"] / n * 100, results["f1"] / n * 100, decoded_preds, decoded_labels


### Test

In [None]:
results = trainer.evaluate(eval_dataset=test_dataset_preprocessed)
print(results)

```{'eval_loss': 14.340641021728516, 'eval_model_preparation_time': 0.005, 'eval_runtime': 8.9796, 'eval_samples_per_second': 66.039, 'eval_steps_per_second': 4.232} before training```

In [None]:
predictions = trainer.predict(test_dataset_preprocessed)

In [None]:
from time import time
# This cell takes far too much time. I don't know why but it doesn't seem right

start = time()
exact_match, f1, test_preds, train_labels = compute_metrics(predictions)
end = time()

print(f"Evaluation time: {end - start} s")
print(f"Exact Match:     {exact_match}")
print(f"F1 Score:        f1}")

### Valid

In [None]:
results = trainer.evaluate(eval_dataset=valid_dataset_preprocessed)
print(results)

In [None]:
predictions = trainer.predict(valid_dataset_preprocessed)

In [None]:
start = time()
exact_match, f1, valid_predictions, valid_labels = compute_metrics(predictions)
end = time()

print(f"Evaluation time: {end - start} s")
print(f"Exact Match:     {exact_match}")
print(f"F1 Score:        f1}")

# Compare results

In [None]:
test_p

In [None]:
results = trainer.evaluate(eval_dataset=test_dataset_preprocessed)
print(results)

In [None]:
predictions = trainer.predict(test_dataset_preprocessed)

In [None]:
metrics = compute_metrics(predictions)
print("Exact Match:", metrics["exact_match"])
print("F1 Score:", metrics["f1"])