# Load simple-legal-questions-pl dataset\

In [1]:
import json
from datasets import Dataset

passage_dict = {}
title_dict = {}
with open("data/passages.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        passage_dict[d["_id"]] = d["text"]
        title_dict[d["_id"]] = d["title"]

question_dict = {}
with open("data/questions.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        question_dict[d["_id"]] = d["text"]

answer_dict = {}
with open("data/answers.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        answer_dict[d["question-id"]] = d["answer"]

test_dataset = {
    "id": [],
    "title": [],
    "context": [],
    "question": [],
    "answers": [],
}

with open("data/relevant.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        # test only on questions that have answers
        if d["question-id"] not in answer_dict or answer_dict[d["question-id"]] == "":
            continue
        test_dataset["id"].append(d["question-id"])
        test_dataset["title"].append(title_dict[d["passage-id"]])
        test_dataset["context"].append(passage_dict[d["passage-id"]])
        test_dataset["question"].append(question_dict[d["question-id"]])
        test_dataset["answers"].append(answer_dict[d["question-id"]])

test_dataset = Dataset.from_dict(test_dataset)

# Load and process PoQuAD dataset

In [2]:
from datasets import load_dataset

poquad = load_dataset("clarin-pl/poquad")

In [3]:
def load(dataset, path):
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)

    answers = []
    for data_dict in data["data"]:
        for p in data_dict["paragraphs"]:
            for qa in p["qas"]:
                if qa["is_impossible"]:
                    continue
                for answer in qa["answers"]:
                    answers.append(answer["generative_answer"])

    poquad_dataset = {
        "id": dataset["id"],
        "title": dataset["title"],
        "context": dataset["context"],
        "question": dataset["question"],
        "answers": answers,
    }

    return poquad_dataset

In [4]:
train_dataset = load(poquad["train"], "poquad-train.json")
train_dataset = Dataset.from_dict(train_dataset)

valid_dataset = load(poquad["validation"], "poquad-dev.json")
valid_dataset = Dataset.from_dict(valid_dataset)

# Load and prepare plt5 model

In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("allegro/plt5-base")

# # Important class. Without that the trainer won't convert the
# model = AutoModelForSeq2SeqLM.from_pretrained("allegro/plt5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("results/checkpoint-8661")
model.to(device)
[]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[]

In [7]:
def preprocess(examples):
    inputs = [
        f"pytanie: {q} \n kontekst: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]
    targets = examples["answers"]

    # Tokenizing inputs and targets
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # Tokenizing targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

    # Replace padding token id for labels to ignore index (-100) in loss computation
    labels["input_ids"] = [
        [l if l != tokenizer.pad_token_id else -100 for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Create dataset and preprocess

train_dataset_preprocessed = train_dataset.map(preprocess, batched=True)
valid_dataset_preprocessed = valid_dataset.map(preprocess, batched=True)
test_dataset_preprocessed = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/46187 [00:00<?, ? examples/s]



Map:   0%|          | 0/5764 [00:00<?, ? examples/s]

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=2000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=2000,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_preprocessed,
    eval_dataset=valid_dataset_preprocessed,
    processing_class=tokenizer,
)

# # Training done. Now we load the model from checkpoint
# trainer.train()




# Evaluate model

In [65]:
from evaluate import load
import numpy as np

exact_match = load("exact_match")


def compute_metrics(predictions, labels):

    encoded_preds = tokenizer(
        predictions,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    encoded_labels = tokenizer(
        labels,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    print("computing metrics")
    em = exact_match.compute(predictions=predictions, references=labels)
    f1 = f1_score.compute(predictions=encoded_preds, references=encoded_labels)

    em, f1

# Test

In [10]:
results = trainer.evaluate(eval_dataset=test_dataset_preprocessed)
print(results)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'eval_loss': 2.360666275024414, 'eval_model_preparation_time': 0.005, 'eval_runtime': 8.7814, 'eval_samples_per_second': 67.529, 'eval_steps_per_second': 4.327}


```{'eval_loss': 14.340641021728516, 'eval_model_preparation_time': 0.005, 'eval_runtime': 8.9796, 'eval_samples_per_second': 66.039, 'eval_steps_per_second': 4.232} before training```

In [11]:
from tqdm import tqdm

batch_size = 16

generated = []
for i in tqdm(range(0, len(test_dataset_preprocessed), batch_size)):
    batch_inputs = torch.tensor(
        test_dataset_preprocessed[i : i + batch_size]["input_ids"]
    ).to(device)
    att_mask = torch.tensor(
        test_dataset_preprocessed[i : i + batch_size]["attention_mask"]
    ).to(device)

    batch_generated_ids = model.generate(
        input_ids=batch_inputs,
        attention_mask=att_mask,
        max_length=128,
    )
    generated.append(batch_generated_ids)

100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:27<00:00,  1.37it/s]


In [26]:
generated[0]

tensor([[   0,  279,    1,  ...,    0,    0,    0],
        [   0,  580,    1,  ...,    0,    0,    0],
        [   0,  272,  853,  ...,    0,    0,    0],
        ...,
        [   0,  279,    1,  ...,    0,    0,    0],
        [   0,  423, 1389,  ...,    0,    0,    0],
        [   0, 1192,  264,  ...,    0,    0,    0]], device='cuda:0')

In [37]:
import itertools

decoded_preds = [tokenizer.batch_decode(g, skip_special_tokens=True) for g in generated]
decoded_preds = list(itertools.chain.from_iterable(decoded_preds))

labels = test_dataset_preprocessed["answers"]

In [66]:
from time import time

# This cell takes far too much time. I don't know why but it doesn't seem right

start = time()
em, f1 = compute_metrics(decoded_preds, labels)
end = time()

print(f"Evaluation time: {end - start} s")
print(f"Exact Match:     {em}")
print(f"F1 Score:        {f1}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

# Compare results

In [None]:
decoded_preds[:10]

In [None]:
labels[:10]

# Valid

In [None]:
results = trainer.evaluate(eval_dataset=valid_dataset_preprocessed)
print(results)

In [None]:
predictions = trainer.predict(valid_dataset_preprocessed)

In [None]:
start = time()
exact_match, f1, valid_preds, valid_labels = compute_metrics(predictions)
end = time()

print(f"Evaluation time: {end - start} s")
print(f"Exact Match:     {exact_match}")
print(f"F1 Score:        f1}")