# Load simple-legal-questions-pl dataset\

In [1]:
import json
from datasets import Dataset

passage_dict = {}
title_dict = {}
with open("data/passages.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        passage_dict[d["_id"]] = d["text"]
        title_dict[d["_id"]] = d["title"]

question_dict = {}
with open("data/questions.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        question_dict[d["_id"]] = d["text"]

answer_dict = {}
with open("data/answers.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        answer_dict[d["question-id"]] = d["answer"]

test_dataset = {
    "id": [],
    "title": [],
    "context": [],
    "question": [],
    "answers": [],
}

with open("data/relevant.jl", "r", encoding="utf-8") as file:
    for line in file:
        d = json.loads(line)
        # test only on questions that have answers
        if d["question-id"] not in answer_dict or answer_dict[d["question-id"]] == "":
            continue
        test_dataset["id"].append(d["question-id"])
        test_dataset["title"].append(title_dict[d["passage-id"]])
        test_dataset["context"].append(passage_dict[d["passage-id"]])
        test_dataset["question"].append(question_dict[d["question-id"]])
        test_dataset["answers"].append(answer_dict[d["question-id"]])

test_dataset = Dataset.from_dict(test_dataset)

# Load and process PoQuAD dataset

In [2]:
from datasets import load_dataset

poquad = load_dataset("clarin-pl/poquad")

In [3]:
def load(dataset, path):
    with open(path, "r", encoding="utf-8") as file:
        data = json.load(file)

    answers = []
    for data_dict in data["data"]:
        for p in data_dict["paragraphs"]:
            for qa in p["qas"]:
                if qa["is_impossible"]:
                    continue
                for answer in qa["answers"]:
                    answers.append(answer["generative_answer"])

    poquad_dataset = {
        "id": dataset["id"],
        "title": dataset["title"],
        "context": dataset["context"],
        "question": dataset["question"],
        "answers": answers,
    }

    return poquad_dataset

In [4]:
train_dataset = load(poquad["train"], "poquad-train.json")
train_dataset = Dataset.from_dict(train_dataset)

valid_dataset = load(poquad["validation"], "poquad-dev.json")
valid_dataset = Dataset.from_dict(valid_dataset)

# Load and prepare plt5 model

In [5]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

In [6]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("allegro/plt5-base")

# # Useful class
# model = AutoModelForSeq2SeqLM.from_pretrained("allegro/plt5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("results/checkpoint-8661")
model.to(device)
[]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


[]

In [7]:
def preprocess(examples):
    inputs = [
        f"pytanie: {q} \n kontekst: {c}"
        for q, c in zip(examples["question"], examples["context"])
    ]
    targets = examples["answers"]

    # Tokenizing inputs and targets
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )

    # Tokenizing targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

    # Replace padding token id for labels to ignore index (-100) in loss computation
    labels["input_ids"] = [
        [l if l != tokenizer.pad_token_id else -100 for l in label]
        for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Create dataset and preprocess

train_dataset_preprocessed = train_dataset.map(preprocess, batched=True)
valid_dataset_preprocessed = valid_dataset.map(preprocess, batched=True)
test_dataset_preprocessed = test_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/46187 [00:00<?, ? examples/s]



Map:   0%|          | 0/5764 [00:00<?, ? examples/s]

Map:   0%|          | 0/593 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=2000,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="steps",
    save_steps=2000,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_preprocessed,
    eval_dataset=valid_dataset_preprocessed,
    processing_class=tokenizer,
)

# # Training done. Now we load the model from checkpoint
# trainer.train()



# Evaluate model

In [53]:
from evaluate import load
import numpy as np

exact_match = load("exact_match")


def compute_f1(predictions, labels):
    f1 = 0
    for predicted_tokens, true_tokens in zip(predictions, labels):
        true_tokens = set(true_tokens.tolist())
        predicted_tokens = set(predicted_tokens.tolist())

        if 0 in true_tokens:
            true_tokens.remove(0)
        if 0 in predicted_tokens:
            predicted_tokens.remove(0)

        # Compute the intersection
        common_tokens = true_tokens.intersection(predicted_tokens)

        # Compute precision and recall
        precision = (
            len(common_tokens) / len(predicted_tokens) if predicted_tokens else 0
        )
        recall = len(common_tokens) / len(true_tokens) if true_tokens else 0

        # Compute F1 score
        if precision + recall == 0:
            return 0.0
        f1 += 2 * precision * recall / (precision + recall)
    f1 = f1 / len(predictions)
    return f1


def compute_metrics(predictions, labels):

    encoded_preds = tokenizer(
        predictions,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    encoded_labels = tokenizer(
        labels,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )["input_ids"]

    print("computing metrics")
    em = exact_match.compute(predictions=predictions, references=labels)["exact_match"]
    f1 = compute_f1(encoded_preds, encoded_labels)

    return em, f1

# Test

In [11]:
results = trainer.evaluate(eval_dataset=test_dataset_preprocessed)
print(results)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'eval_loss': 2.360666275024414, 'eval_model_preparation_time': 0.003, 'eval_runtime': 8.9475, 'eval_samples_per_second': 66.276, 'eval_steps_per_second': 4.247}


```{'eval_loss': 14.340641021728516, 'eval_model_preparation_time': 0.005, 'eval_runtime': 8.9796, 'eval_samples_per_second': 66.039, 'eval_steps_per_second': 4.232} before training```

In [50]:
from tqdm import tqdm

batch_size = 16

generated_test = []
for i in tqdm(range(0, len(test_dataset_preprocessed), batch_size)):
    batch_inputs = torch.tensor(
        test_dataset_preprocessed[i : i + batch_size]["input_ids"]
    ).to(device)
    att_mask = torch.tensor(
        test_dataset_preprocessed[i : i + batch_size]["attention_mask"]
    ).to(device)

    batch_generated_ids = model.generate(
        input_ids=batch_inputs,
        attention_mask=att_mask,
        max_length=128,
    )
    generated_test.append(batch_generated_ids)

100%|██████████████████████████████████████████████████████████████████████████████████| 38/38 [00:29<00:00,  1.28it/s]


In [56]:
import itertools

decoded_preds_test = [
    tokenizer.batch_decode(g, skip_special_tokens=True) for g in generated_test
]
decoded_preds_test = list(itertools.chain.from_iterable(decoded_preds_test))

labels_test = test_dataset_preprocessed["answers"]

In [82]:
from time import time
from pandas import DataFrame

em, f1 = compute_metrics(decoded_preds_test, labels_test)

df = DataFrame()
df["metric"] = ["Exact Match", "F1 Score"]
df["value"] = [em, f1]

df

computing metrics


Unnamed: 0,metric,value
0,Exact Match,0.150084
1,F1 Score,0.525069


# Valid

In [35]:
results = trainer.evaluate(eval_dataset=valid_dataset_preprocessed)
print(results)

{'eval_loss': 0.6636286973953247, 'eval_model_preparation_time': 0.003, 'eval_runtime': 82.5138, 'eval_samples_per_second': 69.855, 'eval_steps_per_second': 4.375}


In [58]:
batch_size = 16

generated_valid = []
for i in tqdm(range(0, len(valid_dataset_preprocessed), batch_size)):
    batch_inputs = torch.tensor(
        valid_dataset_preprocessed[i : i + batch_size]["input_ids"]
    ).to(device)
    att_mask = torch.tensor(
        valid_dataset_preprocessed[i : i + batch_size]["attention_mask"]
    ).to(device)

    batch_generated_ids = model.generate(
        input_ids=batch_inputs,
        attention_mask=att_mask,
        max_length=128,
    )
    generated_valid.append(batch_generated_ids)

100%|████████████████████████████████████████████████████████████████████████████████| 361/361 [02:30<00:00,  2.40it/s]


In [60]:
decoded_preds_valid = [
    tokenizer.batch_decode(g, skip_special_tokens=True) for g in generated_valid
]
decoded_preds_valid = list(itertools.chain.from_iterable(decoded_preds_valid))

labels_valid = valid_dataset_preprocessed["answers"]

In [83]:
em, f1 = compute_metrics(decoded_preds_valid, labels_valid)

df = DataFrame()
df["metric"] = ["Exact Match", "F1 Score"]
df["value"] = [em, f1]

df

computing metrics


Unnamed: 0,metric,value
0,Exact Match,0.408223
1,F1 Score,0.718381


The results are much better on the validation set! It's more similar to the training dataset. The answers are more brief than in the test dataset

# Compare results - test

In [78]:
idx_start = 30
idx_end = 35

for q, p, a in zip(
    test_dataset_preprocessed["question"][idx_start:idx_end],
    decoded_preds_test[idx_start:idx_end],
    labels_test[idx_start:idx_end],
):
    print(f"question:\n    {q}")

    print(f"prediction:\n    {p}")

    print(f"actual_answer:\n    {a}\n")

question:
    Jakiej karze podlega armator, który wykonuje rybołówstwo morskie w polskich obszarach morskich, z naruszeniem przepisów ustawy?
prediction:
    do wysokości nieprzekraczającej pięćdziesięciokrotnego przeciętnego wynagrodzenia miesięcznego w gospodarce narodowej za rok poprzedzający, ogłaszanego przez Prezesa Głównego Urzędu Statystycznego
actual_answer:
    w zależności od wielkości łodzi podlega karze pieniężnej do wysokości nieprzekraczającej pięćdziesięciokrotnego przeciętnego wynagrodzenia miesięcznego w gospodarce narodowej za rok poprzedzający

question:
    Czy żołnierze przy wykonaniu czynności służbowej nie muszą się przedstawiać?
prediction:
    nie
actual_answer:
    eie umundurowany funkcjonariusz jest obowiązany na żądanie obywatela, wobec którego wykonuje czynność służbową, okazać legitymację służbową

question:
    W jakim przypadku policja może dokonać przeszukania pomieszczeń w domu?
prediction:
    w wypadkach nie cierpiących zwłoki


# Compare results - validation

In [81]:
idx_start = 10
idx_end = 15

for q, p, a in zip(
    valid_dataset_preprocessed["question"][idx_start:idx_end],
    decoded_preds_valid[idx_start:idx_end],
    labels_valid[idx_start:idx_end],
):
    print(f"question:\n    {q}")

    print(f"prediction:\n    {p}")

    print(f"actual_answer:\n    {a}\n")

question:
    Wynagrodzenie w jakiej wysokości otrzymała para za zezwolenie na opublikowanie relacji ze ślubu?
prediction:
    pięćset tysięcy funtów
actual_answer:
    pięćset tysięcy funtów

question:
    Dlaczego w zapusty ludzie spożywali więcej jedzenia?
prediction:
    tak
actual_answer:
    chciano zaspokoić głód przed zbliżającym się postem

question:
    Jakie dania jedzono w czasie karnawału? 
prediction:
    placki ziemniaczane
actual_answer:
    tłuste

question:
    Czy w zapusty dozwolone było spożywanie mięsa?
prediction:
    nie
actual_answer:
    tak

question:
    Co oprócz placków ziemniaczanych smażono w czasie karnawału? 
prediction:
    pączki, faworki i bliny
actual_answer:
    pączki, faworki i bliny


# Questions

1. Does the performance on the validation dataset reflects the performance on your test set?
    - It doesn't. Even though the test dataset contains abstractive answers, the abstractive training/validation targets extracted from the json files are much shorter. For example "nie" instead of "jest to zabronione". Because of that the model learns to also give very short answers. The abstractive naswers prepared by students for the simple-legal-questions-pl dataset are much more complex than simply "yes"/"no".
2. What are the outcomes of the model on your test questions? Are they satisfying? If not, what might be the reason
   for that?
    - The answers on the test dataset aren't perfect but we can see that model learned to find some of the information that we want. In some cases it gives the wrong answer. It's possible that using the larger model or training it for more epochs would improve the performance
4. Why extractive question answering is not well suited for inflectional languages?
    - An answer extracted for a qa task in inflectional language will very often, if not most of the time, in the incorrect form that does not match the form used to formulate the actual answer even if the extracted information is correct.