Marcin Wardyński  
czwartek, 8:00

Dla przygotowania danych treningowych i walidacyjnych dla modelu trzeba przerobić istniejące pliki json. Format nowych plików powininen zawierać elementy:
- id
- title
- context
- question
- generative_answer
- is_impossible
a każdy element powinien zawierać się w pojedyńczym wierszu

In [13]:
import json

def convert_data(data):
    results = []
    i = 0
    for article in data.get("data", []):
        for paragraph in article.get("paragraphs", []):
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answers = qa['answers'] if 'answers' in qa.keys() else qa['plausible_answers']
                for answer in answers:
                    i += 1
                    results.append({
                        "id": i,
                        "context": context,
                        "question": question,
                        "answers": {
                            "text": [answer["generative_answer"]]
                        }
                    })
    return results


def convert_format(input_filepath, output_filepath):
    with open(input_filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    output_data = convert_data(data)
    output_wrapped_data = {"version": "0.1.0", "data": output_data}

    with open(output_filepath, "w", encoding="utf-8") as f:
        json.dump(output_wrapped_data, f, ensure_ascii=False, indent=2)


In [46]:
convert_format("poquad-train.json", "poquad-conv-train.json")
convert_format("poquad-dev.json", "poquad-conv-dev.json")

python run_seq2seq_qa.py \
  --model_name_or_path allegro/plt5-base \
  --dataset_name clarin-pl/poquad \
  --context_column context \
  --question_column question \
  --answer_column answers \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 12 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir ../../../../model_poquad_extr

python run_seq2seq_qa.py \
  --model_name_or_path allegro/plt5-base \
  --train_file /Users/mwardynski/Documents/ds/_semestr_9/przetwarzanie_jezyka_naturalnego/labs/lab9/poquad-conv-train.json \
  --validation_file /Users/mwardynski/Documents/ds/_semestr_9/przetwarzanie_jezyka_naturalnego/labs/lab9/poquad-conv-dev.json \
  --context_column context \
  --question_column question \
  --answer_column answers \
  --do_train \
  --do_eval \
  --per_device_train_batch_size 12 \
  --learning_rate 3e-5 \
  --num_train_epochs 3 \
  --max_seq_length 384 \
  --doc_stride 128 \
  --output_dir ../../../../model_poquad_abstr

In [14]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

ready_model_name = "apohllo/plt5-base-poquad"
ready_model_tokenizer = AutoTokenizer.from_pretrained(ready_model_name)
ready_model = AutoModelForSeq2SeqLM.from_pretrained(ready_model_name)

In [15]:
context = "Art. 345. § 1. Żołnierz, który dopuszcza się czynnej napaści na przełożonego, podlega karze aresztu wojskowego albo pozbawienia wolności do lat 3. § 2. Jeżeli sprawca dopuszcza się czynnej napaści w związku z pełnieniem przez przełożonego obowiązków służbowych albo wspólnie z innymi żołnierzami lub w obecności zebranych żołnierzy, podlega karze pozbawienia wolności od 6 miesięcy do lat 8. § 3. Jeżeli sprawca czynu określonego w § 1 lub 2 używa broni, noża lub innego podobnie niebezpiecznego przedmiotu, podlega karze pozbawienia wolności od roku do lat 10. § 4. Karze przewidzianej w § 3 podlega sprawca czynu określonego w § 1 lub 2, jeżeli jego następstwem jest skutek określony w art. 156 lub 157 § 1."
question = "Czy żołnierz, który dopuszcza się czynnej napaści na przełożonego podlega karze pozbawienia wolności?"

input_text = f"question: {question} context: {context}"

inputs = ready_model_tokenizer(input_text, return_tensors="pt")

outputs = ready_model.generate(inputs["input_ids"], max_length=50, num_beams=5, early_stopping=True)

answer = ready_model_tokenizer.decode(outputs[0], skip_special_tokens=True)

print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Czy żołnierz, który dopuszcza się czynnej napaści na przełożonego podlega karze pozbawienia wolności?
Answer: tak


In [17]:
import json

NO_ANS = "no_ans"

class QA:
    def __init__(self, question_id, question, answer):
        self.question_id = question_id
        self.question = question
        self.answer = answer

class Entry:
    def __init__(self, passage_id, passage_text, qas):
        self.passage_id = passage_id
        self.passage_text = passage_text
        self.qas = qas

def init_qas_with_answers(filepath):
    qa_dict = {}
    with open(filepath, "r") as file:
        for line in file:
            record = json.loads(line.strip())
            
            if "score" in record and record["score"] == "1"\
                    and "question-id" in record and "answer" in record:
                qa = QA(record["question-id"], None, record["answer"])
                qa_dict[record["question-id"]] = qa
    return qa_dict

def match_questions_to_answers(filepath, qa_dict):
    q_wo_a = []
    with open(filepath, "r") as file:
        for line in file:
            record = json.loads(line.strip())
            
            if "text" in record and "_id" in record:
                if record["_id"] in qa_dict.keys():
                    qa = qa_dict[record["_id"]]
                    qa.question = record["text"]
                else:
                    qa_dict[NO_ANS].append(record["text"])


def organize_question_to_context_relations(filepath, qa_dict):
    qc_dict = {}
    with open(filepath, "r") as file:
        for line in file:
            record = json.loads(line.strip())
            
            if "score" in record and record["score"] == "1"\
                    and "passage-id" in record\
                    and "question-id" in record and record["question-id"] in qa_dict.keys():
                if record["passage-id"] not in qc_dict.keys():
                    qc_dict[record["passage-id"]] = []
                qc_dict[record["passage-id"]].append(record["question-id"])
    return qc_dict

def load_passages(filepath, qc_dict, qa_dict):
    entries = []
    with open(filepath, "r") as file:
        for line in file:
            record = json.loads(line.strip())
            
            if "text" in record and "_id" in record and record["_id"] in qc_dict.keys():
                qa_ids = qc_dict[record["_id"]]
                qas = []
                for qa_id in qa_ids:
                    qas.append(qa_dict[qa_id])
                entries.append(Entry(record["_id"], record["text"], qas))
    return entries

In [22]:
qa_dict = init_qas_with_answers("simple-legal-questions-pl-main/answers.jl")

qa_dict[NO_ANS] = []
match_questions_to_answers("simple-legal-questions-pl-main/questions.jl", qa_dict)

qc_dict = organize_question_to_context_relations("simple-legal-questions-pl-main/relevant.jl", qa_dict)
test_passages = load_passages("simple-legal-questions-pl-main/passages.jl", qc_dict, qa_dict)

In [19]:
def convert_poquad_data(filepath):
    val_entries = []
    with open(filepath, "r") as file:
        json_content = json.load(file)
        for data in json_content['data']:
            for paragraph in data['paragraphs']:
                qa_list = []
                for qa in paragraph['qas']:
                    answers = qa['answers'] if 'answers' in qa.keys() else qa['plausible_answers']
                    for answer in answers:
                        qa_obj = QA(None, qa['question'], answer['generative_answer'])
                        qa_list.append(qa_obj)
                    
                entry = Entry(None, paragraph['context'], qa_list)
                val_entries.append(entry)
    return val_entries

val_passages = convert_poquad_data("poquad-dev.json")

In [20]:
from tqdm import tqdm

def exec_passages(model, tokenizer, passages):
    answers = []
    expected_answers = []
    for passage in tqdm(passages):
        for qa in passage.qas:
            input_text = f"question: {qa.question} context: {passage.passage_text}"
            inputs = tokenizer(input_text, return_tensors="pt")

            outputs = model.generate(inputs["input_ids"], max_length=100, num_beams=5, early_stopping=True)

            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
            answers.append(answer)
            expected_answers.append(qa.answer)
    return answers, expected_answers


In [36]:
len(val_passages[:1])

1

In [24]:
answers, expected_answers = exec_passages(ready_model, ready_model_tokenizer, test_passages)

100%|██████████| 557/557 [16:48<00:00,  1.81s/it]


In [53]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

my_t5_base_model_name = "./model_poquad_abstr"
my_t5_base_model_tokenizer = AutoTokenizer.from_pretrained(my_t5_base_model_name)
my_t5_base_model = AutoModelForSeq2SeqLM.from_pretrained(ready_model_name)

In [54]:
my_answers, expected_answers = exec_passages(my_t5_base_model, my_t5_base_model_tokenizer, test_passages)

100%|██████████| 557/557 [14:06<00:00,  1.52s/it]


In [32]:
answers[:6]

['tak',
 'tak',
 'tak',
 '1 stycznia',
 '5% miesięcznego wynagrodzenia zasadniczego',
 'trzymiesięcznego wynagrodzenia']

In [29]:
my_answers

['tak',
 'tak',
 'tak',
 '1 stycznia',
 '5% miesięcznego wynagrodzenia zasadniczego',
 'trzymiesięcznego wynagrodzenia']

In [31]:
expected_answers

['Tak, urzędnikiem może zostać osoba, która odbyła staż urzędniczy w prokuraturze',
 'Tak, osoba, która cieszy się nieposzlakowaną opinią, może zostać urzędnikiem (zgodnie z ustawą o pracownikach samorządowych).',
 'Tak',
 '1 stycznia każdego roku',
 'Wysokość trzymiesięcznego wynagrodzenia',
 'Pracownikowi sądu, który prawcował 15 lat w sądzie, przysługuje jednorazowa odprawa w wysokości trzymiesięcznego wynagrodzenia']

In [49]:
def calculate_exact_matches(answers, expected_answers):
    matches = 0
    for s1, s2 in zip(answers, expected_answers):
        if s1.lower() == s2.lower():
            matches += 1
    return matches/len(answers)

In [55]:
calculate_exact_matches(my_answers, expected_answers)

0.2456445993031359

In [56]:
calculate_exact_matches(answers, expected_answers)

0.2456445993031359

In [62]:
from collections import Counter
import re

# Sample data
list1 = ["the cat sat on the mat", "a dog barked loudly", "birds are singing"]
list2 = ["cat is on the mat", "a dog howled loudly", "birds are chirping"]

def tokenize(text):
    text = text.lower()
    tokens = re.split(r"[^\w]+", text)
    return tokens

# Compute confusion matrix and F1 score based on token counts
def compute_single_f1(tokens1, tokens2):
    TP = sum((tokens1 & tokens2).values())
    FP = sum((tokens1 - tokens2).values())
    FN = sum((tokens2 - tokens1).values())

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1

def compute_f1(answers, expected_answers, tokenize_fun):

    f1_scores = []
    for s1, s2 in zip(answers, expected_answers):
        s1_t = Counter(tokenize_fun(s1))
        s2_t = Counter(tokenize_fun(s2))
        f1 = compute_single_f1(s1_t, s2_t)
        f1_scores.append(f1)

    return sum(f1_scores)/len(f1_scores)


In [64]:
compute_f1(answers, expected_answers, tokenize)

0.5184004475490456

In [None]:
compute_f1(my_answers, expected_answers, tokenize)