In [1]:
test_file = "../data/COLIEE2024statute_data-English/train/riteval_R04_en.xml"
dev_file = "../data/COLIEE2024statute_data-English/train/riteval_R03_en.xml"

In [2]:
from os import listdir
from os.path import isfile, join

def get_all_files_from_path(mypath):
    filenames = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
    return filenames

from bs4 import BeautifulSoup
import re
import json

def get_article(articles):
    result = {}
    current_statue = "(non-statute)"
    for i in re.split(r"(.*)", articles.strip()):
        if len(i) == 0 or i == "\n":
            continue
        if re.search(r"^\(.*\)$", i):
            current_statue = i.strip()
            if current_statue not in result:
                result.update({current_statue: []})
        else:
            if current_statue not in result:
                result.update({current_statue: []})
            result[current_statue].append(i)
    return result

def build_test(filename):
    result = {}
    with open(filename, 'r') as f:
        data = f.read()

    data = BeautifulSoup(data, "xml").find_all('pair')
    for i in data:
        id = i.get('id')
        result.update({id: {}})
        result[id].update({"label": i.get('label')})
        articles = i.find('t1').text.strip()
        # articles = get_article(articles)
        result[id].update({"result": articles})
        result[id].update({"content": i.find('t2').text.strip()})
    return result

def write_json(filename, data):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

import xml.etree.ElementTree as Et
import glob

def format_first_line(text):
    lines = text.split("\n")
    results = []
    for line in lines:
        if line[0] == "":
            continue
        if line[0] == "(" and line[-1] == ")":
            continue
        results.append(line)
    return "\n".join(results)

def load_samples(filexml):
    # try:
        tree = Et.parse(filexml)
        root = tree.getroot()
        samples = []
        for i in range(0, len(root)):
            sample = {'result': []}
            for j, e in enumerate(root[i]):
                if e.tag == "t1":
                    sample['result'] = format_first_line(e.text.strip())
                elif e.tag == "t2":
                    question = e.text.strip()
                    sample['content'] = question if len(question) > 0 else None
            sample.update(
                {'index': root[i].attrib['id'], 'label': root[i].attrib.get('label', "N")})
            # filter the noise samples
            if sample['content'] is not None:
                samples.append(sample)
            else:
                print("[Important warning] samples {} is ignored".format(sample))
        return samples

def load_test_data_samples(path_folder_base, test_id):
    data = []
    test = load_samples(f"{path_folder_base}/riteval_{test_id}.xml")
    for file_path in glob.glob(f"{path_folder_base}/riteval_{test_id}.xml"):
        data = data + load_samples(file_path)
    return data


def load_all_data_samples(path_folder_base):
    data = []
    for file_path in glob.glob("{}/*.xml".format(path_folder_base)):
        data = data + load_samples(file_path)
    return data

def check_false_labels(pred, false_labels):
	for label in false_labels:
		if label in pred:
			return True
	return False

from tqdm import tqdm

def format_output(text):
	CLEANR = re.compile('<.*?>') 
	cleantext = re.sub(CLEANR, '', text)
	return cleantext.strip().lower()

def readfile(filename):
    f = open(filename)
    data = json.load(f)
    return data

In [3]:
# def prompting(prompt="{{premise}}\nBased on the previous passage, {{hypothesis}}? ", data=None):
#     return prompt.replace("{{premise}}", data['result']).replace('{{hypothesis}}', data['content'])

In [4]:
datas = get_all_files_from_path("../data/COLIEE2024statute_data-English/train")
corpus = []
content = []
labels = []
for data in datas:
    data = load_samples(data)
    for item in data:
        corpus.append(item["result"])
        content.append(item["content"])
        labels.append(item["label"])
print(len(corpus))

1097


In [5]:
from sentence_transformers import SentenceTransformer, util
passage_encoder = SentenceTransformer('facebook-dpr-ctx_encoder-single-nq-base')

passage_embeddings = passage_encoder.encode(corpus)

query_encoder = SentenceTransformer('facebook-dpr-question_encoder-single-nq-base')


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoTokenizer, BloomForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

model_name = "google/flan-t5-xxl"
cache_dir = "/home/congnguyen/drive/.cache"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(
		model_name, device_map="auto", cache_dir=cache_dir, torch_dtype=torch.float16, load_in_8bit=True
	)

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████| 5/5 [00:12<00:00,  2.42s/it]


In [7]:
from tqdm import tqdm

def format_output(text):
	CLEANR = re.compile('<.*?>') 
	cleantext = re.sub(CLEANR, '', text)
	return cleantext.strip().lower()
    
def few_shot_prompting(indexes, corpus, content, labels, prompt_template):
    result = ""
    for i in indexes:
        answer = "Yes"
        if "N" == labels[i]:
            answer = "No"
        prompt = prompt_template.replace("{{premise}}", corpus[i]).replace('{{hypothesis}}', content[i]).replace('{{answer}}', answer)
        result += prompt
    return result
    
prompt_template = "{{premise}}\nBased on the previous passage, {{hypothesis}}?\nAnswer: {{answer}}\n\n"
final_prompt = "{{premise}}\nBased on the previous passage, {{hypothesis}}?\nAnswer: "

In [8]:
data = load_samples(test_file)

In [9]:
result = []
count = 0
for item in tqdm(data):
    label = item["label"]
    query = item["result"]
    query_embedding = query_encoder.encode(query)
    #Important: You must use dot-product, not cosine_similarity
    scores = util.dot_score(query_embedding, passage_embeddings)
    indexes = torch.topk(scores, 3).indices[0]
    
    text = few_shot_prompting(indexes, corpus, content, labels, prompt_template)
    text += final_prompt.replace("{{premise}}", item["result"]).replace('{{hypothesis}}', item["content"])
    if count < 5:
        print("------------------------------")
        print(prompt_template)
        print(text)
        print(final_prompt)
    # print(text)
    inputs = tokenizer(text, return_tensors="pt")["input_ids"].cuda()
    outputs = model.generate(inputs, max_new_tokens=10)
    output_text = format_output(tokenizer.decode(outputs[0]).replace(text, "").split("\n")[-1])
    if "yes" in output_text:
        output_text = "Y"
    else:
        output_text = "N"
    if output_text == label:
        count+=1
    # print("predict label: ", output_text, "label: ", label)

count/len(data)
len(data)
count

  0%|                                                                                                                 | 0/101 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (604 > 512). Running this sequence through the model will result in indexing errors


------------------------------
{{premise}}
Based on the previous passage, {{hypothesis}}?
Answer: {{answer}}


Article 5 (1) A minor must obtain the consent of the minor's legal representative to perform a juridical act;provided, however, that this does not apply to a juridical act for merely acquiring a right or being released from an obligation.
(2) A juridical act in contravention of the provisions of the preceding paragraph is voidable.
Article 6 (1) A minor who is permitted to conduct one or multiple types of business has the same capacity to act as an adult as far as that business is concerned.
Based on the previous passage, A juridical act by a minor who is permitted to conduct business may not be rescinded even if the judicial act does not concern the business.?
Answer: No

Article 6
(1) A minor who is permitted to conduct one or multiple types of business has the same capacity to act as an adult as far as that business is concerned.
(2) In a case as referred to in the precedin

  1%|█                                                                                                        | 1/101 [00:00<01:32,  1.09it/s]

------------------------------
{{premise}}
Based on the previous passage, {{hypothesis}}?
Answer: {{answer}}


Article 5 (1) A minor must obtain the consent of the minor's legal representative to perform a juridical act;provided, however, that this does not apply to a juridical act for merely acquiring a right or being released from an obligation.
(2) A juridical act in contravention of the provisions of the preceding paragraph is voidable.
Article 120 (1) An act that is voidable on the grounds of the qualified legal capacity to act of the person who did the act may be rescinded only by the person with qualified legal capacity (in the case of an act performed by the person as a legal representative of another person with limited capacity, including that other person with limited capacity), or an agent or successor thereof, or a person who has the authority to give consent thereto.
Based on the previous passage, When a minor concludes a contract without obtaining the consent of the pers

  2%|██                                                                                                       | 2/101 [00:02<01:46,  1.08s/it]

------------------------------
{{premise}}
Based on the previous passage, {{hypothesis}}?
Answer: {{answer}}


Article 124
(1) The ratification of a voidable act does not become effective unless it is made after the circumstances that made the act voidable cease to exist and the person ratifying the act becomes aware of the right to rescind it.
(2) In the following cases, the ratification referred to in the preceding paragraph is not required to be made after the circumstances that made the act voidable cease to exist:
(i) if a legal representative or a curator or assistant of a person with qualified legal capacity ratifies the act; or
(ii) if a person with qualified legal capacity (excluding an adult ward) makes the ratification with the consent of a legal representative, curator or assistant.
Article 126
The right to rescind an act is extinguished by the operation of the prescription if it is not exercised within five years from the time when it becomes possible to ratify the act. Th

  3%|███                                                                                                      | 3/101 [00:03<01:51,  1.14s/it]

------------------------------
{{premise}}
Based on the previous passage, {{hypothesis}}?
Answer: {{answer}}


Article 9 A juridical act performed by an adult ward is voidable;provided, however, that this does not apply to the purchase of daily necessities or to any other act involved in day-to-day life.
Based on the previous passage, If an adult ward acquires land by gift, the guardian of the adult ward may not rescind the gift.?
Answer: No

Article 9
A juridical act performed by an adult ward is voidable;provided, however, that this does not apply to the purchase of daily necessities or to any other act involved in day-to-day life.
Based on the previous passage, The purchase of daily household items may not be rescinded, even if an adult ward performed the act.?
Answer: Yes

Article 9
A juridical act performed by an adult ward is voidable;provided, however, that this does not apply to the purchase of daily necessities or to any other act involved in day-to-day life.
Based on the prev

  4%|████▏                                                                                                    | 4/101 [00:03<01:31,  1.06it/s]

------------------------------
{{premise}}
Based on the previous passage, {{hypothesis}}?
Answer: {{answer}}


Article 9 A juridical act performed by an adult ward is voidable;provided, however, that this does not apply to the purchase of daily necessities or to any other act involved in day-to-day life.
Article 102 An act that a person with qualified legal capacity performs as an agent of another person may not be rescinded on the grounds of qualified legal capacity;provided, however, that this does not apply to an act performed by a person with qualified legal capacity as a legal representative of another person with qualified legal capacity.
Based on the previous passage, An act that A, an adult ward, performs as legal representative of B, a minor, may not be rescinded on the grounds of qualified legal capacity of A.?
Answer: No

Article 102
An act that a person with qualified legal capacity performs as an agent of another person may not be rescinded on the grounds of qualified lega

  5%|█████▏                                                                                                   | 5/101 [00:04<01:21,  1.17it/s]

------------------------------
{{premise}}
Based on the previous passage, {{hypothesis}}?
Answer: {{answer}}


Article 158 (2) If a minor or an adult ward has a right vis-a-vis the minor's or the adult ward's father, mother, or guardian who manages the property, the prescription period does not expire with respect to that right until six months have passed from the time when that minor or adult ward becomes a person with capacity to act, or a succeeding legal representative assumes the position.
Based on the previous passage, If A, an adult ward, has a right vis-a-vis the adult ward's guardian who manages the property, the prescription period will not expire with respect to that right until the statutory period has passed from the time when A becomes a person with capacity to act, or a succeeding legal representative assumes the position.?
Answer: Yes

Article 7
The family court may decide to commence a guardianship in respect of a person who constantly lacks the capacity to appreciate

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 101/101 [01:16<00:00,  1.32it/s]


87

# Evaluate R0-R4

In [10]:
path_file = "../data/COLIEE2024statute_data-English/train/"

In [11]:
list_prompt = readfile("../data/prompt.json")
# list_prompt

In [12]:
def prompting(premise, hypothesis, template=None):
    text = template.replace("{{premise}}", premise).replace("{{hypothesis}}", hypothesis)
    return text

def writefile(data, filename):
    # Serializing json
    json_object = json.dumps(data, indent=1)
    # Writing to sample.json
    with open(filename, "w") as outfile:
        outfile.write(json_object)
        
def few_shot_prompting(indexes, corpus, content, labels, prompt_template):
    result = ""
    for i in indexes:
        if "true or false" in prompt_template.lower():
            answer = "True"
            if "N" == labels[i]:
                answer = "False"
        else:
            answer = "Yes"
            if "N" == labels[i]:
                answer = "No"
        prompt = prompt_template.replace("{{premise}}", corpus[i]).replace('{{hypothesis}}', content[i]).replace('{{answer}}', answer)
        result += prompt
    return result

In [1]:
def predict(model, tokenizer, path_file, passage_embeddings, files=["riteval_R01_en","riteval_R02_en","riteval_R03_en","riteval_R04_en"], output="../output/accuracy2/newpromt_"):
    for file in files:
        test_file = path_file+file+".xml"
        f = open(path_file+file+".txt", "w", encoding="utf-8")
        data = load_samples(test_file)
        
        acc = {}
        for template_prompt in list_prompt:
            template_prompt = template_prompt["prompt"]
            query_prompt = template_prompt+"\nAnswer: "
            prompt_template = template_prompt+"\nAnswer: {{answer}}\n\n"
            
            result = []
            count = 0
            for item in tqdm(data):
                label = item["label"]
                hypothesis = item["content"]
                premise = item["result"]
                #Important: You must use dot-product, not cosine_similarity
                query_embedding = query_encoder.encode(premise)
                scores = util.dot_score(query_embedding, passage_embeddings)
                indexes = torch.topk(scores, 3).indices[0]
                few_shot = few_shot_prompting(indexes, corpus, content, labels, prompt_template)
                text = few_shot + prompting(premise, hypothesis, query_prompt)
                #############################################
                inputs = tokenizer(text, return_tensors="pt")["input_ids"].cuda()
                outputs = model.generate(inputs, max_new_tokens=10)
                output_text = format_output(tokenizer.decode(outputs[0]).replace(text, "").split("\n")[-1])
                f.write(text)
                f.write("\n3-shot answer:"+output_text+"\n")
                f.write("=========================================\n")
                if count < 1:
                    print(text)
                # if count < 3:
                #     print(output_text)
                if "yes" in output_text or "true" in output_text:
                    output_text = "Y"
                else:
                    output_text = "N"
                if output_text == label:
                    count+=1
                # print("predict label: ", output_text, "label: ", label)
            # print("=======================================")
            # print(template_prompt)
            # print(count, "/", len(data))
            acc.update({template_prompt: count/len(data)})
            print(acc)
        writefile(acc, output+file+".json")
        f.close()
predict(model, tokenizer, path_file, passage_embeddings, ["riteval_R04_en"], "../output/newpromt_")

NameError: name 'model' is not defined