In [2]:
from os import listdir
from os.path import isfile, join

def get_all_files_from_path(mypath):
    filenames = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
    return filenames

from bs4 import BeautifulSoup
import re
import json

def get_article(articles):
    result = {}
    current_statue = "(non-statute)"
    for i in re.split(r"(.*)", articles.strip()):
        if len(i) == 0 or i == "\n":
            continue
        if re.search(r"^\(.*\)$", i):
            current_statue = i.strip()
            if current_statue not in result:
                result.update({current_statue: []})
        else:
            if current_statue not in result:
                result.update({current_statue: []})
            result[current_statue].append(i)
    return result

def build_test(filename):
    result = {}
    with open(filename, 'r') as f:
        data = f.read()

    data = BeautifulSoup(data, "xml").find_all('pair')
    for i in data:
        id = i.get('id')
        result.update({id: {}})
        result[id].update({"label": i.get('label')})
        articles = i.find('t1').text.strip()
        # articles = get_article(articles)
        result[id].update({"result": articles})
        result[id].update({"content": i.find('t2').text.strip()})
    return result

def write_json(filename, data):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

import xml.etree.ElementTree as Et
import glob

def format_first_line(text):
    lines = text.split("\n")
    results = []
    for line in lines:
        if line[0] == "":
            continue
        if line[0] == "(" and line[-1] == ")":
            continue
        results.append(line)
    return "\n".join(results)

def load_samples(filexml):
    # try:
    tree = Et.parse(filexml)
    root = tree.getroot()
    samples = []
    for i in range(0, len(root)):
        sample = {'result': []}
        for j, e in enumerate(root[i]):
            if e.tag == "t1":
                sample['result'] = format_first_line(e.text.strip())
            elif e.tag == "t2":
                question = e.text.strip()
                sample['content'] = question if len(question) > 0 else None
        sample.update(
            {'index': root[i].attrib['id'], 'label': root[i].attrib.get('label', "N")})
        # filter the noise samples
        if sample['content'] is not None:
            samples.append(sample)
        else:
            print("[Important warning] samples {} is ignored".format(sample))
    return samples

def load_test_data_samples(path_folder_base, test_id):
    data = []
    test = load_samples(f"{path_folder_base}/riteval_{test_id}.xml")
    for file_path in glob.glob(f"{path_folder_base}/riteval_{test_id}.xml"):
        data = data + load_samples(file_path)
    return data


def load_all_data_samples(path_folder_base):
    data = []
    for file_path in glob.glob("{}/*.xml".format(path_folder_base)):
        data = data + load_samples(file_path)
    return data

def check_false_labels(pred, false_labels):
	for label in false_labels:
		if label in pred:
			return True
	return False

from tqdm import tqdm

def format_output(text):
	CLEANR = re.compile('<.*?>') 
	cleantext = re.sub(CLEANR, '', text)
	return cleantext.strip().lower()

def readfile(filename):
    f = open(filename)
    data = json.load(f)
    return data

In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    TextStreamer,
    pipeline,
)

MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/home/congnguyen/drive/.cache")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, cache_dir="/home/congnguyen/drive/.cache", torch_dtype=torch.float16, 
    load_in_4bit=True, 
)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 19/19 [05:19<00:00, 16.84s/it]


In [33]:
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1
generation_config.temperature = 0.0001
generation_config.do_sample = True
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    streamer=streamer,
)

In [17]:
test_file = "../data/COLIEE2024statute_data-English/fewshot/riteval_R04_en.xml"
data = load_samples(test_file)
# data

In [4]:
def format_prompt(prompt, system_prompt=""):
    if system_prompt.strip():
        return f"[INST] {system_prompt} {prompt} [/INST]"
    return f"[INST] {prompt} [/INST]"

In [52]:
result = {}
count = 0
for item in data:
    id = item['index']
    context = item['result'].replace("\n", " ").strip()
    query = item['content'].replace("\n", " ").strip()
    label = item['label']
    prompt = "(No explain textual entailment) Premise: {{premise}} Hypothesis: {{hypothesis}} Yes or No"
    prompt = format_prompt(prompt.replace("{{premise}}", context).replace("{{hypothesis}}", query))
    # prompt = format_prompt(prompt)
    # inputs = tokenizer(prompt, return_tensors="pt")
    # outputs = model.generate(**inputs, max_new_tokens=128)
    # out = format_output(tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, ""))
    response = llm(format_prompt(prompt.strip()))
    response = format_output(response[0]['generated_text'].replace(prompt.strip(), ""))
    # result.update({id: out})
    # if count < 1:
    #     print(prompt)
    answer = "N"
    if "yes" in response or "true" in response:
        answer = "Y"
    
    if answer == label:
        count += 1
    # print(out)
    print(id, ": ", answer, ", ", label)
    print("====================================")



[INST] [INST] (No explain textual entailment) Premise: Article 5 (1) A minor must obtain the consent of the minor's legal representative to perform a juridical act;provided, however, that this does not apply to a juridical act for merely acquiring a right or being released from an obligation. (2) A juridical act in contravention of the provisions of the preceding paragraph is voidable. Article 6 (1) A minor who is permitted to conduct one or multiple types of business has the same capacity to act as an adult as far as that business is concerned. Hypothesis: A juridical act by a minor who is permitted to conduct business may not be rescinded even if the judicial act does not concern the business. Yes or No [/INST] [/INST] Based
R04-01-I :  N ,  N
Yes
R04-01-U :  Y ,  N
Yes
R04-01-E :  Y ,  Y
No
R04-02-A :  N ,  N
Yes
R04-02-U :  Y ,  N
Yes
R04-02-E :  Y ,  Y
No
R04-03-A :  N ,  N
No
R04-03-I :  N ,  N
Yes
R04-03-U :  Y ,  Y
No
R04-03-E :  N ,  N
Yes
R04-03-O :  Y ,  Y
Based
R04-04-A :  

In [53]:
count

70

In [32]:
f = open("../data/infer/R04_mistral.xml", "w")
for k in result:
    if "Yes" in result[k]:
        f.write(k + " Y " + "JNLP"+"\n")
    elif "No" in result[k]:
        f.write(k + " N " + "JNLP"+"\n")