In [1]:
import json
import os
with open("../dataset/rag_truth_test.json", "r") as f:
    test_data = json.load(f)

In [2]:
test_data[0]

{'ref': 'The FBI charged a Philadelphia woman on Thursday with trying to travel overseas to fight for ISIS. She\'s one of three women arrested this week on terror charges. Two New York women were also taken into custody. An FBI complaint cites numerous social media messages dating back to August 2013 that were sent by Keonna Thomas, 30, also known as "Young Lioness" and "Fatayat Al Khilafah." One Twitter message said, "If we truly knew the realities ... we all would be rushing to join our brothers in the front lines pray ALLAH accept us as shuhada [martyrs]." Another said, "When you\'re a mujahid [violent jihadi fighter] your death becomes a wedding." The FBI said Thomas purchased an electronic visa to Turkey on March 23. Turkey is known as the easiest place from which to enter Syria and join ISIS. An ISIS manual advises recruits to buy round-trip tickets to vacation spots such as Spain and then purchase tickets for their real destination once they arrive overseas, the FBI said. On Mar

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=False, # to avoid the error "AttributeError: 'DynamicCache' object has no attribute 'get_max_length'"
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.30s/it]


In [None]:
from tqdm import tqdm

outputs = []

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

generation_args = {
    "max_new_tokens": 5,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

for d in tqdm(test_data):
    messages = [
        {"role": "system", "content": "Input_Document:" + d["ref"]
                        + "Please judge the following Text whether it includes hallucination or not based on the Input_Document above and output 1 if it includes hallucination and 0 if not. Output should be only an number (1 or 0). You mustn't output any description other than a number."},
        {"role": "user", "content": "Text:" + d["text"] + "Output:"},
    ]
    output = pipe(messages, **generation_args)
    outputs.append(output[0]['generated_text'])
    

In [7]:
print(len(outputs))
# please check whether the outputs are only '0' or '1'
set(outputs)

2208


{' 0', ' 1'}

In [8]:
answers = [d["labels"] for d in test_data]
# output will be like " 0" or " 1", so we need to convert it to int
outputs = [int(o[1]) for o in outputs]

In [None]:
from collections import Counter
count = Counter(outputs)

print(count)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

accuracy_score(answers, outputs), f1_score(answers, outputs), precision_score(answers, outputs), recall_score(answers, outputs)

In [11]:
import os

if os.path.exists("../results_test.json"):
    with open("../results_test.json", "r") as f:
        data = json.load(f)
else:
    data = []
    for i, d in enumerate(test_data):
        data.append({"id": i, "label": d["labels"], "task": d["task_type"]})

len(data)

2208

In [12]:
for i, d in enumerate(data):
    d["phi3.5"] = outputs[i]

In [None]:
data[0]

In [15]:
with open("../results_test.json", "w") as f:
    json.dump(data, f, indent=4)