## Evaluation

In [1]:
import json

with open("dataset/rag_truth_test.json", "r") as f:
    test_data = json.load(f)


In [2]:
def add_prefix(data):
    for d in data:
        d["text"] = "Please judge the following statement whether it includes hallucination or not (based on the references above): " + d["text"]
    return data


test_data = add_prefix(test_data)
test_data[0]



{'ref': 'The FBI charged a Philadelphia woman on Thursday with trying to travel overseas to fight for ISIS. She\'s one of three women arrested this week on terror charges. Two New York women were also taken into custody. An FBI complaint cites numerous social media messages dating back to August 2013 that were sent by Keonna Thomas, 30, also known as "Young Lioness" and "Fatayat Al Khilafah." One Twitter message said, "If we truly knew the realities ... we all would be rushing to join our brothers in the front lines pray ALLAH accept us as shuhada [martyrs]." Another said, "When you\'re a mujahid [violent jihadi fighter] your death becomes a wedding." The FBI said Thomas purchased an electronic visa to Turkey on March 23. Turkey is known as the easiest place from which to enter Syria and join ISIS. An ISIS manual advises recruits to buy round-trip tickets to vacation spots such as Spain and then purchase tickets for their real destination once they arrive overseas, the FBI said. On Mar

In [3]:
# task_type: QA, Data2txt, Summary
task_name = "Summary"
test_data = [d for d in test_data if d["task_type"] == task_name]

In [4]:
from datasets import Dataset, DatasetDict
import pandas as pd


test_df = pd.DataFrame(test_data)
test_ds = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({"test":test_ds})
raw_datasets

DatasetDict({
    dev: Dataset({
        features: ['ref', 'text', 'labels', 'source', 'model', 'task_type', 'source_id'],
        num_rows: 900
    })
})

### no_doc

In [5]:
from transformers import AutoModel, AutoTokenizer
from models_phi import NoDocModel

base_model = AutoModel.from_pretrained("microsoft/Phi-3.5-mini-instruct")
# load trained model
name = "./trained/no_doc_phi"
model = NoDocModel.from_pretrained(base_model,name)
tokenizer = AutoTokenizer.from_pretrained(name)


2025-01-23 13:24:49.871431: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-23 13:24:50.268171: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-23 13:24:50.448942: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-23 13:24:50.449335: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-23 13:24:50.779741: I tensorflow/core/platform/cpu_feature_gua

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [8]:
from transformers import DataCollatorWithPadding

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

In [9]:
tokenized_datasets

DatasetDict({
    dev: Dataset({
        features: ['ref', 'labels', 'source', 'model', 'task_type', 'source_id', 'input_ids', 'attention_mask'],
        num_rows: 900
    })
})

In [10]:
results = []
for i, d in enumerate(tokenized_datasets["test"]):
    results.append({"id": i,"label":d["labels"],"task":d["task_type"]})

results[0]

{'id': 0, 'label': 0, 'task': 'Summary'}

In [11]:
# inference
import torch
from tqdm import tqdm

model.eval()

for i,d in tqdm(enumerate(tokenized_datasets["test"])):
    input_ids =torch.tensor(d["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.tensor(d["attention_mask"]).unsqueeze(0).to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        predicted_index = torch.argmax(logits, dim=-1) # predicted label
    results[i]["no_rag_logits"] = logits.cpu().numpy()
    results[i]["no_rag_label"] = predicted_index.cpu().numpy()
        
    


0it [00:00, ?it/s]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_bmm)

### with_doc

In [11]:
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
import torch
from models_phi import WithDocModel

base_model = AutoModel.from_pretrained("microsoft/Phi-3.5-mini-instruct")

name = "./trained/with_doc_phi"
model = WithDocModel.from_pretrained(base_model,name)
tokenizer = AutoTokenizer.from_pretrained(name)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


def tokenize_function(examples):
    ref = tokenizer(examples["ref"],truncation=True, max_length=512)
    text = tokenizer(examples["text"],truncation=True, max_length=512)
    return {
        "ref_input_ids":ref["input_ids"],
        "ref_attention_mask":ref["attention_mask"],
        "text_input_ids":text["input_ids"],
        "text_attention_mask":text["attention_mask"],
    }

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text","ref"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [12]:
import torch
from tqdm import tqdm

model.eval()

for i,d in tqdm(enumerate(tokenized_datasets["dev"])):
    ref_input_ids = torch.tensor(d["ref_input_ids"]).unsqueeze(0).to(device)
    text_input_ids = torch.tensor(d["text_input_ids"]).unsqueeze(0).to(device)
    input_ids = [ref_input_ids, text_input_ids]
    
    ref_attention_mask = torch.tensor(d["ref_attention_mask"]).unsqueeze(0).to(device)
    text_attention_mask = torch.tensor(d["text_attention_mask"]).unsqueeze(0).to(device)
    attention_mask = [ref_attention_mask, text_attention_mask]
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs["logits"]
        predicted_index = torch.argmax(logits, dim=-1)
        
    results[i]["rag_logits"] = logits.cpu().numpy()
    results[i]["rag_label"] = predicted_index.cpu().numpy()
        

2700it [12:19,  3.65it/s]


### triplet

In [46]:
# triplet loss is not calculated during inference, so the other data is dummy ("hallucination"/"not hallucination")

def create_trip_ver2(data):
    trip = []
    for d in data:
        if d["labels"] == 0: # not hallucination
            trip.append({"anchor":d["ref"],"positive": d["text"], "negative": "hallucination", "labels": 0})
        else:
            trip.append({"anchor":d["ref"],"positive": "not hallucination", "negative": d["text"], "labels": 1})
    return trip
        

In [47]:
from datasets import Dataset, DatasetDict
import pandas as pd

test_trip = create_trip_ver2(test_data)

test_df = pd.DataFrame(test_trip)
test_ds = Dataset.from_pandas(test_df)

raw_datasets = DatasetDict({"test":test_ds})
raw_datasets

DatasetDict({
    dev: Dataset({
        features: ['anchor', 'positive', 'negative', 'labels'],
        num_rows: 2700
    })
})

In [48]:
import torch.nn.functional as F
import torch
import torch.nn as nn

def triplet_loss(anchor_output, positive_output, negative_output, positive_logits, negative_logits):
    positive_targets = torch.zeros(positive_output.size(0), dtype=torch.long).to(device)  # not hallucination
    negative_targets = torch.ones(negative_output.size(0), dtype=torch.long).to(device)
    positive_loss = nn.CrossEntropyLoss()(positive_logits, positive_targets)
    negative_loss = nn.CrossEntropyLoss()(negative_logits, negative_targets)

    classification_loss = (positive_loss + negative_loss) / 2.0

    triplet_loss_fn = (nn.TripletMarginWithDistanceLoss(margin=1.0,distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y)))
    triplet_loss = triplet_loss_fn(anchor_output, positive_output, negative_output)

    return classification_loss, triplet_loss

In [49]:
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
import torch
from models_phi import TripletModel


base_model = AutoModel.from_pretrained("microsoft/Phi-3.5-mini-instruct")

name = "./trained/triplet_phi"
model = TripletModel.from_pretrained(base_model, triplet_loss, name)
tokenizer = AutoTokenizer.from_pretrained(name)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


def tokenize_function(examples):
    anchor = tokenizer(examples["anchor"], truncation=True,max_length=512)
    positive = tokenizer(examples["positive"], truncation=True,max_length=512)
    negative = tokenizer(examples["negative"], truncation=True,max_length=512)

    return {
        "anchor_input_ids": anchor["input_ids"],
        "anchor_attention_mask": anchor["attention_mask"],
        "positive_input_ids": positive["input_ids"],
        "positive_attention_mask": positive["attention_mask"],
        "negative_input_ids": negative["input_ids"],
        "negative_attention_mask": negative["attention_mask"],
    }


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["anchor", "positive", "negative"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

In [50]:
import torch
from tqdm import tqdm

model.eval()

for i,d in tqdm(enumerate(tokenized_datasets["test"])):
    
    flag = False # hallusination → True
    if d["labels"] == 1:
        flag = True
    
    anchor_input_ids = torch.tensor(d["anchor_input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    positive_input_ids = torch.tensor(d["positive_input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    negative_input_ids = torch.tensor(d["negative_input_ids"], dtype=torch.long).unsqueeze(0).to(device)
    input_ids = [anchor_input_ids, positive_input_ids, negative_input_ids]
    anchor_attention_mask = torch.tensor(d["anchor_attention_mask"], dtype=torch.long).unsqueeze(0).to(device)
    positive_attention_mask = torch.tensor(d["positive_attention_mask"], dtype=torch.long).unsqueeze(0).to(device)
    negative_attention_mask = torch.tensor(d["negative_attention_mask"], dtype=torch.long).unsqueeze(0).to(device)
    attention_mask = [anchor_attention_mask, positive_attention_mask, negative_attention_mask]
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        if flag:
            logits = outputs.logits[1] # negative_logit
        else:
            logits = outputs.logits[0] # positive_logit
        predicted_index = torch.argmax(logits, dim=-1)
    results[i]["triplet_logits"] = logits.cpu().numpy()
    results[i]["triplet_label"] = predicted_index.cpu().numpy()
    
    
        

2700it [15:29,  2.90it/s]


In [38]:
results[-1]

{'id': 2699,
 'label': 0,
 'task': 'QA',
 'no_rag_logits': [[5.56593656539917, -5.5176544189453125]],
 'no_rag_label': [0],
 'rag_logits': [[2.7037570476531982, -1.0352840423583984]],
 'rag_label': [0],
 'triplet_logits': [[1.4126927852630615, -0.585303544998169]],
 'triplet_label': [0]}

In [23]:
# save results
import numpy as np
import json

for result in results:
    if isinstance(result["no_rag_logits"], np.ndarray):
        result["no_rag_logits"] = result["no_rag_logits"].tolist()
    if isinstance(result["rag_logits"], np.ndarray):
        result["rag_logits"] = result["rag_logits"].tolist()
    if isinstance(result["triplet_logits"], np.ndarray):
        result["triplet_logits"] = result["triplet_logits"].tolist()
    if isinstance(result["no_rag_label"], np.ndarray):
        result["no_rag_label"] = result["no_rag_label"].tolist()
    if isinstance(result["rag_label"], np.ndarray):
        result["rag_label"] = result["rag_label"].tolist()
    if isinstance(result["triplet_label"], np.ndarray):
        result["triplet_label"] = result["triplet_label"].tolist()


with open("check_improve_phi.json", "w") as f:
    json.dump(results, f)

## View results

In [5]:
import json

with open('check_improve_phi.json',"r") as f:
    results = json.load(f)
    

In [6]:
# compare no_doc and triplet

id_000 = [] # both success not to detect
id_001 = [] # both fail to detect
id_010 = [] # over-detect in triplet
id_011 = [] # success to detect only in triplet
id_100 = [] # success not to detect only i triplet
id_101 = [] # fail to detect only in triplet
id_110 = [] # both over-detect
id_111 = [] # both success to detect

for result in results:
    if result["no_rag_label"][0]==0 and result["triplet_label"][0]==0 and result["label"]==0:
        id_000.append(result["id"])
    elif result["no_rag_label"][0]==0 and result["triplet_label"][0]==0 and result["label"]==1:
        id_001.append(result["id"])
    elif result["no_rag_label"][0]==0 and result["triplet_label"][0]==1 and result["label"]==0:
        id_010.append(result["id"])
    elif result["no_rag_label"][0]==0 and result["triplet_label"][0]==1 and result["label"]==1:
        id_011.append(result["id"])
    elif result["no_rag_label"][0]==1 and result["triplet_label"][0]==0 and result["label"]==0:
        id_100.append(result["id"])
    elif result["no_rag_label"][0]==1 and result["triplet_label"][0]==0 and result["label"]==1:
        id_101.append(result["id"])
    elif result["no_rag_label"][0]==1 and result["triplet_label"][0]==1 and result["label"]==0:
        id_110.append(result["id"])
    elif result["no_rag_label"][0]==1 and result["triplet_label"][0]==1 and result["label"]==1:
        id_111.append(result["id"])
    
    
# 000:972, 001:181, 010:502, 011:169, 100:71, 101:80, 110:212, 111:513
print(len(id_000),len(id_001),len(id_010),len(id_011),len(id_100),len(id_101),len(id_110),len(id_111))

972 181 502 169 71 80 212 513


### Hallucination type

In [5]:
import json

with open("dataset/rag_truth_span_test.json", "r") as f:
    test_data_span = json.load(f)
with open("dataset/rag_truth_span_train.json", "r") as f:
    train_data_span = json.load(f)
with open("dataset/rag_truth_span_dev.json", "r") as f:
    dev_data_span = json.load(f)


In [4]:
test_data_span[0]

{'ref': 'The FBI charged a Philadelphia woman on Thursday with trying to travel overseas to fight for ISIS. She\'s one of three women arrested this week on terror charges. Two New York women were also taken into custody. An FBI complaint cites numerous social media messages dating back to August 2013 that were sent by Keonna Thomas, 30, also known as "Young Lioness" and "Fatayat Al Khilafah." One Twitter message said, "If we truly knew the realities ... we all would be rushing to join our brothers in the front lines pray ALLAH accept us as shuhada [martyrs]." Another said, "When you\'re a mujahid [violent jihadi fighter] your death becomes a wedding." The FBI said Thomas purchased an electronic visa to Turkey on March 23. Turkey is known as the easiest place from which to enter Syria and join ISIS. An ISIS manual advises recruits to buy round-trip tickets to vacation spots such as Spain and then purchase tickets for their real destination once they arrive overseas, the FBI said. On Mar

In [5]:
from collections import defaultdict

hal_type_qa = defaultdict(lambda:0)
hal_type_d2t = defaultdict(lambda:0)
hal_type_sum = defaultdict(lambda:0)

for d in test_data_span:
    if d["hallucination_id"] == []:
        continue
    hal_type = []
    for hal_id in d["hallucination_id"]:
        hal_type.append(hal_id["label_type"])
        
    if d["task_type"] == "QA":
        for hal in hal_type:
            hal_type_qa[hal] += 1
    elif d["task_type"] == "Data2txt":
        for hal in hal_type:
            hal_type_d2t[hal] += 1
    else:
        for hal in hal_type:
            hal_type_sum[hal] += 1
print(dict(hal_type_qa))
print(dict(hal_type_d2t))
print(dict(hal_type_sum))

{'Evident Baseless Info': 156, 'Evident Conflict': 30, 'Subtle Baseless Info': 49}
{'Evident Conflict': 489, 'Evident Baseless Info': 465, 'Subtle Baseless Info': 95, 'Subtle Conflict': 5}
{'Evident Conflict': 100, 'Evident Baseless Info': 117, 'Subtle Conflict': 11, 'Subtle Baseless Info': 16}


In [6]:
from collections import defaultdict

hal_type_qa_imp = defaultdict(lambda:0)
hal_type_d2t_imp = defaultdict(lambda:0)
hal_type_sum_imp = defaultdict(lambda:0)

for i in id_001 + id_011:
    if test_data_span[i]["hallucination_id"]==[]:
        continue
    hal_type_imp = []
    for hal_id in test_data_span[i]["hallucination_id"]:
        hal_type_imp.append(hal_id["label_type"])
        
    if test_data_span[i]["task_type"] == "QA":
        for hal in hal_type_imp:
            hal_type_qa_imp[hal] += 1
    elif test_data_span[i]["task_type"] == "Data2txt":
        for hal in hal_type_imp:
            hal_type_d2t_imp[hal] += 1
    else:
        for hal in hal_type_imp:
            hal_type_sum_imp[hal] += 1
print(dict(hal_type_qa_imp))
print(dict(hal_type_d2t_imp))
print(dict(hal_type_sum_imp))

{'Evident Conflict': 23, 'Evident Baseless Info': 63, 'Subtle Baseless Info': 18}
{'Evident Conflict': 61, 'Evident Baseless Info': 73, 'Subtle Baseless Info': 19, 'Subtle Conflict': 2}
{'Evident Conflict': 82, 'Subtle Conflict': 9, 'Evident Baseless Info': 86, 'Subtle Baseless Info': 6}


In [7]:
sample = []
for i in id_011:
    if test_data_span[i]["task_type"]!="QA":
        continue
    
    hal_type2 = []
    for hal_id in test_data_span[i]["hallucination_id"]:
        hal_type2.append(hal_id["label_type"])
    
    if "Evident Conflict" in hal_type2:
        sample.append(test_data_span[i])

In [8]:
sample[-2]

{'ref': '{\'question\': \'history of the word gemini\', \'passages\': "passage 1:The word “Gemini” is a Latin word for twins and it is one of the constellations that actually look like its name would suggest. The first known reference of the Gemini constellation was in Aristotle’s Meteorologica, over 300 years BC. He mentions that he observed an occulting a star in Gemini and speaks of observing Jupiter in conjunction with it.\\n\\npassage 2:Gemini History. The History of Gemini. As other signs in the zodiac, Gemini is not in the same position as the constellation of Gemini. In the zodiac, it follows Taurus and takes the third 30 degrees of the zodiacal circle. Gemini is a mutable sign that is preceding the summer, and as such, it announces change while ruling the time of year when Taurus spring has ended, and life on Earth is about to change.\\n\\npassage 3:Gemini Mythology The Story Behind the Gemini Zodiac Sign. The origins of some constellation myths are heavily debated. Gemini myt

In [9]:
sample2 = []
for i in id_101:
    if test_data_span[i]["task_type"]!="Data2txt":
        continue
    
    hal_type2 = []
    for hal_id in test_data_span[i]["hallucination_id"]:
        hal_type2.append(hal_id["label_type"])
    
    if "Subtle Baseless Info" in hal_type2 and "Evident Baseless Info" in hal_type2:
        sample2.append(test_data_span[i])

In [10]:
sample2[0]

{'ref': '{\'name\': \'Finch & Fork\', \'address\': \'31 W Carrillo St\', \'city\': \'Santa Barbara\', \'state\': \'CA\', \'categories\': \'Breakfast & Brunch, American (New), Restaurants, American (Traditional), Nightlife, Bars\', \'hours\': {\'Monday\': \'17:30-23:0\', \'Tuesday\': \'17:0-21:0\', \'Wednesday\': \'17:0-21:0\', \'Thursday\': \'17:0-21:0\', \'Friday\': \'17:0-21:0\', \'Saturday\': \'17:0-21:0\', \'Sunday\': \'9:0-14:0\'}, \'attributes\': {\'BusinessParking\': {\'garage\': True, \'street\': True, \'validated\': True, \'lot\': False, \'valet\': True}, \'RestaurantsReservations\': True, \'OutdoorSeating\': False, \'WiFi\': \'free\', \'RestaurantsTakeOut\': True, \'RestaurantsGoodForGroups\': True, \'Music\': False, \'Ambience\': {\'romantic\': False, \'intimate\': False, \'classy\': True, \'hipster\': False, \'divey\': False, \'touristy\': False, \'trendy\': False, \'upscale\': False, \'casual\': False}}, \'business_stars\': 4.0, \'review_info\': [{\'review_stars\': 5.0, \'

In [11]:
sample3 = []
for i in id_011:
    if test_data_span[i]["task_type"]!="Summary":
        continue
    
    hal_type2 = []
    for hal_id in test_data_span[i]["hallucination_id"]:
        hal_type2.append(hal_id["label_type"])
    
    if "Subtle Baseless Info" in hal_type2 or "Evident Baseless Info" in hal_type2:
        sample3.append(test_data_span[i])

In [13]:
sample3[6]

{'ref': 'A nuclear submarine being repaired at a Russian shipyard has caught on fire, according to a law enforcement source speaking to Russia\'s state-run news agency ITAR-Tass. "The submarine is in a dry dock," Tass reports, citing the source, and there is no ammunition on board. "The rubber insulation between the submarine\'s light and pressure hull is on fire," Tass reported. Russia\'s RIA Novosti news agency says insulation caught on fire as welding work was being done on the submarine. Tass reported that the fire began on a sub in the Zvyozdochka shipyard in northwestern Russia. Zvyozdochka spokesman Yevgeny Gladyshev told the news agency that the sub had been undergoing repairs since November 2013. "Nuclear fuel from the sub\'s reactor has been unloaded," he reportedly said. "There are no armaments or chemically active, dangerous substances, fissionable materials on it," Gladyshev said to Tass. "The enterprise\'s personnel left the premises when the submarine caught fire, no one

### Model

In [1]:
import json

with open("dataset/rag_truth_span_test.json", "r") as f:
    test_data_span = json.load(f)
with open("dataset/rag_truth_span_train.json", "r") as f:
    train_data_span = json.load(f)
with open("dataset/rag_truth_span_dev.json", "r") as f:
    dev_data_span = json.load(f)

In [2]:
test_data_span[0]

{'ref': 'The FBI charged a Philadelphia woman on Thursday with trying to travel overseas to fight for ISIS. She\'s one of three women arrested this week on terror charges. Two New York women were also taken into custody. An FBI complaint cites numerous social media messages dating back to August 2013 that were sent by Keonna Thomas, 30, also known as "Young Lioness" and "Fatayat Al Khilafah." One Twitter message said, "If we truly knew the realities ... we all would be rushing to join our brothers in the front lines pray ALLAH accept us as shuhada [martyrs]." Another said, "When you\'re a mujahid [violent jihadi fighter] your death becomes a wedding." The FBI said Thomas purchased an electronic visa to Turkey on March 23. Turkey is known as the easiest place from which to enter Syria and join ISIS. An ISIS manual advises recruits to buy round-trip tickets to vacation spots such as Spain and then purchase tickets for their real destination once they arrive overseas, the FBI said. On Mar

In [9]:
from collections import defaultdict

hal_model_qa = defaultdict(lambda:0)
hal_model_d2t = defaultdict(lambda:0)
hal_model_sum = defaultdict(lambda:0)

for d in test_data_span:
    if d["hallucination_id"] == []:
        continue
        
    if d["task_type"] == "QA":
        hal_model_qa[d["model"]] += 1
    elif d["task_type"] == "Data2txt":
        hal_model_d2t[d["model"]] += 1
    else:
        hal_model_sum[d["model"]] += 1
print(dict(hal_model_qa))
print(dict(hal_model_d2t))
print(dict(hal_model_sum))

{'llama-2-13b-chat': 36, 'mistral-7B-instruct': 31, 'llama-2-7b-chat': 52, 'llama-2-70b-chat': 35, 'gpt-3.5-turbo-0613': 5, 'gpt-4-0613': 1}
{'gpt-4-0613': 35, 'mistral-7B-instruct': 134, 'llama-2-7b-chat': 123, 'llama-2-13b-chat': 138, 'llama-2-70b-chat': 112, 'gpt-3.5-turbo-0613': 37}
{'llama-2-13b-chat': 33, 'llama-2-70b-chat': 24, 'mistral-7B-instruct': 86, 'gpt-3.5-turbo-0613': 4, 'llama-2-7b-chat': 51, 'gpt-4-0613': 6}


In [10]:
from collections import defaultdict

hal_model_qa_imp = defaultdict(lambda:0)
hal_model_d2t_imp = defaultdict(lambda:0)
hal_model_sum_imp = defaultdict(lambda:0)

for i in id_011 + id_111:
    d = test_data_span[i]
    if d["hallucination_id"] == []:
        continue
        
    if d["task_type"] == "QA":
        hal_model_qa_imp[d["model"]] += 1
    elif d["task_type"] == "Data2txt":
        hal_model_d2t_imp[d["model"]] += 1
    else:
        hal_model_sum_imp[d["model"]] += 1
print(dict(hal_model_qa_imp))
print(dict(hal_model_d2t_imp))
print(dict(hal_model_sum_imp))

{'llama-2-13b-chat': 35, 'llama-2-7b-chat': 51, 'llama-2-70b-chat': 29, 'mistral-7B-instruct': 24, 'gpt-3.5-turbo-0613': 4}
{'llama-2-70b-chat': 80, 'llama-2-13b-chat': 132, 'mistral-7B-instruct': 121, 'llama-2-7b-chat': 101, 'gpt-4-0613': 1}
{'llama-2-13b-chat': 20, 'mistral-7B-instruct': 39, 'llama-2-7b-chat': 34, 'llama-2-70b-chat': 11}
