# Claim-based Question-Answer Hallucination Detection

Import necessary packages.

In [1]:
import time
from uqlm.longform.black_box import ClaimQAScorer
from uqlm import BlackBoxUQ

#### Load LLM and device

In [2]:
# from langchain_google_vertexai import ChatVertexAI
# llm = ChatVertexAI(model="gemini-1.5-flash")

from dotenv import load_dotenv, find_dotenv
from langchain_openai import AzureChatOpenAI

load_dotenv(find_dotenv())
llm = AzureChatOpenAI(
    deployment_name="gpt-4o-mini",
    openai_api_type="azure",
    openai_api_version="2024-02-15-preview",
    temperature=1,  # User to set temperature
)

In [3]:
import torch

# Set the torch device
if torch.cuda.is_available():  # NVIDIA GPU
    device = torch.device("cuda")
elif torch.backends.mps.is_available():  # macOS
    device = torch.device("mps")
else:
    device = torch.device("cpu")  # CPU
print(f"Using {device.type} device")

Using cuda device


#### Setup Prompts and Black Box Scorer

In [4]:
prompts = ["write a paragraph about Paul McCartney", "write a paragraph about John Lennon"]

In [5]:
bb_scorer = BlackBoxUQ(
    llm=llm,
    max_calls_per_min=500,  # set value to avoid rate limit error
    device=device,
    scorers=["exact_match"],
)

#### Claim-QA class

There are three methods that can be used to compute Claim-QA score.
- `generate_and_score`: If you only have prompts, call this method generate long response, decompose that response into factoids, then generate questions for each factoids, and compute question-level, factoid-level, and response-level scores.
- `score`: If you already generated long response, call this method
- `evaluate`: If you already have decomposed long responses into factoids, call this method

##### 1. `generate_and_score` method

In [6]:
claim_qa = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500, num_questions=2)
start_time = time.time()
result = await claim_qa.generate_and_score(prompts=prompts)

print(f"Computation time: {time.time() - start_time} seconds")

Number of factoids per response:  [28, 24]


Output()

Number of total questions:  104


Length of BB result:  104
tmp:  [1.0, 1.0, 0.0, 0.8, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.8, 0.0, 0.2, 0.8, 1.0, 0.6, 1.0, 0.0, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.6, 0.0, 0.0, 0.6, 0.2, 0.4, 0.6, 1.0, 1.0, 0.8, 0.8, 0.8, 0.6, 1.0, 0.6, 0.0, 0.4, 0.4, 0.2, 0.8, 1.0, 1.0, 0.2, 0.2, 0.0, 0.2, 0.0, 0.0, 0.0]
tmp_factoid_scores:  [1.0, 0.4, 0.5, 0.5, 1.0, 0.4, 0.5, 0.8, 0.5, 0.9, 0.0, 0.0, 0.0, 0.7, 0.0, 0.4, 0.5, 1.0, 0.8, 0.7, 0.8, 0.2, 0.30000000000000004, 0.9, 0.6, 0.1, 0.1, 0.0]
tmp:  [0.8, 0.8, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.8, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.6, 0.6, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0]
tmp_factoid_scores:  [0.8, 0.5, 1.0, 0.5, 0.5, 1.0, 0.9, 0.0, 0.1, 0.0, 0.0, 1.0, 0.6, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 1.0, 0.5, 0.0, 0.2, 0.0]
Computation time: 77.72662901878357 seconds


In [7]:
factoid_scores = result.to_dict()["data"]["factoid_scores_exact_match"]
print(" Number of factoids*questions for first response: ", len(factoid_scores[0]))
print(" Number of factoids*questions for second response: ", len(factoid_scores[1]))

 Number of factoids*questions for first response:  28
 Number of factoids*questions for second response:  24


In [8]:
result.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,write a paragraph about Paul McCartney,write a paragraph about Paul McCartney,0.485714,"[1.0, 0.4, 0.5, 0.5, 1.0, 0.4, 0.5, 0.8, 0.5, ...",[Paul McCartney is a legendary British musicia...,"[[ Who is a legendary British musician? , Wha...","[[David Bowie., British.], [A British musician...","[[[David Bowie., David Bowie., David Bowie., D..."
1,write a paragraph about John Lennon,write a paragraph about John Lennon,0.4,"[0.8, 0.5, 1.0, 0.5, 0.5, 1.0, 0.9, 0.0, 0.1, ...","[John Lennon was an iconic British musician., ...","[[ Who was an iconic British musician? , What...","[[David Bowie., British], [Elvis Presley., Mus...","[[[David Bowie., David Bowie., David Bowie, Da..."


#### 2. `score` method

In [9]:
claim_qa2 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500)
start_time = time.time()
result2 = await claim_qa2.score(prompts=claim_qa.prompts, responses=claim_qa.responses)
print(f"Computation time: {time.time() - start_time} seconds")

Number of factoids per response:  [26, 25]


Output()

Number of total questions:  51


Length of BB result:  51
tmp:  [1.0, 1.0, 1.0, 1.0, 0.8, 0.2, 0.0, 1.0, 0.8, 0.0, 0.2, 0.0, 0.0, 0.0, 0.6, 1.0, 0.8, 0.4, 0.6, 1.0, 0.0, 0.6, 0.4, 0.0, 0.0, 0.0]
tmp_factoid_scores:  [1.0, 1.0, 1.0, 1.0, 0.8, 0.2, 0.0, 1.0, 0.8, 0.0, 0.2, 0.0, 0.0, 0.0, 0.6, 1.0, 0.8, 0.4, 0.6, 1.0, 0.0, 0.6, 0.4, 0.0, 0.0, 0.0]
tmp:  [0.4, 1.0, 0.0, 1.0, 0.8, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.8, 0.8, 0.6, 0.0, 0.0, 0.6, 0.0, 0.4, 0.0, 0.8, 1.0, 0.0, 0.0, 0.0]
tmp_factoid_scores:  [0.4, 1.0, 0.0, 1.0, 0.8, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.8, 0.8, 0.6, 0.0, 0.0, 0.6, 0.0, 0.4, 0.0, 0.8, 1.0, 0.0, 0.0, 0.0]
Computation time: 17.740710973739624 seconds


In [10]:
result2.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,write a paragraph about Paul McCartney,write a paragraph about Paul McCartney,0.476923,"[1.0, 1.0, 1.0, 1.0, 0.8, 0.2, 0.0, 1.0, 0.8, ...",[Paul McCartney is a legendary British musicia...,[[Who is a legendary British musician known fo...,"[[Paul McCartney.], [Musician.], [Paul McCartn...","[[[Paul McCartney., Paul McCartney., Paul McCa..."
1,write a paragraph about John Lennon,write a paragraph about John Lennon,0.4,"[0.4, 1.0, 0.0, 1.0, 0.8, 1.0, 0.8, 0.0, 0.0, ...","[John Lennon was an iconic British musician., ...",[[What was John Lennon's role in music that ma...,"[[Co-founder of The Beatles, influential songw...",[[[Co-founder of The Beatles; revolutionary mu...


#### 3. `evaluate` method

In [6]:
import os
import json
import pandas as pd

os.chdir("/home/jupyter/longform-experiments/factscore")
with open("responses_gemini_flash.json", "r") as f:
    gemini_data = json.load(f)

factscore = pd.read_parquet("factscore.parquet")[["entity", "hundredw_prompt", "wikipedia_text"]].rename(columns={"hundredw_prompt": "question", "wikipedia_text": "answer"})

In [8]:
gemini_data["sentences"][224:227], gemini_data["responses"][224:227]

([['Rachel Bilson is an American actress best known for her breakout role as Summer Roberts in the hit teen drama *The O.C.* (2003-2007).',
   'She later starred as Dr. Zoe Hart in the CW series *Hart of Dixie* (2011-2015).\n\n',
   'Bilson also appeared in films like *Jumper* and *The Last Kiss*.',
   "Beyond acting, she's recognized for her fashion sense and has been involved in various design collaborations.",
   'A mother, she continues to work in television and podcasting, maintaining a presence in Hollywood.'],
  ["Xi Jinping, born in 1953, is the current General Secretary of the Chinese Communist Party and President of the People's Republic of China.",
   'Son of a revolutionary veteran, he endured hardship during the Cultural Revolution before steadily rising through provincial leadership.',
   "He became China's top leader in 2012.",
   'Since then, he has consolidated power, launched a sweeping anti-corruption campaign, and championed "Xi Jinping Thought."',
   'His tenure ha

In [9]:
claim_qa4 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], max_calls_per_min=500, num_questions=3, num_claim_qa_responses=4)
start = time.time()
result4 = await claim_qa4.evaluate(
    factoids=gemini_data["sentences"][224:226],
    entities=factscore["entity"].tolist()[224:226],
    # responses=gemini_data["responses"][224:226]
)
stop = time.time()
print(f"Computation time: {stop - start} seconds")

Number of factoids per response:  [5, 5]


Output()

Number of total questions:  30


Length of BB result:  30
Computation time: 10.036797285079956 seconds


In [10]:
result4_df = result4.to_df()
result4_df

Unnamed: 0,response,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,,0.466667,"[0.6666666666666666, 0.3333333333333333, 0.583...",[Rachel Bilson is an American actress best kno...,"[[We are writing some facts about ""Rachel Bils...","[[""The O.C."", Summer Roberts., 2003 to 2007.],...","[[[""The OC."", ""The O.C."", ""The OC"", ""The O.C.""..."
1,,0.6,"[0.3333333333333333, 0.9166666666666666, 1.0, ...","[Xi Jinping, born in 1953, is the current Gene...","[[We are writing some facts about ""Xi Jinping....","[[1953, General Secretary of the Communist Par...","[[[1953, 1953, 1953, 1953], [General Secretary..."


In [11]:
result4_df["response_fact_question"][0]

[['We are writing some facts about "Rachel Bilson."\n\nConsider the following question and answer with as few words as possible:\n\n What is Rachel Bilson best known for? \n\nNow your answer is:',
  'We are writing some facts about "Rachel Bilson."\n\nConsider the following question and answer with as few words as possible:\n\n Which character did Rachel Bilson play in *The O.C.*? \n\nNow your answer is:',
  'We are writing some facts about "Rachel Bilson."\n\nConsider the following question and answer with as few words as possible:\n\n In what years did *The O.C.* air?\n\nNow your answer is:'],
 ['We are writing some facts about "Rachel Bilson."\n\nConsider the following question and answer with as few words as possible:\n\n Who played the character Dr. Zoe Hart? \n\nNow your answer is:',
  'We are writing some facts about "Rachel Bilson."\n\nConsider the following question and answer with as few words as possible:\n\n What series did she star in? \n\nNow your answer is:',
  'We are w

In [13]:
result4_df["factoid"][0]

['Rachel Bilson is an American actress best known for her breakout role as Summer Roberts in the hit teen drama *The O.C.* (2003-2007).',
 'She later starred as Dr. Zoe Hart in the CW series *Hart of Dixie* (2011-2015).\n\n',
 'Bilson also appeared in films like *Jumper* and *The Last Kiss*.',
 "Beyond acting, she's recognized for her fashion sense and has been involved in various design collaborations.",
 'A mother, she continues to work in television and podcasting, maintaining a presence in Hollywood.']

In [14]:
result4_df["response_fact_questions_response"][0]

[['"The O.C."', 'Summer Roberts.', '2003 to 2007.'],
 ['Rachel Bilson.', 'The OC.', '2011 to 2015'],
 ['Jumper.', 'The Last Kiss.', '"Jumper"'],
 ['Fashion and lifestyle influence.',
  'Acting, fashion, producing.',
  'Fashion collaborations and television shows.'],
 ['Acting.', 'Television and film.', 'Los Angeles.']]

In [23]:
result4_df["response_fact_questions_sampled_response"][1]

[[['Miguel Ángel Félix Gallardo.',
   'Miguel Ángel Félix Gallardo.',
   'Miguel Ángel Félix Gallardo.',
   'Miguel Ángel Félix Gallardo.'],
  ['The Guadalajara Cartel.',
   'Gulf Cartel.',
   'Guadalajara Cartel.',
   'The Guadalajara Cartel.'],
  ['1980s.', '1980s.', '1980s', '1980s.']],
 [['Heroin and marijuana.',
   'Marijuana and cocaine.',
   'Marijuana and cocaine.',
   'Marijuana and cocaine.'],
  ['He controlled drug trafficking in México.',
   'He controlled drug trafficking operations in Mexico.',
   'The Guadalajara Cartel.',
   'He controlled major drug trafficking routes in Mexico.'],
  ['Mexico.', 'Mexico', 'Mexico.', 'Mexico.']],
 [['April 1989', 'April 1989.', '1989.', 'April 8, 1989.'],
  ['His arrest in 1989.',
   'His arrest in 1989.',
   'His arrest in 1989.',
   'His arrest in 1989.'],
  ['Drug trafficking.',
   'Drug trafficking.',
   'Drug trafficking.',
   'Drug trafficking.']],
 [['Maximum security prison.',
   'Maximum-security prison.',
   'Prison.',
   'Pri