# Claim-based Question-Answer Hallucination Detection

Import necessary packages.

In [1]:
import time
from uqlm.longform.black_box import ClaimQAScorer
from uqlm import BlackBoxUQ

#### Load LLM and device

In [2]:
# from langchain_google_vertexai import ChatVertexAI
# llm = ChatVertexAI(model="gemini-1.5-flash")

from dotenv import load_dotenv, find_dotenv
from langchain_openai import AzureChatOpenAI

load_dotenv(find_dotenv())
llm = AzureChatOpenAI(
    deployment_name="gpt-4o-mini",
    openai_api_type="azure",
    openai_api_version="2024-02-15-preview",
    temperature=1,  # User to set temperature
)

In [3]:
import torch

# Set the torch device
if torch.cuda.is_available():  # NVIDIA GPU
    device = torch.device("cuda")
elif torch.backends.mps.is_available():  # macOS
    device = torch.device("mps")
else:
    device = torch.device("cpu")  # CPU
print(f"Using {device.type} device")

Using mps device


#### Setup Prompts and Black Box Scorer

In [4]:
prompts = [
    "write a paragraph about Paul McCartney",
    "write a paragraph about John Lennon"
]

In [5]:
bb_scorer = BlackBoxUQ(
    llm=llm,
    max_calls_per_min=500,  # set value to avoid rate limit error
    device=device,
    scorers=["exact_match"],
)

#### Claim-QA class

There are three methods that can be used to compute Claim-QA score.
- `generate_and_score`: If you only have prompts, call this method generate long response, decompose that response into factoids, then generate questions for each factoids, and compute question-level, factoid-level, and response-level scores.
- `score`: If you already generated long response, call this method
- `evaluate`: If you already have decomposed long responses into factoids, call this method

##### 1. `generate_and_score` method

In [6]:
claim_qa = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500, num_questions=2)
start_time = time.time()
result = await claim_qa.generate_and_score(prompts=prompts)

print(f"Computation time: {time.time() - start_time} seconds")

Number of factoids per response:  [28, 24]


Output()

Number of total questions:  104


Length of BB result:  104
tmp:  [1.0, 1.0, 0.0, 0.8, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.8, 0.0, 0.2, 0.8, 1.0, 0.6, 1.0, 0.0, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.8, 0.6, 0.0, 0.0, 0.6, 0.2, 0.4, 0.6, 1.0, 1.0, 0.8, 0.8, 0.8, 0.6, 1.0, 0.6, 0.0, 0.4, 0.4, 0.2, 0.8, 1.0, 1.0, 0.2, 0.2, 0.0, 0.2, 0.0, 0.0, 0.0]
tmp_factoid_scores:  [1.0, 0.4, 0.5, 0.5, 1.0, 0.4, 0.5, 0.8, 0.5, 0.9, 0.0, 0.0, 0.0, 0.7, 0.0, 0.4, 0.5, 1.0, 0.8, 0.7, 0.8, 0.2, 0.30000000000000004, 0.9, 0.6, 0.1, 0.1, 0.0]
tmp:  [0.8, 0.8, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.8, 0.0, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.6, 0.6, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0]
tmp_factoid_scores:  [0.8, 0.5, 1.0, 0.5, 0.5, 1.0, 0.9, 0.0, 0.1, 0.0, 0.0, 1.0, 0.6, 0.0, 0.5, 0.0, 0.0, 0.5, 0.0, 1.0, 0.5, 0.0, 0.2, 0.0]
Computation time: 77.72662901878357 seconds


In [7]:
factoid_scores = result.to_dict()["data"]["factoid_scores_exact_match"]
print(" Number of factoids*questions for first response: ", len(factoid_scores[0]))
print(" Number of factoids*questions for second response: ", len(factoid_scores[1]))

 Number of factoids*questions for first response:  28
 Number of factoids*questions for second response:  24


In [8]:
result.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,write a paragraph about Paul McCartney,write a paragraph about Paul McCartney,0.485714,"[1.0, 0.4, 0.5, 0.5, 1.0, 0.4, 0.5, 0.8, 0.5, ...",[Paul McCartney is a legendary British musicia...,"[[ Who is a legendary British musician? , Wha...","[[David Bowie., British.], [A British musician...","[[[David Bowie., David Bowie., David Bowie., D..."
1,write a paragraph about John Lennon,write a paragraph about John Lennon,0.4,"[0.8, 0.5, 1.0, 0.5, 0.5, 1.0, 0.9, 0.0, 0.1, ...","[John Lennon was an iconic British musician., ...","[[ Who was an iconic British musician? , What...","[[David Bowie., British], [Elvis Presley., Mus...","[[[David Bowie., David Bowie., David Bowie, Da..."


#### 2. `score` method

In [9]:
claim_qa2 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500)
start_time = time.time()
result2 = await claim_qa2.score(prompts=claim_qa.prompts, responses=claim_qa.responses)
print(f"Computation time: {time.time() - start_time} seconds")

Number of factoids per response:  [26, 25]


Output()

Number of total questions:  51


Length of BB result:  51
tmp:  [1.0, 1.0, 1.0, 1.0, 0.8, 0.2, 0.0, 1.0, 0.8, 0.0, 0.2, 0.0, 0.0, 0.0, 0.6, 1.0, 0.8, 0.4, 0.6, 1.0, 0.0, 0.6, 0.4, 0.0, 0.0, 0.0]
tmp_factoid_scores:  [1.0, 1.0, 1.0, 1.0, 0.8, 0.2, 0.0, 1.0, 0.8, 0.0, 0.2, 0.0, 0.0, 0.0, 0.6, 1.0, 0.8, 0.4, 0.6, 1.0, 0.0, 0.6, 0.4, 0.0, 0.0, 0.0]
tmp:  [0.4, 1.0, 0.0, 1.0, 0.8, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.8, 0.8, 0.6, 0.0, 0.0, 0.6, 0.0, 0.4, 0.0, 0.8, 1.0, 0.0, 0.0, 0.0]
tmp_factoid_scores:  [0.4, 1.0, 0.0, 1.0, 0.8, 1.0, 0.8, 0.0, 0.0, 0.0, 0.0, 0.8, 0.8, 0.6, 0.0, 0.0, 0.6, 0.0, 0.4, 0.0, 0.8, 1.0, 0.0, 0.0, 0.0]
Computation time: 17.740710973739624 seconds


In [10]:
result2.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,write a paragraph about Paul McCartney,write a paragraph about Paul McCartney,0.476923,"[1.0, 1.0, 1.0, 1.0, 0.8, 0.2, 0.0, 1.0, 0.8, ...",[Paul McCartney is a legendary British musicia...,[[Who is a legendary British musician known fo...,"[[Paul McCartney.], [Musician.], [Paul McCartn...","[[[Paul McCartney., Paul McCartney., Paul McCa..."
1,write a paragraph about John Lennon,write a paragraph about John Lennon,0.4,"[0.4, 1.0, 0.0, 1.0, 0.8, 1.0, 0.8, 0.0, 0.0, ...","[John Lennon was an iconic British musician., ...",[[What was John Lennon's role in music that ma...,"[[Co-founder of The Beatles, influential songw...",[[[Co-founder of The Beatles; revolutionary mu...


#### 3. `evaluate` method

In [11]:
claim_qa3 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500, num_questions=2)
start = time.time()
result3 = await claim_qa3.evaluate(prompts=claim_qa.prompts, responses=claim_qa.responses, factoids=claim_qa.factoids)
stop = time.time()
print(f"Computation time: {stop - start} seconds")

Number of factoids per response:  [28, 24]


Output()

Number of total questions:  104


Length of BB result:  104
tmp:  [0.8, 0.8, 0.0, 1.0, 1.0, 1.0, 1.0, 0.8, 1.0, 1.0, 0.8, 0.2, 0.8, 0.6, 0.0, 0.8, 0.2, 1.0, 1.0, 0.2, 0.0, 0.0, 0.0, 0.4, 0.0, 0.0, 0.8, 0.0, 0.0, 0.4, 0.8, 0.4, 0.6, 0.4, 0.8, 0.0, 0.4, 0.8, 1.0, 0.2, 1.0, 0.8, 1.0, 0.0, 0.2, 0.6, 0.6, 0.0, 1.0, 0.0, 0.6, 0.0, 0.0, 0.0, 1.0, 0.4]
tmp_factoid_scores:  [0.8, 0.5, 1.0, 0.9, 1.0, 0.5, 0.7, 0.4, 0.6, 0.6, 0.0, 0.2, 0.0, 0.4, 0.2, 0.6000000000000001, 0.5, 0.4, 0.6000000000000001, 0.6, 0.9, 0.5, 0.4, 0.3, 0.5, 0.3, 0.0, 0.7]
tmp:  [1.0, 0.8, 1.0, 1.0, 0.8, 0.4, 0.6, 0.8, 1.0, 0.0, 1.0, 0.2, 1.0, 0.6, 0.2, 0.0, 0.0, 0.0, 0.0, 0.6, 0.8, 0.4, 0.6, 1.0, 0.6, 0.8, 0.0, 0.0, 0.2, 1.0, 0.4, 0.6, 1.0, 0.0, 0.4, 0.2, 0.6, 0.2, 1.0, 1.0, 1.0, 0.0, 0.6, 0.0, 0.0, 0.0, 0.2, 0.0]
tmp_factoid_scores:  [0.9, 1.0, 0.6000000000000001, 0.7, 0.5, 0.6, 0.8, 0.1, 0.0, 0.3, 0.6000000000000001, 0.8, 0.7, 0.0, 0.6, 0.5, 0.5, 0.30000000000000004, 0.4, 1.0, 0.5, 0.3, 0.0, 0.1]
Computation time: 68.46951389312744 seconds


In [12]:
claim_qa.factoids

[['Paul McCartney is a legendary British musician.',
  'Paul McCartney is a singer.',
  'Paul McCartney is a songwriter.',
  'Paul McCartney is best known as a co-founder of The Beatles.',
  'Paul McCartney was born on June 18, 1942.',
  'Paul McCartney was born in Liverpool.',
  'Paul McCartney was born in England.',
  'Paul McCartney showed an early interest in music.',
  'Paul McCartney learned to play the guitar.',
  'Paul McCartney learned to play the piano.',
  "McCartney's songwriting is innovative.",
  "McCartney's songwriting is characterized by catchy melodies.",
  "McCartney's songwriting is characterized by insightful lyrics.",
  "McCartney played a pivotal role in The Beatles' success.",
  'McCartney contributed to timeless hits.',
  'McCartney contributed to the hit "Yesterday."',
  'McCartney contributed to the hit "Hey Jude."',
  'McCartney contributed to the hit "Let It Be."',
  "McCartney embarked on a successful solo career after the band's breakup.",
  'The Beatles 

In [13]:
claim_qa3.response_fact_questions

[[[' Who is a legendary British musician? ',
   ' What nationality is Paul McCartney?'],
  [' Who is a famous musician known for singing? ',
   " What is Paul McCartney's profession?"],
  [" What is Paul McCartney's occupation? ",
   ' Is Paul McCartney known for writing songs?'],
  [' Who is best known as a co-founder of The Beatles? ',
   ' What band is Paul McCartney famously associated with?'],
  [' When was Paul McCartney born? ',
   ' What is the birth date of Paul McCartney?'],
  [' Where was Paul McCartney born? ',
   " What city is associated with Paul McCartney's birth?"],
  [' Where was Paul McCartney born? ',
   ' What country is Paul McCartney from?'],
  [' What early interests did Paul McCartney have? ',
   ' In what field did Paul McCartney show an early interest?'],
  [' Who learned to play the guitar? ',
   ' What instrument did Paul McCartney learn to play?'],
  [' What instrument did Paul McCartney learn to play? ',
   ' Who is known for learning to play the piano?']

In [14]:
claim_qa3.response_fact_questions_responses

[[['David Bowie.', 'British.'],
  ['Adele.', 'Musician.'],
  ['Musician.', 'Yes.'],
  ['John Lennon.', 'The Beatles.'],
  ['June 18, 1942.', 'June 18, 1942.'],
  ['Liverpool, England.', 'Liverpool'],
  ['Liverpool, England.', 'United Kingdom.'],
  ['Music and playing instruments.', 'Music.'],
  ['I did.', 'Bass guitar.'],
  ['Bass guitar.', 'Beethoven.'],
  ['Melodic innovation and emotional depth.',
   'Melodic, diverse, and collaborative.'],
  ['Melodic hooks.', 'Catchy and diverse.'],
  ['Melodic creativity.', 'Love, nostalgia, peace, introspection, nature.'],
  ['George Martin.',
   'Crucial; drove songwriting, pop appeal, and innovation.'],
  ['Songwriting, performing, producing.', 'Yes, many.'],
  ['Paul McCartney.', '"Yesterday"'],
  ['The Beatles.', '"Yesterday"'],
  ['The Beatles.', '"Hey Jude"'],
  ['Pursued a solo career.', 'Solo musician.'],
  ['1970.', '1970'],
  ['Paul McCartney.', 'The Beatles.'],
  ['Pop rock.', 'Seventeen.'],
  ['Awards and honors.', 'Over 600.'],
  ['

In [15]:
claim_qa3.factoids

[['Paul McCartney is a legendary British musician.',
  'Paul McCartney is a singer.',
  'Paul McCartney is a songwriter.',
  'Paul McCartney is best known as a co-founder of The Beatles.',
  'Paul McCartney was born on June 18, 1942.',
  'Paul McCartney was born in Liverpool.',
  'Paul McCartney was born in England.',
  'Paul McCartney showed an early interest in music.',
  'Paul McCartney learned to play the guitar.',
  'Paul McCartney learned to play the piano.',
  "McCartney's songwriting is innovative.",
  "McCartney's songwriting is characterized by catchy melodies.",
  "McCartney's songwriting is characterized by insightful lyrics.",
  "McCartney played a pivotal role in The Beatles' success.",
  'McCartney contributed to timeless hits.',
  'McCartney contributed to the hit "Yesterday."',
  'McCartney contributed to the hit "Hey Jude."',
  'McCartney contributed to the hit "Let It Be."',
  "McCartney embarked on a successful solo career after the band's breakup.",
  'The Beatles 

In [16]:
claim_qa3.response_fact_questions

[[[' Who is a legendary British musician? ',
   ' What nationality is Paul McCartney?'],
  [' Who is a famous musician known for singing? ',
   " What is Paul McCartney's profession?"],
  [" What is Paul McCartney's occupation? ",
   ' Is Paul McCartney known for writing songs?'],
  [' Who is best known as a co-founder of The Beatles? ',
   ' What band is Paul McCartney famously associated with?'],
  [' When was Paul McCartney born? ',
   ' What is the birth date of Paul McCartney?'],
  [' Where was Paul McCartney born? ',
   " What city is associated with Paul McCartney's birth?"],
  [' Where was Paul McCartney born? ',
   ' What country is Paul McCartney from?'],
  [' What early interests did Paul McCartney have? ',
   ' In what field did Paul McCartney show an early interest?'],
  [' Who learned to play the guitar? ',
   ' What instrument did Paul McCartney learn to play?'],
  [' What instrument did Paul McCartney learn to play? ',
   ' Who is known for learning to play the piano?']

In [17]:
result3.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,write a paragraph about Paul McCartney,write a paragraph about Paul McCartney,0.503571,"[0.8, 0.5, 1.0, 0.9, 1.0, 0.5, 0.7, 0.4, 0.6, ...",[Paul McCartney is a legendary British musicia...,"[[ Who is a legendary British musician? , Wha...","[[David Bowie., British.], [Adele., Musician.]...","[[[David Bowie., David Bowie., David Bowie., D..."
1,write a paragraph about John Lennon,write a paragraph about John Lennon,0.491667,"[0.9, 1.0, 0.6000000000000001, 0.7, 0.5, 0.6, ...","[John Lennon was an iconic British musician., ...","[[ Who was an iconic British musician? , What...","[[David Bowie., British.], [Paul McCartney., M...","[[[David Bowie., David Bowie., David Bowie., D..."


In [18]:
import json

with open("responses_gemini_flash.json", "r") as f:
    gemini_data = json.load(f)


In [19]:
gemini_data["sentences"][:2]

[['Queen Suthida Bajrasudhabimalalakshana, born June 3, 1978, is the current Queen of Thailand.',
  'Before her royal marriage, she had a diverse career, initially serving as a flight attendant for Thai Airways.',
  'She later joined the Royal Thai Army in 2010, rising to the rank of general.',
  "She commanded King Vajiralongkorn's personal security unit and was appointed acting commander of the Royal Thai Aide-de-Camp Department.",
  'She married King Vajiralongkorn on May 1, 2019, and was formally crowned Queen days later, on May 4, 2019.'],
 ['Miguel Ángel Félix Gallardo, known as "El Padrino," was a co-founder of the Guadalajara Cartel in the 1980s, becoming one of Mexico\'s most powerful drug lords.',
  'He controlled much of the cocaine and marijuana trade routes into the United States.',
  'His empire crumbled after his arrest in 1989 for the murder of DEA agent Kiki Camarena.',
  "Sentenced to decades in prison, he remains incarcerated, now in a medium-security facility due to

In [20]:
claim_qa4 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], max_calls_per_min=500, num_questions=2)
start = time.time()
result4 = await claim_qa4.evaluate(factoids=gemini_data["sentences"][:2])
stop = time.time()
print(f"Computation time: {stop - start} seconds")

Number of factoids per response:  [5, 4]


Output()

Number of total questions:  18


Length of BB result:  18
tmp:  [0.4, 0.0, 1.0, 0.8, 0.0, 0.6, 0.0, 0.0, 1.0, 0.4]
tmp_factoid_scores:  [0.2, 0.9, 0.3, 0.0, 0.7]
tmp:  [0.0, 0.6, 0.2, 0.0, 0.4, 0.6, 0.0, 0.0]
tmp_factoid_scores:  [0.3, 0.1, 0.5, 0.0]
Computation time: 4.601781845092773 seconds


In [21]:
result4_df = result4.to_df()
result4_df

Unnamed: 0,response_scores_exact_match,factoid_scores_exact_match,factoid,response_fact_question,response_fact_questions_response,response_fact_questions_sampled_response
0,0.42,"[0.2, 0.9, 0.3, 0.0, 0.7]","[Queen Suthida Bajrasudhabimalalakshana, born ...","[[ Who is the current Queen of Thailand? , Wh...","[[Queen Suthida., Suthida was born on June 3, ...","[[[Suthida Tidjai., Queen Suthida., Suthida Ti..."
1,0.225,"[0.3, 0.1, 0.5, 0.0]","[Miguel Ángel Félix Gallardo, known as ""El Pad...","[[ Who was Miguel Ángel Félix Gallardo? , Wha...","[[Mexican drug lord, founding leader of the Gu...","[[[Mexican drug lord, founder of the Guadalaja..."


In [22]:
result4_df["factoid_scores_exact_match"][0]

[0.2, 0.9, 0.3, 0.0, 0.7]

In [23]:
result4_df["response_fact_questions_response"][1]

[['Mexican drug lord, founding leader of the Guadalajara Cartel.',
  'Guadalajara Cartel.'],
 ['Cartels.', 'Cocaine and marijuana.'],
 ['Fall of the Berlin Wall.', 'Kiki Camarena.'],
 ['Medical facility.', 'He destabilized cartels and increased violence.']]