# Claim-based Question-Answer Hallucination Detection

Import necessary packages.

In [1]:
import time
from uqlm.longform.black_box import ClaimQAScorer
from uqlm import BlackBoxUQ

#### Load LLM and device

In [2]:
# from langchain_google_vertexai import ChatVertexAI
# llm = ChatVertexAI(model="gemini-1.5-flash")

from dotenv import load_dotenv, find_dotenv
from langchain_openai import AzureChatOpenAI

load_dotenv(find_dotenv())
llm = AzureChatOpenAI(
    deployment_name="gpt-4o-mini",
    openai_api_type="azure",
    openai_api_version="2024-02-15-preview",
    temperature=1,  # User to set temperature
)

In [3]:
import torch

# Set the torch device
if torch.cuda.is_available():  # NVIDIA GPU
    device = torch.device("cuda")
elif torch.backends.mps.is_available():  # macOS
    device = torch.device("mps")
else:
    device = torch.device("cpu")  # CPU
print(f"Using {device.type} device")

Using mps device


#### Setup Prompts and Black Box Scorer

In [4]:
prompts = [
    "write a paragraph about Paul McCartney",
    "write a paragraph about John Lennon"
]

In [5]:
bb_scorer = BlackBoxUQ(
    llm=llm,
    max_calls_per_min=500,  # set value to avoid rate limit error
    device=device,
    scorers=["exact_match"],
)

#### Claim-QA class

There are three methods that can be used to compute Claim-QA score.
- `generate_and_score`: If you only have prompts, call this method generate long response, decompose that response into factoids, then generate questions for each factoids, and compute question-level, factoid-level, and response-level scores.
- `score`: If you already generated long response, call this method
- `evaluate`: If you already have decomposed long responses into factoids, call this method

##### 1. `generate_and_score` method

In [6]:
start_time = time.time()
claim_qa = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500)
result = await claim_qa.generate_and_score(prompts=prompts)
print(f"Computation time: {time.time() - start_time} seconds")

Computation time: 140.32325506210327 seconds


In [7]:
result.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match
0,write a paragraph about Paul McCartney,Paul McCartney is a legendary British musician...,0.215517,"[0.0, 0.5, 0.25, 0.0, 0.0, 0.75, 0.0, 0.0, 0.7..."
1,write a paragraph about John Lennon,John Lennon was a quintessential figure in 20t...,0.152778,"[0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


#### 2. `score` method

In [8]:
claim_qa2 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500)
result2 = await claim_qa2.score(prompts=claim_qa.prompts, responses=claim_qa.responses)

In [9]:
result2.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match
0,write a paragraph about Paul McCartney,Paul McCartney is a legendary British musician...,0.163462,"[0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.75, 1.0, 0.25..."
1,write a paragraph about John Lennon,John Lennon was a quintessential figure in 20t...,0.2,"[0.0, 0.25, 0.5, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


#### 3. `evaluate` method

In [10]:
claim_qa3 = ClaimQAScorer(llm=llm, black_box_scorers=["exact_match"], response_template="atomic", max_calls_per_min=500)
result3 = await claim_qa3.evaluate(prompts=claim_qa.prompts, responses=claim_qa.responses, factoids=claim_qa.factoids)

In [11]:
result3.to_df()

Unnamed: 0,prompt,response,response_scores_exact_match,factoid_scores_exact_match
0,write a paragraph about Paul McCartney,Paul McCartney is a legendary British musician...,0.284483,"[0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 0.5, 1.0, 1.0, ..."
1,write a paragraph about John Lennon,John Lennon was a quintessential figure in 20t...,0.180556,"[0.0, 0.25, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
