In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

True

# OpenAI

In [2]:
from nemoguardrails import LLMRails, RailsConfig

# initialize rails config
config = RailsConfig.from_path("./hallucination_config/openai")

# create rails
app = LLMRails(config, verbose=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
history = [{"role": "user", "content": "How many CUDA cores does a 4090 have?"}]
bot_message = await app.generate_async(messages=history)
print(bot_message['content'])

The NVIDIA GeForce RTX 4090 has 5,376 CUDA cores.


## Adding the hallucination rail

In [4]:
# initialize rails config
config = RailsConfig.from_path("./hallucination_config/openai")

# create rails
app = LLMRails(config, verbose=True)

In [5]:
history = [{"role": "user", "content": "How many CUDA cores does a 4090 have?"}]
bot_message = await app.generate_async(messages=history)
print(bot_message['content'])

The NVIDIA GeForce RTX 4090 has 8704 CUDA cores. However, this may be subject to change depending on the model and manufacturer.
The previous answer is prone to hallucination and may not be accurate. Please double check the answer using additional sources.


# Llama2

In [2]:
import huggingface_hub
huggingface_hub.login(token=os.environ["HUGGINGFACEHUB_API_TOKEN"])


from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer,
    max_length=4096,
    do_sample=True,
    temperature=0.2,
    top_p=0.95,
    logprobs=None,
    top_k=40,
    repetition_penalty=1.1
)

llm = HuggingFacePipeline(pipeline=pipe)

  from .autonotebook import tqdm as notebook_tqdm


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/sofiaperez/.cache/huggingface/token
Login successful


Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.09s/it]


In [3]:
from nemoguardrails.llm.helpers import get_llm_instance_wrapper
from nemoguardrails.llm.providers import register_llm_provider
from nemoguardrails import LLMRails, RailsConfig

HFPipeline = get_llm_instance_wrapper(
    llm_instance=llm, llm_type="hf_pipeline_llama2"
)

register_llm_provider("hf_pipeline_llama2", HFPipeline)

# initialize rails config
config = RailsConfig.from_path("./hallucination_config/llama2")

# create rails
app = LLMRails(config, verbose = True)

In [8]:
history = [{"role": "user", "content": "How many CUDA cores does a 4090 have?"}]
bot_message = await app.generate_async(messages=history)
print(bot_message['content'])

Parameter temperature does not exist for WrapperLLM
Parameter temperature does not exist for WrapperLLM


A 4090 GPU has 88 RT cores, 46 Tensor cores, and 160 CUDA cores.


## Adding the hallucination rail

In [6]:
from nemoguardrails.llm.helpers import get_llm_instance_wrapper
from nemoguardrails.llm.providers import register_llm_provider
from nemoguardrails import LLMRails, RailsConfig

HFPipeline = get_llm_instance_wrapper(
    llm_instance=llm, llm_type="hf_pipeline_llama2"
)

register_llm_provider("hf_pipeline_llama2", HFPipeline)

# initialize rails config
config = RailsConfig.from_path("./hallucination_config/llama2")

# create rails
app = LLMRails(config, verbose = True)

In [7]:
history = [{"role": "user", "content": "How many CUDA cores does a 4090 have?"}]
bot_message = await app.generate_async(messages=history)
print(bot_message['content'])

Parameter temperature does not exist for WrapperLLM
Parameter temperature does not exist for WrapperLLM
Hallucination rail can only be used with OpenAI LLM engines.Current LLM engine is WrapperLLM.


Hallucination rail can only be used with OpenAI LLM engines.Current LLM engine is WrapperLLM.
According to my knowledge base, the Nvidia GeForce RTX 4090 has 48 CUDA cores.


## Changing hallucination.py file

In [3]:
pipe = pipeline(
    "text-generation",
    model=model, 
    tokenizer=tokenizer,
    max_length=1024,
    do_sample=True,
    temperature=0.1,
    top_p=0.95,
    logprobs=None,
    top_k=40,
    repetition_penalty=1.1
)

llm = HuggingFacePipeline(pipeline=pipe)

In [4]:
from nemoguardrails.llm.helpers import get_llm_instance_wrapper
from nemoguardrails.llm.providers import register_llm_provider
from nemoguardrails import LLMRails, RailsConfig


HFPipeline = get_llm_instance_wrapper(
    llm_instance=llm, llm_type="hf_pipeline_llama2"
)

register_llm_provider("hf_pipeline_llama2", HFPipeline)


# initialize rails config
config = RailsConfig.from_path("./hallucination_config/llama2")

# create rails
app = LLMRails(config, verbose = True)

In [9]:
import logging
from typing import Optional

from langchain import LLMChain, PromptTemplate
from langchain.llms.base import BaseLLM

from nemoguardrails.actions.llm.utils import (
    get_multiline_response,
    llm_call,
    strip_quotes,
)
from nemoguardrails.llm.params import llm_params
from nemoguardrails.llm.taskmanager import LLMTaskManager
from nemoguardrails.llm.types import Task

log = logging.getLogger(__name__)

HALLUCINATION_NUM_EXTRA_RESPONSES = 2


async def check_hallucination_llama(
    llm_task_manager: LLMTaskManager,
    context: Optional[dict] = None,
    llm: Optional[BaseLLM] = None,
    use_llm_checking: bool = True,
):
    """Checks if the last bot response is a hallucination by checking multiple completions for self-consistency.

    :return: True if hallucination is detected, False otherwise.
    """

    bot_response = context.get("last_bot_message")
    last_user_message_string = context.get("last_user_message")
    num_responses = HALLUCINATION_NUM_EXTRA_RESPONSES
    
    # Use the "generate" call from langchain to get all completions in the same response.
    last_bot_prompt = PromptTemplate(template="{text}", input_variables=["text"])
    chain = LLMChain(prompt=last_bot_prompt, llm=llm)
    
    extra_responses = []
    for i in range(num_responses):
        result = chain.run(last_user_message_string)    
        result = get_multiline_response(result)
        result = strip_quotes(result)
        extra_responses.append(result)
    
    if len(extra_responses) == 0:
        print(f"No extra LLM responses were generated for '{bot_response}' hallucination check.")
        return False
    elif len(extra_responses) < num_responses:
        print(f"Requested {num_responses} extra LLM responses for hallucination check, "
            f"received {len(extra_responses)}.")
        
    if use_llm_checking:
        # Only support LLM-based agreement check in current version
        prompt = llm_task_manager.render_task_prompt(
            task=Task.CHECK_HALLUCINATION,
            context={
                "statement": bot_response,
                "paragraph": " ".join(extra_responses),
            },
        )

        with llm_params(llm):#, temperature=0.0):
            agreement = await llm_call(llm, prompt)

        agreement = agreement.lower().strip()
        print(f"Agreement result for looking for hallucination is {agreement}.")
        # Return True if the hallucination check fails
        return "no" in agreement

    return False

app.register_action(check_hallucination_llama, name="check_hallucination_llama") 

In [10]:
history = [{"role": "user", "content": "How many CUDA cores does a 4090 have?"}]
bot_message = await app.generate_async(messages=history)
print(bot_message['content'])

Parameter temperature does not exist for WrapperLLM
Parameter temperature does not exist for WrapperLLM


Agreement result for looking for hallucination is no.
A GeForce RTX 4090 has 24 GB of GDDR6X memory and 5888 CUDA cores.
The previous answer is prone to hallucination and may not be accurate. Please double check the answer using additional sources.
