# Demonstration of the Granite certainty intrisic

This notebook shows the usage of the IO processor for the Granite certainty intrisic, 
also known as the [Granite 3.2 8B Instruct Uncertainty LoRA](
    https://huggingface.co/ibm-granite/granite-uncertainty-3.2-8b-lora
)

This notebook can run its own vLLM server to perform inference, or you can host the 
models on your own server. To use your own server, set the `run_server` variable below
to `False` and set appropriate values for the constants 
`openai_base_url`, `openai_base_model_name` and `openai_lora_model_name`.

In [None]:
# Imports go here
from granite_io.io.granite_3_2.input_processors.granite_3_2_input_processor import (
    Granite3Point2Inputs,
)
from granite_io import make_io_processor, make_backend
from granite_io.backend.vllm_server import LocalVLLMServer
from granite_io.io.certainty import CertaintyIOProcessor, CertaintyCompositeIOProcessor

In [None]:
%load_ext pdl.pdl_notebook_ext

In [None]:
# Constants go here
base_model_name = "ibm-granite/granite-3.2-8b-instruct"
lora_model_name = "ibm-granite/granite-uncertainty-3.2-8b-lora"

run_server = False

In [None]:
if run_server:
    # Start by firing up a local vLLM server and connecting a backend instance to it.
    server = LocalVLLMServer(
        base_model_name, lora_adapters=[(lora_model_name, lora_model_name)]
    )
    server.wait_for_startup(200)
    lora_backend = server.make_lora_backend(lora_model_name)
    backend = server.make_backend()
else:  # if not run_server
    # Use an existing server.
    # Modify the constants here as needed.
    openai_base_url = "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com"
    openai_api_key = "XXXXXXXX"
    openai_base_model_name = base_model_name
    openai_lora_model_name = lora_model_name
    backend = make_backend(
        "openai",
        {
            "model_name": "ibm-granite/granite-3.2-8b-instruct",
            "openai_base_url": "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/granite-3-2-8b-instruct/v1/",
            "openai_api_key": openai_api_key,
        },
    )
    lora_backend = make_backend(
        "openai",
        {
            "model_name": "ibm-granite/granite-3.2-8b-lora-uncertainty",
            "openai_base_url": "https://inference-3scale-apicast-production.apps.rits.fmaas.res.ibm.com/granite-3-2-8b-lora-uncertainty/v1/",
            "openai_api_key": openai_api_key,
        },
    )

In [None]:
%%pdl --reset-context
# Create an example chat completion with a user question and two documents.
lastOf:
- role: assistant
  text: Welcome to pet questions!
- Which of my pets have fleas?

In [None]:
%%pdl
# Pass the example input through Granite 3.2 to get an answer
processor:
  model: Granite 3.2
  backend: ${backend}
parameters:
    documents:
    - text: My dog has fleas.
    - text: My cat does not have fleas.
    generate_inputs:
        temperature: 0.0
        max_tokens: 4096
        extra_headers: 
            RITS_API_KEY: ${openai_api_key}

In [None]:
# Append the model's output to the chat


# done implicitly

In [None]:
# Instantiate the I/O processor for the certainty intrinsic
io_proc = CertaintyIOProcessor(lora_backend)

In [None]:
%%pdl
# Set temperature to 0 because we are not sampling from the intrinsic's output
# Pass our example input through the I/O processor and retrieve the result
processor: ${ io_proc }
parameters:
    documents:
    - text: My dog has fleas.
    - text: My cat does not have fleas.
    generate_inputs:
        temperature: 0.0
        extra_headers: 
            RITS_API_KEY: ${openai_api_key}
modelResponse: chat_result

In [None]:
%%pdl

text: >
  Certainty score for the original response is
  ${chat_result.results[0].next_message.content}
contribute: [result]

In [None]:
%%pdl
# Try with an artifical poor-quality assistant response.
lastOf:
- role: "assistant"
  text: Your iguana is absolutely covered in fleas.
- processor: ${ io_proc }
  parameters:
    documents:
    - text: My dog has fleas.
    - text: My cat does not have fleas.
    generate_inputs:
        temperature: 0.0
        extra_headers: 
            RITS_API_KEY: ${openai_api_key}
  modelResponse: chat_result_2
- >
  Certainty score for the low-quality response is
  ${chat_result_2.results[0].next_message.content}


In [None]:
# Use majority voting to get a second opinion
from granite_io.io.voting import MBRDMajorityVotingProcessor

voting_proc = MBRDMajorityVotingProcessor(io_proc)
next_chat_input.generate_inputs.temperature = 0.1
chat_result_3 = await voting_proc.acreate_chat_completion(
    next_chat_input.with_addl_generate_params({"n": 10})
)
print(
    f"Certainty score with majority voting is "
    f"{chat_result_3.results[0].next_message.content}"
)

In [None]:
# Use the composite processor to generate multiple completions and filter those that
# are below a certainty threshold
composite_proc = CertaintyCompositeIOProcessor(
    granite_io_proc, lora_backend, threshold=0.8, include_score=True
)
composite_results = await composite_proc.acreate_chat_completion(
    chat_input.with_addl_generate_params({"n": 5, "temperature": 1.0})
)
composite_results.results

In [None]:
# Change the certainty threshold and try again
composite_proc.update_threshold(0.9)
composite_results_2 = await composite_proc.acreate_chat_completion(
    chat_input.with_addl_generate_params({"n": 5, "temperature": 1.0})
)
composite_results_2.results

In [None]:
# Free up GPU resources
if "server" in locals():
    server.shutdown()