# RAG

In [1]:
import torch
import os
import uuid
import datetime

from langchain_huggingface import HuggingFacePipeline, ChatHuggingFace
from langchain.tools import tool

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TransformersEngine, CodeAgent, BitsAndBytesConfig
from langchain_core.utils.function_calling import convert_to_openai_function
from langchain_core.messages import SystemMessage
from langchain.schema import AIMessage, HumanMessage

from lightning import Fabric
from peft import LoraConfig, get_peft_model, PeftModelForCausalLM, PeftModel

from IPython.display import display, Markdown, Image, SVG
from os import walk



### Set mixed precision

In [2]:
torch.set_float32_matmul_precision("medium")
fabric = Fabric(accelerator="cuda", devices=1, precision="bf16-mixed")
device = fabric.device
fabric.launch()

Using bfloat16 Automatic Mixed Precision (AMP)


In [3]:
!nvidia-smi

Sun Apr 27 15:27:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.86.15              Driver Version: 570.86.15      CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4080        Off |   00000000:01:00.0  On |                  N/A |
|  0%   54C    P5             27W /  340W |    9589MiB /  16376MiB |     57%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
model_name = "Salesforce/Llama-xLAM-2-8b-fc-r"
# model_name = "NousResearch/Hermes-3-Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, load_in_4bit=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

https://python.langchain.com/docs/how_to/tools_prompting/

In [5]:
@tool
def get_weather(city: str) -> str:
    """
    Get the current weather for a city.
    Args:
        city (str): The name of the city.
        
    Returns:
        str: The current weather in the city.
    """
    return f"The weather in {city} is sunny."

@tool
def add(x: int, y: int) -> int:
    "Add two numbers."
    return x + y

@tool
def multiply(x: float, y: float) -> float:
    """Multiply two numbers together."""
    return x * y

tools = [
    get_weather,
    add,
    multiply,
]

for t in tools:
    print("--")
    print(t.name)
    print(t.description)
    print(t.args)

--
get_weather
Get the current weather for a city.
Args:
    city (str): The name of the city.

Returns:
    str: The current weather in the city.
{'city': {'title': 'City', 'type': 'string'}}
--
add
Add two numbers.
{'x': {'title': 'X', 'type': 'integer'}, 'y': {'title': 'Y', 'type': 'integer'}}
--
multiply
Multiply two numbers together.
{'x': {'title': 'X', 'type': 'number'}, 'y': {'title': 'Y', 'type': 'number'}}


In [6]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.tools import render_text_description

rendered_tools = render_text_description(tools)
print(rendered_tools)

get_weather(city: str) -> str - Get the current weather for a city.
Args:
    city (str): The name of the city.

Returns:
    str: The current weather in the city.
add(x: int, y: int) -> int - Add two numbers.
multiply(x: float, y: float) -> float - Multiply two numbers together.


In [7]:
pipe = pipeline(
    task="text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=2048,
    top_k=10,
    device_map="auto"
)

llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cuda:0


In [8]:
system_prompt = SystemMessage(f"""\
You are an assistant that has access to the following set of tools. 
Here are the names and descriptions for each tool:

{rendered_tools}

Given the user input, return the name and input of the tool to use. 
Return your response as a JSON blob with 'name' and 'arguments' keys.

The `arguments` should be a dictionary, with keys corresponding 
to the argument names and the values corresponding to the requested values.
""")

chat = ChatHuggingFace(llm=llm, tokenizer=tokenizer).bind_tools(tools)

In [9]:
query = "what's 3 plus 1132?"

messages = [system_prompt, HumanMessage(query)]

message = chat.invoke(messages)

if isinstance(message, str):
    print(message)
else:  # Otherwise it's a chat model
    print(message.content)

{"name": "add", "arguments": {"x": 3, "y": 1132}}


In [10]:
from langchain_core.output_parsers import JsonOutputParser

chain = chat | JsonOutputParser()
chain.invoke(messages)

{'name': 'add', 'arguments': {'x': 3, 'y': 1132}}

In [11]:
from typing import Any, Dict, Optional, TypedDict

from langchain_core.runnables import RunnableConfig
from langchain_core.runnables import RunnablePassthrough

class ToolCallRequest(TypedDict):
    """A typed dict that shows the inputs into the invoke_tool function."""

    name: str
    arguments: Dict[str, Any]


def invoke_tool(
    tool_call_request: ToolCallRequest, config: Optional[RunnableConfig] = None
):
    """A function that we can use the perform a tool invocation.

    Args:
        tool_call_request: a dict that contains the keys name and arguments.
            The name must match the name of a tool that exists.
            The arguments are the arguments to that tool.
        config: This is configuration information that LangChain uses that contains
            things like callbacks, metadata, etc.See LCEL documentation about RunnableConfig.

    Returns:
        output from the requested tool
    """
    print("Tool call request:", tool_call_request)
    
    if isinstance(tool_call_request, list):
        tool_call_request = tool_call_request[-1]
        
    tool_name_to_tool = {tool.name: tool for tool in tools}
    name = tool_call_request["name"]
    requested_tool = tool_name_to_tool[name]
    return requested_tool.invoke(tool_call_request["arguments"], config=config)

In [12]:
chain = chat | JsonOutputParser() | invoke_tool

query = "what's the weather in San Francisco?"

messages = [system_prompt, HumanMessage(query)]

response = chain.invoke(messages)
print(response)

Tool call request: [{'name': 'get_weather', 'arguments': {'city': 'San Francisco'}}]
The weather in San Francisco is sunny.
