In [None]:
%pip install -q -U langchain langchain_openai langchain_core langgraph langfuse dotenv

## Langfuse Tracing

In [None]:
from my_utils import setup_langfuse_tracer
from my_config import MyConfig

langfuse_handler = setup_langfuse_tracer()
my_config = MyConfig()

## OpenAI Authentication

In [None]:
import os

# Please setp your own key.
os.environ["OPENAI_API_KEY"] = my_config.OPENAI_API_KEY

## Get the LLM Models

In [None]:
from langchain_openai import ChatOpenAI
vision_llm = ChatOpenAI(model="gpt-4o")
llm = ChatOpenAI(model="gpt-4o")
#.achat, .achatstream

## Define the Tools

In [None]:
import base64
from langchain_core.messages import HumanMessage

def extract_text(img_path: str) -> str:
    """
    Extract text from an image file using a multimodal model.

    Args:
        img_path: A local image file path (strings).

    Returns:
        A single string containing the concatenated text extracted from each image.
    """
    all_text = ""
    try:

        # Read image and encode as base64
        with open(img_path, "rb") as image_file:
            image_bytes = image_file.read()

        image_base64 = base64.b64encode(image_bytes).decode("utf-8")

        # Prepare the prompt including the base64 image data
        message = [
            HumanMessage(
                content=[
                    {
                        "type": "text",
                        "text": (
                            "Extract all the text from this image. "
                            "Return only the extracted text, no explanations."
                        ),
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{image_base64}"
                        },
                    },
                ]
            )
        ]

        # Call the vision-capable model
        response = vision_llm.invoke(message, config={"langfuse_handler": langfuse_handler})

        # Append extracted text
        all_text += response.content + "\n\n"

        return all_text.strip()
    except Exception as e:
        # You can choose whether to raise or just return an empty string / error message
        error_msg = f"Error extracting text: {str(e)}"
        print(error_msg)
        return ""
    

def divide(a: int, b: int) -> float:
    """Divide a and b."""
    return a / b


def sum(a: int, b: int) -> float:
    """Sum a and b."""
    return a + b


def multiply(a: int, b: int) -> float:
    """Multiply a and b."""
    return a * b


def substract(a: int, b: int) -> float:
    """Substract a and b."""
    return a - b

## Bind tools to LLM

In [None]:
tools = [
    divide,
    sum,
    multiply,
    substract,
    extract_text
]
llm_with_tools = llm.bind_tools(tools, parallel_tool_calls=False)

## Defining Agent's State
This state is a little more complex than the previous ones we have seen. AnyMessage is a class from Langchain that defines messages, and add_messages is an operator that adds the latest message rather than overwriting it with the latest state.

This is a new concept in LangGraph, where you can add operators in your state to define the way they should interact together.

In [None]:
from typing import TypedDict, Annotated, Optional
from langchain_core.messages import AnyMessage
from langgraph.graph.message import add_messages

class AgentState(TypedDict):
    # The document provided
    input_file: Optional[str]  # Contains file path (PDF/PNG)
    messages: Annotated[list[AnyMessage], add_messages]

## The nodes

In [31]:
from langchain_core.messages import SystemMessage

def assistant(state: AgentState):
    # System message
    textual_description_of_tool="""
extract_text(img_path: str) -> str:
    Extract text from an image file using a multimodal model.

    Args:
        img_path: A local image file path (strings).

    Returns:
        A single string containing the concatenated text extracted from each image.

divide(a: int, b: int) -> float:
    Divide a and b
    Args:
        a: The numerator (int).
        b: The denominator (int).
    Returns:
        The result of the division (float).
"""
    image=state["input_file"]
    sys_msg = SystemMessage(content=f"You are a helpful butler named Alfred that serves Mr. Wayne and Batman. You can analyse documents and run computations with provided tools:\n{textual_description_of_tool} \n You have access to some optional images. Currently the loaded image is: {image}")

    response = llm_with_tools.invoke([sys_msg] + state["messages"])
    return {
        "messages": [response],
        "input_file": state["input_file"]
    }

## The ReAct Pattern: How I Assist Mr. Wayne
Allow me to explain the approach in this agent. The agent follows what’s known as the ReAct pattern (Reason-Act-Observe)

1. **Reason** about his documents and requests
2. **Act** by using appropriate tools
3. **Observe** the results
4. **Repeat** as necessary until I’ve fully addressed his needs

In [None]:
from langgraph.graph import START, StateGraph
from langgraph.prebuilt import ToolNode, tools_condition
from IPython.display import Image, display

# The graph
builder = StateGraph(AgentState)

# Define nodes: these do the work
builder.add_node("assistant", assistant)
builder.add_node("tools", ToolNode(tools))

# Define edges: these determine how the control flow moves
builder.add_edge(START, "assistant")
builder.add_conditional_edges(
    "assistant",
    # If the latest message requires a tool, route to tools
    # Otherwise, provide a direct response
    tools_condition,
)
builder.add_edge("tools", "assistant")
react_graph = builder.compile()

# Show the butler's thought process
display(Image(react_graph.get_graph(xray=True).draw_mermaid_png()))

We define a tools node with our list of tools. The assistant node is just our model with bound tools. We create a graph with assistant and tools nodes.

We add a tools_condition edge, which routes to End or to tools based on whether the assistant calls a tool.

Now, we add one new step:

We connect the tools node back to the assistant, forming a loop.

- After the assistant node executes, tools_condition checks if the model’s output is a tool call.
- If it is a tool call, the flow is directed to the tools node.
- The tools node connects back to assistant.
- This loop continues as long as the model decides to call tools.
- If the model response is not a tool call, the flow is directed to END, terminating the process.

## The Butler in Action
### Example 1: Simple Calculations

Here is an example to show a simple use case of an agent using a tool in LangGraph.

In [34]:
messages = [HumanMessage(content="Assign evaluated value of given solution to variable A: ((20 * 10) + (87 * 100) / 50)")]
messages = react_graph.invoke(
    {
        "messages": messages
        , "input_file": None
    }
    , config={"callbacks": [langfuse_handler]}
)
# Show the messages
for m in messages['messages']:
    m.pretty_print()

content='' additional_kwargs={'tool_calls': [{'id': 'call_xL5nVaQhW9ph6ffEY5cHaEp4', 'function': {'arguments': '{"a":20,"b":10}', 'name': 'multiply'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 324, 'total_tokens': 341, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_a288987b44', 'id': 'chatcmpl-Bmm8wVRFIPzMLYNJrIkIELcOpzYvv', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--ef78811c-2e42-4fe0-a6fb-e17442fb1ae5-0' tool_calls=[{'name': 'multiply', 'args': {'a': 20, 'b': 10}, 'id': 'call_xL5nVaQhW9ph6ffEY5cHaEp4', 'type': 'tool_call'}] usage_metadata={'input_tokens': 324, 'output_tokens': 17, 'total_tokens': 341, 'input_token_details': {'audio': 0, 'cache_read': 0},

In [35]:
from functools import reduce
messages = react_graph.invoke({
        "messages": reduce(
            lambda left, right: add_messages(left, right)
            , messages['messages']
                + [HumanMessage(content="Assign evaluated value of given solution to variable B: ((20 * 10) + (87 * 100) / 100)")])
        , "input_file": None
    })

content='' additional_kwargs={'tool_calls': [{'id': 'call_n5ZKrRuleaUpBgbGBomPzQQ2', 'function': {'arguments': '{"a":8700,"b":100}', 'name': 'divide'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 18, 'prompt_tokens': 487, 'total_tokens': 505, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_a288987b44', 'id': 'chatcmpl-Bmm9VXF74Cuawr6HUCR2qe2DdnCv6', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--0641247d-8546-4ec8-9993-5df7174494d1-0' tool_calls=[{'name': 'divide', 'args': {'a': 8700, 'b': 100}, 'id': 'call_n5ZKrRuleaUpBgbGBomPzQQ2', 'type': 'tool_call'}] usage_metadata={'input_tokens': 487, 'output_tokens': 18, 'total_tokens': 505, 'input_token_details': {'audio': 0, 'cache_read': 0

In [36]:
messages = react_graph.invoke({
        "messages": reduce(
            lambda left, right: add_messages(left, right)
            , messages['messages']
                + [HumanMessage(content="What is A + B")])
        , "input_file": None
    })

content='' additional_kwargs={'tool_calls': [{'id': 'call_wrqgK9iLIAxyxLGKlsdMrNUE', 'function': {'arguments': '{"a":374,"b":287}', 'name': 'sum'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 577, 'total_tokens': 594, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_a288987b44', 'id': 'chatcmpl-Bmm9pDrblKuUmpxzEWccnwe8Ts1QH', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--d49f3d50-d0c1-4322-bb0d-4c4352bd7bbc-0' tool_calls=[{'name': 'sum', 'args': {'a': 374, 'b': 287}, 'id': 'call_wrqgK9iLIAxyxLGKlsdMrNUE', 'type': 'tool_call'}] usage_metadata={'input_tokens': 577, 'output_tokens': 17, 'total_tokens': 594, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'outp

### Example 2: Analyzing Master Wayne’s Training Documents
When Master Wayne leaves his training and meal notes:

In [37]:
# with open("data/Batman_training_and_meals.png", "rb") as img_file:
#     img_base64 = base64.b64encode(img_file.read()).decode("utf-8")

# messages = [HumanMessage(
#     content=[
#         {
#             "type": "text",
#             "text": (
#                 "According to the note provided by Mr. Wayne in the provided images. What's the list of items I should buy for the dinner menu?"
#             ),
#         },
#         {
#             "type": "image_url",
#             "image_url": {
#                 "url": f"data:image/png;base64,{img_base64}"
#             },
#         },
#     ]
# )]
# messages = react_graph.invoke(
#     {
#         "messages": messages
#         , "input_file": None
#     }
#     , config={"callbacks": [langfuse_handler]}
# )

messages = [HumanMessage(content="According to the note provided by Mr. Wayne in the provided images. What's the list of items I should buy for the dinner menu?")]
response = react_graph.invoke(
    {
        "messages": messages
        , "input_file": "data/Batman_training_and_meals.png"
    }
    , config={"callbacks": [langfuse_handler]}
)
# Show the messages
for m in response['messages']:
    m.pretty_print()

content='' additional_kwargs={'tool_calls': [{'id': 'call_VTMbGY8ZtGbHVyoQin9S8A4P', 'function': {'arguments': '{"img_path":"data/Batman_training_and_meals.png"}', 'name': 'extract_text'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 332, 'total_tokens': 355, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_a288987b44', 'id': 'chatcmpl-BmmCimr2IsLxVykpBFNRHihex8RET', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--f794688b-2f48-4f90-8caf-1f4beefb1250-0' tool_calls=[{'name': 'extract_text', 'args': {'img_path': 'data/Batman_training_and_meals.png'}, 'id': 'call_VTMbGY8ZtGbHVyoQin9S8A4P', 'type': 'tool_call'}] usage_metadata={'input_tokens': 332, 'output_tokens': 23, 