In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
from langchain.chat_models import init_chat_model
from langchain.agents import create_agent
from langgraph.checkpoint.memory import InMemorySaver
from langchain.messages import HumanMessage
from langchain_core.tools import tool
from tavily import TavilyClient
from typing import Dict, Any
import base64

In [None]:
@tool
def web_search(query: str) -> Dict[str, Any]:
    """Search the web for information"""
    return tavily_client.search(query)


MODEL_NAME = "gemini-2.5-flash-lite"

tavily_client = TavilyClient()

model = init_chat_model(model=MODEL_NAME, model_provider="google_genai")

config = {"configurable": {"thread_id": "1"}}

system_prompt = """You are a personal chef assistant. The user will show you the contents of their fridge 
and give you voice instructions about what they'd like to eat. Based on the available ingredients 
and the user's request, suggest recipes and cooking instructions. Use web search to find recipes if needed."""

agent = create_agent(
                model=model,
                checkpointer=InMemorySaver(),
                tools=[web_search],
                system_prompt=system_prompt)

## Upload fridge image

In [None]:
from ipywidgets import FileUpload
from IPython.display import display

uploader = FileUpload(accept='.png', multiple=False)
display(uploader)

In [None]:
uploaded_file = uploader.value[0]

content_mv = uploaded_file["content"]
img_bytes = bytes(content_mv)
img_b64 = base64.b64encode(img_bytes).decode("utf-8")

multimodal_question = HumanMessage(content=[
    {"type": "text", "text": "Here are the contents of my fridge"},
    {"type": "image", "base64": img_b64, "mime_type": "image/png"}
])

## Record audio instruction

In [None]:
import sounddevice as sd
from scipy.io.wavfile import write
import io
import time
from tqdm import tqdm

duration = 5  # seconds
sample_rate = 44100

print("Recording...")
audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
for _ in tqdm(range(duration * 10)):
    time.sleep(0.1)
sd.wait()
print("Done.")

buf = io.BytesIO()
write(buf, sample_rate, audio)
wav_bytes = buf.getvalue()

aud_b64 = base64.b64encode(wav_bytes).decode("utf-8")

audio_question = HumanMessage(content=[
    {"type": "text", "text": "Follow the user's audio instruction about the fridge image shown earlier"},
    {"type": "audio", "base64": aud_b64, "mime_type": "audio/wav"}
])

## Invoke agent

In [None]:
userImageResponse = agent.invoke(
    {"messages": [multimodal_question]},
    config
)
print(userImageResponse['messages'][-1].content)

In [None]:
userInstructionResponse = agent.invoke(
    {"messages": [audio_question]},
    config
)
print(userInstructionResponse['messages'][-1].content)