# Multimodal Agents Workshop with Progress Tracking

This enhanced version of the multimodal agents workshop includes comprehensive progress tracking, validation, and interactive guidance using the `jupyter-lab-progress` library.

**Workshop Overview:**
- Build a multimodal AI agent that can analyze documents and images
- Use MongoDB Atlas Vector Search for retrieval
- Implement function calling with Gemini 2.0 Flash
- Add memory and ReAct reasoning capabilities

## 🎯 Learning Objectives
By the end of this workshop, you will be able to:
- Process PDFs and extract images for multimodal search
- Set up MongoDB Atlas vector search indexes
- Build an AI agent with tool calling capabilities
- Implement session-based memory for conversational agents
- Create a ReAct (Reasoning + Acting) agent architecture

In [None]:
# Initialize progress tracking and lab utilities
import sys
import os

# Force load from development source if available
dev_path = "/Users/michael.lynn/code/mongodb/developer-days/jupyter-utils/jupyter-lab-progress"
if os.path.exists(dev_path) and dev_path not in sys.path:
    sys.path.insert(0, dev_path)

# Remove any cached modules
modules_to_remove = [key for key in sys.modules.keys() if key.startswith('jupyter_lab_progress')]
for module in modules_to_remove:
    del sys.modules[module]

try:
    from jupyter_lab_progress import (
        LabProgress, LabValidator, show_info, show_warning, 
        show_success, show_error, show_hint
    )
    show_success("Progress tracking libraries loaded successfully! 🎉")
except ImportError as e:
    print(f"Warning: Could not import progress tracking: {e}")
    print("Installing basic fallbacks...")
    def show_info(msg, title=None): print(f"ℹ️ {title or 'Info'}: {msg}")
    def show_warning(msg, title=None): print(f"⚠️ {title or 'Warning'}: {msg}")
    def show_success(msg, title=None): print(f"✅ {title or 'Success'}: {msg}")
    def show_error(msg, title=None): print(f"❌ {title or 'Error'}: {msg}")
    def show_hint(msg, title=None): print(f"💡 {title or 'Hint'}: {msg}")

In [None]:
# Set up comprehensive lab progress tracking
try:
    progress = LabProgress(
        steps=[
            "Environment Setup",
            "PDF Processing", 
            "Data Ingestion",
            "Vector Index Creation",
            "Agent Tools Setup",
            "LLM Integration",
            "Basic Agent Testing",
            "Memory Implementation",
            "ReAct Agent Enhancement"
        ],
        lab_name="Multimodal Agents Workshop",
        persist=True
    )
    
    # Set up validation
    validator = LabValidator(progress_tracker=progress)
    
    show_success("Lab progress tracking initialized!")
    show_info(f"Workshop: {progress.lab_name}")
    show_info(f"Total steps: {len(progress.steps)}")
    
except NameError:
    show_info("Running without progress tracking")

# Step 1: Environment Setup

Let's start by setting up our environment and connecting to MongoDB Atlas.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("Environment Setup")
except (NameError, AttributeError):
    show_info("Setting up environment and connections...")

In [4]:
import os
from pymongo import MongoClient

# Check environment variables
required_vars = ["MONGODB_URI", "SERVERLESS_URL"]
missing_vars = [var for var in required_vars if not os.getenv(var)]

if missing_vars:
    show_error(f"Missing environment variables: {missing_vars}")
    show_info("Please set the required environment variables before proceeding")
else:
    show_success("All required environment variables are set!")

# Validate connection variables
try:
    validator.validate_variable_exists("MONGODB_URI", {"MONGODB_URI": os.getenv("MONGODB_URI")}, str)
    validator.validate_variable_exists("SERVERLESS_URL", {"SERVERLESS_URL": os.getenv("SERVERLESS_URL")}, str)
except NameError:
    pass

In [None]:
# Connect to MongoDB Atlas
MONGODB_URI = os.getenv("MONGODB_URI")
SERVERLESS_URL = os.getenv("SERVERLESS_URL")
LLM_PROVIDER = "google"

# Initialize MongoDB client
try:
    mongodb_client = MongoClient(MONGODB_URI)
    # Test the connection
    result = mongodb_client.admin.command("ping")
    
    if result.get("ok") == 1:
        show_success("Successfully connected to MongoDB Atlas! 🎉")
        
        # Mark step as complete
        try:
            progress.mark_done("Environment Setup", score=100, notes="MongoDB connection successful")
        except NameError:
            pass
    else:
        show_error("MongoDB connection failed")
        
except Exception as e:
    show_error(f"Connection error: {e}")
    show_hint("Check your connection string and network access settings", 
             "Connection Troubleshooting")

# Step 2: PDF Processing

Download a research paper and extract pages as images for multimodal processing.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("PDF Processing")
except (NameError, AttributeError):
    show_info("Processing PDF and extracting images...")

In [None]:
import pymupdf
import requests
from pathlib import Path

# Create directory for images
Path("data/images").mkdir(parents=True, exist_ok=True)

show_info("📚 Reference: https://pymupdf.readthedocs.io/en/latest/how-to-open-a-file.html#opening-remote-files")

In [None]:
# Download the DeepSeek paper
try:
    show_info("Downloading DeepSeek R1 research paper...")
    response = requests.get("https://arxiv.org/pdf/2501.12948")
    
    if response.status_code != 200:
        raise ValueError(f"Failed to download PDF. Status code: {response.status_code}")
    
    # Get the content of the response
    pdf_stream = response.content
    show_success(f"PDF downloaded successfully! Size: {len(pdf_stream)} bytes")
    
    # TODO: Open the data in `pdf_stream` as a PDF document
    # HINT: Set the `filetype` argument to "pdf"
    pdf = pymupdf.Document(stream=pdf_stream, filetype="pdf")
    
    show_success(f"PDF loaded! Pages: {pdf.page_count}")
    
    # Validate PDF processing
    try:
        validator.validate_variable_exists('pdf', locals(), pymupdf.Document)
        validator.validate_custom(
            pdf.page_count > 0,
            "PDF has valid page count",
            "PDF appears to be empty or corrupted"
        )
    except NameError:
        pass
        
except Exception as e:
    show_error(f"PDF processing failed: {e}")
    show_hint("Check your internet connection and try again", "Download Issue")

In [None]:
# Extract pages as images
from tqdm import tqdm

docs = []
zoom = 3.0

show_info("📚 Reference: https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_pixmap")

try:
    # Set image matrix dimensions
    mat = pymupdf.Matrix(zoom, zoom)
    
    show_info(f"Extracting {pdf.page_count} pages as images...")
    
    # Track partial progress
    total_pages = pdf.page_count
    
    # Iterate through the pages of the PDF
    for n in tqdm(range(pdf.page_count), desc="Extracting pages"):
        temp = {}
        
        # TODO: Use the `get_pixmap` method to render the PDF page
        # HINT: Access the PDF page as pdf[n]
        pix = pdf[n].get_pixmap(matrix=mat)
        
        # Store image locally
        key = f"data/images/{n+1}.png"
        pix.save(key)
        
        # Extract image metadata
        temp["key"] = key
        temp["width"] = pix.width
        temp["height"] = pix.height
        temp["page_number"] = n + 1
        docs.append(temp)
    
    show_success(f"Successfully extracted {len(docs)} pages as images!")
    show_info(f"Images saved to: data/images/")
    
    # Mark step complete
    try:
        progress.mark_done("PDF Processing", score=95, 
                          notes=f"Extracted {len(docs)} pages")
    except (NameError, AttributeError):
        pass
        
except Exception as e:
    show_error(f"Image extraction failed: {e}")
    show_hint("Ensure the data/images directory exists and is writable", "File Access")

# Step 3: Data Ingestion

Load pre-generated embeddings and ingest them into MongoDB Atlas.

In [None]:
# Optional: Generate embeddings (requires Voyage AI API key)
show_info("ℹ️ Embedding Generation", "Optional Step")
show_info("""
For this workshop, we'll use pre-generated embeddings to save time.
If you want to generate your own embeddings, uncomment the code below 
and add your Voyage AI API key.

Follow these steps to get an API key:
https://docs.voyageai.com/docs/api-key-and-installation#authentication-with-api-keys
""")

# Uncomment this section if you have a Voyage AI API key
# from voyageai import Client
# from PIL import Image
# 
# os.environ["VOYAGE_API_KEY"] = "your-api-key-here"
# voyageai_client = Client()
# 
# def get_embedding(data, input_type):
#     """Get Voyage AI embeddings for images and text."""
#     embedding = voyageai_client.multimodal_embed(
#         inputs=[[data]], model="voyage-multimodal-3", input_type=input_type
#     ).embeddings[0]
#     return embedding
# 
# embedded_docs = []
# for doc in tqdm(docs, desc="Generating embeddings"):
#     img = Image.open(doc['key'])
#     doc["embedding"] = get_embedding(img, "document")
#     embedded_docs.append(doc)

In [None]:
import json

# Database configuration
DB_NAME = "mongodb_aiewf"
COLLECTION_NAME = "multimodal_workshop"

# Connect to the collection
collection = mongodb_client[DB_NAME][COLLECTION_NAME]

show_info(f"Connected to database: {DB_NAME}")
show_info(f"Using collection: {COLLECTION_NAME}")

In [None]:
# Load pre-generated embeddings
try:
    show_info("Loading pre-generated embeddings...")
    
    with open("data/embeddings.json", "r") as data_file:
        json_data = data_file.read()
    data = json.loads(json_data)
    
    show_success(f"Loaded {len(data)} documents with embeddings")
    
    # Validate data structure
    try:
        validator.validate_custom(
            len(data) > 0,
            "Embeddings data loaded successfully",
            "Embeddings file is empty or invalid"
        )
        
        # Check if first document has required fields
        if data:
            required_fields = ['embedding', 'key']
            missing_fields = [field for field in required_fields if field not in data[0]]
            
            validator.validate_custom(
                len(missing_fields) == 0,
                "Document structure validation passed",
                f"Missing required fields: {missing_fields}"
            )
    except NameError:
        pass
        
except FileNotFoundError:
    show_error("Embeddings file not found: data/embeddings.json")
    show_hint("Make sure the data/embeddings.json file exists in your working directory", 
             "File Missing")
except Exception as e:
    show_error(f"Failed to load embeddings: {e}")

In [None]:
# Ingest data into MongoDB
show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.insert_many")

try:
    # Clear existing documents
    delete_result = collection.delete_many({})
    show_info(f"Deleted {delete_result.deleted_count} existing documents")
    
    # TODO: Bulk insert documents into the collection
    insert_result = collection.insert_many(data)
    
    # Verify insertion
    doc_count = collection.count_documents({})
    
    show_success(f"Successfully ingested {doc_count} documents into {COLLECTION_NAME}! 🎉")
    
    # Validate ingestion
    try:
        validator.validate_custom(
            doc_count == len(data),
            "All documents ingested successfully",
            f"Document count mismatch: expected {len(data)}, got {doc_count}"
        )
        
        progress.mark_done("Data Ingestion", score=100, 
                          notes=f"Ingested {doc_count} documents")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Data ingestion failed: {e}")
    show_hint("Check your MongoDB connection and permissions", "Database Error")

# Step 4: Vector Search Index Creation

Create a vector search index to enable similarity search on our multimodal embeddings.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("Vector Index Creation")
except (NameError, AttributeError):
    show_info("Creating vector search index...")

In [None]:
VS_INDEX_NAME = "vector_index"

# Define vector index configuration
model = {
    "name": VS_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1024,
                "similarity": "cosine",
            }
        ]
    },
}

show_info(f"Index configuration: {VS_INDEX_NAME}")
show_info("Vector field: embedding")
show_info("Dimensions: 1024 (Voyage multimodal)")
show_info("Similarity metric: cosine")

In [None]:
# Create the vector search index
show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_search_index")

try:
    # Check if index already exists
    existing_indexes = list(collection.list_search_indexes())
    index_exists = any(idx.get('name') == VS_INDEX_NAME for idx in existing_indexes)
    
    if index_exists:
        show_info(f"Index '{VS_INDEX_NAME}' already exists")
    else:
        show_info("Creating vector search index...")
        
        # TODO: Create the vector search index
        collection.create_search_index(model=model)
        
        show_success(f"Vector search index '{VS_INDEX_NAME}' created successfully! 🎉")
    
    # Mark step complete
    try:
        progress.mark_done("Vector Index Creation", score=100, 
                          notes=f"Index '{VS_INDEX_NAME}' ready")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Index creation failed: {e}")
    show_hint("Index creation may take a few minutes. Check Atlas UI to monitor progress", 
             "Index Status")

In [None]:
# Verify index status
try:
    indexes = list(collection.list_search_indexes())
    
    show_info("Current search indexes:")
    for idx in indexes:
        name = idx.get('name', 'Unknown')
        status = idx.get('status', 'Unknown')
        
        if status == 'READY':
            show_success(f"✅ {name}: {status}")
        else:
            show_warning(f"⏳ {name}: {status}")
    
    # Check if our index is ready
    our_index = next((idx for idx in indexes if idx.get('name') == VS_INDEX_NAME), None)
    
    if our_index and our_index.get('status') == 'READY':
        show_success(f"Index '{VS_INDEX_NAME}' is ready for vector search! 🚀")
    else:
        show_warning(f"Index '{VS_INDEX_NAME}' is still building. Please wait...")
        show_hint("Index creation can take several minutes. Check the Atlas UI for progress.", 
                 "Index Building")
        
except Exception as e:
    show_error(f"Failed to check index status: {e}")

# Step 5: Agent Tools Setup

Create the vector search tool that our AI agent will use to retrieve relevant information.

In [None]:
from typing import List

show_info("📚 Reference: https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/#ann-examples")

In [None]:
def get_information_for_question_answering(user_query: str) -> List[str]:
    """
    Retrieve information using vector search to answer a user query.

    Args:
        user_query (str): The user's query string.

    Returns:
        List[str]: List of image file paths retrieved from vector search.
    """
    try:
        show_info(f"🔍 Searching for: {user_query}")
        
        # Embed the user query using our serverless endpoint
        response = requests.post(
            url=SERVERLESS_URL,
            json={
                "task": "get_embedding",
                "data": {"input": user_query, "input_type": "query"},
            },
        )
        
        if response.status_code != 200:
            show_error(f"Embedding API failed: {response.status_code}")
            return []
        
        # Extract the embedding from the response
        query_embedding = response.json()["embedding"]
        show_success(f"Generated query embedding: {len(query_embedding)} dimensions")

        # TODO: Define aggregation pipeline with $vectorSearch and $project stages
        pipeline = [
            {
                "$vectorSearch": {
                    "index": VS_INDEX_NAME,
                    "path": "embedding",
                    "queryVector": query_embedding,
                    "numCandidates": 150,
                    "limit": 2,
                }
            },
            {
                "$project": {
                    "_id": 0,
                    "key": 1,
                    "width": 1,
                    "height": 1,
                    "score": {"$meta": "vectorSearchScore"},
                }
            },
        ]

        # TODO: Execute the aggregation pipeline
        results = list(collection.aggregate(pipeline))
        
        # Extract image keys
        keys = [result["key"] for result in results]
        scores = [result["score"] for result in results]
        
        show_success(f"Found {len(keys)} relevant images")
        for i, (key, score) in enumerate(zip(keys, scores)):
            show_info(f"  {i+1}. {key} (score: {score:.4f})")
        
        return keys
        
    except Exception as e:
        show_error(f"Vector search failed: {e}")
        return []

In [None]:
# Define function declaration for Gemini function calling
show_info("📚 Reference: https://ai.google.dev/gemini-api/docs/function-calling#step_1_define_function_declaration")

# TODO: Define the function declaration
get_information_for_question_answering_declaration = {
    "name": "get_information_for_question_answering",
    "description": "Retrieve information using vector search to answer a user query.",
    "parameters": {
        "type": "object",
        "properties": {
            "user_query": {
                "type": "string",
                "description": "Query string to use for vector search",
            }
        },
        "required": ["user_query"],
    },
}

show_success("Function declaration created for Gemini integration!")

# Mark step complete
try:
    progress.mark_done("Agent Tools Setup", score=100, 
                      notes="Vector search tool and function declaration ready")
except NameError:
    pass

# Step 6: LLM Integration

Set up Gemini 2.0 Flash with function calling capabilities.

In [None]:
from google import genai
from google.genai import types
from google.genai.types import FunctionCall

LLM = "gemini-2.0-flash"

try:
    # Get API key from serverless endpoint
    show_info("Obtaining Gemini API key...")
    
    api_response = requests.post(
        url=SERVERLESS_URL, 
        json={"task": "get_api_key", "data": LLM_PROVIDER}
    )
    
    if api_response.status_code == 200:
        api_key = api_response.json()["api_key"]
        
        # Initialize Gemini client
        gemini_client = genai.Client(api_key=api_key)
        
        show_success(f"Gemini client initialized with model: {LLM}")
        
        # Validate client setup
        try:
            validator.validate_variable_exists('gemini_client', locals(), genai.Client)
        except NameError:
            pass
    else:
        show_error(f"Failed to get API key: {api_response.status_code}")
        
except Exception as e:
    show_error(f"LLM setup failed: {e}")
    show_hint("Check your SERVERLESS_URL and network connection", "API Key Error")

In [None]:
# Create generation configuration
try:
    tools = types.Tool(
        function_declarations=[get_information_for_question_answering_declaration]
    )
    tools_config = types.GenerateContentConfig(tools=[tools], temperature=0.0)
    
    show_success("Generation configuration created with function calling enabled!")
    show_info("Temperature: 0.0 (deterministic responses)")
    show_info("Available tools: get_information_for_question_answering")
    
    # Mark step complete
    try:
        progress.mark_done("LLM Integration", score=100, 
                          notes="Gemini 2.0 Flash configured with function calling")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Configuration failed: {e}")

# Step 7: Basic Agent Implementation

Create the core agent functions for tool selection and response generation.

In [None]:
from PIL import Image

show_info("📚 Reference: https://ai.google.dev/gemini-api/docs/function-calling#step_4_create_user_friendly_response")

In [None]:
def select_tool(messages: List) -> FunctionCall | None:
    """
    Use an LLM to decide which tool to call.

    Args:
        messages (List): Messages as a list

    Returns:
        FunctionCall: Function call object or None
    """
    try:
        system_prompt = [
            (
                "You're an AI assistant. Based on the given information, decide which tool to use. "
                "If the user is asking to explain an image, don't call any tools unless that would help you better explain the image. "
                "Here is the provided information:\n"
            )
        ]
        
        # Input to the LLM
        contents = system_prompt + messages
        
        # TODO: Generate response using Gemini
        response = gemini_client.models.generate_content(
            model=LLM, contents=contents, config=tools_config
        )
        
        # Extract and return the function call
        if response.candidates and response.candidates[0].content.parts:
            return response.candidates[0].content.parts[0].function_call
        
        return None
        
    except Exception as e:
        show_error(f"Tool selection failed: {e}")
        return None

show_success("Tool selection function created!")

In [None]:
def generate_answer(user_query: str, images: List = []) -> str:
    """
    Execute any tools and generate a response.

    Args:
        user_query (str): User's query string
        images (List): List of image file paths. Defaults to [].

    Returns:
        str: LLM-generated response
    """
    try:
        # TODO: Use select_tool to determine if we need to call any tools
        tool_call = select_tool([user_query])
        
        # If a tool call is found and it's our vector search function
        if (
            tool_call is not None
            and tool_call.name == "get_information_for_question_answering"
        ):
            show_info(f"🛠️ Agent calling tool: {tool_call.name}")
            
            # TODO: Call the tool with the extracted arguments
            tool_images = get_information_for_question_answering(**tool_call.args)
            
            # Add retrieved images to the input images
            images.extend(tool_images)

        # Prepare system prompt
        system_prompt = (
            "Answer the questions based on the provided context only. "
            "If the context is not sufficient, say I DON'T KNOW. "
            "DO NOT use any other information to answer the question."
        )
        
        # Prepare contents for the LLM
        contents = [system_prompt] + [user_query] + [Image.open(image) for image in images]

        # Get the response from the LLM
        response = gemini_client.models.generate_content(
            model=LLM,
            contents=contents,
            config=types.GenerateContentConfig(temperature=0.0),
        )
        
        answer = response.text
        return answer
        
    except Exception as e:
        show_error(f"Answer generation failed: {e}")
        return "I apologize, but I encountered an error while processing your question."

show_success("Answer generation function created!")

In [None]:
def execute_agent(user_query: str, images: List = []) -> None:
    """
    Execute the agent and display the response.

    Args:
        user_query (str): User query
        images (List, optional): List of image file paths. Defaults to [].
    """
    try:
        show_info(f"🤖 Processing query: {user_query}")
        
        response = generate_answer(user_query, images)
        
        show_success("🤖 Agent Response:")
        print(f"\n{response}\n")
        
    except Exception as e:
        show_error(f"Agent execution failed: {e}")

show_success("Agent execution function created!")

# Mark step complete
try:
    progress.mark_done("Basic Agent Testing", score=100, 
                      notes="Agent functions implemented and ready for testing")
except NameError:
    pass

In [None]:
# Test the agent with different types of queries
show_info("🧪 Testing the agent with sample queries...")

# Test 1: Text-based query requiring vector search
show_info("Test 1: Factual question requiring document search")
execute_agent("What is the Pass@1 accuracy of Deepseek R1 on the MATH500 benchmark?")

In [None]:
# Test 2: Image explanation (if test image exists)
import os

if os.path.exists("data/test.png"):
    show_info("Test 2: Image analysis")
    execute_agent("Explain the graph in this image:", ["data/test.png"])
else:
    show_warning("Test image not found: data/test.png")
    show_info("Test 2: Using extracted PDF page instead")
    if docs:
        execute_agent("What can you see in this document page?", [docs[0]['key']])

# Step 8: Memory Implementation

Add conversational memory to enable multi-turn conversations with context retention.

In [None]:
from datetime import datetime

# Set up history collection
history_collection = mongodb_client[DB_NAME]["history"]

show_info(f"Setting up conversation memory in: {DB_NAME}.history")
show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_index")

In [None]:
# Create index for efficient session queries
try:
    # TODO: Create index on session_id field
    history_collection.create_index("session_id")
    
    show_success("Session index created for conversation history!")
    
except Exception as e:
    show_error(f"Index creation failed: {e}")

In [None]:
def store_chat_message(session_id: str, role: str, type: str, content: str) -> None:
    """
    Create chat history document and store it in MongoDB.

    Args:
        session_id (str): Session ID
        role (str): Message role, one of 'user' or 'agent'
        type (str): Type of message, one of 'text' or 'image'
        content (str): Content of the message (text or image path)
    """
    try:
        # TODO: Create message document
        message = {
            "session_id": session_id,
            "role": role,
            "type": type,
            "content": content,
            "timestamp": datetime.now(),
        }
        
        # TODO: Insert message into history collection
        history_collection.insert_one(message)
        
    except Exception as e:
        show_error(f"Failed to store chat message: {e}")

show_success("Chat message storage function created!")

In [None]:
def retrieve_session_history(session_id: str) -> List:
    """
    Retrieve chat history for a particular session.

    Args:
        session_id (str): Session ID

    Returns:
        List: List of messages (text and images)
    """
    try:
        show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/cursor.html#pymongo.cursor.Cursor.sort")
        
        # TODO: Query history collection and sort by timestamp
        cursor = history_collection.find({"session_id": session_id}).sort("timestamp", 1)
        
        messages = []
        if cursor:
            for msg in cursor:
                # If message type is text, append content as is
                if msg["type"] == "text":
                    messages.append(msg["content"])
                # If message type is image, open and append the image
                elif msg["type"] == "image":
                    try:
                        messages.append(Image.open(msg["content"]))
                    except Exception as e:
                        show_warning(f"Could not load image {msg['content']}: {e}")
        
        return messages
        
    except Exception as e:
        show_error(f"Failed to retrieve session history: {e}")
        return []

show_success("Session history retrieval function created!")

In [None]:
# Enhanced generate_answer function with memory
def generate_answer_with_memory(session_id: str, user_query: str, images: List = []) -> str:
    """
    Execute tools and generate response with conversation memory.

    Args:
        session_id (str): Session ID for conversation tracking
        user_query (str): User's query string
        images (List): List of image file paths. Defaults to [].

    Returns:
        str: LLM-generated response
    """
    try:
        # TODO: Retrieve conversation history
        history = retrieve_session_history(session_id)
        
        show_info(f"Retrieved {len(history)} previous messages for session {session_id}")
        
        # Determine if tools need to be called
        tool_call = select_tool(history + [user_query])
        
        if (
            tool_call is not None
            and tool_call.name == "get_information_for_question_answering"
        ):
            show_info(f"🛠️ Agent calling tool: {tool_call.name}")
            tool_images = get_information_for_question_answering(**tool_call.args)
            images.extend(tool_images)

        # Generate response with history context
        system_prompt = (
            "Answer the questions based on the provided context only. "
            "If the context is not sufficient, say I DON'T KNOW. "
            "DO NOT use any other information to answer the question."
        )
        
        contents = (
            [system_prompt]
            + history
            + [user_query]
            + [Image.open(image) for image in images]
        )
        
        response = gemini_client.models.generate_content(
            model=LLM,
            contents=contents,
            config=types.GenerateContentConfig(temperature=0.0),
        )
        
        answer = response.text
        
        # Store conversation in memory
        # TODO: Store user query
        store_chat_message(session_id, "user", "text", user_query)
        
        # TODO: Store image references
        for image in images:
            store_chat_message(session_id, "user", "image", image)
        
        # TODO: Store agent response
        store_chat_message(session_id, "agent", "text", answer)
        
        return answer
        
    except Exception as e:
        show_error(f"Memory-enabled answer generation failed: {e}")
        return "I apologize, but I encountered an error while processing your question."

show_success("Memory-enabled answer generation function created!")

In [None]:
# Enhanced execute_agent function with memory
def execute_agent_with_memory(session_id: str, user_query: str, images: List = []) -> None:
    """
    Execute the agent with conversation memory.

    Args:
        session_id (str): Session ID for conversation tracking
        user_query (str): User query
        images (List, optional): List of image file paths. Defaults to [].
    """
    try:
        show_info(f"🧠 Session {session_id} - Processing: {user_query}")
        
        response = generate_answer_with_memory(session_id, user_query, images)
        
        show_success("🤖 Agent Response:")
        print(f"\n{response}\n")
        
    except Exception as e:
        show_error(f"Memory-enabled agent execution failed: {e}")

show_success("Memory-enabled agent execution function created!")

# Mark step complete
try:
    progress.mark_done("Memory Implementation", score=100, 
                      notes="Conversation memory system implemented")
except NameError:
    pass

In [None]:
# Test memory-enabled agent
show_info("🧪 Testing memory-enabled agent...")

# First query in session
show_info("Test 1: Initial query")
execute_agent_with_memory(
    "session_1",
    "What is the Pass@1 accuracy of Deepseek R1 on the MATH500 benchmark?",
)

In [None]:
# Follow-up query to test memory
show_info("Test 2: Follow-up query to test memory")
execute_agent_with_memory(
    "session_1",
    "What did I just ask you?",
)

# Step 9: ReAct Agent Enhancement

Implement a ReAct (Reasoning + Acting) agent that can reason about whether it has enough information and iteratively gather more data if needed.

In [None]:
def generate_answer_react(user_query: str, images: List = []) -> str:
    """
    Implement a ReAct (Reasoning + Acting) agent.

    Args:
        user_query (str): User's query string
        images (List): List of image file paths. Defaults to [].

    Returns:
        str: LLM-generated response
    """
    try:
        show_info("🧠 Starting ReAct agent processing...")
        
        # Define reasoning prompt
        system_prompt = [
            (
                "You are an AI assistant. Based on the current information, decide if you have enough to answer the user query, or if you need more information. "
                "If you have enough information, respond with 'ANSWER: <your answer>'. "
                "If you need more information, respond with 'TOOL: <question for the tool>'. Keep the question concise. "
                f"User query: {user_query}\n"
                "Current information:\n"
            )
        ]
        
        # Set max iterations to prevent infinite loops
        max_iterations = 3
        current_iteration = 0
        
        # Initialize list to accumulate information
        current_information = []

        # If the user provided images, add them to current information
        if len(images) != 0:
            current_information.extend([Image.open(image) for image in images])
            show_info(f"Added {len(images)} user-provided images to context")

        # Run the reasoning → action loop
        while current_iteration < max_iterations:
            current_iteration += 1
            show_info(f"🔄 ReAct Iteration {current_iteration}:")
            
            # Generate reasoning and decision
            response = gemini_client.models.generate_content(
                model=LLM,
                contents=system_prompt + current_information,
                config=types.GenerateContentConfig(temperature=0.0),
            )
            
            decision = response.text
            show_info(f"💭 Agent decision: {decision[:100]}...")
            
            # If the agent has the final answer, return it
            if "ANSWER:" in decision:
                final_answer = decision.split("ANSWER:", 1)[1].strip()
                show_success(f"✅ Final answer reached in {current_iteration} iterations")
                return final_answer
            
            # If the agent decides to use a tool
            elif "TOOL:" in decision:
                tool_query = decision.split("TOOL:", 1)[1].strip()
                show_info(f"🛠️ Agent requesting tool with query: {tool_query}")
                
                # Use tool selection to get the function call
                tool_call = select_tool([tool_query])
                
                if (
                    tool_call is not None
                    and tool_call.name == "get_information_for_question_answering"
                ):
                    show_info(f"📊 Calling vector search with: {tool_call.args}")
                    
                    # Call the tool and add results to current information
                    tool_images = get_information_for_question_answering(**tool_call.args)
                    
                    if tool_images:
                        new_images = [Image.open(image) for image in tool_images]
                        current_information.extend(new_images)
                        show_success(f"➕ Added {len(new_images)} retrieved images to context")
                    else:
                        show_warning("No relevant images found")
                        current_information.append("No relevant visual information found for this query.")
                else:
                    show_warning("Tool selection failed or returned unexpected tool")
                    current_information.append("Tool call failed.")
            else:
                show_warning("Agent response didn't contain ANSWER or TOOL directive")
                current_information.append("Unable to determine next action.")
        
        # If we've exhausted iterations without a final answer
        show_warning(f"⚠️ Reached maximum iterations ({max_iterations}) without final answer")
        return "I apologize, but I couldn't find a definitive answer after exploring the available information. Please try rephrasing your question or asking for more specific details."
        
    except Exception as e:
        show_error(f"ReAct agent failed: {e}")
        return "I apologize, but I encountered an error while processing your question with the ReAct approach."

show_success("ReAct agent implementation completed!")

In [None]:
def execute_react_agent(user_query: str, images: List = []) -> None:
    """
    Execute the ReAct agent.

    Args:
        user_query (str): User query
        images (List, optional): List of image file paths. Defaults to [].
    """
    try:
        show_info(f"🦸‍♀️ ReAct Agent Processing: {user_query}")
        
        response = generate_answer_react(user_query, images)
        
        show_success("🤖 ReAct Agent Final Response:")
        print(f"\n{response}\n")
        
    except Exception as e:
        show_error(f"ReAct agent execution failed: {e}")

show_success("ReAct agent execution function created!")

# Mark final step complete
try:
    progress.mark_done("ReAct Agent Enhancement", score=100, 
                      notes="ReAct reasoning and acting agent implemented")
except NameError:
    pass

In [None]:
# Test ReAct agent
show_info("🧪 Testing ReAct agent with iterative reasoning...")

# Test 1: Question requiring document search
show_info("Test 1: Complex factual question")
execute_react_agent("What is the Pass@1 accuracy of Deepseek R1 on the MATH500 benchmark?")

In [None]:
# Test 2: Image analysis (if available)
if os.path.exists("data/test.png"):
    show_info("Test 2: Image analysis with ReAct")
    execute_react_agent("Explain the graph in this image:", ["data/test.png"])
else:
    show_info("Test 2: Document page analysis with ReAct")
    if docs:
        execute_react_agent("What technical concepts are discussed in this document page?", [docs[0]['key']])

# 🎉 Workshop Complete!

Congratulations! You've successfully built a comprehensive multimodal AI agent system.

In [None]:
# Final progress summary
try:
    show_success("🎓 Workshop Completed Successfully!")
    
    # Display final progress
    progress.display_progress(detailed=True)
    
    # Show completion statistics
    completion_rate = progress.get_completion_rate()
    avg_score = progress.get_average_score()
    
    show_info(f"📊 Overall Completion: {completion_rate:.1f}%")
    if avg_score:
        show_info(f"📈 Average Score: {avg_score:.1f}/100")
    
    # Show what was accomplished
    show_success("""
    🚀 What You've Built:
    
    ✅ PDF processing pipeline for multimodal content
    ✅ MongoDB Atlas vector search integration
    ✅ AI agent with function calling capabilities
    ✅ Conversational memory system
    ✅ ReAct (Reasoning + Acting) agent architecture
    ✅ End-to-end multimodal AI application
    """)
    
    # Next steps
    show_info("""
    🎯 Next Steps:
    
    • Experiment with different types of documents and queries
    • Modify the agent to work with your own data
    • Add more sophisticated reasoning capabilities
    • Integrate with web interfaces or chat applications
    • Explore other multimodal models and embeddings
    """)
    
except NameError:
    show_success("🎓 Workshop completed successfully!")
    show_info("All agent implementations are ready for use.")

In [None]:
# Optional: Export progress analytics
try:
    if hasattr(progress, 'export_analytics_json'):
        analytics_file = progress.export_analytics_json()
        show_success(f"📄 Progress analytics exported to: {analytics_file}")
        
        # Show summary
        summary = progress.get_analytics_summary()
        if summary:
            show_info(f"⏱️ Total session time: {summary.get('session_duration', 'N/A')} seconds")
            show_info(f"📝 Total interactions: {summary.get('total_events', 'N/A')}")
except (NameError, AttributeError):
    pass

show_success("Thank you for completing the Multimodal Agents Workshop! 🙏")