# Multimodal Agents Workshop with VoyageAI Python Client

This version of the multimodal agents workshop uses the new VoyageAI Python client library for simplified embedding generation and MongoDB Atlas Vector Search integration.

**Workshop Overview:**
- Build a multimodal AI agent that can analyze documents and images
- Use MongoDB Atlas Vector Search for retrieval
- Implement function calling with Gemini 2.0 Flash
- Add memory and ReAct reasoning capabilities
- **NEW**: Streamlined VoyageAI integration with the official Python client

## 🎯 Learning Objectives
By the end of this workshop, you will be able to:
- Process PDFs and extract images for multimodal search
- Set up MongoDB Atlas vector search indexes
- Build an AI agent with tool calling capabilities
- Implement session-based memory for conversational agents
- Create a ReAct (Reasoning + Acting) agent architecture
- **NEW**: Use the VoyageAI Python client for production-ready embeddings

In [None]:
# Initialize progress tracking and lab utilities
import sys
import os

try:
    from jupyter_lab_progress import (
        LabProgress, LabValidator, show_info, show_warning, 
        show_success, show_error, show_hint
    )
    show_success("Progress tracking libraries loaded successfully! 🎉")
except ImportError as e:
    print(f"Warning: Could not import progress tracking: {e}")
    print("Installing basic fallbacks...")
    def show_info(msg, title=None): print(f"ℹ️ {title or 'Info'}: {msg}")
    def show_warning(msg, title=None): print(f"⚠️ {title or 'Warning'}: {msg}")
    def show_success(msg, title=None): print(f"✅ {title or 'Success'}: {msg}")
    def show_error(msg, title=None): print(f"❌ {title or 'Error'}: {msg}")
    def show_hint(msg, title=None): print(f"💡 {title or 'Hint'}: {msg}")

In [None]:
# Set up comprehensive lab progress tracking
try:
    progress = LabProgress(
        steps=[
            "Environment Setup",
            "VoyageAI Client Setup",
            "PDF Processing", 
            "Embedding Generation",
            "Data Ingestion",
            "Vector Index Creation",
            "Agent Tools Setup",
            "LLM Integration",
            "Basic Agent Testing",
            "Memory Implementation",
            "ReAct Agent Enhancement"
        ],
        lab_name="Multimodal Agents with VoyageAI Client",
        persist=True
    )
    
    # Set up validation
    validator = LabValidator(progress_tracker=progress)
    
    show_success("Lab progress tracking initialized!")
    show_info(f"Workshop: {progress.lab_name}")
    show_info(f"Total steps: {len(progress.steps)}")
    
except NameError:
    show_info("Running without progress tracking")

# Step 1: Environment Setup

Let's start by setting up our environment and connecting to MongoDB Atlas.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("Environment Setup")
except (NameError, AttributeError):
    show_info("Setting up environment and connections...")

In [None]:
import os
from pymongo import MongoClient

# Load environment variables from .env file (required)
from pathlib import Path

env_path = Path('.') / '.env'
if not env_path.exists():
    raise FileNotFoundError(
        "❌ .env file is required! Please create a .env file with:\n"
        "MONGODB_URI=your_mongodb_connection_string\n"
        "GOOGLE_API_KEY=your_google_api_key\n"
        "VOYAGE_API_KEY=your_voyage_api_key"
    )

# Load variables from .env
with open(env_path) as f:
    for line in f:
        if '=' in line and not line.strip().startswith('#'):
            key, value = line.strip().split('=', 1)
            os.environ[key] = value.strip('"\'')

show_info("Loaded environment variables from .env file")

# Check required environment variables
required_vars = ["MONGODB_URI", "GOOGLE_API_KEY", "VOYAGE_API_KEY"]
missing_vars = [var for var in required_vars if not os.getenv(var)]

if missing_vars:
    raise ValueError(
        f"❌ Missing required environment variables in .env file: {missing_vars}\n"
        "Please add these to your .env file:\n" + 
        "\n".join([f"{var}=your_{var.lower()}" for var in missing_vars])
    )

show_success("All required environment variables are set!")
show_info("✓ MONGODB_URI: Available")
show_info("✓ GOOGLE_API_KEY: Available") 
show_info("✓ VOYAGE_API_KEY: Available")

# Validate connection variables
try:
    validator.validate_variable_exists("MONGODB_URI", {"MONGODB_URI": os.getenv("MONGODB_URI")}, str)
except NameError:
    pass

In [None]:
# Connect to MongoDB Atlas
MONGODB_URI = os.getenv("MONGODB_URI")
SERVERLESS_URL = os.getenv("SERVERLESS_URL")  # Optional fallback
LLM_PROVIDER = "google"

# Initialize MongoDB client
try:
    mongodb_client = MongoClient(MONGODB_URI)
    # Test the connection
    result = mongodb_client.admin.command("ping")
    
    if result.get("ok") == 1:
        show_success("Successfully connected to MongoDB Atlas! 🎉")
        
        # Mark step as complete
        try:
            progress.mark_done("Environment Setup", score=100, notes="MongoDB connection successful")
        except NameError:
            pass
    else:
        show_error("MongoDB connection failed")
        
except Exception as e:
    show_error(f"Connection error: {e}")
    show_hint("Check your connection string and network access settings", 
             "Connection Troubleshooting")

# Step 2: VoyageAI Client Setup

Initialize the VoyageAI Python client for embedding generation.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("VoyageAI Client Setup")
except (NameError, AttributeError):
    show_info("Setting up VoyageAI client...")

In [None]:
import voyageai
import requests
import numpy as np

# Initialize VoyageAI client with required API key
VOYAGE_API_KEY = os.getenv("VOYAGE_API_KEY")

# Use direct API key (required)
voyage_client = voyageai.Client(api_key=VOYAGE_API_KEY)
show_success("VoyageAI client initialized with API key! 🚀")

# Normalize vector function (MongoDB doesn't auto-normalize)
def normalize_vector(v):
    """Normalize a vector to unit length."""
    norm = np.linalg.norm(v)
    return v / norm if norm > 0 else v

show_success("Vector normalization utility ready")

# Mark step complete
try:
    progress.mark_done("VoyageAI Client Setup", score=100, 
                      notes="VoyageAI client configured with direct API key")
except NameError:
    pass

# Step 3: PDF Processing

Download a research paper and extract pages as images for multimodal processing.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("PDF Processing")
except (NameError, AttributeError):
    show_info("Processing PDF and extracting images...")

In [None]:
import pymupdf
from pathlib import Path

# Create directory for images
Path("data/images").mkdir(parents=True, exist_ok=True)

show_info("📚 Reference: https://pymupdf.readthedocs.io/en/latest/how-to-open-a-file.html#opening-remote-files")

In [None]:
# Download the DeepSeek paper
try:
    show_info("Downloading DeepSeek R1 research paper...")
    response = requests.get("https://arxiv.org/pdf/2501.12948")
    
    if response.status_code != 200:
        raise ValueError(f"Failed to download PDF. Status code: {response.status_code}")
    
    # Get the content of the response
    pdf_stream = response.content
    show_success(f"PDF downloaded successfully! Size: {len(pdf_stream)} bytes")
    
    # Open the data in `pdf_stream` as a PDF document
    pdf = pymupdf.Document(stream=pdf_stream, filetype="pdf")
    
    show_success(f"PDF loaded! Pages: {pdf.page_count}")
    
    # Validate PDF processing
    try:
        validator.validate_variable_exists('pdf', locals(), pymupdf.Document)
        validator.validate_custom(
            pdf.page_count > 0,
            "PDF has valid page count",
            "PDF appears to be empty or corrupted"
        )
    except NameError:
        pass
        
except Exception as e:
    show_error(f"PDF processing failed: {e}")
    show_hint("Check your internet connection and try again", "Download Issue")

In [None]:
# Extract pages as images
from tqdm import tqdm

docs = []
zoom = 3.0

show_info("📚 Reference: https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_pixmap")

try:
    # Set image matrix dimensions
    mat = pymupdf.Matrix(zoom, zoom)
    
    show_info(f"Extracting {pdf.page_count} pages as images...")
    
    # Track partial progress
    total_pages = pdf.page_count
    
    # Iterate through the pages of the PDF
    for n in tqdm(range(pdf.page_count), desc="Extracting pages"):
        temp = {}
        
        # Use the `get_pixmap` method to render the PDF page
        pix = pdf[n].get_pixmap(matrix=mat)
        
        # Store image locally
        key = f"data/images/{n+1}.png"
        pix.save(key)
        
        # Extract image metadata
        temp["key"] = key
        temp["width"] = pix.width
        temp["height"] = pix.height
        temp["page_number"] = n + 1
        docs.append(temp)
    
    show_success(f"Successfully extracted {len(docs)} pages as images!")
    show_info(f"Images saved to: data/images/")
    
    # Mark step complete
    try:
        progress.mark_done("PDF Processing", score=95, 
                          notes=f"Extracted {len(docs)} pages")
    except (NameError, AttributeError):
        pass
        
except Exception as e:
    show_error(f"Image extraction failed: {e}")
    show_hint("Ensure the data/images directory exists and is writable", "File Access")

# Step 4: Embedding Generation with VoyageAI Client

Generate multimodal embeddings using the VoyageAI Python client.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("Embedding Generation")
except (NameError, AttributeError):
    show_info("Generating embeddings with VoyageAI client...")

In [None]:
from PIL import Image
import json
import base64
from io import BytesIO

def generate_embedding(data, input_type="document", model="voyage-multimodal-3"):
    """Generate embedding using VoyageAI client.
    
    Args:
        data: PIL Image or text string
        input_type: "document" or "query" (affects embedding optimization)
        model: Model to use for embedding generation
    
    Returns:
        list: Normalized embedding vector
    """
    try:
        # Use VoyageAI Python client - it handles PIL Images directly
        if isinstance(data, Image.Image):
            # For images, use multimodal embedding with proper input format
            inputs = [[data]]  # VoyageAI expects nested list format
            response = voyage_client.multimodal_embed(
                inputs=inputs, 
                model=model, 
                input_type=input_type
            )
            embedding = response.embeddings[0]
        else:
            # For text, use regular embedding
            response = voyage_client.embed(
                texts=[str(data)],
                model="voyage-2",  # Use text model for text
                input_type=input_type
            )
            embedding = response.embeddings[0]
        
        # Log usage for cost tracking
        if hasattr(response, 'usage'):
            show_info(f"Token usage: {response.usage}")
        
        # Normalize the embedding (MongoDB doesn't do this automatically)
        normalized_embedding = normalize_vector(np.array(embedding)).tolist()
        
        # Log vector norm for quality checking
        norm = np.linalg.norm(normalized_embedding)
        if abs(norm - 1.0) > 0.01:  # Should be close to 1.0 after normalization
            show_warning(f"Vector norm after normalization: {norm:.4f} (expected ~1.0)")
        
        return normalized_embedding
        
    except Exception as e:
        show_error(f"Embedding generation failed: {e}")
        return None

show_success("Embedding generation function ready!")

In [None]:
# Generate embeddings for all extracted images
embedded_docs = []

try:
    show_info(f"Generating embeddings for {len(docs)} images...")
    
    # Process images in batches for efficiency
    batch_size = 10  # Adjust based on API limits
    
    for i in tqdm(range(0, len(docs), batch_size), desc="Processing batches"):
        batch = docs[i:i+batch_size]
        
        for doc in batch:
            try:
                # Load the image
                img = Image.open(doc['key'])
                
                # Generate embedding using the new client
                embedding = generate_embedding(img, input_type="document")
                
                if embedding:
                    doc["embedding"] = embedding
                    embedded_docs.append(doc)
                    
                    # Validate embedding properties
                    if len(embedding) != 1024:
                        show_warning(f"Unexpected embedding dimension: {len(embedding)} (expected 1024)")
                else:
                    show_warning(f"Failed to generate embedding for {doc['key']}")
                    
            except Exception as e:
                show_error(f"Error processing {doc['key']}: {e}")
    
    show_success(f"Successfully generated embeddings for {len(embedded_docs)} documents!")
    
    # Save embeddings to file for future use
    Path("data").mkdir(exist_ok=True)
    with open("data/embeddings_voyageai.json", "w") as f:
        json.dump(embedded_docs, f)
    
    show_info("Embeddings saved to data/embeddings_voyageai.json")
    
    # Mark step complete
    try:
        progress.mark_done("Embedding Generation", score=100, 
                          notes=f"Generated {len(embedded_docs)} embeddings")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Batch embedding generation failed: {e}")
    show_hint("Check your API key and rate limits", "API Error")

# Step 5: Data Ingestion

Ingest the generated embeddings into MongoDB Atlas.

In [None]:
# Database configuration
DB_NAME = "mongodb_aiewf"
COLLECTION_NAME = "multimodal_workshop_voyageai"

# Connect to the collection
collection = mongodb_client[DB_NAME][COLLECTION_NAME]

show_info(f"Connected to database: {DB_NAME}")
show_info(f"Using collection: {COLLECTION_NAME}")

In [None]:
# Ingest data into MongoDB
show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.insert_many")

try:
    # Clear existing documents
    delete_result = collection.delete_many({})
    show_info(f"Deleted {delete_result.deleted_count} existing documents")
    
    # Use the newly generated embeddings or load from file
    data_to_ingest = embedded_docs if embedded_docs else []
    
    if not data_to_ingest:
        # Try to load from saved file
        try:
            with open("data/embeddings_voyageai.json", "r") as f:
                data_to_ingest = json.load(f)
            show_info(f"Loaded {len(data_to_ingest)} documents from saved embeddings")
        except FileNotFoundError:
            show_error("No embeddings data available for ingestion")
            raise
    
    # Bulk insert documents into the collection
    insert_result = collection.insert_many(data_to_ingest)
    
    # Verify insertion
    doc_count = collection.count_documents({})
    
    show_success(f"Successfully ingested {doc_count} documents into {COLLECTION_NAME}! 🎉")
    
    # Validate ingestion
    try:
        validator.validate_custom(
            doc_count == len(data_to_ingest),
            "All documents ingested successfully",
            f"Document count mismatch: expected {len(data_to_ingest)}, got {doc_count}"
        )
        
        progress.mark_done("Data Ingestion", score=100, 
                          notes=f"Ingested {doc_count} documents")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Data ingestion failed: {e}")
    show_hint("Check your MongoDB connection and permissions", "Database Error")

# Step 6: Vector Search Index Creation

Create a vector search index to enable similarity search on our multimodal embeddings.

In [None]:
# Show step guidance
try:
    progress.show_step_tips("Vector Index Creation")
except (NameError, AttributeError):
    show_info("Creating vector search index...")

In [None]:
VS_INDEX_NAME = "vector_index_voyageai"

# Define vector index configuration
model = {
    "name": VS_INDEX_NAME,
    "type": "vectorSearch",
    "definition": {
        "fields": [
            {
                "type": "vector",
                "path": "embedding",
                "numDimensions": 1024,
                "similarity": "cosine",
            }
        ]
    },
}

show_info(f"Index configuration: {VS_INDEX_NAME}")
show_info("Vector field: embedding")
show_info("Dimensions: 1024 (Voyage multimodal)")
show_info("Similarity metric: cosine")

In [None]:
# Create the vector search index
show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_search_index")

try:
    # Check if index already exists
    existing_indexes = list(collection.list_search_indexes())
    index_exists = any(idx.get('name') == VS_INDEX_NAME for idx in existing_indexes)
    
    if index_exists:
        show_info(f"Index '{VS_INDEX_NAME}' already exists")
    else:
        show_info("Creating vector search index...")
        
        # Create the vector search index
        collection.create_search_index(model=model)
        
        show_success(f"Vector search index '{VS_INDEX_NAME}' created successfully! 🎉")
    
    # Mark step complete
    try:
        progress.mark_done("Vector Index Creation", score=100, 
                          notes=f"Index '{VS_INDEX_NAME}' ready")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Index creation failed: {e}")
    show_hint("Index creation may take a few minutes. Check Atlas UI to monitor progress", 
             "Index Status")

In [None]:
# Verify index status
try:
    indexes = list(collection.list_search_indexes())
    
    show_info("Current search indexes:")
    for idx in indexes:
        name = idx.get('name', 'Unknown')
        status = idx.get('status', 'Unknown')
        
        if status == 'READY':
            show_success(f"✅ {name}: {status}")
        else:
            show_warning(f"⏳ {name}: {status}")
    
    # Check if our index is ready
    our_index = next((idx for idx in indexes if idx.get('name') == VS_INDEX_NAME), None)
    
    if our_index and our_index.get('status') == 'READY':
        show_success(f"Index '{VS_INDEX_NAME}' is ready for vector search! 🚀")
    else:
        show_warning(f"Index '{VS_INDEX_NAME}' is still building. Please wait...")
        show_hint("Index creation can take several minutes. Check the Atlas UI for progress.", 
                 "Index Building")
        
except Exception as e:
    show_error(f"Failed to check index status: {e}")

# Step 7: Agent Tools Setup

Create the vector search tool using the VoyageAI client for query embeddings.

In [None]:
from typing import List

show_info("📚 Reference: https://www.mongodb.com/docs/atlas/atlas-vector-search/vector-search-stage/#ann-examples")

In [None]:
def get_information_for_question_answering(user_query: str) -> List[str]:
    """
    Retrieve information using vector search to answer a user query.
    Uses VoyageAI client for query embedding generation.

    Args:
        user_query (str): The user's query string.

    Returns:
        List[str]: List of image file paths retrieved from vector search.
    """
    try:
        show_info(f"🔍 Searching for: {user_query}")
        
        # Generate query embedding using VoyageAI client
        query_embedding = generate_embedding(user_query, input_type="query")
        
        if not query_embedding:
            show_error("Failed to generate query embedding")
            return []
        
        show_success(f"Generated query embedding: {len(query_embedding)} dimensions")

        # Define aggregation pipeline with $vectorSearch and $project stages
        pipeline = [
            {
                "$vectorSearch": {
                    "index": VS_INDEX_NAME,
                    "path": "embedding",
                    "queryVector": query_embedding,
                    "numCandidates": 150,  # Higher for better recall
                    "limit": 2,  # Top results
                }
            },
            {
                "$project": {
                    "_id": 0,
                    "key": 1,
                    "width": 1,
                    "height": 1,
                    "page_number": 1,
                    "score": {"$meta": "vectorSearchScore"},
                }
            },
        ]

        # Execute the aggregation pipeline
        results = list(collection.aggregate(pipeline))
        
        # Extract image keys and scores
        keys = [result["key"] for result in results]
        scores = [result["score"] for result in results]
        
        show_success(f"Found {len(keys)} relevant images")
        for i, (key, score) in enumerate(zip(keys, scores)):
            show_info(f"  {i+1}. {key} (score: {score:.4f})")
        
        return keys
        
    except Exception as e:
        show_error(f"Vector search failed: {e}")
        return []

In [None]:
# Define function declaration for Gemini function calling
show_info("📚 Reference: https://ai.google.dev/gemini-api/docs/function-calling#step_1_define_function_declaration")

# Define the function declaration
get_information_for_question_answering_declaration = {
    "name": "get_information_for_question_answering",
    "description": "Retrieve information using vector search to answer a user query. Uses VoyageAI embeddings for enhanced similarity matching.",
    "parameters": {
        "type": "object",
        "properties": {
            "user_query": {
                "type": "string",
                "description": "Query string to use for vector search",
            }
        },
        "required": ["user_query"],
    },
}

show_success("Function declaration created for Gemini integration!")

# Mark step complete
try:
    progress.mark_done("Agent Tools Setup", score=100, 
                      notes="Vector search tool with VoyageAI client ready")
except NameError:
    pass

# Step 8: LLM Integration

Set up Gemini 2.0 Flash with function calling capabilities.

In [None]:
from google import genai
from google.genai import types
from google.genai.types import FunctionCall

LLM = "gemini-2.0-flash"

try:
    # Use GOOGLE_API_KEY from environment (required)
    api_key = os.getenv("GOOGLE_API_KEY")
    
    # Initialize Gemini client
    gemini_client = genai.Client(api_key=api_key)
    
    show_success(f"Gemini client initialized with model: {LLM}")
    show_info("Using GOOGLE_API_KEY from environment")
    
    # Validate client setup
    try:
        validator.validate_variable_exists('gemini_client', locals(), genai.Client)
    except NameError:
        pass
        
except Exception as e:
    show_error(f"LLM setup failed: {e}")
    show_hint("Check your GOOGLE_API_KEY in .env file", "API Key Error")

In [None]:
# Create generation configuration
try:
    tools = types.Tool(
        function_declarations=[get_information_for_question_answering_declaration]
    )
    tools_config = types.GenerateContentConfig(tools=[tools], temperature=0.0)
    
    show_success("Generation configuration created with function calling enabled!")
    show_info("Temperature: 0.0 (deterministic responses)")
    show_info("Available tools: get_information_for_question_answering")
    
    # Mark step complete
    try:
        progress.mark_done("LLM Integration", score=100, 
                          notes="Gemini 2.0 Flash configured with function calling")
    except NameError:
        pass
        
except Exception as e:
    show_error(f"Configuration failed: {e}")

# Step 9: Basic Agent Implementation

Create the core agent functions for tool selection and response generation.

In [None]:
show_info("📚 Reference: https://ai.google.dev/gemini-api/docs/function-calling#step_4_create_user_friendly_response")

In [None]:
def select_tool(messages: List) -> FunctionCall | None:
    """
    Use an LLM to decide which tool to call.

    Args:
        messages (List): Messages as a list

    Returns:
        FunctionCall: Function call object or None
    """
    try:
        system_prompt = [
            (
                "You're an AI assistant. Based on the given information, decide which tool to use. "
                "If the user is asking to explain an image, don't call any tools unless that would help you better explain the image. "
                "Here is the provided information:\n"
            )
        ]
        
        # Input to the LLM
        contents = system_prompt + messages
        
        # Generate response using Gemini
        response = gemini_client.models.generate_content(
            model=LLM, contents=contents, config=tools_config
        )
        
        # Extract and return the function call
        if response.candidates and response.candidates[0].content.parts:
            return response.candidates[0].content.parts[0].function_call
        
        return None
        
    except Exception as e:
        show_error(f"Tool selection failed: {e}")
        return None

show_success("Tool selection function created!")

In [None]:
def generate_answer(user_query: str, images: List = []) -> str:
    """
    Execute any tools and generate a response.

    Args:
        user_query (str): User's query string
        images (List): List of image file paths. Defaults to [].

    Returns:
        str: LLM-generated response
    """
    try:
        show_info("🔍 DEBUG: Starting generate_answer")
        show_info(f"🔍 DEBUG: User query: {user_query}")
        show_info(f"🔍 DEBUG: Initial images: {images}")
        
        # Use select_tool to determine if we need to call any tools
        tool_call = select_tool([user_query])
        
        show_info(f"🔍 DEBUG: Tool call result: {tool_call}")
        
        # If a tool call is found and it's our vector search function
        if (
            tool_call is not None
            and tool_call.name == "get_information_for_question_answering"
        ):
            show_info(f"🛠️ Agent calling tool: {tool_call.name}")
            show_info(f"🔍 DEBUG: Tool call args: {tool_call.args}")
            
            # Call the tool with the extracted arguments
            tool_images = get_information_for_question_answering(**tool_call.args)
            
            show_info(f"🔍 DEBUG: Tool returned {len(tool_images) if tool_images else 0} images")
            if tool_images:
                show_info(f"🔍 DEBUG: Image paths: {tool_images}")
            
            # Add retrieved images to the input images
            images.extend(tool_images)
        else:
            show_warning("🔍 DEBUG: No tool was called!")

        show_info(f"🔍 DEBUG: Total images to send to LLM: {len(images)}")
        show_info(f"🔍 DEBUG: Image paths: {images}")

        # Prepare system prompt
        system_prompt = (
            "Answer the questions based on the provided context only. "
            "If the context is not sufficient, say I DON'T KNOW. "
            "DO NOT use any other information to answer the question."
        )
        
        # Verify images exist and can be opened
        valid_images = []
        for img_path in images:
            try:
                img = Image.open(img_path)
                valid_images.append(img)
                show_success(f"✅ DEBUG: Successfully opened image: {img_path}")
            except Exception as e:
                show_error(f"❌ DEBUG: Failed to open image {img_path}: {e}")
        
        show_info(f"🔍 DEBUG: Successfully opened {len(valid_images)} images")
        
        # Prepare contents for the LLM
        contents = [system_prompt] + [user_query] + valid_images

        show_info(f"🔍 DEBUG: Sending to LLM - prompt + query + {len(valid_images)} images")

        # Get the response from the LLM
        response = gemini_client.models.generate_content(
            model=LLM,
            contents=contents,
            config=types.GenerateContentConfig(temperature=0.0),
        )
        
        answer = response.text
        show_info(f"🔍 DEBUG: LLM response length: {len(answer)} characters")
        return answer
        
    except Exception as e:
        show_error(f"Answer generation failed: {e}")
        import traceback
        show_error(f"🔍 DEBUG: Full traceback:\n{traceback.format_exc()}")
        return "I apologize, but I encountered an error while processing your question."

show_success("Answer generation function with debugging created!")

In [None]:
def execute_agent(user_query: str, images: List = []) -> None:
    """
    Execute the agent and display the response.

    Args:
        user_query (str): User query
        images (List, optional): List of image file paths. Defaults to [].
    """
    try:
        show_info(f"🤖 Processing query: {user_query}")
        
        response = generate_answer(user_query, images)
        
        show_success("🤖 Agent Response:")
        print(f"\n{response}\n")
        
    except Exception as e:
        show_error(f"Agent execution failed: {e}")

show_success("Agent execution function created!")

# Mark step complete
try:
    progress.mark_done("Basic Agent Testing", score=100, 
                      notes="Agent functions with VoyageAI integration ready")
except NameError:
    pass

In [None]:
# Test the agent with different types of queries
show_info("🧪 Testing the agent with sample queries...")

# Test 1: Text-based query requiring vector search
# Expected: The Pass@1 accuracy of DeepSeek R1 on AIME 2024 is 79.8%.

show_info("Test 1: Factual question requiring document search")
execute_agent("What is the Pass@1 accuracy of DeepSeek R1 on AIME 2024?")

In [None]:
# DEBUG: Check MongoDB and index status
show_info("🔍 DEBUG: Checking MongoDB connection and index")

# Check collection
try:
    doc_count = collection.count_documents({})
    show_success(f"✅ Collection has {doc_count} documents")
    
    # Get a sample document to check structure
    sample_doc = collection.find_one()
    if sample_doc:
        show_info(f"Sample document keys: {list(sample_doc.keys())}")
        if 'embedding' in sample_doc:
            show_success(f"✅ Embedding field exists, length: {len(sample_doc['embedding'])}")
        else:
            show_error("❌ No embedding field in documents!")
            
        if 'key' in sample_doc:
            show_info(f"Sample image path: {sample_doc['key']}")
    else:
        show_error("❌ No documents found in collection")
        
except Exception as e:
    show_error(f"❌ MongoDB error: {e}")

# Check index status
try:
    indexes = list(collection.list_search_indexes())
    show_info(f"Search indexes: {indexes}")
    
    for idx in indexes:
        if idx.get('name') == VS_INDEX_NAME:
            status = idx.get('status', 'Unknown')
            if status == 'READY':
                show_success(f"✅ Index {VS_INDEX_NAME} is READY")
            else:
                show_error(f"❌ Index {VS_INDEX_NAME} status: {status}")
except Exception as e:
    show_error(f"❌ Index check error: {e}")

In [None]:
# DEBUG: Test vector search directly
show_info("🔍 DEBUG: Testing vector search function directly")
test_query = "What is the Pass@1 accuracy of DeepSeek R1 on AIME 2024?"
show_info(f"Test query: {test_query}")

results = get_information_for_question_answering(test_query)
show_info(f"🔍 DEBUG: Vector search returned: {results}")

if results:
    # Try to verify the images exist
    for img_path in results:
        import os
        if os.path.exists(img_path):
            show_success(f"✅ Image exists: {img_path}")
            # Try to open it
            try:
                test_img = Image.open(img_path)
                show_success(f"✅ Can open image: {img_path} - Size: {test_img.size}")
            except Exception as e:
                show_error(f"❌ Cannot open image: {e}")
        else:
            show_error(f"❌ Image does NOT exist: {img_path}")
else:
    show_warning("⚠️ No results returned from vector search")

In [None]:
# Test 2: Image explanation
import os

if docs and len(docs) > 0:
    show_info("Test 2: Document page analysis")
    execute_agent("What can you see in this document page?", [docs[0]['key']])
else:
    show_warning("No document pages available for testing")

# Step 10: Memory Implementation

Add conversational memory to enable multi-turn conversations with context retention.

In [None]:
from datetime import datetime

# Set up history collection
history_collection = mongodb_client[DB_NAME]["history_voyageai"]

show_info(f"Setting up conversation memory in: {DB_NAME}.history_voyageai")
show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_index")

In [None]:
# Create index for efficient session queries
try:
    # Create index on session_id field
    history_collection.create_index("session_id")
    
    show_success("Session index created for conversation history!")
    
except Exception as e:
    show_error(f"Index creation failed: {e}")

In [None]:
def store_chat_message(session_id: str, role: str, type: str, content: str) -> None:
    """
    Create chat history document and store it in MongoDB.

    Args:
        session_id (str): Session ID
        role (str): Message role, one of 'user' or 'agent'
        type (str): Type of message, one of 'text' or 'image'
        content (str): Content of the message (text or image path)
    """
    try:
        # Create message document
        message = {
            "session_id": session_id,
            "role": role,
            "type": type,
            "content": content,
            "timestamp": datetime.now(),
        }
        
        # Insert message into history collection
        history_collection.insert_one(message)
        
    except Exception as e:
        show_error(f"Failed to store chat message: {e}")

show_success("Chat message storage function created!")

In [None]:
def retrieve_session_history(session_id: str) -> List:
    """
    Retrieve chat history for a particular session.

    Args:
        session_id (str): Session ID

    Returns:
        List: List of messages (text and images)
    """
    try:
        show_info("📚 Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/cursor.html#pymongo.cursor.Cursor.sort")
        
        # Query history collection and sort by timestamp
        cursor = history_collection.find({"session_id": session_id}).sort("timestamp", 1)
        
        messages = []
        if cursor:
            for msg in cursor:
                # If message type is text, append content as is
                if msg["type"] == "text":
                    messages.append(msg["content"])
                # If message type is image, open and append the image
                elif msg["type"] == "image":
                    try:
                        messages.append(Image.open(msg["content"]))
                    except Exception as e:
                        show_warning(f"Could not load image {msg['content']}: {e}")
        
        return messages
        
    except Exception as e:
        show_error(f"Failed to retrieve session history: {e}")
        return []

show_success("Session history retrieval function created!")

In [None]:
# Enhanced generate_answer function with memory
def generate_answer_with_memory(session_id: str, user_query: str, images: List = []) -> str:
    """
    Execute tools and generate response with conversation memory.

    Args:
        session_id (str): Session ID for conversation tracking
        user_query (str): User's query string
        images (List): List of image file paths. Defaults to [].

    Returns:
        str: LLM-generated response
    """
    try:
        # Retrieve conversation history
        history = retrieve_session_history(session_id)
        
        show_info(f"Retrieved {len(history)} previous messages for session {session_id}")
        
        # Determine if tools need to be called
        tool_call = select_tool(history + [user_query])
        
        if (
            tool_call is not None
            and tool_call.name == "get_information_for_question_answering"
        ):
            show_info(f"🛠️ Agent calling tool: {tool_call.name}")
            tool_images = get_information_for_question_answering(**tool_call.args)
            images.extend(tool_images)

        # Generate response with history context
        system_prompt = (
            "Answer the questions based on the provided context only. "
            "If the context is not sufficient, say I DON'T KNOW. "
            "DO NOT use any other information to answer the question."
        )
        
        contents = (
            [system_prompt]
            + history
            + [user_query]
            + [Image.open(image) for image in images]
        )
        
        response = gemini_client.models.generate_content(
            model=LLM,
            contents=contents,
            config=types.GenerateContentConfig(temperature=0.0),
        )
        
        answer = response.text
        
        # Store conversation in memory
        # Store user query
        store_chat_message(session_id, "user", "text", user_query)
        
        # Store image references
        for image in images:
            store_chat_message(session_id, "user", "image", image)
        
        # Store agent response
        store_chat_message(session_id, "agent", "text", answer)
        
        return answer
        
    except Exception as e:
        show_error(f"Memory-enabled answer generation failed: {e}")
        return "I apologize, but I encountered an error while processing your question."

show_success("Memory-enabled answer generation function created!")

In [None]:
# Enhanced execute_agent function with memory
def execute_agent_with_memory(session_id: str, user_query: str, images: List = []) -> None:
    """
    Execute the agent with conversation memory.

    Args:
        session_id (str): Session ID for conversation tracking
        user_query (str): User query
        images (List, optional): List of image file paths. Defaults to [].
    """
    try:
        show_info(f"🧠 Session {session_id} - Processing: {user_query}")
        
        response = generate_answer_with_memory(session_id, user_query, images)
        
        show_success("🤖 Agent Response:")
        print(f"\n{response}\n")
        
    except Exception as e:
        show_error(f"Memory-enabled agent execution failed: {e}")

show_success("Memory-enabled agent execution function created!")

# Mark step complete
try:
    progress.mark_done("Memory Implementation", score=100, 
                      notes="Conversation memory system implemented")
except NameError:
    pass

In [None]:
# Test memory-enabled agent
show_info("🧪 Testing memory-enabled agent...")

# First query in session
show_info("Test 1: Initial query")
execute_agent_with_memory(
    "session_voyageai_1",
    "What is the Pass@1 accuracy of Deepseek R1 on the MATH500 benchmark?",
)

In [None]:
# Follow-up query to test memory
show_info("Test 2: Follow-up query to test memory")
execute_agent_with_memory(
    "session_voyageai_1",
    "What did I just ask you?",
)

# Step 11: ReAct Agent Enhancement

Implement a ReAct (Reasoning + Acting) agent that can reason about whether it has enough information and iteratively gather more data if needed.

In [None]:
def generate_answer_react(user_query: str, images: List = []) -> str:
    """
    Implement a ReAct (Reasoning + Acting) agent with VoyageAI embeddings.

    Args:
        user_query (str): User's query string
        images (List): List of image file paths. Defaults to [].

    Returns:
        str: LLM-generated response
    """
    try:
        show_info("🧠 Starting ReAct agent processing with VoyageAI embeddings...")
        
        # Define reasoning prompt
        system_prompt = [
            (
                "You are an AI assistant with access to high-quality VoyageAI embeddings for document search. "
                "Based on the current information, decide if you have enough to answer the user query, or if you need more information. "
                "If you have enough information, respond with 'ANSWER: <your answer>'. "
                "If you need more information, respond with 'TOOL: <question for the tool>'. Keep the question concise. "
                f"User query: {user_query}\n"
                "Current information:\n"
            )
        ]
        
        # Set max iterations to prevent infinite loops
        max_iterations = 3
        current_iteration = 0
        
        # Initialize list to accumulate information
        current_information = []

        # If the user provided images, add them to current information
        if len(images) != 0:
            current_information.extend([Image.open(image) for image in images])
            show_info(f"Added {len(images)} user-provided images to context")

        # Run the reasoning → action loop
        while current_iteration < max_iterations:
            current_iteration += 1
            show_info(f"🔄 ReAct Iteration {current_iteration}:")
            
            # Generate reasoning and decision
            response = gemini_client.models.generate_content(
                model=LLM,
                contents=system_prompt + current_information,
                config=types.GenerateContentConfig(temperature=0.0),
            )
            
            decision = response.text
            show_info(f"💭 Agent decision: {decision[:100]}...")
            
            # If the agent has the final answer, return it
            if "ANSWER:" in decision:
                final_answer = decision.split("ANSWER:", 1)[1].strip()
                show_success(f"✅ Final answer reached in {current_iteration} iterations")
                return final_answer
            
            # If the agent decides to use a tool
            elif "TOOL:" in decision:
                tool_query = decision.split("TOOL:", 1)[1].strip()
                show_info(f"🛠️ Agent requesting tool with query: {tool_query}")
                
                # Use tool selection to get the function call
                tool_call = select_tool([tool_query])
                
                if (
                    tool_call is not None
                    and tool_call.name == "get_information_for_question_answering"
                ):
                    show_info(f"📊 Calling VoyageAI-powered vector search with: {tool_call.args}")
                    
                    # Call the tool and add results to current information
                    tool_images = get_information_for_question_answering(**tool_call.args)
                    
                    if tool_images:
                        new_images = [Image.open(image) for image in tool_images]
                        current_information.extend(new_images)
                        show_success(f"➕ Added {len(new_images)} retrieved images to context")
                    else:
                        show_warning("No relevant images found")
                        current_information.append("No relevant visual information found for this query.")
                else:
                    show_warning("Tool selection failed or returned unexpected tool")
                    current_information.append("Tool call failed.")
            else:
                show_warning("Agent response didn't contain ANSWER or TOOL directive")
                current_information.append("Unable to determine next action.")
        
        # If we've exhausted iterations without a final answer
        show_warning(f"⚠️ Reached maximum iterations ({max_iterations}) without final answer")
        return "I apologize, but I couldn't find a definitive answer after exploring the available information. Please try rephrasing your question or asking for more specific details."
        
    except Exception as e:
        show_error(f"ReAct agent failed: {e}")
        return "I apologize, but I encountered an error while processing your question with the ReAct approach."

show_success("ReAct agent with VoyageAI integration completed!")

In [None]:
def execute_react_agent(user_query: str, images: List = []) -> None:
    """
    Execute the ReAct agent.

    Args:
        user_query (str): User query
        images (List, optional): List of image file paths. Defaults to [].
    """
    try:
        show_info(f"🦸‍♀️ ReAct Agent Processing: {user_query}")
        
        response = generate_answer_react(user_query, images)
        
        show_success("🤖 ReAct Agent Final Response:")
        print(f"\n{response}\n")
        
    except Exception as e:
        show_error(f"ReAct agent execution failed: {e}")

show_success("ReAct agent execution function created!")

# Mark final step complete
try:
    progress.mark_done("ReAct Agent Enhancement", score=100, 
                      notes="ReAct reasoning and acting agent with VoyageAI implemented")
except NameError:
    pass

In [None]:
# Test ReAct agent
show_info("🧪 Testing ReAct agent with VoyageAI embeddings...")

# Test 1: Question requiring document search
show_info("Test 1: Complex factual question")
execute_react_agent("What is the Pass@1 accuracy of Deepseek R1 on the MATH500 benchmark?")

In [None]:
# Test 2: Document analysis
if docs and len(docs) > 0:
    show_info("Test 2: Document page analysis with ReAct")
    execute_react_agent("What technical concepts are discussed in this document page?", [docs[0]['key']])
else:
    show_warning("No document pages available for ReAct testing")

# 🎉 Workshop Complete!

Congratulations! You've successfully built a comprehensive multimodal AI agent system with VoyageAI integration.

In [None]:
# Final progress summary
try:
    show_success("🎓 VoyageAI Workshop Completed Successfully!")
    
    # Display final progress
    progress.display_progress(detailed=True)
    
    # Show completion statistics
    completion_rate = progress.get_completion_rate()
    avg_score = progress.get_average_score()
    
    show_info(f"📊 Overall Completion: {completion_rate:.1f}%")
    if avg_score:
        show_info(f"📈 Average Score: {avg_score:.1f}/100")
    
    # Show what was accomplished
    show_success("""
    🚀 What You've Built with VoyageAI:
    
    ✅ PDF processing pipeline for multimodal content
    ✅ VoyageAI Python client for high-quality embeddings
    ✅ MongoDB Atlas vector search integration
    ✅ AI agent with function calling capabilities
    ✅ Conversational memory system
    ✅ ReAct (Reasoning + Acting) agent architecture
    ✅ Production-ready multimodal AI application
    ✅ Optimized query vs document embedding types
    ✅ Vector normalization and quality checks
    """)
    
    # Next steps
    show_info("""
    🎯 Next Steps with VoyageAI:
    
    • Experiment with different VoyageAI models (voyage-2 vs voyage-lite-02-instruct)
    • Implement batch processing for large document collections
    • Add VoyageAI reranking for improved search quality
    • Monitor usage and costs with response.usage logging
    • Integrate with production applications using proper API key management
    • Explore hybrid search combining keywords and vector similarity
    """)
    
except NameError:
    show_success("🎓 VoyageAI Workshop completed successfully!")
    show_info("All agent implementations with VoyageAI client are ready for use.")

In [None]:
# Optional: Export progress analytics
try:
    if hasattr(progress, 'export_analytics_json'):
        analytics_file = progress.export_analytics_json()
        show_success(f"📄 Progress analytics exported to: {analytics_file}")
        
        # Show summary
        summary = progress.get_analytics_summary()
        if summary:
            show_info(f"⏱️ Total session time: {summary.get('session_duration', 'N/A')} seconds")
            show_info(f"📝 Total interactions: {summary.get('total_events', 'N/A')}")
except (NameError, AttributeError):
    pass

show_success("Thank you for completing the Multimodal Agents Workshop with VoyageAI! 🙏")
show_info("🔗 Learn more about VoyageAI: https://www.voyageai.com/")
show_info("📚 VoyageAI Documentation: https://docs.voyageai.com/")