#### Multi Model Agent (MMA)

In [None]:
import json
import logging
from typing import List, Dict, Any

from langchain_core.messages import HumanMessage
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI

# Unstructured for document parsing
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# Langchain components for vector store and LLM
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

<img src="../images/multi-model-agent_v2.png" alt="Multi Model Agent (MMA) Building Guide" width="600">

In [47]:
def partition_document(file_path: str) -> List[Document]:
    """Partition a PDF document into structured elements using Unstructured."""
    elements = partition_pdf(
        filename=file_path, # Path to the PDF document
        strategy="hi_res", # Use high resolution strategy for better text extraction
        infer_table_structure=True, # Infer tables as structured data
        extract_image_block_types=["Image"], # Extract images as blocks
        extract_image_block_to_payload=True # Store image as base64 in the payload
    )
    print(f"Partitioned document into {len(elements)} elements.")
    return elements

file_path = "../data/NIPS-2017.pdf"
elements = partition_document(file_path)

INFO: Reading PDF for file: ../data/NIPS-2017.pdf ...


Partitioned document into 183 elements.


In [48]:
print(f"Extracted {len(elements)} atomic elements from the document.")
elements

Extracted 183 atomic elements from the document.


[<unstructured.documents.elements.Title at 0x31f81c4b0>,
 <unstructured.documents.elements.NarrativeText at 0x31f81c0c0>,
 <unstructured.documents.elements.NarrativeText at 0x33fbbfc40>,
 <unstructured.documents.elements.Text at 0x33fbbfa80>,
 <unstructured.documents.elements.NarrativeText at 0x33fbbfd20>,
 <unstructured.documents.elements.NarrativeText at 0x33fbbfee0>,
 <unstructured.documents.elements.Text at 0x35272c910>,
 <unstructured.documents.elements.NarrativeText at 0x35272cbb0>,
 <unstructured.documents.elements.NarrativeText at 0x35272c980>,
 <unstructured.documents.elements.Text at 0x35272cad0>,
 <unstructured.documents.elements.NarrativeText at 0x35272c830>,
 <unstructured.documents.elements.Text at 0x35272c8a0>,
 <unstructured.documents.elements.NarrativeText at 0x35272cf30>,
 <unstructured.documents.elements.Title at 0x35272ce50>,
 <unstructured.documents.elements.NarrativeText at 0x35272d1d0>,
 <unstructured.documents.elements.Title at 0x35272cc90>,
 <unstructured.docum

In [49]:
st = set([str(type(el)) for el in elements])
print("Total unique element types:", len(st))
st

Total unique element types: 10


{"<class 'unstructured.documents.elements.FigureCaption'>",
 "<class 'unstructured.documents.elements.Footer'>",
 "<class 'unstructured.documents.elements.Formula'>",
 "<class 'unstructured.documents.elements.Header'>",
 "<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.ListItem'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

In [50]:
elements[0].to_dict()

{'type': 'Title',
 'element_id': '7d427e09b0d18f62624da1336e2231a0',
 'text': 'Attention Is All You Need',
 'metadata': {'detection_class_prob': 0.5234606266021729,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(1026.815673828125),
     np.float64(483.33842375000006)),
    (np.float64(1026.815673828125), np.float64(567.9808349609375)),
    (np.float64(1946.5042724609375), np.float64(567.9808349609375)),
    (np.float64(1946.5042724609375), np.float64(483.33842375000006))),
   'system': 'PixelSpace',
   'layout_width': 2975,
   'layout_height': 3850},
  'last_modified': '2026-02-27T01:05:38',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 1,
  'file_directory': '../data',
  'filename': 'NIPS-2017.pdf'}}

In [51]:
tables = [el for el in elements if el.category == "Table"]
print(f"Extracted {len(tables)} tables from the document.")
tables[0].to_dict()

Extracted 3 tables from the document.


{'type': 'Table',
 'element_id': '9922caf9a703b5d75eacdf3b76bc27bc',
 'text': 'Layer Type Complexity per Layer Sequential Maximum Path Length Operations Self-Attention O(n2 · d) O(1) O(1) Recurrent O(n · d2) O(n) O(n) Convolutional O(k · n · d2) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)',
 'metadata': {'detection_class_prob': 0.9264091849327087,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(565.9496459960938),
     np.float64(553.2269897460938)),
    (np.float64(565.9496459960938), np.float64(907.312255859375)),
    (np.float64(2380.64453125), np.float64(907.312255859375)),
    (np.float64(2380.64453125), np.float64(553.2269897460938))),
   'system': 'PixelSpace',
   'layout_width': 2975,
   'layout_height': 3850},
  'last_modified': '2026-02-27T01:05:38',
  'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention

In [52]:
def create_chunks_by_title(elements: List[Document]) -> List[Document]:
    """Chunk the document elements by title using Unstructured."""
    chunks = chunk_by_title(
        elements=elements, # List of document elements to chunk
        max_characters=3000, # Maximum characters per chunk
        new_after_n_chars=2500, # Force new chunk after this many characters
        combine_text_under_n_chars=500 # Combine small text elements into chunks
    )
    print(f"Created {len(chunks)} chunks by title.")
    return chunks

chunks = create_chunks_by_title(elements)
# print(f"First chunk content:\n{chunks[0].text[:500]}...")
chunks

Created 21 chunks by title.


[<unstructured.documents.elements.CompositeElement at 0x17f83cd70>,
 <unstructured.documents.elements.CompositeElement at 0x17f83d240>,
 <unstructured.documents.elements.CompositeElement at 0x16d104de0>,
 <unstructured.documents.elements.CompositeElement at 0x350f5d0f0>,
 <unstructured.documents.elements.CompositeElement at 0x17f83cc20>,
 <unstructured.documents.elements.CompositeElement at 0x350f5c280>,
 <unstructured.documents.elements.CompositeElement at 0x350f5d940>,
 <unstructured.documents.elements.CompositeElement at 0x350f5d470>,
 <unstructured.documents.elements.CompositeElement at 0x350f5fb60>,
 <unstructured.documents.elements.CompositeElement at 0x352226660>,
 <unstructured.documents.elements.CompositeElement at 0x352225ef0>,
 <unstructured.documents.elements.CompositeElement at 0x352225f60>,
 <unstructured.documents.elements.CompositeElement at 0x3522267b0>,
 <unstructured.documents.elements.CompositeElement at 0x3522266d0>,
 <unstructured.documents.elements.CompositeEleme

In [53]:
chunks[0].to_dict()

{'type': 'CompositeElement',
 'element_id': '7bd06068-eeba-487d-8b6a-a5cc5a73c649',
 'text': 'Attention Is All You Need\n\nAshish Vaswani∗ Google Brain avaswani@google.com\n\nNoam Shazeer∗ Google Brain noam@google.com\n\nNiki Parmar∗\n\nGoogle Research nikip@google.com\n\nJakob Uszkoreit∗ Google Research usz@google.com\n\nLlion Jones∗\n\nGoogle Research llion@google.com\n\nAidan N. Gomez∗ † University of Toronto aidan@cs.toronto.edu\n\nŁukasz Kaiser∗\n\nGoogle Brain lukaszkaiser@google.com\n\nIllia Polosukhin∗ ‡\n\nillia.polosukhin@gmail.com\n\nAbstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation ta

In [54]:
chunks[4].metadata.orig_elements

[<unstructured.documents.elements.Title at 0x35272d630>,
 <unstructured.documents.elements.NarrativeText at 0x35272d710>,
 <unstructured.documents.elements.Footer at 0x35272d7f0>,
 <unstructured.documents.elements.Image at 0x3527ace50>,
 <unstructured.documents.elements.FigureCaption at 0x345a5d6a0>,
 <unstructured.documents.elements.NarrativeText at 0x33fbbf8c0>,
 <unstructured.documents.elements.NarrativeText at 0x35272c9f0>]

In [55]:
chunks[4].metadata.orig_elements[3].to_dict()

{'type': 'Image',
 'element_id': '52dc7d4e03f111468c605cbb728eaf49',
 'text': 'Output Probabilities Add & Norm Feed Forward Add & Norm Multi-Head Attention a, Add & Norm Nx Add & Norm Feed Forward Nx | —-Casda Nom] Add & Norm VWEeea Multi-Head Multi-Head Attention Attention Sy ae, SE a, Positional CY Encoding ® Positional @ q Encoding Input Embedding Inputs Outputs (shifted right) Output Embedding',
 'metadata': {'coordinates': {'points': ((np.float64(955.4951388888888),
     np.float64(350.00972222222197)),
    (np.float64(955.4951388888888), np.float64(1917.309722222222)),
    (np.float64(2019.4951388888885), np.float64(1917.309722222222)),
    (np.float64(2019.4951388888885), np.float64(350.00972222222197))),
   'system': 'PixelSpace',
   'layout_width': 2975,
   'layout_height': 3850},
  'last_modified': '2026-02-27T01:05:38',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 3,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLD

In [56]:
chunks[10].metadata.orig_elements

[<unstructured.documents.elements.Title at 0x35272ec10>,
 <unstructured.documents.elements.NarrativeText at 0x35272ecf0>,
 <unstructured.documents.elements.Footer at 0x35272edd0>,
 <unstructured.documents.elements.NarrativeText at 0x33fbbf700>,
 <unstructured.documents.elements.Table at 0x3527f3c50>,
 <unstructured.documents.elements.NarrativeText at 0x35272d5c0>,
 <unstructured.documents.elements.NarrativeText at 0x35272dbe0>,
 <unstructured.documents.elements.Formula at 0x350b224e0>,
 <unstructured.documents.elements.NarrativeText at 0x35272e120>,
 <unstructured.documents.elements.NarrativeText at 0x35272e430>]

In [57]:
chunks[10].metadata.orig_elements[4].to_dict()

{'type': 'Table',
 'element_id': '9922caf9a703b5d75eacdf3b76bc27bc',
 'text': 'Layer Type Complexity per Layer Sequential Maximum Path Length Operations Self-Attention O(n2 · d) O(1) O(1) Recurrent O(n · d2) O(n) O(n) Convolutional O(k · n · d2) O(1) O(logk(n)) Self-Attention (restricted) O(r · n · d) O(1) O(n/r)',
 'metadata': {'detection_class_prob': 0.9264091849327087,
  'is_extracted': 'true',
  'coordinates': {'points': ((np.float64(565.9496459960938),
     np.float64(553.2269897460938)),
    (np.float64(565.9496459960938), np.float64(907.312255859375)),
    (np.float64(2380.64453125), np.float64(907.312255859375)),
    (np.float64(2380.64453125), np.float64(553.2269897460938))),
   'system': 'PixelSpace',
   'layout_width': 2975,
   'layout_height': 3850},
  'last_modified': '2026-02-27T01:05:38',
  'text_as_html': '<table><thead><tr><th>Layer Type</th><th>Complexity per Layer</th><th>Sequential Operations</th><th>Maximum Path Length</th></tr></thead><tbody><tr><td>Self-Attention

In [58]:
# Configure logging for the module
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

In [59]:
def extract_chunk_content(chunk: Any) -> Dict[str, Any]:
    """Extract text, tables, and images from a document chunk."""
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': {'text'}  # Using a set to naturally handle duplicates
    }
    
    if not (hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements')):
        return content_data
        
    for element in chunk.metadata.orig_elements:
        element_type = type(element).__name__
        
        if element_type == 'Table':
            content_data['types'].add('table')
            table_html = getattr(element.metadata, 'text_as_html', element.text)
            content_data['tables'].append(table_html)
            
        elif element_type == 'Image':
            if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                content_data['types'].add('image')
                content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(content_data['types'])
    return content_data

def _create_fallback_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Helper to create a basic text summary when AI generation fails."""
    summary = f"{text[:300]}..."
    if tables:
        summary += f" [Contains {len(tables)} table(s)]"
    if images:
        summary += f" [Contains {len(images)} image(s)]"
    return summary

def generate_content_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Generate an AI-enhanced summary for chunks containing mixed media."""
    try:
        llm = ChatOpenAI(model="gpt-4o", temperature=0)
        
        prompt_text = (
            "You are creating a searchable description for document content retrieval.\n\n"
            "CONTENT TO ANALYZE:\n"
            f"TEXT CONTENT:\n{text}\n\n"
        )
        
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"
        
        # This section is now guaranteed to append regardless of table presence
        prompt_text += (
            "YOUR TASK:\n"
            "Generate a comprehensive, searchable description that covers:\n"
            "1. Key facts, numbers, and data points from text and tables\n"
            "2. Main topics and concepts discussed\n"
            "3. Questions this content could answer\n"
            "4. Visual content analysis (charts, diagrams, patterns in images)\n"
            "5. Alternative search terms users might use\n\n"
            "Make it detailed and searchable - prioritize findability over brevity.\n"
            "SEARCHABLE DESCRIPTION:"
        )

        message_content: List[Dict[str, Any]] = [{"type": "text", "text": prompt_text}]
        
        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })
        
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        logger.error(f"AI summary failed: {e}")
        return _create_fallback_summary(text, tables, images)

def process_document_chunks(chunks: List[Any]) -> List[Document]:
    """Process all chunks and enrich them with AI summaries if needed."""
    logger.info(f"Processing {len(chunks)} chunks with AI Summaries...")
    
    langchain_documents = []
    
    for i, chunk in enumerate(chunks, 1):
        logger.info(f"Processing chunk {i}/{len(chunks)}")
        content_data = extract_chunk_content(chunk)
        
        if content_data['tables'] or content_data['images']:
            logger.info("Generating AI summary for mixed content...")
            enhanced_content = generate_content_summary(
                content_data['text'],
                content_data['tables'], 
                content_data['images']
            )
        else:
            enhanced_content = content_data['text']
        
        doc = Document(
            page_content=enhanced_content,
            metadata={
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "tables_html": content_data['tables'],
                    "images_base64": content_data['images']
                })
            }
        )
        langchain_documents.append(doc)
    
    logger.info(f"Successfully processed {len(langchain_documents)} chunks.")
    return langchain_documents

# Example usage:
processed_chunks = process_document_chunks(chunks)

INFO: Processing 21 chunks with AI Summaries...
INFO: Processing chunk 1/21
INFO: Processing chunk 2/21
INFO: Processing chunk 3/21
INFO: Processing chunk 4/21
INFO: Processing chunk 5/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 6/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 7/21
INFO: Processing chunk 8/21
INFO: Processing chunk 9/21
INFO: Processing chunk 10/21
INFO: Processing chunk 11/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 12/21
INFO: Processing chunk 13/21
INFO: Processing chunk 14/21
INFO: Processing chunk 15/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTT

In [60]:
processed_chunks

[Document(metadata={'original_content': '{"raw_text": "Attention Is All You Need\\n\\nAshish Vaswani\\u2217 Google Brain avaswani@google.com\\n\\nNoam Shazeer\\u2217 Google Brain noam@google.com\\n\\nNiki Parmar\\u2217\\n\\nGoogle Research nikip@google.com\\n\\nJakob Uszkoreit\\u2217 Google Research usz@google.com\\n\\nLlion Jones\\u2217\\n\\nGoogle Research llion@google.com\\n\\nAidan N. Gomez\\u2217 \\u2020 University of Toronto aidan@cs.toronto.edu\\n\\n\\u0141ukasz Kaiser\\u2217\\n\\nGoogle Brain lukaszkaiser@google.com\\n\\nIllia Polosukhin\\u2217 \\u2021\\n\\nillia.polosukhin@gmail.com\\n\\nAbstract\\n\\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convoluti

In [61]:
def export_chunks_to_json(
    chunks: List[Document], 
    filename: str = "chunks_export.json"
) -> List[Dict[str, Any]]:
    """Export processed chunks to a formatted JSON file."""
    export_data = []
    
    for i, doc in enumerate(chunks, 1):
        # Safely parse metadata, falling back to an empty dict if decoding fails
        try:
            original_content = json.loads(doc.metadata.get("original_content", "{}"))
        except json.JSONDecodeError:
            logger.warning(f"Failed to parse original_content JSON for chunk {i}.")
            original_content = {}

        chunk_data = {
            "chunk_id": i,
            "enhanced_content": doc.page_content,
            "metadata": {
                "original_content": original_content
            }
        }
        export_data.append(chunk_data)
    
    # Safely write to the filesystem
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(export_data, f, indent=2, ensure_ascii=False)
        logger.info(f"Successfully exported {len(export_data)} chunks to {filename}")
    except IOError as e:
        logger.error(f"Failed to write chunks to {filename}: {e}")
    
    return export_data

# Example usage:
json_data = export_chunks_to_json(processed_chunks)

INFO: Successfully exported 21 chunks to chunks_export.json


In [62]:
def create_vector_store(
    documents: List[Document], 
    persist_directory: str = "dbv1/chroma_db"
) -> Chroma:
    """Create and persist a ChromaDB vector store from processed documents."""
    logger.info("Initializing embedding model and creating ChromaDB vector store...")
    
    try:
        embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
        
        logger.info("Building vector store from documents...")
        vectorstore = Chroma.from_documents(
            documents=documents,
            embedding=embedding_model,
            persist_directory=persist_directory, 
            collection_metadata={"hnsw:space": "cosine"}
        )
        
        logger.info(f"Vector store successfully created and saved to '{persist_directory}'.")
        return vectorstore
        
    except Exception as e:
        logger.error(f"Failed to create or persist the vector store: {e}")
        raise  # Re-raise the exception since the pipeline cannot proceed without the DB

# Example usage:
db = create_vector_store(processed_chunks)

INFO: Initializing embedding model and creating ChromaDB vector store...
INFO: Building vector store from documents...
INFO: Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: Vector store successfully created and saved to 'dbv1/chroma_db'.


In [65]:
def retrieve_and_export_chunks(
    db: Chroma, 
    query: str, 
    k: int = 3, 
    output_filename: str = "rag_results.json"
) -> List[Document]:
    """Retrieve relevant chunks for a query and export them to a JSON file."""
    logger.info(f"Setting up retriever for query: '{query}'")
    
    try:
        # Initialize retriever with dynamic 'k'
        retriever = db.as_retriever(search_kwargs={"k": k})
        
        logger.info(f"Retrieving top {k} chunks from the vector store...")
        chunks = retriever.invoke(query)
        
        logger.info(f"Successfully retrieved {len(chunks)} chunks.")
        
        # Utilize the previously defined export function
        export_chunks_to_json(chunks, filename=output_filename)
        
        return chunks
        
    except Exception as e:
        logger.error(f"Failed during retrieval or export: {e}")
        return []

# Example usage:
query = "What are the two main components of the Transformer architecture?"
retrieved_chunks = retrieve_and_export_chunks(db, query, k=3)

INFO: Setting up retriever for query: 'What are the two main components of the Transformer architecture?'
INFO: Retrieving top 3 chunks from the vector store...
INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: Successfully retrieved 3 chunks.
INFO: Successfully exported 3 chunks to rag_results.json


In [66]:
def run_complete_ingestion_pipeline(
    pdf_path: str, 
    persist_directory: str = "dbv2/chroma_db"
) -> Chroma:
    """Run the complete RAG ingestion pipeline from PDF to vector store."""
    logger.info("Starting RAG Ingestion Pipeline")
    logger.info("-" * 50)
    
    try:
        # Step 1: Extract elements from the document
        logger.info(f"Partitioning document: {pdf_path}")
        elements = partition_document(pdf_path)
        
        # Step 2: Group elements into logical chunks
        logger.info("Creating chunks by title...")
        chunks = create_chunks_by_title(elements)
        
        # Step 3: Enhance chunks with AI-generated summaries
        # Note: Using the refactored function name from previous steps
        logger.info("Generating AI summaries for chunks...")
        processed_chunks = process_document_chunks(chunks)
        
        # Step 4: Embed and store in ChromaDB
        logger.info("Building and persisting vector store...")
        db = create_vector_store(processed_chunks, persist_directory=persist_directory)
        
        logger.info("Pipeline completed successfully.")
        return db
        
    except FileNotFoundError:
        logger.error(f"Could not find the file at path: {pdf_path}")
        raise
    except Exception as e:
        logger.error(f"Pipeline failed during execution: {e}")
        raise

# Example usage:
if __name__ == "__main__":
    pdf_file_path = "../data/NIPS-2017.pdf"
    vector_db = run_complete_ingestion_pipeline(pdf_file_path)

INFO: Starting RAG Ingestion Pipeline
INFO: --------------------------------------------------
INFO: Partitioning document: ../data/NIPS-2017.pdf
INFO: Reading PDF for file: ../data/NIPS-2017.pdf ...




INFO: Creating chunks by title...
INFO: Generating AI summaries for chunks...
INFO: Processing 21 chunks with AI Summaries...
INFO: Processing chunk 1/21
INFO: Processing chunk 2/21
INFO: Processing chunk 3/21
INFO: Processing chunk 4/21
INFO: Processing chunk 5/21
INFO: Generating AI summary for mixed content...


Partitioned document into 183 elements.
Created 21 chunks by title.


INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 6/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 7/21
INFO: Processing chunk 8/21
INFO: Processing chunk 9/21
INFO: Processing chunk 10/21
INFO: Processing chunk 11/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 12/21
INFO: Processing chunk 13/21
INFO: Processing chunk 14/21
INFO: Processing chunk 15/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 16/21
INFO: Processing chunk 17/21
INFO: Generating AI summary for mixed content...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Processing chunk 18/21
I

In [None]:
def generate_final_answer(
    chunks: List[Document], 
    query: str, 
    model_name: str = "gpt-4o"
) -> str:
    """Generate a synthesized answer using text, tables, and images from retrieved chunks."""
    logger.info(f"Generating final answer for query: '{query}'")
    
    try:
        llm = ChatOpenAI(model=model_name, temperature=0.0)
        
        prompt_text = (
            f"Based on the following documents, please answer this question: {query}\n\n"
            "CONTENT TO ANALYZE:\n"
        )
        
        # List to hold the multimodal message structure
        message_content: List[Dict[str, Any]] = []
        
        for i, chunk in enumerate(chunks, 1):
            prompt_text += f"--- Document {i} ---\n"
            
            # Safely parse the original content JSON
            try:
                original_data = json.loads(chunk.metadata.get("original_content", "{}"))
            except json.JSONDecodeError:
                logger.warning(f"Failed to parse JSON metadata for Document {i}. Skipping content.")
                original_data = {}
            
            # Append text
            raw_text = original_data.get("raw_text", "")
            if raw_text:
                prompt_text += f"TEXT:\n{raw_text}\n\n"
            
            # Append tables
            tables_html = original_data.get("tables_html", [])
            if tables_html:
                prompt_text += "TABLES:\n"
                for j, table in enumerate(tables_html, 1):
                    prompt_text += f"Table {j}:\n{table}\n\n"
            
            # Queue images to be added to the message payload
            images_base64 = original_data.get("images_base64", [])
            for image_base64 in images_base64:
                message_content.append({
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                })

            prompt_text += "\n"
        
        prompt_text += (
            "Please provide a clear, comprehensive answer using the text, tables, and images above. "
            "If the documents don't contain sufficient information to answer the question, "
            "say 'I don't have enough information to answer that question based on the provided documents.'\n\n"
            "ANSWER:"
        )

        # Prepend the text prompt to the message content list
        message_content.insert(0, {"type": "text", "text": prompt_text})
        
        logger.info("Sending prompt and multimodal content to the LLM...")
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        logger.info("Successfully generated the final answer.")
        return response.content
        
    except Exception as e:
        logger.error(f"Answer generation failed: {e}")
        return "Sorry, I encountered an error while generating the answer."

# Example usage:
if __name__ == "__main__":
    query = "What are the two main components of the Transformer architecture?"
    retriever = db.as_retriever(search_kwargs={"k": 3})
    retrieved_chunks = retriever.invoke(query)
    
    final_answer = generate_final_answer(retrieved_chunks, query)
    logger.info(f"Final Answer:\n{final_answer}")

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: Generating final answer for query: 'What are the two main components of the Transformer architecture?'
INFO: Sending prompt and multimodal content to the LLM...
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO: Successfully generated the final answer.
INFO: Final Answer:
The two main components of the Transformer architecture are the **Encoder** and the **Decoder**. Each consists of a stack of identical layers, with the encoder having two sub-layers (multi-head self-attention and feed-forward network) and the decoder having an additional third sub-layer for multi-head attention over the encoder's output.
