In [1]:
import json
from typing import List

# unstructured
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title

# langchain
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI,OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.messages import HumanMessage

# load .env
from dotenv import load_dotenv

load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


False

In [2]:
def partition_document(file_path: str):
    """Extract elements from PDF using unstructured"""
    print(f"Partitioning document : {file_path}")

    elements = partition_pdf(
        filename=file_path,
        strategy="hi_res",
        infer_table_structure=True,
        extract_image_block_types=["Image"],
        extract_image_block_to_payload=True
    )

    print(f"Extract {len(elements)} elements")
    return elements

file_path="./docs/EJ1172284.pdf"
elements= partition_document(file_path)

Partitioning document : ./docs/EJ1172284.pdf


The `max_size` parameter is deprecated and will be removed in v4.26. Please specify in `size['longest_edge'] instead`.


Extract 168 elements


In [None]:
elements[0].category

False

In [None]:
# gather images
images = [element for element in elements if element.category == "Image"]
print(f"Found {len(images)} images")

Found 11 images


In [21]:
# for i in images:
#     print(i.text)

images[2].to_dict()

{'type': 'Image',
 'element_id': '3900d9bcbb56ad0d4fc52b2e9eb12c22',
 'text': '20  ',
 'metadata': {'coordinates': {'points': ((np.float64(780.0),
     np.float64(2191.333333333333)),
    (np.float64(780.0), np.float64(2265.0)),
    (np.float64(873.3333333333334), np.float64(2265.0)),
    (np.float64(873.3333333333334), np.float64(2191.333333333333))),
   'system': 'PixelSpace',
   'layout_width': 1654,
   'layout_height': 2339},
  'last_modified': '2025-12-29T01:20:13',
  'filetype': 'application/pdf',
  'languages': ['eng'],
  'page_number': 3,
  'image_base64': '/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAgGBgcGBQgHBwcJCQgKDBQNDAsLDBkSEw8UHRofHh0aHBwgJC4nICIsIxwcKDcpLDAxNDQ0Hyc5PTgyPC4zNDL/2wBDAQkJCQwLDBgNDRgyIRwhMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjIyMjL/wAARCABKAF0DASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqD

In [22]:
# tables
tables = [element for element in elements if element.category == "Table"]
print(f"Found {len(tables)} tables")

Found 2 tables


In [None]:
tables[0].to_dict()

In [24]:
# chunking by titile 
def create_chucks_by_title(elements):
    """Create intelligent chunks using title-based strategy"""
    print(" creating smart chunks...")

    chunks = chunk_by_title(
        elements,
        max_characters=3000,
        new_after_n_chars=2400,
        combine_text_under_n_chars=500
    )

    print(f"created {len(chunks)} chunks")
    return chunks

chunks=create_chucks_by_title(elements)

 creating smart chunks...
created 20 chunks


In [36]:
chunks[5].metadata.orig_elements

[<unstructured.documents.elements.NarrativeText at 0x25b1951a210>,
 <unstructured.documents.elements.NarrativeText at 0x25b1950bed0>]

In [35]:
chunks[5].text

'Given the importance attached to new technologies, and, in particular the potential role of mobile devices in autonomous language learning, the terms mobile learning and mobile devices (MobDs) need first to be explained. As for mobile learning, no single agreed-upon definition of the term exists in the literature (Oz, 2015). This is because some researchers define mobile learning as an extension of e-learning built upon mobile devices whereas some other researchers understand it as learning that happens anywhere and anytime (cf. Oz, 2015). As far as mobile devices are concerned, they can be defined as ‚Äúany device that is small, autonomous and unobtrusive enough to accompany us in every moment and can be used for educational purposes‚Äù (Trifanova Knapp, Ronchetti & Gamper, 2004, p. 3) or as ‚Äúhand held electronic devices that can be comfortably carried around in a pocket or bag, including MP3 players, digital recorders, e-readers, tablets, and smartphones‚Äù (Kukulska-Hulme, Norris

In [None]:
def separate_content_types(chunk):
    """Analyze what types of content are in a chunk"""
    content_data = {
        'text': chunk.text,
        'tables': [],
        'images': [],
        'types': ['text']
    }
    
    # Check for tables and images in original elements
    if hasattr(chunk, 'metadata') and hasattr(chunk.metadata, 'orig_elements'):
        for element in chunk.metadata.orig_elements:
            element_type = type(element).__name__
            
            # Handle tables
            if element_type == 'Table':
                content_data['types'].append('table')
                table_html = getattr(element.metadata, 'text_as_html', element.text)
                content_data['tables'].append(table_html)
            
            # Handle images
            elif element_type == 'Image':
                if hasattr(element, 'metadata') and hasattr(element.metadata, 'image_base64'):
                    content_data['types'].append('image')
                    content_data['images'].append(element.metadata.image_base64)
    
    content_data['types'] = list(set(content_data['types']))
    return content_data

def create_ai_enhanced_summary(text: str, tables: List[str], images: List[str]) -> str:
    """Create AI-enhanced summary for mixed content"""
    
    try:
        # Initialize LLM (needs vision model for images)
        llm = ChatOpenAI(model="gpt-4o", temperature=0)
        
        # Build the text prompt
        prompt_text = f"""You are creating a searchable description for document content retrieval.

        CONTENT TO ANALYZE:
        TEXT CONTENT:
        {text}

        """
        
        # Add tables if present
        if tables:
            prompt_text += "TABLES:\n"
            for i, table in enumerate(tables):
                prompt_text += f"Table {i+1}:\n{table}\n\n"
        
                prompt_text += """
                YOUR TASK:
                Generate a comprehensive, searchable description that covers:

                1. Key facts, numbers, and data points from text and tables
                2. Main topics and concepts discussed  
                3. Questions this content could answer
                4. Visual content analysis (charts, diagrams, patterns in images)
                5. Alternative search terms users might use

                Make it detailed and searchable - prioritize findability over brevity.

                SEARCHABLE DESCRIPTION:"""

        # Build message content starting with text
        message_content = [{"type": "text", "text": prompt_text}]
        
        # Add images to the message
        for image_base64 in images:
            message_content.append({
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
            })
        
        # Send to AI and get response
        message = HumanMessage(content=message_content)
        response = llm.invoke([message])
        
        return response.content
        
    except Exception as e:
        print(f"     ‚ùå AI summary failed: {e}")
        # Fallback to simple summary
        summary = f"{text[:300]}..."
        if tables:
            summary += f" [Contains {len(tables)} table(s)]"
        if images:
            summary += f" [Contains {len(images)} image(s)]"
        return summary

def summarise_chunks(chunks):
    """Process all chunks with AI Summaries"""
    print("üß† Processing chunks with AI Summaries...")
    
    langchain_documents = []
    total_chunks = len(chunks)
    
    for i, chunk in enumerate(chunks):
        current_chunk = i + 1
        print(f"   Processing chunk {current_chunk}/{total_chunks}")
        
        # Analyze chunk content
        content_data = separate_content_types(chunk)
        
        # Debug prints
        print(f"     Types found: {content_data['types']}")
        print(f"     Tables: {len(content_data['tables'])}, Images: {len(content_data['images'])}")
        
        # Create AI-enhanced summary if chunk has tables/images
        if content_data['tables'] or content_data['images']:
            print(f"     ‚Üí Creating AI summary for mixed content...")
            try:
                enhanced_content = create_ai_enhanced_summary(
                    content_data['text'],
                    content_data['tables'], 
                    content_data['images']
                )
                print(f"     ‚Üí AI summary created successfully")
                print(f"     ‚Üí Enhanced content preview: {enhanced_content[:200]}...")
            except Exception as e:
                print(f"     ‚ùå AI summary failed: {e}")
                enhanced_content = content_data['text']
        else:
            print(f"     ‚Üí Using raw text (no tables/images)")
            enhanced_content = content_data['text']
        
        # Create LangChain Document with rich metadata
        doc = Document(
            page_content=enhanced_content,
            metadata={
                "original_content": json.dumps({
                    "raw_text": content_data['text'],
                    "tables_html": content_data['tables'],
                    "images_base64": content_data['images']
                })
            }
        )
        
        langchain_documents.append(doc)
    
    print(f"‚úÖ Processed {len(langchain_documents)} chunks")
    return langchain_documents


# Process chunks with AI
processed_chunks = summarise_chunks(chunks)