In [1]:
import os
import asyncio
import json
import nest_asyncio
import dotenv
dotenv.load_dotenv()



GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
GOOGLE_SEARCH_KEY = os.getenv('GOOGLE_SEARCH_KEY')
GOOGLE_SEARCH_ENGINE = os.getenv('GOOGLE_SEARCH_ENGINE')

nest_asyncio.apply()

In [2]:
from pydantic import BaseModel, Field
from typing import List, Optional

class Books(BaseModel):
    """Summary"""
    title: str = Field(..., description="Title of the book")
    author: str = Field(..., description="Author of the book")
    yearPublished: int = Field(..., description="Year the book was published", alias="yearPublished")
    summary: str = Field(..., description="Brief summary of the book")


class BookResponseFormatJson(BaseModel):
    """Response Format"""
    Top10BoookingSelling: List[Books] = Field(..., description="List of top 10 best-selling books")

In [None]:
from llama_index.core.prompts import PromptTemplate

prompt_template = PromptTemplate("Generate a list of the top 10 best-selling books of all time.")
response_obj = llm.structured_predict(
  BookResponseFormatJson,
  prompt=prompt_template
)


In [38]:
response_obj.Top10BoookingSelling

[Books(title='Don Quixote', author='Miguel de Cervantes', yearPublished=1605, summary='A Spanish noble reads so many chivalric romances that he decides to revive chivalry himself and sets out on a series of adventures as a knight-errant.'),
 Books(title='A Tale of Two Cities', author='Charles Dickens', yearPublished=1859, summary='Set during the French Revolution, it tells the story of the French doctor Manette, his daughter Lucie, and her two suitors, the French aristocrat Charles Darnay and the English lawyer Sydney Carton.'),
 Books(title='The Lord of the Rings', author='J.R.R. Tolkien', yearPublished=1954, summary='A hobbit named Frodo Baggins inherits a magical ring that he discovers is the One Ring, an evil artifact created by the Dark Lord Sauron. He embarks on a quest to destroy the Ring in the fires of Mount Doom.'),
 Books(title='The Little Prince', author='Antoine de Saint-Exupéry', yearPublished=1943, summary='A pilot stranded in the desert meets a young prince who has fall

## Strucutred output from PDF + OpenAI + pdf2images

In [12]:
from huggingface_hub import hf_hub_download
file_path = hf_hub_download(repo_id="jaiganesan/ai_tutor_knowledge", filename="rag_research_paper.zip",repo_type="dataset",local_dir="./data")

In [36]:
# system_instruction_prompt ="""
# You are an expert in extracting structured data from research paper PDFs.

# Task Description:
# Your task is to process an entire research paper provided as a PDF document and extract comprehensive, structured information from it. This includes all text, headlines, and detailed descriptions of visual elements. The final output must be a single, well-structured JSON object.

# Must Follow Guidelines:
# 1.  Process the Entire PDF: Treat the input as a complete document, not as individual pages.
# 2.  Accurate Data Extraction: Extract all text and information with high precision. Do not summarize, paraphrase, or omit any details.
# 3.  Logical Structure: Organize the extracted content into logical sections based on the paper's structure (e.g., Abstract, Introduction, Methods, Results, Conclusion, Appendices).
# 4.  Complete Information: Ensure no information is fragmented. Merge text that spans columns or pages into coherent paragraphs and sentences.

# Content Requirements:

# 1.  Source Identification:
#     *   Accurately extract the arXiv ID (e.g., `arXiv:2405.07437v2`). Verify its correctness. If no arXiv ID is present, use `null`.

# 2.  Headlines and Sections:
#     *   Extract all headlines and subheadlines to define the structure (e.g., "1. Introduction," "2.1. System Architecture").
#     *   If a section of content has no visible headline, generate a descriptive title for it based on its content.
#     *   Each distinct section of the paper should become a separate object in the final JSON output.

# 3.  Text Content:
#     *   For each section, extract the complete and verbatim text. Preserve all technical details, equations, and specific terminology.

# 4.  Visual Elements (Figures, Tables, Graphs, Architectures):
#     *   Within the content of each section, when you encounter a visual element, provide a detailed analysis.
#     *   **Title/Caption:** Extract the exact title and caption.
#     *   **Detailed Description:** Describe the visual element's purpose and what it depicts.
#     *   **Key Information:** Detail the main trends, data points, comparisons, and conclusions shown. For architectures, describe the components, layers, and data flow.
#     *   **Contextual Insights:** Include any related insights or explanations from the surrounding text that refer to the visual element.

# You are a data extraction engine. CRITICAL: Do not include more than two consecutive newlines. If you encounter empty space in the PDF, ignore it. Do not hallucinate content. Output only the requested JSON fields. Be concise.

# Required Output Format (JSON):

# Your output must be a single JSON object that strictly adheres to the following structure:

# ```json
# {
#   "source_name": "Extract complete arXiv ID including prefix (e.g., arXiv:2405.07437v2). If none, use null.",
#   "source_id": "Extract complete arXiv ID including prefix (e.g., arXiv:2405.07437v2). If none, use null.",
#   "research_paper_data": [
#     {
#       "content_title": "The title of the first section (e.g., 'Abstract').",
#       "content": "The complete, verbatim text of the Abstract. Include descriptions of any visual elements if present."
#     },
#     {
#       "content_title": "The title of the second section (e.g., '1. Introduction').",
#       "content": "The complete, verbatim text of the Introduction. This section should include detailed descriptions of any figures or tables found within it, as per the 'Visual Elements' guidelines."
#     },
#     .
#     .
#     .
#   ]
# }
# ```
# Key Guidelines:
# - Extract exact content without summarization
# - Ensure accuracy in complex technical details
# - Maintain logical content organization
# - Include complete visual element analysis
# """

I reduce the system instruction, because this will be helpful in the model like `GPT-5`
Since, I'm using the Germini, I decided to minimize this, otherwise I was getting invalid json

In [28]:
system_instruction_prompt = """
You are a data extraction bot. 
Output ONLY valid JSON. 
CRITICAL: Do not use tabs (\\t) or multiple newlines (\\n). 
Keep the JSON compact. If you find headers or footers in the PDF, ignore them. 
Do not repeat the paper title in the output.
"""

In [4]:
from pydantic import BaseModel, Field
from typing import List, Optional

# The response format- JSON schema
class ResearchPaperData(BaseModel):
  content_title: Optional[str] = Field(..., description="Extract or generate headlines and subheadlines (e.g., Abstract, Introduction, Methods, etc). Include section titles and subsection headings.")
  content: Optional[str] = Field(..., description="For each section: - Complete text content - Visual element descriptions - Figure/graph details: * Title/caption * Description * Key trends/comparisons * Architecture details * Related insights, Don't Summarize, Extract all the Content in the section")

class ResearchPaperResponseFormatJSON(BaseModel):
  source_name: str = Field(..., description="Extract Research paper Title.")
  source_id: str = Field(..., description="Extract complete arXiv ID including prefix (e.g., arXiv:2405.07437v2). Verify ID accuracy multiple times. if there is no Arxiv ID return None")
  research_paper_data: List[ResearchPaperData] = Field(..., description="List of Extracted research paper data complete data without summarizing,")


In [29]:
import os
import glob
import base64
from pypdf import PdfReader, PdfWriter
import google.generativeai as genai

# Parameters
PDF_FOLDER = "./data/rag_research_paper"     
PAGES_PER_CHUNK = 2                  
SYSTEM_INSTRUCTION = system_instruction_prompt  
RESPONSE_FORMAT = ResearchPaperResponseFormatJSON

def split_pdf_by_pages(pdf_path, pages_per_chunk=PAGES_PER_CHUNK):
    """Split a single PDF into smaller page-range chunks."""
    reader = PdfReader(pdf_path)
    total_pages = len(reader.pages)
    chunks = []

    for start in range(0, total_pages, pages_per_chunk):
        end = min(start + pages_per_chunk, total_pages)
        writer = PdfWriter()
        for page_idx in range(start, end):
            writer.add_page(reader.pages[page_idx])
        chunk_filename = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_" \
                         f"pages_{start+1}_{end}.pdf"
        chunk_path = os.path.join(PDF_FOLDER+"/chunks_data/", chunk_filename)
        with open(chunk_path, "wb") as f_out:
            writer.write(f_out)
        chunks.append({"path": chunk_path, "pages": f"{start+1}-{end}"})
    return chunks

In [30]:
def process_pdf_chunk(chunk, system_instructions, response_format):

    pdf_path = chunk['path']
    pages = chunk['pages']
    print(f"Processing chunk {pages} (Uploading {pdf_path})...")

    file_ref = genai.upload_file(pdf_path, mime_type="application/pdf")

    # 2. Wait for processing (Essential step)
    # Gemini needs a moment to process the file before it can be queried
    while file_ref.state.name == "PROCESSING":
        time.sleep(1)
        file_ref = genai.get_file(file_ref.name)

    if file_ref.state.name == "FAILED":
        print(f"Uploading Failed for {pdf_path}")

    try:

        config = genai.GenerationConfig(
            response_mime_type="application/json",
            response_schema=response_format,
            temperature=0.1
        )
        model = genai.GenerativeModel(
            model_name="gemini-2.5-flash-preview-09-2025",
            system_instruction=system_instructions
        )

        response = model.generate_content(
            [file_ref, f"Extract structured content from pages {chunk['pages']}."],
            generation_config=config
        )

        # Check if it was cut off again
        if response.candidates[0].finish_reason.name == "MAX_TOKENS":
            print(f"Warning: Chunk {pages} was truncated! Try reducing PAGES_PER_CHUNK further.")

        # --- DEBUG START ---
        # print(f"--- RAW OUTPUT FOR PAGES {pages} ---")
        # print("Last 200 characters of response.text:", response.text[-600:], "\n") # See the last 200 characters
        # print("First 200 characters of response.text:", response.text[:600], "\n") # See the first 200 characters
        
        # Check why the model stopped
        finish_reason = response.candidates[0].finish_reason.name
        print(f"Finish Reason: {finish_reason}")

        # The actual use of model_validate_json is to act as a bridge between a raw text string (which is what APIs send) and a Python object (which is what your code needs).
        # Think of it as a "parser + safety inspector" combined.
        # THE PROBLEM IT SOLVES - When you call any LLM (OpenAI, Gemini, Anthropic) directly, the model does not send back a Python object. It sends back a String
        parse_obj = response_format.model_validate_json(response.text)
        return parse_obj.research_paper_data
    
    except Exception as e:
        print(f"Error extracting from chunk {pages}: {e}")
        return None

    finally:
        # 6. Cleanup: Delete the file from Google's server
        # This prevents cluttering your Google Cloud storage
        try:
            file_ref.delete()
        except Exception:
            pass

In [31]:
def convert_to_dict(obj):
    """Convert Pydantic model or custom object to dictionary."""
    if hasattr(obj, 'model_dump'):
        # For Pydantic v2
        return obj.model_dump()
    elif hasattr(obj, 'dict'):
        # For Pydantic v1
        return obj.dict()
    elif hasattr(obj, '__dict__'):
        # For regular objects
        return obj.__dict__
    else:
        # If it's already a basic type
        return obj

In [32]:
# Main workflow
all_results = {}
pdf_paths = glob.glob(os.path.join(PDF_FOLDER, "*.pdf"))

for pdf_path in pdf_paths:
    pdf_name = os.path.basename(pdf_path)
    print(f"Processing {pdf_name}...")

    chunks = split_pdf_by_pages(pdf_path)
    results = []
    for chunk in chunks:
        # if count == 6:
        print(f"** Processing chunk: {chunk['pages']}")
        data = process_pdf_chunk(chunk, SYSTEM_INSTRUCTION, RESPONSE_FORMAT)
        print("** Data extracted.")
        print(data, "\n")
        # Convert each ResearchPaperData object to a dictionary
        for item in data:
            results.append(convert_to_dict(item))
        # else:
        #     pass
        # count += 1

    all_results[pdf_name] = results

    # Remove the break if you want to process all PDFs**
    break

Processing 2405.07437v2.pdf...
** Processing chunk: 1-2
Processing chunk 1-2 (Uploading ./data/rag_research_paper/chunks_data/2405.07437v2_pages_1_2.pdf)...
Finish Reason: STOP
** Data extracted.
[ResearchPaperData(content_title='Abstract', content='Retrieval-Augmented Generation (RAG) has recently gained traction in natural language processing. Numerous studies and real-world applications are leveraging its ability to enhance generative models through external information retrieval. Evaluating these RAG systems, however, poses unique challenges due to their hybrid structure and reliance on dynamic knowledge sources. To better understand these challenges, we conduct A Unified Evaluation Process of RAG (Auepora) and aim to provide a comprehensive overview of the evaluation and benchmarks of RAG systems. Specifically, we examine and compare several quantifiable metrics of the Retrieval and Generation components, such as relevance, accuracy, and faithfulness, within the current RAG benchm

In [33]:
all_results

{'2405.07437v2.pdf': [{'content_title': 'Abstract',
   'content': 'Retrieval-Augmented Generation (RAG) has recently gained traction in natural language processing. Numerous studies and real-world applications are leveraging its ability to enhance generative models through external information retrieval. Evaluating these RAG systems, however, poses unique challenges due to their hybrid structure and reliance on dynamic knowledge sources. To better understand these challenges, we conduct A Unified Evaluation Process of RAG (Auepora) and aim to provide a comprehensive overview of the evaluation and benchmarks of RAG systems. Specifically, we examine and compare several quantifiable metrics of the Retrieval and Generation components, such as relevance, accuracy, and faithfulness, within the current RAG benchmarks, encompassing the possible output and ground truth pairs. We then analyze the various datasets and metrics, discuss the limitations of current benchmarks, and suggest potential d

In [35]:
# Save combined results for each PDF
for pdf_name, data in all_results.items():
    out_file = f"{pdf_name.rsplit('.',1)[0]}_structured.json"
    with open(out_file, "w") as f_json:
        json.dump(data, f_json, indent=2)
    print(f"Saved structured data to {out_file}")

print("\nCleaning up temporary chunk files...")
for pdf_path in pdf_paths:
    pdf_base = os.path.splitext(os.path.basename(pdf_path))[0]
    chunk_pattern = os.path.join("/content", f"{pdf_base}_pages_*.pdf")
    for chunk_file in glob.glob(chunk_pattern):
        os.remove(chunk_file)
        print(f"Removed: {chunk_file}")

Saved structured data to 2405.07437v2_structured.json

Cleaning up temporary chunk files...


## List of available models in the Purchase Tier

In [None]:
import google.genai as genai
client = genai.Client(api_key=GOOGLE_API_KEY)
for model in client.models.list():
    print(model.name)



models/embedding-gecko-001
models/gemini-2.5-flash
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-exp-1206
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
models/gemma-3n-e2b-it
models/gemini-flash-latest
models/gemini-flash-lite-latest
models/gemini-pro-latest
models/gemini-2.5-flash-lite
models/gemini-2.5-flash-image-preview
models/gemini-2.5-flash-image
models/gemini-2.5-flash-preview-09-2025
models/gemini-2.5-flash-lite-preview-09-2025
models/gemini-3-pro-preview
models/gemini-3-flash-preview
models/gemini-3-pro-image-preview
models/nano-banana-pro-preview
models/gemini-robotics-er-1.5-preview
models/gemini-2.5-computer-use-preview-10-2025
models

Why we created `convert_to_dict`, this is the demo code that shows what is will be the use of the function

In [None]:
data = [ # THis is what process_pdf_chunk will return
    ResearchPaperData(
        content_title="Abstract",
        content="This paper introduces a new method called RAG..."
    ),
    ResearchPaperData(
        content_title="1. Introduction",
        content="Large Language Models have shown..."
    ),
    ResearchPaperData(
        content_title="2. Methodology",
        content="We used a dual-encoder architecture..."
    )
]

for item in data:
    print(convert_to_dict(item))

{'content_title': 'Abstract', 'content': 'This paper introduces a new method called RAG...'}
{'content_title': '1. Introduction', 'content': 'Large Language Models have shown...'}
{'content_title': '2. Methodology', 'content': 'We used a dual-encoder architecture...'}


## Testing with 1 PDF

In [26]:
system_instruction_prompt = """
You are a data extraction bot. 
Output ONLY valid JSON. 
CRITICAL: Do not use tabs (\\t) or multiple newlines (\\n). 
Keep the JSON compact. If you find headers or footers in the PDF, ignore them. 
Do not repeat the paper title in the output.
"""

In [27]:
file_ref = genai.upload_file("./data/rag_research_paper/chunks_data/2405.07437v2_pages_5_6.pdf", mime_type="application/pdf")
chunk = {'pages': '5-6'}

try:
    config = genai.GenerationConfig(
            response_mime_type="application/json",
            response_schema=RESPONSE_FORMAT,
            temperature=0.0)

    model = genai.GenerativeModel('gemini-2.5-flash-preview-09-2025',
        system_instruction=system_instruction_prompt
    )

    response = model.generate_content(
        [file_ref, f"Extract structured content from pages {chunk['pages']}."],
        generation_config=config
        )

    print("Gemma response:", response.text)
except Exception as e:
    print(f"Gemma doesn't support this: {e}")

Gemma response: {
  "research_paper_data": [
    {
      "content": "The combination of EOs and GTs in the RAG system can generate all possible targets, which is the fundamental concept of the Auepora (as shown in Figure 1). Once identified, these targets can be defined based on a specific pair of EOs or EO with GT, as illustrated in Figure 2, and used to analyze all aspects of current RAG benchmarks.Fig. 2: The Target modular of the Auepora. The figure illustrates the flow and relationships between Query, Result, Ground Truth, Retrieval, and Generation components, mapping them to specific evaluation targets (Relevance, Accuracy, Faithfulness, Correctness) and Additional Requirements (Latency, Noise Robustness, Negative Rejection, Diversity, etc.). Retrieval uses Query, Relevant Docs, and Docs Candidates to determine Relevance (Relevant Docs \tleftrightarrow Query) and Accuracy (Relevant Docs \tleftrightarrow Docs Candidates). Generation uses Response, Sample Response, Output, and Labe

In [34]:
RESPONSE_FORMAT.model_validate_json(response.text).research_paper_data

[ResearchPaperData(content_title='3.1 Evaluation Target (What to Evaluate?)', content='The combination of EOs and GTs in the RAG system can generate all possible targets, which is the fundamental concept of the Auepora (as shown in Figure 1). Once identified, these targets can be defined based on a specific pair of EOs or EO with GT, as illustrated in Figure 2, and used to analyze all aspects of current RAG benchmarks.Fig. 2: The Target modular of the Auepora. The figure illustrates the flow and relationships between Query, Result, Ground Truth, Retrieval, and Generation components, mapping them to specific evaluation targets (Relevance, Accuracy, Faithfulness, Correctness) and Additional Requirements (Latency, Noise Robustness, Negative Rejection, Diversity, etc.). Retrieval uses Query, Relevant Docs, and Docs Candidates to determine Relevance (Relevant Docs \tleftrightarrow Query) and Accuracy (Relevant Docs \tleftrightarrow Docs Candidates). Generation uses Response, Sample Response

In [58]:
response.candidates[0].finish_reason

<FinishReason.STOP: 1>