# Overview



# Document Ingestion

In [None]:
!pip install boto3==1.34.103

In [2]:
import boto3
from botocore.config import Config
import os

"""
Environment variables:
  AWS_S3_ENDPOINT        – MinIO service DNS name (e.g. minio.minio.svc.cluster.local)
  AWS_ACCESS_KEY_ID      – MinIO access key
  AWS_SECRET_ACCESS_KEY  – MinIO secret key
  AWS_DEFAULT_REGION     – Dummy value; boto3 still expects one
  AWS_S3_BUCKET          – Default bucket to use for the Workspace data connection 
"""

# === Configuration ===
endpoint = os.getenv("AWS_S3_ENDPOINT")
access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
region = os.getenv("AWS_DEFAULT_REGION")
bucket_name = os.getenv("AWS_S3_BUCKET")
object_key = "2502.07835v1.pdf"  # The name of the PDF in the S3 bucket
download_dir = "downloads"

# === Initialise S3 client ===
s3 = boto3.client(
    "s3",
    endpoint_url=f"http://{endpoint}",
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
    config=Config(signature_version="s3v4"),
)

# === Ensure download directory exists ===
os.makedirs(download_dir, exist_ok=True)
local_path = os.path.join(download_dir, object_key)
print(f"Downloading from {bucket_name}::{object_key} to: {local_path}")

# === Download the file ===
try:
    s3.download_file(bucket_name, object_key, local_path)
    print(f"✅ Downloaded '{object_key}' to '{local_path}'")
except s3.exceptions.NoSuchKey:
    print(f"❌ File '{object_key}' not found in bucket '{bucket_name}'")
except Exception as e:
    print(f"❌ Error downloading file: {e}")


Downloading from rag-docs::2502.07835v1.pdf to: downloads/2502.07835v1.pdf
✅ Downloaded '2502.07835v1.pdf' to 'downloads/2502.07835v1.pdf'


# Embedding Generation

In [4]:
!pip install docling==2.39.0

Collecting docling==2.39.0
  Downloading docling-2.39.0-py3-none-any.whl.metadata (10 kB)
Collecting pydantic<3.0.0,>=2.0.0 (from docling==2.39.0)
  Downloading pydantic-2.11.7-py3-none-any.whl.metadata (67 kB)
Collecting docling-core<3.0.0,>=2.39.0 (from docling-core[chunking]<3.0.0,>=2.39.0->docling==2.39.0)
  Downloading docling_core-2.39.0-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-ibm-models<4.0.0,>=3.4.4 (from docling==2.39.0)
  Downloading docling_ibm_models-3.6.0-py3-none-any.whl.metadata (6.7 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling==2.39.0)
  Downloading docling_parse-4.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling==2.39.0)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2<5.0.0,>=4.30.0 (from docling==2.39.0)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
Collecting pydant

In [22]:
from utils import project_root

# Assemble a complete path to the file so the document import can properly and reliably always find the document.
DOC_SOURCE = project_root() / local_path

if not DOC_SOURCE.is_file():
    raise FileNotFoundError(f"{DOC_SOURCE} does not exist.")

print(f"Found {DOC_SOURCE}")

Found /opt/app-root/src/rhoai-roadshow-v2/docs/2-rag/notebook/downloads/2502.07835v1.pdf


In [23]:
"""
Parse and chunk a PDF using Docling v2.x
"""
from docling.document_converter import DocumentConverter
from pathlib import Path

#base_dir = Path(__file__).resolve().parent
base_dir = Path().resolve()
doc_source = base_dir / local_path
if not doc_source.is_file():
    raise FileNotFoundError(f"{doc_source} does not exist.")




In [27]:

doc = DocumentConverter().convert(source=doc_source).document


In [32]:
print(doc.pages)

{1: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=1), 2: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=2), 3: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=3), 4: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=4), 5: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=5), 6: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=6), 7: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=7), 8: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=8), 9: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=9), 10: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=10), 11: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=11), 12: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=12), 13: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=13)}


In [33]:
from docling.chunking import HybridChunker

chunker = HybridChunker()
chunk_iter = chunker.chunk(dl_doc=doc)

In [34]:
for i, chunk in enumerate(chunk_iter):
    print(f"=== {i} ===")
    print(f"chunk.text:\n{f'{chunk.text[:300]}…'!r}")

    enriched_text = chunker.contextualize(chunk=chunk)
    print(f"chunker.contextualize(chunk):\n{f'{enriched_text[:300]}…'!r}")

    print()

=== 0 ===
chunk.text:
'ahilanp@gmail.com\nFebruary 13, 2025…'
chunker.contextualize(chunk):
'Ahilan Ayyachamy Nadar Ponnusamy\nahilanp@gmail.com\nFebruary 13, 2025…'

=== 1 ===
chunk.text:
'The rise of Large Language Models (LLMs) in software engineering, particularly in code generation, has garnered significant attention. However, assessing the quality of AI-generated code remains a challenge due to the inherent complexity of programming tasks and the lack of robust evaluation metrics…'
chunker.contextualize(chunk):
'Abstract\nThe rise of Large Language Models (LLMs) in software engineering, particularly in code generation, has garnered significant attention. However, assessing the quality of AI-generated code remains a challenge due to the inherent complexity of programming tasks and the lack of robust evaluatio…'

=== 2 ===
chunk.text:
'AI-assisted coding has been shown to be more beneficial for senior developers, as they possess the expertise to critically evaluate the generated co

# Vector Storage and Search

# Query-Time Retrieval

# Augmented Generation