# Overview



In [20]:
!pip install -q -r requirements.txt

!pip install openai==1.93.0      # Only for testing
# ! pip install --upgrade docling openai torch



# Document Ingestion

In [21]:
import boto3
from botocore.config import Config
import os

"""
Environment variables:
  AWS_S3_ENDPOINT        – MinIO service DNS name (e.g. minio.minio.svc.cluster.local)
  AWS_ACCESS_KEY_ID      – MinIO access key
  AWS_SECRET_ACCESS_KEY  – MinIO secret key
  AWS_DEFAULT_REGION     – Dummy value; boto3 still expects one
  AWS_S3_BUCKET          – Default bucket to use for the Workspace data connection 
"""

# === Configuration ===
open_ai_api_key = os.getenv("OPENAI_API_KEY")
endpoint = os.getenv("AWS_S3_ENDPOINT")
access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
region = os.getenv("AWS_DEFAULT_REGION")
bucket_name = os.getenv("AWS_S3_BUCKET")
object_key = "2502.07835v1.pdf"  # The name of the PDF in the S3 bucket
download_dir = "downloads"

# === Initialise S3 client ===
s3 = boto3.client(
    "s3",
    endpoint_url=f"http://{endpoint}",
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=region,
    config=Config(signature_version="s3v4"),
)

# === Ensure download directory exists ===
os.makedirs(download_dir, exist_ok=True)
local_path = os.path.join(download_dir, object_key)
print(f"Downloading from {bucket_name}::{object_key} to: {local_path}")

# === Download the file ===
try:
    s3.download_file(bucket_name, object_key, local_path)
    print(f"✅ Downloaded '{object_key}' to '{local_path}'")
except s3.exceptions.NoSuchKey:
    print(f"❌ File '{object_key}' not found in bucket '{bucket_name}'")
except Exception as e:
    print(f"❌ Error downloading file: {e}")


Downloading from rag-docs::2502.07835v1.pdf to: downloads/2502.07835v1.pdf
✅ Downloaded '2502.07835v1.pdf' to 'downloads/2502.07835v1.pdf'


# Embedding Generation

In [22]:
"""
Generate an embedding vector for a piece of text.
This helper wraps the embedding function to reduce boiler-plate when you frequently need sentence- or paragraph-level embeddings.

Parameters
----------
text : str
    The input text to embed.

Returns: A 1-D list of 1 536 floats representing the semantic embedding of *text*. 
The vector can be indexed, stored, or compared with other embeddings (e.g., via cosine similarity).
"""
def emb_text(text: str) -> list[float]:
    return (
        openai_client.embeddings.create(
            input=text,
            model="text-embedding-3-small"
        ).data[0].embedding
    )

In [23]:
from openai import OpenAI

openai_client = OpenAI()

In [25]:
# Use this to find the default number of dimensions this embedding model generates. We will use that later.
test_embedding = emb_text("This is a test")
embedding_dim = len(test_embedding)
print(f"Embedding dimensions: {embedding_dim}")
print(test_embedding[:10])

Embedding dimensions: 1536
[0.009873751550912857, -0.005582896992564201, 0.0068350606597959995, -0.038091305643320084, -0.018248096108436584, -0.041217729449272156, -0.00763660529628396, 0.032221291214227676, 0.018918044865131378, 0.00010168847802560776]


from sentence_transformers import SentenceTransformer

embedding_transformer = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")


In [26]:
from utils import project_root

# Assemble a complete path to the file so the document import can properly and reliably always find the document.
doc_source = project_root() / local_path

if not doc_source.is_file():
    raise FileNotFoundError(f"{DOC_SOURCE} does not exist.")

print(f"Found {doc_source}")

Found /opt/app-root/src/rhoai-roadshow-v2/docs/2-rag/notebook/downloads/2502.07835v1.pdf


In [27]:
"""
Parse and chunk a PDF using Docling v2.x
"""
from docling.document_converter import DocumentConverter
from pathlib import Path

doc = DocumentConverter().convert(source=doc_source).document

In [28]:
print(doc.pages)

{1: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=1), 2: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=2), 3: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=3), 4: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=4), 5: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=5), 6: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=6), 7: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=7), 8: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=8), 9: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=9), 10: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=10), 11: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=11), 12: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=12), 13: PageItem(size=Size(width=612.0, height=792.0), image=None, page_no=13)}


# Connect to Milvus

In [29]:
from pymilvus import MilvusClient

collection_name = "my_rag_collection"

milvus_client = MilvusClient(
    uri="http://milvus-service.milvus.svc.cluster.local:19530",
    db_name="default"
)

In [30]:
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [31]:
print(f"Collection list: {milvus_client.list_collections()}") 

milvus_client.create_collection(
    collection_name=collection_name,
    dimension=embedding_dim,
    metric_type="IP",  # Inner product distance
    consistency_level="Strong",  # Supported values are (`"Strong"`, `"Session"`, `"Bounded"`, `"Eventually"`). See https://milvus.io/docs/consistency.md#Consistency-Level for more details.
)

print(f"Collection list: {milvus_client.list_collections()}") 

Collection list: ['animal_test']
Collection list: ['animal_test', 'my_rag_collection']


In [32]:
from docling_core.transforms.chunker import HierarchicalChunker

from docling.document_converter import DocumentConverter

converter = DocumentConverter()
chunker = HierarchicalChunker()

# Convert the input file to Docling Document
source = doc_source
doc = converter.convert(source).document

# Perform hierarchical chunking. This is faster than Hybrid chunking, but not as good.
texts = [chunk.text for chunk in chunker.chunk(doc)]

# Vector Storage and Search

In [33]:
from tqdm import tqdm

data = []

for i, chunk in enumerate(tqdm(texts, desc="Processing chunks")):
    embedding = emb_text(chunk)
    data.append({"id": i, "vector": embedding, "text": chunk})

milvus_client.insert(collection_name=collection_name, data=data)

Processing chunks: 100%|██████████| 70/70 [00:20<00:00,  3.44it/s]


{'insert_count': 70, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69], 'cost': 0}

# Visualising how embeddings are stored in a vector database

<Describe how this visualises how the text is stored in the vector database.

https://projector.tensorflow.org/

# Query-Time Retrieval

In [34]:
question = (
    "What are the challenges of assessing assessing the quality of AI-generated code? What are some strategies for doing this"
)

In [35]:
search_res = milvus_client.search(
    collection_name=collection_name,
    data=[emb_text(question)],
    limit=3,
    search_params={"metric_type": "IP", "params": {}},
    output_fields=["text"],
)

In [37]:
import json

retrieved_lines_with_distances = [
    (res["entity"]["text"], res["distance"]) for res in search_res[0]
]
print(json.dumps(retrieved_lines_with_distances, indent=4))

[
    [
        "AI-assisted coding has been shown to be more beneficial for senior developers, as they possess the expertise to critically evaluate the generated code for correctness, completeness, and compliance. In contrast, junior developers may struggle to identify hallucinations, missing functionality, or incorrect logic in AI-generated code. To bridge this gap, This paper introduces a novel scoring mechanism called the SBC score , which is based on a reverse generation technique that leverages the natural language generation capabilities of LLMs. Unlike direct code analysis, our approach reconstructs system requirements from AI-generated code and compares them with the original specifications to quantify accuracy. The SBC score combines semantic similarity, BLEU, and completeness analysis , providing actionable insights to developers by highlighting missing features and hallucinations. This hybrid metric not only improves the evaluation of AI-generated code but also offers a rea

# Augmented Generation

In [38]:
context = "\n".join(
    [line_with_distance[0] for line_with_distance in retrieved_lines_with_distances]
)

In [39]:
SYSTEM_PROMPT = """
Human: You are an AI assistant. You are able to find answers to the questions from the contextual passage snippets provided.
"""
USER_PROMPT = f"""
Use the following pieces of information enclosed in <context> tags to provide an answer to the question enclosed in <question> tags.
<context>
{context}
</context>
<question>
{question}
</question>
"""

In [40]:
response = openai_client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT},
    ],
)
print(response.choices[0].message.content)

The challenges of assessing the quality of AI-generated code include:

1. **Inherent Complexity of Programming Tasks**: Programming tasks are complex, making it difficult to assess the quality of AI-generated code accurately.

2. **Lack of Robust Evaluation Metrics**: There is a lack of robust evaluation metrics that align well with human judgment. Traditional token-based metrics like BLEU and ROUGE show weak correlations with human assessments in code intelligence and verification tasks.

3. **Difficulty for Junior Developers**: Junior developers may find it hard to identify issues such as hallucinations, missing functionality, or incorrect logic in AI-generated code, while senior developers are generally more equipped to conduct such evaluations.

Strategies for assessing AI-generated code:

1. **SBC Score**: The paper introduces a novel scoring mechanism called the SBC score, which leverages a reverse generation technique to assess AI-generated code. It reconstructs system requireme