In [1]:
import random
import json
from tqdm import tqdm

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.chat_models import ChatOCIGenAI
from langchain.prompts import PromptTemplate
from langchain.schema import HumanMessage

from oci_aqua_embeddings import OCIAquaEmbeddings
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from config import (
    AUTH,
    SERVICE_ENDPOINT,
    MAX_TOKENS,
    TEMPERATURE,
    COMPARTMENT_ID,
)

N_QUESTIONS = 50
# Set seed for reproducibility
random.seed(42)

CHUNK_SIZE = 3000
CHUNK_OVERLAP = 100

# files
PDF_PATH = "ai-4-italy.pdf"
OUTPUT_FILE = "questions.json"

# for similarity search
TOP_K = 6

In [3]:
# LLM is used to generate questions
def get_llm():
    """
    Initialize and return an instance of ChatOCIGenAI with the specified configuration.

    Returns:
        ChatOCIGenAI: An instance of the OCI GenAI language model.
    """
    _llm = ChatOCIGenAI(
        auth_type=AUTH,
        model_id="meta.llama-3.1-70b-instruct",
        service_endpoint=SERVICE_ENDPOINT,
        compartment_id=COMPARTMENT_ID,
        is_stream=True,
        model_kwargs={"temperature": TEMPERATURE, "max_tokens": MAX_TOKENS},
    )
    return _llm


# Function for processing a single question-chunk pair
def process_question_chunk_with_mrr(question, chunk, vectorstore, top_k=TOP_K):
    # Perform similarity search and compute hit_ratio and MRR
    search_result = vectorstore.similarity_search(question, k=top_k)
    # Extract expected chunk number
    chunk_num_expected = chunk.metadata["chunk_num"]

    # Check for hit and compute reciprocal rank
    for rank, item in enumerate(search_result, start=1):
        if item.metadata["chunk_num"] == chunk_num_expected:
            return True, 1 / rank  # Hit, Reciprocal Rank

    return False, 0  # No hit, MRR is 0

#### Embeddings

In [4]:
BASE_URL = "https://modeldeployment.eu-frankfurt-1.oci.customer-oci.com"
ENDPOINT = f"{BASE_URL}/ocid1.datasciencemodeldeployment.oc1.eu-frankfurt-1.amaaaaaa2xxap7yagq4z62toy5toj6fijrzi6deswanqy2l3yik7mmifix2a/predict"

In [5]:
embed_model = OCIAquaEmbeddings(endpoint=ENDPOINT)

#### Load chunks

In [6]:
# load the pdf
loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

In [7]:
# Step 2: split in chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
)
chunks = text_splitter.split_documents(documents)

In [8]:
for i, chunk in enumerate(chunks):
    chunk.metadata["chunk_num"] = i

#### Load vector store

In [9]:
print("Embedding and loading vector store...")
vector_store = FAISS.from_documents(chunks, embed_model)

Embedding and loading vector store...


Processing batches...: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:33<00:00,  4.72s/it]


#### Generate questions

In [None]:
# Step 3: Randomly select N_QUESTIONS chunks
selected_chunks = random.sample(chunks, min(N_QUESTIONS, len(chunks)))

In [None]:
LANG = "italian"

prompt_template = PromptTemplate(
    input_variables=["content"],
    template="""Based only on the following content, generate a thoughtful question in {language} language:\n\n{content}. 
    Report only the question.""",
)

chat_model = get_llm()

In [None]:
questions = []

for chunk in tqdm(selected_chunks):
    chunk_content = chunk.page_content
    prompt = prompt_template.format(content=chunk_content, language=LANG)

    response = chat_model.invoke([HumanMessage(content=prompt)])

    questions.append(response.content)

In [None]:
data_to_save = []

for question, chunk in zip(questions, selected_chunks):
    entry = {
        "chunk_num": chunk.metadata["chunk_num"],
        "page_num": chunk.metadata["page"],
        "question": question,
    }
    data_to_save.append(entry)

In [None]:
# Save in JSON

with open(OUTPUT_FILE, "w", encoding="utf-8") as file:
    json.dump(data_to_save, file, indent=4)

print(f"Data saved in {OUTPUT_FILE}")

#### Similarity Search and computation of metrics

In [None]:
hit_at_top_k = 0
reciprocal_ranks = []

# Main loop to compute Hit Ratio and MRR
for question, chunk in tqdm(
    zip(questions, selected_chunks), total=len(questions), desc="Processing Questions"
):

    hit, rr = process_question_chunk_with_mrr(question, chunk, vector_store)

    if hit:
        hit_at_top_k += 1
    reciprocal_ranks.append(rr)

In [None]:
# Compute final metrics
total_queries = len(questions)
hit_ratio = hit_at_top_k / total_queries
mrr = sum(reciprocal_ranks) / total_queries

print(f"Language: {LANG}")
print(f"Hit Ratio @ Top-{TOP_K}: {hit_ratio:.3f}")
print(f"Mean Reciprocal Rank (MRR): {mrr:.3f}")