In [None]:

from openai import OpenAI
import voyageai as vo  
import os

from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client_vo = vo.Client(api_key=os.getenv("VOYAGE_API_KEY"))
chat_evaluator_model = "gpt-5-nano"
chat_judge_model = "gpt-4.1-mini"
embedding_model = "voyage-3-large"

k = 10
a = 0.6 
lambda_parameter = 0.7 

**Parameters**  
The generator model is lightweight to keep token usage low while still producing good answers. The judge model is intentionally stronger to ensure reliable, consistent scoring.  

- `k` = number of top retrieved chunks passed to the generator  
- `a` = hybrid search weighting factor (balances dense vs. BM25 scores)  
- `lambda_parameter` = MMR diversity–relevance tradeoff during retrieval 

In [None]:
import numpy as np
#from chunking_cts import chunk_texts
# from chunking_rcts import chunk_texts 
from chunking_semantic import chunk_texts 

# from chunking_contextual import build_contextual_chunks
# chunk_texts, chunk_embeddings = build_contextual_chunks( client_vo=client_vo, embedding_model=embedding_model)
print(f"Number of chunks created: {len(chunk_texts)}")
print(f"Sample chunk: {chunk_texts[3]}")


**CTS (Character Token Splitting)**  
Splits text strictly by character or token limits, producing uniform chunks without considering structure or meaning. Design choices include chunk size (evaluated later), chunk overlap, and the encoding model — here using the widely adopted and balanced `cl100k_base`.

**RCTS (Recursive Character Splitting)**  
Splits text using a hierarchical fallback approach that respects natural boundaries (headings, paragraphs, sentences) before defaulting to fixed-length cuts. Design choices mirror CTS, mainly chunk size, overlap, and encoding.

**Semantic Chunking**  
Uses embedding similarity to identify natural semantic boundaries, grouping sentences or small units that share meaning rather than relying on raw length. Design choices include the threshold, this required experimentation where I found that 0.3 produces around 108 chunks. 

**Contextualized Chunking**  
Embeds groups of chunks together and expands each chunk by merging it with its top-k most similar neighbours, resulting in richer context and stronger embeddings. Design choices include the group size fed into the model and the number of neighbouring chunks used for expansion.

In [None]:
#note: no need to use embed endpoint for contextualized chunking
# resp = client_vo.contextualized_embed(
#     inputs=[[text] for text in chunk_texts],
#     model=embedding_model,
#     input_type="document",
# )
# chunk_embeddings = np.array([r.embeddings[0] for r in resp.results], dtype=np.float32)
# chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings, axis=1, keepdims=True)


resp = client_vo.embed(
    chunk_texts,  
    model=embedding_model,
    input_type="document",
)
chunk_embeddings = np.array(resp.embeddings, dtype=np.float32)
chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings, axis=1, keepdims=True)

The contextualized embedding call sends each chunk in its own list to the Voyage model so the model can interpret it as a standalone document and generate richer, context-aware embeddings. After receiving the outputs, each embedding is extracted, converted to a NumPy array, and L2-normalised so all vectors share the same scale for similarity search.

The commented alternative uses standard (non-contextual) embeddings i.e. voyage-large-3, where each chunk is embedded independently without the additional context-conditioning step. The normalisation process remains the same, but the embeddings capture only the isolated text rather than context-enhanced meaning.

In [None]:
from rag import response_llm, embed_query
import json

with open("qa_long_dataset.json", "r") as f:
    qa_data = json.load(f)
questions = [q["question"] for q in qa_data]

llm_results = response_llm(
    questions=questions,
    client=client,
    chat_model=chat_evaluator_model,
    chunk_embeddings=chunk_embeddings,
    chunk_texts=chunk_texts,
    embed_query=embed_query,
    k=k,
    alpha=a,
    lambda_param=lambda_parameter
)

In [None]:
with open("qa_long_dataset.json", "r") as f:
    qa_data = json.load(f)

ragas_data = []
for ref, pred in zip(qa_data, llm_results):
    ragas_data.append({
        "question": ref["question"],
        "answer": pred["answer"],
        "contexts": [pred["context_used"]],
        "ground_truth": ref["ground_truth_answer"]
    })

from datasets import Dataset
ragas_dataset = Dataset.from_list(ragas_data)


from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name=chat_judge_model, temperature=0)

from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness
)
from ragas import evaluate

evaluation_scores = evaluate(
    ragas_dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
    llm=llm
)

print(evaluation_scores)
