In [1]:

from openai import OpenAI
import voyageai as vo  
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client_vo = vo.Client(api_key=os.getenv("VOYAGE_API_KEY"))  
chat_model = "gpt-3.5-turbo"
embedding_model = "voyage-context-3"


In [2]:
k = 3
a = 0.6 #alpha for semantic (1.0) and keyword search (0.0)
lambda_parameter = 0.7 #lambda for diversity (1.0 is all diverse) and 0.0 is chunks are the same

In [3]:
import os
from ingestion import load_documents

base_path = os.path.abspath("..")
md_texts = load_documents(base_path)

print(f"Loaded {len(md_texts)} documents.") #check to see documents have been loaded


Loaded 28 documents.


In [4]:

""" 
First we have to split the documents into chunks to be able to group them contextually together later.
The first decision here is to choose what type of chunking to use for now. 
I tried CTS but the issue with that is that it would split mid praragraph/sentence, 
which meant we would loose coherence sometimes. 
Using a recursive CTS meant we have more chunks as well, as it makes sure to perserve context. 
Chunk size here is really small because we later on attach 4 chunks to it when cintextualising,
this means that at the end of chunking, a chunk will have around 640 tokens"""

from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    chunk_size=512,
    chunk_overlap=50
)
all_chunks = []

for doc in md_texts:
    chunks = splitter.create_documents([doc["text"]])
    for c in chunks:
        c.metadata = {"source": doc["file"]}
    all_chunks.extend(chunks)
    
print(len(all_chunks), "total chunks created") 
#print(all_chunks[0].page_content)

73 total chunks created


In [5]:
"""
This script performs contextualized semantic embedding using Voyage's `voyage-context-3` model.

What it does:
- Splits all chunks into smaller pseudo-documents (default: 15 chunks per group)
- Sends each group to the `contextualized_embed` API to get document-aware embeddings
- For each chunk, finds its top-k most semantically similar chunks (within the same group)
- Builds an expanded version of the chunk by appending its similar neighbors
- Stores both the original chunk, expanded context, document ID, and normalized embedding

Optimizations & decisions:
- We avoid grouping by actual document to allow flexibility and even similarity across unrelated chunks
- Group size (`GROUP_SIZE`) is tuned to stay below the 32k token limit of `voyage-context-3`
- Embeddings are normalized to enable cosine similarity and MMR-style ranking
- Top-k (`TOP_K`) controls how much semantic context is appended â€” small enough to keep expansion relevant
- Processing is batched efficiently to reduce API calls while respecting model constraints
"""


import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

TOP_K = 2
GROUP_SIZE = 15

def batch_chunks(chunks, size):
    for i in range(0, len(chunks), size):
        yield chunks[i:i + size]

contextual_all_chunks = []

# Prepare all chunk groups for embedding
chunk_groups = list(batch_chunks(all_chunks, GROUP_SIZE))
inputs = [[c.page_content for c in group] for group in chunk_groups]

response = client_vo.contextualized_embed(
    inputs=inputs,
    model="voyage-context-3",
    input_type="document"
)

for group, doc_result in zip(chunk_groups, response.results):
    embeddings = [np.array(e, dtype=np.float32) for e in doc_result.embeddings]
    normed_embeddings = [v / np.linalg.norm(v) for v in embeddings]
    texts = [c.page_content for c in group]

    for i, (chunk_text, vec) in enumerate(zip(texts, normed_embeddings)):
        sims = cosine_similarity([vec], normed_embeddings)[0]
        sims[i] = -1
        top_k_idx = sims.argsort()[-TOP_K:]
        similar_texts = [texts[j] for j in top_k_idx]
        expanded_text = chunk_text + " " + " ".join(similar_texts)

        contextual_all_chunks.append({
            "chunk": chunk_text,
            "expanded": expanded_text,
            "doc_id": group[i].metadata["source"],
            "embedding": vec
        })

chunk_texts = [c["chunk"] for c in contextual_all_chunks]
expanded_texts = [c["expanded"] for c in contextual_all_chunks]
chunk_embeddings = np.array([c["embedding"] for c in contextual_all_chunks], dtype=np.float32)

print(f"\n Total contextualized chunks created: {len(contextual_all_chunks)}")



 Total contextualized chunks created: 73


In [6]:
from rag import response_llm, embed_query
import json

with open("qa_long_dataset.json", "r") as f:
    qa_data = json.load(f)
questions = [q["question"] for q in qa_data]

# Run
llm_results = response_llm(
    questions=questions,
    client=client,
    chat_model=chat_model,
    chunk_embeddings=chunk_embeddings,
    chunk_texts=chunk_texts,
    embed_query=embed_query,
    k=k,
    alpha=a,
    lambda_param=lambda_parameter
)




Q1: Write a 500 word report on what are the best practices for developing and implementing reproductive and fertility health policies in the workplace to ensure inclusivity and support for all employees?
Answer: Developing and implementing reproductive and fertility health policies in the workplace is crucial to ensure inclusivity and support for all employees. This report will outline the best practices for creating effective policies that address the diverse reproductive health needs of employees and provide them with the necessary support and resources.

1. Conduct a Needs Assessment: Before implementing any policies, it is essential to understand the specific reproductive health needs of your employees. This can be done through surveys, focus groups, or consultations with reproductive health experts. By gathering this data, organizations can tailor their policies to meet the unique needs of their workforce.

2. Establish Clear Policies and Benefits: Clearly outline the company's p

In [7]:
with open("qa_long_dataset.json", "r") as f:
    qa_data = json.load(f)

ragas_data = []
for ref, pred in zip(qa_data, llm_results):
    ragas_data.append({
        "question": ref["question"],
        "answer": pred["answer"],
        "contexts": [pred["context_used"]],
        "ground_truth": ref["ground_truth_answer"]
    })

from datasets import Dataset
ragas_dataset = Dataset.from_list(ragas_data)

# Evaluate
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness
)
from ragas import evaluate

evaluation_scores = evaluate(
    ragas_dataset,
    metrics=[answer_relevancy, context_precision, context_recall, faithfulness],
    llm=llm
)

print(evaluation_scores)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


{'answer_relevancy': 0.9232, 'context_precision': 1.0000, 'context_recall': 1.0000, 'faithfulness': 0.8903}
