In [1]:
from openai import OpenAI
import voyageai as vo  
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client_vo = vo.Client(api_key=os.getenv("VOYAGE_API_KEY"))  
chat_evaluator_model = "gpt-5-nano"
chat_judge_model = "gpt-4.1-mini"
embedding_model = "voyage-3-large"


In [2]:
k = 3
a = 0.6 #alpha for semantic (1.0) and keyword search (0.0)
lambda_parameter = 0.7 #lambda for diversity (1.0 is all diverse) and 0.0 is chunks are the same

In [3]:
import os
from ingestion import load_documents
base_path = os.path.abspath("..")
md_texts = load_documents(base_path)

In [4]:
from sentence_transformers import SentenceTransformer
import re

model = SentenceTransformer("all-MiniLM-L6-v2") #or "all-mpnet-base-v2"

all_text = "\n".join(doc["text"] for doc in md_texts)

paragraphs = re.split(r'\n\s*\n+', all_text)
paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 100]

In [5]:
print(len(paragraphs), "total chunks created")
print(paragraphs[1])

352 total chunks created
Define reproductive health in the workplace: not just fertility, but encompassing menstrual health, menopause, and other reproductive challenges.


In [6]:
# import numpy as np

# chunk_texts = [c.page_content for c in all_chunks]

# resp = client_vo.contextualized_embed(
#     inputs=[[text] for text in chunk_texts],  # list of lists
#     model=embedding_model,
#     input_type="document",
# )
# chunk_embeddings = np.array([r.embeddings[0] for r in resp.results], dtype=np.float32)
# chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings, axis=1, keepdims=True)

import numpy as np

chunk_texts = [c for c in paragraphs]

resp = client_vo.embed(
    chunk_texts,  # list of lists
    model=embedding_model,
    input_type="document",
)
chunk_embeddings = np.array(
    resp.embeddings,
    dtype=np.float32
)
chunk_embeddings = chunk_embeddings / np.linalg.norm(chunk_embeddings, axis=1, keepdims=True)


In [7]:
from rag import response_llm, embed_query
import json

with open("qa_long_dataset.json", "r") as f:
    qa_data = json.load(f)
questions = [q["question"] for q in qa_data]

# Run
llm_results = response_llm(
    questions=questions,
    client=client,
    chat_model=chat_evaluator_model,
    chunk_embeddings=chunk_embeddings,
    chunk_texts=chunk_texts,
    embed_query=embed_query,
    k=k,
    alpha=a,
    lambda_param=lambda_parameter
)




Q1: Write a 500 word report on what are the best practices for developing and implementing reproductive and fertility health policies in the workplace to ensure inclusivity and support for all employees?
Answer: Best practices for developing and implementing reproductive and fertility health policies in the workplace

A well-structured reproductive and fertility health guide is foundational to an inclusive workplace. Such guidance helps reduce stigma around reproductive health, supports employee well-being, and contributes to higher retention. When policies are thoughtfully designed and implemented, they signal to all employees that their health needs are acknowledged and accommodated.

Key practices

- Cultural sensitivity: Develop policies by adapting them to regional attitudes toward reproductive healthcare. Recognize that perceptions and norms around reproductive health can vary by location, and tailor guidance to fit these contexts. This approach helps ensure that policies are re

In [8]:
with open("qa_long_dataset.json", "r") as f:
    qa_data = json.load(f)

ragas_data = []
for ref, pred in zip(qa_data, llm_results):
    ragas_data.append({
        "question": ref["question"],
        "answer": pred["answer"],
        "contexts": [pred["context_used"]],
        "ground_truth": ref["ground_truth_answer"]
    })

from datasets import Dataset
ragas_dataset = Dataset.from_list(ragas_data)

# Evaluate
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name=chat_judge_model, temperature=0)

from ragas.metrics import (
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness
)
from ragas import evaluate

evaluation_scores = evaluate(
    ragas_dataset,
    metrics=[answer_relevancy, context_precision, context_recall, faithfulness],
    llm=llm
)

print(evaluation_scores)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Evaluating:   0%|          | 0/16 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.


{'answer_relevancy': 0.9282, 'context_precision': 1.0000, 'context_recall': 0.5417, 'faithfulness': 0.6475}
