In [71]:

from openai import OpenAI
import voyageai as vo  
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
client_vo = vo.Client(api_key=os.getenv("VOYAGE_API_KEY"))  
chat_model = "gpt-3.5-turbo"
embedding_model = "voyage-context-3"
k = 5


In [72]:
""" Document Ingestion Stage 
%pip install pymupdf4llm docx2txt python-pptx """

import docx2txt
import pymupdf4llm
from pptx import Presentation

folders = ["../Module 1", "../Module 2", "../Module 4", "../Module 5"]

md_texts = []

def read_pptx(path):
    prs = Presentation(path)
    text = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text.append(shape.text)
    return "\n".join(text)


def load_files(folder):
    for root, dirs, files in os.walk(folder):  # recursive scan
        for filename in files:
            file_path = os.path.join(root, filename)
            ext = os.path.splitext(file_path)[-1].lower()

            try:
                if ext == ".pdf":
                    text = pymupdf4llm.to_markdown(file_path)
                elif ext == ".docx":
                    text = docx2txt.process(file_path)
                elif ext == ".pptx":
                    text = read_pptx(file_path)
                else:
                    continue

                md_texts.append({"file": file_path, "text": text})

            except Exception as e:
                print(f"Error reading {file_path}: {e}")


# run loader on all target folders
for folder in folders:
    load_files(folder)


print(f"\nTotal documents loaded: {len(md_texts)}")


Total documents loaded: 28


In [73]:
""" First we have to split the documents into chunks to be able to group them contextually together later.
The first decision here is to choose what type of chunking to use for now. 
I tried CTS but the issue with that is that it would split mid praragraph/sentence, which meant we would loose coherence sometimes. 
Using a recursive CTS meant we have more chunks as well, as it makes sure to perserve context. """
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    chunk_size=128,
    chunk_overlap=40
)
cts_all_chunks = []

for doc in md_texts:
    chunks = splitter.create_documents([doc["text"]])
    for c in chunks:
        c.metadata = {"source": doc["file"]}
    cts_all_chunks.extend(chunks)

In [74]:
print(len(cts_all_chunks), "total chunks created")
print(cts_all_chunks[0].page_content)

318 total chunks created
New script based on full curriculum

Module 1

The case for Reproductive Health at Work

Lesson 1: Understanding Reproductive and Fertility Health at Work

Lesson 2: Assessing Current Practices

Lesson 3: Compliance, Legal and Ethical Considerations













Module 2

Creating Your Reproductive and Fertility Health at Work Guide

Lesson 4: Developing a Tailored Guide

Lesson 5: Assessing Current Practices





Module 3

Engaging Executives and Stakeholders

Lesson 6: Identifying Key Stakeholders


In [75]:
""" 
Next is to contextualise the chunks in an anthropic style. 
From what i understood, anthropic style chunking means you iterate over each cts chunk, take the context window to left and right and concatenate them. 
You then embed the expanded version and keep track of original chunk text. Finally you store the resulting contextual embedding.
In terms of optimisation decision, theres three aspects here: should contexts be grouped across documents or one? 
Documents can have a lot of information but similar contexts.One sentence in a document can relate to another sentence in another document
And normalising embeddings before a similarity search - which is important especially if using a hybrid search or MMR (Maximal Marginal Relevance).
Next question is what is a good batch size? Well the good news is that it doesnt affect embeddings quality. What it affects is the cost and API reliability, 10-2 is pretty balanced for latency, throughput and error risk.
Finally, because the chunks are later grouped together, it means that smaller chunks should be produced first. 
"""
import numpy as np  

context_window = 2  # how many neighbor chunks to include (2 left + 2 right)

expanded_chunks = []
for i, c in enumerate(cts_all_chunks):
    left = max(0, i - context_window)
    right = min(len(cts_all_chunks), i + context_window + 1)

    expanded_text = " ".join(chunk.page_content for chunk in cts_all_chunks[left:right])

    expanded_chunks.append({
        "chunk": c.page_content,
        "expanded": expanded_text,
        "source": c.metadata["source"]
    })

print(len(expanded_chunks), "chunks prepared for contextual embedding")

# --- batching helper to embed ---
def batch_list(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

contextual_all_chunks = []
BATCH_SIZE = 20

for idx, batch in enumerate(batch_list(expanded_chunks, BATCH_SIZE)):
    print(f"Embedding batch {idx+1} with {len(batch)} chunks")

    resp = client_vo.contextualized_embed(
        inputs=[[c["expanded"]] for c in batch],
        model=embedding_model,
        input_type="document"
    )

    for c, r in zip(batch, resp.results):
        vec = np.array(r.embeddings[0], dtype=np.float32)

        # Normalize embedding vector (so cosine similarity works directly)
        normed_vec = vec / np.linalg.norm(vec)

        contextual_all_chunks.append({
            "chunk": c["chunk"],
            "expanded": c["expanded"],
            "doc_id": c["source"],
            "embedding": normed_vec
        })

print(f"{len(contextual_all_chunks)} contextual embeddings stored")
# Extract raw chunk texts and their embeddings
chunk_texts = [d["chunk"] for d in contextual_all_chunks]
chunk_embeddings = np.array([d["embedding"] for d in contextual_all_chunks], dtype=np.float32)


318 chunks prepared for contextual embedding
Embedding batch 1 with 20 chunks
Embedding batch 2 with 20 chunks
Embedding batch 3 with 20 chunks
Embedding batch 4 with 20 chunks
Embedding batch 5 with 20 chunks
Embedding batch 6 with 20 chunks
Embedding batch 7 with 20 chunks
Embedding batch 8 with 20 chunks
Embedding batch 9 with 20 chunks
Embedding batch 10 with 20 chunks
Embedding batch 11 with 20 chunks
Embedding batch 12 with 20 chunks
Embedding batch 13 with 20 chunks
Embedding batch 14 with 20 chunks
Embedding batch 15 with 20 chunks
Embedding batch 16 with 18 chunks
318 contextual embeddings stored


In [76]:
""" A deduplication step is added here and what it does is computes the cosine similarity between v and all previously kept vectors.
I believe this has a complexity of O(n^2), which would make it painfully slow for large number of chunks."""

import numpy as np
def dedupe_chunks(chunks, embeddings, threshold=0.95):
    """
    Remove near-duplicate chunks based on cosine similarity threshold.
    Greedy algorithm: keeps first occurrence and drops later duplicates.
    """
    # Normalize embeddings (so cosine similarity = dot product)
    embeddings = np.array(embeddings, dtype=np.float32)
    embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

    keep_idx = []
    keep_vecs = []

    for i, v in enumerate(embeddings):
        if not keep_vecs:
            keep_vecs.append(v)
            keep_idx.append(i)
            continue

        keep_mat = np.stack(keep_vecs)
        sims = np.dot(keep_mat, v)  # since already normalized

        if np.max(sims) < threshold:
            keep_vecs.append(v)
            keep_idx.append(i)

    keep_vecs = np.stack(keep_vecs)
    return [chunks[i] for i in keep_idx], keep_vecs

chunk_texts, chunk_embeddings = dedupe_chunks(chunk_texts, np.array(chunk_embeddings))  # run deduping
print(len(chunk_texts), "unique chunks after deduplication")


136 unique chunks after deduplication


Note: you may need to restart the kernel to use updated packages.


In [None]:
""" 
%pip install sentence-transformers

Reranking step here - used later in hybrid retrieval. Model chosen is fats and accurate 
this ones from hugging face"""
from sentence_transformers import CrossEncoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


In [79]:
""" Maximal Marginal Relevance helps you pick the most relevant and least redundant chunks or documents for a query
lambda_param: trade-off between relevance and diversity (0 = diverse, 1 = purely relevant)."""
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def mmr(query_embedding, doc_embeddings, lambda_param=0.6, top_k=5):
    # Compute cosine similarity between query and all document embeddings
    query_sim = cosine_similarity([query_embedding], doc_embeddings)[0]
    # Compute pairwise cosine similarity between all documents (for redundancy check)
    doc_sim = cosine_similarity(doc_embeddings)

    selected = []  # store indices of selected (chosen) documents
    remaining = list(range(len(doc_embeddings)))  # list of remaining (unselected) document indices

    for _ in range(top_k):  # repeat until top_k documents are selected
        if not remaining:  # stop if no remaining documents
            break
        rel = query_sim[remaining]  # relevance scores of remaining documents to query
        # diversity penalty: similarity to already selected docs (0 if none selected yet)
        div = np.max(doc_sim[remaining][:, selected], axis=1) if selected else np.zeros(len(remaining))
        # MMR score = balance of relevance and diversity
        mmr_score = lambda_param * rel - (1 - lambda_param) * div
        # pick document with highest MMR score
        chosen_idx = remaining[np.argmax(mmr_score)]
        selected.append(chosen_idx)  # add chosen doc index to selected list
        remaining.remove(chosen_idx)  # remove chosen doc from remaining pool
    return selected  # return indices of selected (top-k diverse and relevant) documents


In [80]:
def embed_query(text: str):
    return client_vo.contextualized_embed(
        inputs=[[text]],
        model=embedding_model,
        input_type="query"
    ).results[0].embeddings[0]

In [81]:
from langchain_community.retrievers import BM25Retriever
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# initialize BM25 retriever for keyword-based matching
bm25 = BM25Retriever.from_texts(chunk_texts)

def hybrid_retrieve_mmr_rerank(
    query, bm25, chunk_embeddings, chunk_texts, embed_query,
    alpha=0.5, lambda_param=0.6, top_k=5
):
    """
    Hybrid retrieval pipeline combining:
    1. Vector search (semantic)
    2. BM25 keyword search
    3. Hybrid scoring
    4. MMR diversity selection
    5. Cross-encoder reranking
    """

    #  Vector search (semantic similarity) ---
    q = embed_query(query)  # embed query into same space as documents
    q = q / np.linalg.norm(q)  # normalize query embedding
    chunk_norms = chunk_embeddings / np.linalg.norm(chunk_embeddings, axis=1, keepdims=True)  # normalize doc embeddings
    vec_scores = np.dot(chunk_norms, q)  # cosine similarity between query and each chunk
    vec_top_idx = np.argsort(vec_scores)[-20:][::-1]  # get top 20 most similar chunks

    #  BM25 keyword search ---
    bm_docs = bm25.invoke(query)  # perform keyword search using BM25
    bm_texts = [d.page_content for d in bm_docs[:20]]  # take top 20 keyword-matched chunks
    bm_scores = np.zeros(len(chunk_texts))  # initialize BM25 scores vector
    for t in bm_texts:  # for each retrieved BM25 chunk
        if t in chunk_texts:  # if it's part of the indexed dataset
            bm_scores[chunk_texts.index(t)] = 1.0  # mark as relevant (binary relevance)

    # Hybrid weighted score 
    hybrid_scores = alpha * vec_scores + (1 - alpha) * bm_scores  # weighted combination of both signals
    hybrid_top_idx = np.argsort(hybrid_scores)[-10:][::-1]  # pick top 10 chunks by hybrid score
    candidate_embeds = chunk_embeddings[hybrid_top_idx]  # get their embeddings
    candidate_texts = [chunk_texts[i] for i in hybrid_top_idx]  # get their text contents

    # MMR select top_k diverse chunks (to avoid redundancy)
    selected_local = mmr(q, candidate_embeds, lambda_param=lambda_param, top_k=top_k)
    mmr_idx = [hybrid_top_idx[i] for i in selected_local]  # map MMR indices to original chunk indices
    mmr_texts = [chunk_texts[i] for i in mmr_idx]  # get selected text chunks

    # Cross-encoder reranking (semantic precision)
    pairs = [(query, doc) for doc in mmr_texts]  # create query-doc pairs for reranking
    scores = reranker.predict(pairs)  # reranker gives similarity/confidence scores
    reranked_idx = np.argsort(scores)[::-1][:top_k]  # pick final top_k chunks
    final_idx = [mmr_idx[i] for i in reranked_idx]  # map reranked indices to dataset indices


    return final_idx  # return indices of final top_k selected chunks


In [82]:
def build_context(idx, chunks):
    # Ensure idx contains valid indices
    valid_idx = [i for i in idx if i < len(chunks)]  # Filter out any invalid indices
    if not valid_idx:
        return "No valid context available."
    return "\n\n".join(f"- {chunks[i]}" for i in valid_idx)

In [83]:
questions = [
    # --- Core Coverage & Access ---
    "What reproductive health benefits does the company insurance include?",
    "How do employees access reproductive health resources?",
    "Are fertility treatments like IVF or egg freezing covered?",
    "Does our plan include support for menopause or menstrual health?",

    # --- Policy & Rights ---
    "How is employee privacy protected when using reproductive health services?",
    "What rights do employees have regarding reproductive health decisions?",
    "Does our policy support all genders and family structures?",
    "Are there policies against discrimination related to reproductive health?",

    # --- Leave & Flexibility ---
    "Do employees get paid leave for fertility treatments?",
    "Can staff request flexible work for reproductive health reasons?",
    "What accommodations are provided during pregnancy or postpartum?",
    "Is there return-to-work support after childbirth?",

    # --- Programs & Culture ---
    "Are there internal support groups for reproductive health challenges?",
    "Do managers receive training about reproductive and fertility health?",
    "What wellness programs address reproductive health needs?",
    "Are there awareness campaigns or education programs?",

    # --- Legal & Compliance ---
    "Do our reproductive health policies meet legal standards?",
    "How are workplace complaints or violations handled?",
    "What steps can employees take if their reproductive rights are overlooked?",

    # --- Strategic Impact ---
    "How do reproductive health policies impact retention and performance?",
    "Does leadership actively promote reproductive health inclusion?"
]


In [84]:
"""questions = [
    "What reproductive health benefits does the company insurance include?",
    "How do employees access reproductive health resources?"
] """


'questions = [\n    "What reproductive health benefits does the company insurance include?",\n    "How do employees access reproductive health resources?"\n] '

In [85]:
def build_prompt(context, query):
    return f"""

You are an expert assistant that uses provided company documents and training materials 
to answer questions about reproductive and fertility health in the workplace.
Use ONLY the provided information from the context below. 

Documents:
{context}

Query:
{query}

Answer:
"""


def answer_all_questions_with_hybrid(questions, k=5, alpha=0.6, lambda_param=0.7, output_file=None):
    """
    Runs hybrid (vector + keyword) retrieval + MMR reranking + LLM generation
    for a list of user queries (questions or analytical prompts).
    """
    results = []

    for i, query in enumerate(questions):
        print(f"\n Q{i+1}: {query}")

        # Hybrid retrieval with MMR & reranking
        try:
            top_idx = hybrid_retrieve_mmr_rerank(
                query=query,
                bm25=bm25,
                chunk_embeddings=chunk_embeddings,
                chunk_texts=chunk_texts,
                embed_query=embed_query,
                alpha=alpha,
                lambda_param=lambda_param,
                top_k=k
            )
        except Exception as e:
            print(f"Retrieval failed for Q{i+1}: {e}")
            top_idx = []


        context = build_context(top_idx, chunk_texts) if top_idx else "No relevant context found."
        prompt = build_prompt(context, query)


        try:
            response = client.chat.completions.create(
                model=chat_model,
                messages=[{"role": "user", "content": prompt}]
            )
            ans = response.choices[0].message.content.strip()
        except Exception as e:
            ans = f" Model error: {e}"

        results.append({
            "query": query,
            "answer": ans,
            "context_used": context
        })

        print(" Answer:", ans)


    return results




In [86]:
answers_cts = answer_all_questions_with_hybrid(
    questions=questions,
    k=k,
    alpha=0.6,
    lambda_param=0.7
)



 Q1: What reproductive health benefits does the company insurance include?
 Answer: The company insurance at Multinational Leader in Womenâ€™s Health (Company Z) includes benefits such as subsidized fertility treatments, on-site counseling, and digital support tools.

 Q2: How do employees access reproductive health resources?
 Answer: Employees can access reproductive health resources through various means, including:
- Participating in monthly well-being webinars on fertility, menopause, and reproductive health.
- Taking advantage of free consultations with external reproductive health specialists.
- Seeking support and guidance from managers who have been trained to recognize and respond to reproductive health needs with sensitivity.
- Utilizing the confidential HR support system to access help without disclosing their situation to direct managers.
- Engaging in mental health and peer support groups provided by the company.
- Referring to the Reproductive & Fertility Health Guide t

In [87]:
#print("Total chunks:", len(chunk_texts))


#print("Sample answer entry:", answers_cts[0])
#print("Context used:", answers_cts[0].get("context_used", "MISSING"))

i = 1  # example question index
ctx = answers_cts[i]["context_used"]

print(f"Length: {len(ctx)} characters")
print(f"Contains non-whitespace? {bool(ctx.strip())}")



Length: 2465 characters
Contains non-whitespace? True


In [88]:
def generate_ground_truth(
    questions, 
    bm25, 
    chunk_embeddings, 
    chunk_texts, 
    embed_query, 
    k=10, 
    alpha=0.6, 
    lambda_param=0.7
):
    """
    Generate rigid, reproducible ground truth answers for evaluation (RAGAS).
    """
    gt_answers = []
    gt_contexts = []

    for i, q in enumerate(questions):
        print(f"\nðŸ§© Generating GT for Q{i+1}: {q}")

        try:
            # Retrieve context
            top_idx = hybrid_retrieve_mmr_rerank(
                query=q,
                bm25=bm25,
                chunk_embeddings=chunk_embeddings,
                chunk_texts=chunk_texts,
                embed_query=embed_query,
                alpha=alpha,
                lambda_param=lambda_param,
                top_k=k
            )
            context = build_context(top_idx, chunk_texts)
            gt_contexts.append(context)
        except Exception as e:
            print(f"Retrieval failed: {e}")
            gt_contexts.append("No context retrieved.")
            gt_answers.append("No answer generated.")
            continue

        # --- Build prompt ---
        prompt = f"""
You are an HR compliance and workplace policy expert.
Your task is to provide an authoritative, factual answer based strictly on the provided documents.

Guidelines:
- Only use information that is directly supported or logically implied by the context.
- Do not speculate or introduce external knowledge.


Context:
{context}

Question:
{q}

Answer:
"""

        try:
            res = client.chat.completions.create(
                model="gpt-4o",
                temperature=0,
                messages=[{"role": "user", "content": prompt}]
            )
            ans = res.choices[0].message.content.strip()
        except Exception as e:
            ans = f"Model error: {e}"

        gt_answers.append(ans)
        print("GT Answer:", ans[:200], "..." if len(ans) > 200 else "")

    return gt_answers, gt_contexts


In [89]:
ground_truths, gt_contexts = generate_ground_truth(
    questions=questions,
    bm25=bm25,
    chunk_embeddings=chunk_embeddings,
    chunk_texts=chunk_texts,
    embed_query=embed_query,
    k=12,
    alpha=0.6,
    lambda_param=0.7
)



ðŸ§© Generating GT for Q1: What reproductive health benefits does the company insurance include?
GT Answer: The company insurance includes health coverage and financial support for fertility treatments, consultations, and therapy. Additionally, it provides subsidized access to reproductive health services,  ...

ðŸ§© Generating GT for Q2: How do employees access reproductive health resources?
GT Answer: Employees can access reproductive health resources through several initiatives outlined in the provided context. These include:

1. **Monthly Well-being Webinars**: Employees can participate in webina ...

ðŸ§© Generating GT for Q3: Are fertility treatments like IVF or egg freezing covered?
GT Answer: Yes, fertility treatments such as IVF and egg freezing are covered under Google's workplace policy. The company offers fertility benefits that include IVF coverage, egg freezing, and surrogacy support ...

ðŸ§© Generating GT for Q4: Does our plan include support for menopause or menstrual

In [90]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    LLMContextPrecisionWithReference
)
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper
import os
import numpy as np
import pandas as pd



questions_list = [d["query"] for d in answers_cts]
answers_list   = [d["answer"] for d in answers_cts]
contexts = [
    [c.strip("- ").strip() for c in d["context_used"].split("\n\n") if c.strip()]
    for d in answers_cts
]
reference_contexts = [
    [c.strip("- ").strip() for c in gt_contexts[i].split("\n\n") if c.strip()]
    for i in range(len(gt_contexts))
]


os.environ["RAGAS_DISABLE_PARALLEL_EVAL"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

lc_llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    max_retries=3,
    request_timeout=240
)
llm = LangchainLLMWrapper(lc_llm)
context_precision_metric = LLMContextPrecisionWithReference(llm=llm)

ragas_dataset = Dataset.from_dict({
    "question": questions_list,
    "contexts": contexts,
    "answer": answers_list,
    "ground_truth": ground_truths,
    "reference_contexts": reference_contexts
})


results = evaluate(
    dataset=ragas_dataset,
    metrics=[
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision_metric
    ],
    llm=llm
)


df_scores = results.to_pandas()





  llm = LangchainLLMWrapper(lc_llm)


Evaluating:   0%|          | 0/84 [00:00<?, ?it/s]

LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
LLM returned 1 generations instead of requested 3. Proceeding with 1 generations.
Exception raised in Job[3]: TimeoutError()
Exception raised in Job[7]: TimeoutError()
Exception raised in Job[11]: TimeoutError()
Exception raised in Job[15]: TimeoutError()
LLM re

In [91]:

def custom_nan_handler(row):
    """
    Custom rule-based replacement for NaN and empty cases.
    99  = no retrieved context
    0   = no overlap between answer & ground truth
    999 = empty model answer
    """
    # Case 1: No retrieved context
    if not row.get("contexts") or len(row["contexts"]) == 0:
        return 99

    # Case 2: Empty model answer
    if not row.get("answer") or len(str(row["answer"]).strip()) == 0:
        return 999

    # Case 3: No overlap between model answer & ground truth
    if row.get("ground_truth") and row.get("answer"):
        gt = str(row["ground_truth"]).lower()
        ans = str(row["answer"]).lower()
        overlap = any(word in gt for word in ans.split())
        if not overlap:
            return 0

    # Otherwise, keep original metric
    return np.nan

# Apply to all numeric metrics
for metric in [
    "faithfulness",
    "answer_relevancy",
    "context_recall",
    "llm_context_precision_with_reference"
]:
    if metric in df_scores.columns:
        df_scores[metric] = df_scores.apply(
            lambda row: custom_nan_handler(row)
            if pd.isna(row[metric])
            else row[metric],
            axis=1
        )
        


In [92]:

display(df_scores.round(5))
numeric_means = df_scores.select_dtypes(include=[np.number]).mean().round(5)
print("\n=== Mean Metric Scores ===")
print(numeric_means)

Unnamed: 0,user_input,retrieved_contexts,reference_contexts,response,reference,faithfulness,answer_relevancy,context_recall,llm_context_precision_with_reference
0,What reproductive health benefits does the com...,"[Ethical Considerations (10 minutes), The need...",[Section 2b: Core Policies to Include (5-7 min...,The company insurance at Multinational Leader ...,The company insurance includes health coverage...,0.0,0.893,1.0,99.0
1,How do employees access reproductive health re...,"[Ensure the guide is inclusive of all genders,...","[âœ… Awareness & Employee Support, Launched mont...",Employees can access reproductive health resou...,Employees can access reproductive health resou...,1.0,0.99591,0.85714,99.0
2,Are fertility treatments like IVF or egg freez...,"[Company: Google (Technology, US)\nPolicy High...","[Company: Google (Technology, US)\nPolicy High...","Yes, fertility treatments like IVF and egg fre...","Yes, fertility treatments such as IVF and egg ...",1.0,0.93568,1.0,99.0
3,Does our plan include support for menopause or...,[Offers menopause-specific workplace adjustmen...,[Offers menopause-specific workplace adjustmen...,"Yes, our plan does include support for menopau...","Yes, the plan includes support for menopause a...",1.0,0.98483,1.0,99.0
4,How is employee privacy protected when using r...,"[Ethical Considerations (10 minutes), The need...","[Ethical Considerations (10 minutes), The need...",Employee privacy when using reproductive healt...,Employee privacy is protected when using repro...,1.0,1.0,1.0,99.0
5,What rights do employees have regarding reprod...,[The Health and Safety at Work Act 1974 (emplo...,[The Health and Safety at Work Act 1974 (emplo...,Employees have the right to request flexible w...,Employees in the UK have specific rights regar...,0.8,0.98247,0.875,99.0
6,Does our policy support all genders and family...,[Challenge:\nGoogle recognized that traditiona...,[Challenge:\nGoogle recognized that traditiona...,"Yes, your policy supports all genders and fami...","Yes, the policy supports all genders and famil...",1.0,0.97843,1.0,99.0
7,Are there policies against discrimination rela...,"[Ethical Considerations (10 minutes), The need...","[redmans.co.uk, These cases highlight the crit...","Yes, the ethical considerations in the provide...","Yes, there are policies against discrimination...",0.85714,0.86917,1.0,99.0
8,Do employees get paid leave for fertility trea...,"[Workplace Adjustments & Support, Quiet spaces...",[Case Study: How Company X Implemented a Repro...,"Yes, Google offers paid leave for fertility tr...","Yes, employees at TechInnovate Ltd. receive pa...",0.0,0.9318,0.0,99.0
9,Can staff request flexible work for reproducti...,[The Health and Safety at Work Act 1974 (emplo...,[The Health and Safety at Work Act 1974 (emplo...,"Yes, staff can request flexible work for repro...","Yes, staff can request flexible work arrangeme...",0.8,0.9077,1.0,99.0



=== Mean Metric Scores ===
faithfulness                             0.75748
answer_relevancy                         0.94472
context_recall                           0.83486
llm_context_precision_with_reference    89.65899
dtype: float64
