# Install required Packages

In [1]:
!pip install chromadb datasets evaluate rouge_score python-dotenv

Collecting chromadb
  Downloading chromadb-1.0.8-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-4.0.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.me

# Print Python and packages version

In [1]:
!python --version

Python 3.11.12


In [2]:
!pip list

Package                                  Version
---------------------------------------- -------------------
absl-py                                  1.4.0
accelerate                               1.6.0
aiohappyeyeballs                         2.6.1
aiohttp                                  3.11.15
aiosignal                                1.3.2
alabaster                                1.0.0
albucore                                 0.0.24
albumentations                           2.0.6
ale-py                                   0.11.0
altair                                   5.5.0
annotated-types                          0.7.0
anyio                                    4.9.0
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
array_record                             0.7.2
arviz                                    0.21.0
asgiref                                  3.8.1
astropy                                  7.0.1
astropy-iers-data                    

# Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Download the data

In [None]:
# !wget http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json -O hotpot_train.json

In [4]:
!ln -s /content/drive/MyDrive/python_projects/rag_evaluation/ /rag_evaluation_dir

In [5]:
!ls /rag_evaluation_dir

datasets  embeddings  evaluation_results  outputs


In [6]:
base_path = '/rag_evaluation_dir/'

# Import required packages

In [7]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
from dotenv import load_dotenv
import shutil
import os
import json
import pandas as pd
pd.set_option('display.max_columns', None)
from langchain.text_splitter import RecursiveCharacterTextSplitter
import chromadb
from more_itertools import chunked
from langchain.prompts import PromptTemplate
from datasets import Dataset
import evaluate
from collections import Counter
import re
import string
import unicodedata
import numpy as np

# load env variables

In [8]:
dotenv_path = f"{base_path}.env"
# Load the .env file
load_dotenv(dotenv_path)

os.environ["HF_TOKEN"] = os.getenv("HUGGINGFACEHUB_API_TOKEN")

# Save data in Google Drive

In [9]:
# Source file path
source_path = '/content/hotpot_train.json'

# Destination directory
destination_folder = f'{base_path}datasets/HotpotQA'

# Create destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Move the file
# shutil.move(source_path, destination_folder)

# Load HotpotQA JSON and Convert to DataFrame

In [None]:
with open(f'{destination_folder}/hotpot_train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [None]:
len(data)

90447

In [None]:
type(data)

list

In [None]:
data[0].keys()

dict_keys(['supporting_facts', 'level', 'question', 'context', 'answer', '_id', 'type'])

In [None]:
data[0]

{'supporting_facts': [["Arthur's Magazine", 0], ['First for Women', 0]],
 'level': 'medium',
 'question': "Which magazine was started first Arthur's Magazine or First for Women?",
 'context': [['Radio City (Indian radio station)',
   ["Radio City is India's first private FM radio station and was started on 3 July 2001.",
    ' It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).',
    ' It plays Hindi, English and regional songs.',
    ' It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.',
    ' Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.',
    ' The Radio station currently plays a mix of Hindi and Regional music.',
    ' Abraham Thomas is the CEO of the company.']]

In [None]:
samples = []
for item in data:
    context = ' '.join([' '.join(p[1]) for p in item['context']])
    samples.append({
        "id": item.get("_id"),
        "question": item.get("question"),
        "answer": item.get("answer"),
        "type": item.get("type"),
        "level": item.get("level"),
        "supporting_facts": json.dumps(item.get("supporting_facts")),
        "context": context
    })
df = pd.DataFrame(samples)
df.head()

Unnamed: 0,id,question,answer,type,level,supporting_facts,context
0,5a7a06935542990198eaf050,Which magazine was started first Arthur's Maga...,Arthur's Magazine,comparison,medium,"[[""Arthur's Magazine"", 0], [""First for Women"",...",Radio City is India's first private FM radio s...
1,5a879ab05542996e4f30887e,The Oberoi family is part of a hotel company t...,Delhi,bridge,medium,"[[""Oberoi family"", 0], [""The Oberoi Group"", 0]]",The Ritz-Carlton Jakarta is a hotel and skyscr...
2,5a8d7341554299441c6b9fe5,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,bridge,hard,"[[""Allie Goertz"", 0], [""Allie Goertz"", 1], [""A...",Lisa Marie Simpson is a fictional character in...
3,5a82171f5542990a1d231f4a,What nationality was James Henry Miller's wife?,American,bridge,medium,"[[""Peggy Seeger"", 0], [""Peggy Seeger"", 1], [""E...","Moloch: or, This Gentile World is a semi-autob..."
4,5a84dd955542997b5ce3ff79,Cadmium Chloride is slightly soluble in this c...,alcohol,bridge,medium,"[[""Cadmium chloride"", 1], [""Ethanol"", 0]]",Cadmium chloride is a white crystalline compou...


In [None]:
df.shape

(90447, 7)

In [None]:
df["context"][0]

'Radio City is India\'s first private FM radio station and was started on 3 July 2001.  It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003).  It plays Hindi, English and regional songs.  It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007.  Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features.  The Radio station currently plays a mix of Hindi and Regional music.  Abraham Thomas is the CEO of the company. Football in Albania existed before the Albanian Football Federation (FSHF) was created.  This was evidenced by the team\'s registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania eventually had pressure from the teams because of compe

# sample data and save

In [None]:
df.head()

Unnamed: 0,id,question,answer,type,level,supporting_facts,context
0,5a7a06935542990198eaf050,Which magazine was started first Arthur's Maga...,Arthur's Magazine,comparison,medium,"[[""Arthur's Magazine"", 0], [""First for Women"",...",Radio City is India's first private FM radio s...
1,5a879ab05542996e4f30887e,The Oberoi family is part of a hotel company t...,Delhi,bridge,medium,"[[""Oberoi family"", 0], [""The Oberoi Group"", 0]]",The Ritz-Carlton Jakarta is a hotel and skyscr...
2,5a8d7341554299441c6b9fe5,Musician and satirist Allie Goertz wrote a son...,President Richard Nixon,bridge,hard,"[[""Allie Goertz"", 0], [""Allie Goertz"", 1], [""A...",Lisa Marie Simpson is a fictional character in...
3,5a82171f5542990a1d231f4a,What nationality was James Henry Miller's wife?,American,bridge,medium,"[[""Peggy Seeger"", 0], [""Peggy Seeger"", 1], [""E...","Moloch: or, This Gentile World is a semi-autob..."
4,5a84dd955542997b5ce3ff79,Cadmium Chloride is slightly soluble in this c...,alcohol,bridge,medium,"[[""Cadmium chloride"", 1], [""Ethanol"", 0]]",Cadmium chloride is a white crystalline compou...


In [None]:
# Compute proportions from the actual distribution
level_proportions = df["level"].value_counts(normalize=True).to_dict()

level_proportions

{'medium': 0.6281468705429699,
 'easy': 0.19870200227757692,
 'hard': 0.17315112717945316}

In [None]:
# sample sizes
total_samples = [100, 1000, 10000]

for total_sample in total_samples:
    # Perform proportional stratified sampling
    stratified_sample = pd.concat([
        df[df["level"] == level].sample(
            n=int(total_sample * prop),
            random_state=42
        )
        for level, prop in level_proportions.items()
    ])
    print(f"sample size: {total_sample}")
    print(stratified_sample["level"].value_counts()*100/stratified_sample.shape[0])
    stratified_sample.to_parquet(f"{destination_folder}/sampled_hotpot_train_{total_sample}.parquet")


sample size: 100
level
medium    63.265306
easy      19.387755
hard      17.346939
Name: count, dtype: float64
sample size: 1000
level
medium    62.862863
easy      19.819820
hard      17.317317
Name: count, dtype: float64
sample size: 10000
level
medium    62.816282
easy      19.871987
hard      17.311731
Name: count, dtype: float64


# load sampled data

In [10]:
sample_size = 100

stratified_sample = pd.read_parquet(f"{destination_folder}/sampled_hotpot_train_{sample_size}.parquet")

stratified_sample.head()

Unnamed: 0,id,question,answer,type,level,supporting_facts,context
10075,5a8b8e1b5542997f31a41d6f,were Black Stone Cherry and Gene Loves Jezebel...,no,comparison,medium,"[[""Black Stone Cherry"", 0], [""Gene Loves Jezeb...","""Hell & High Water"" is the second promo single..."
74109,5abb09705542996cc5e49f67,The monarchy of New Zealand has a leader who t...,1952,bridge,medium,"[[""Monarchy of New Zealand"", 1], [""George VI"",...","John Gethin Hughes {'1': "", '2': "", '3': "", '4..."
39557,5ae7d2165542993210983f5c,"The actress that plays Paikea ""Pai"" Apirana in...",2009,bridge,medium,"[[""Piece of My Heart (film)"", 0], [""Keisha Cas...",Annie Whittle is a British-born New Zealand si...
57477,5a7e1c2b55429965cec5ea73,FilmNation Entertainment acquired the rights t...,A Slight Trick of the Mind,bridge,medium,"[[""FilmNation Entertainment"", 2], [""Mr. Holmes...","Case Closed: The Last Wizard Of The Century, k..."
49743,5a78aa44554299148911f904,Which band was formed in Edinburgh and had the...,Swamptrash,bridge,medium,"[[""Swamptrash"", 0], [""Bluegrass music"", 1]]",Brit funk is a musical style that has its orig...


In [None]:
stratified_sample.shape

(98, 7)

# Chunk the Data using LangChain

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap=50)

documents = []
metadatas = []

for _, row in stratified_sample.iterrows():
    chunks = text_splitter.split_text(row["context"])
    for i, chunk in enumerate(chunks):
        documents.append(chunk)
        metadatas.append({
            "id": row["id"],
            "question": row["question"],
            "answer": row["answer"],
            "chunk_index": i
        })

In [None]:
len(documents)

12386

# Generate and Save Embeddings

In [12]:
MAX_CHROMA_BATCH = 5000

# Define embedding models
embedding_models = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "MPNet": "sentence-transformers/all-mpnet-base-v2",
    "E5-small": "intfloat/e5-small-v2",
    "E5-base": "intfloat/e5-base-v2"
}

# Overwrite existing embeddings if True
overwrite = False

embeddings_save_path = f"{base_path}embeddings/{sample_size}"
os.makedirs(embeddings_save_path, exist_ok=True)


In [None]:

for label, model_name in tqdm(embedding_models.items()):
    persist_dir = os.path.join(embeddings_save_path, label)
    sqlite_path = os.path.join(persist_dir, "chroma.sqlite3")

    if os.path.exists(sqlite_path):
        if overwrite:
            print(f"🧹 Overwriting {label} — removing old files...")
            for item in os.listdir(persist_dir):
                item_path = os.path.join(persist_dir, item)
                if os.path.isfile(item_path):
                    os.remove(item_path)
                elif os.path.isdir(item_path):
                    shutil.rmtree(item_path)
        else:
            print(f"✅ Skipping {label} — already saved.")
            continue

    print(f"🔹 Loading model: {model_name}")
    model = SentenceTransformer(model_name)

    print(f"🔸 Generating embeddings for: {label}")
    embeddings = model.encode(documents, show_progress_bar=True, batch_size=512)

    print(f"💾 Saving to ChromaDB at: {persist_dir}")
    chroma_client = chromadb.PersistentClient(path=persist_dir)

    collection = chroma_client.get_or_create_collection(name="hotpotqa_chunks")



    # Precompute IDs
    ids = [f"{meta['id']}_chunk{meta['chunk_index']}" for meta in metadatas]

    # Chunk everything into batches of MAX_CHROMA_BATCH
    for doc_batch, emb_batch, meta_batch, id_batch in zip(
        chunked(documents, MAX_CHROMA_BATCH),
        chunked(embeddings.tolist(), MAX_CHROMA_BATCH),
        chunked(metadatas, MAX_CHROMA_BATCH),
        chunked(ids, MAX_CHROMA_BATCH),
    ):
        collection.add(
            documents=doc_batch,
            embeddings=emb_batch,
            metadatas=meta_batch,
            ids=id_batch,
        )

    print(f"✅ Saved {label} to ChromaDB\n")


  0%|          | 0/4 [00:00<?, ?it/s]

🔹 Loading model: sentence-transformers/all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔸 Generating embeddings for: MiniLM


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

💾 Saving to ChromaDB at: /rag_evaluation_dir/embeddings/1000/MiniLM
✅ Saved MiniLM to ChromaDB

🔹 Loading model: sentence-transformers/all-mpnet-base-v2


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔸 Generating embeddings for: MPNet


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

💾 Saving to ChromaDB at: /rag_evaluation_dir/embeddings/1000/MPNet
✅ Saved MPNet to ChromaDB

🔹 Loading model: intfloat/e5-small-v2


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

🔸 Generating embeddings for: E5-small


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

💾 Saving to ChromaDB at: /rag_evaluation_dir/embeddings/1000/E5-small
✅ Saved E5-small to ChromaDB

🔹 Loading model: intfloat/e5-base-v2


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

🔸 Generating embeddings for: E5-base


Batches:   0%|          | 0/25 [00:00<?, ?it/s]

💾 Saving to ChromaDB at: /rag_evaluation_dir/embeddings/1000/E5-base
✅ Saved E5-base to ChromaDB



# Load Embeddings from Disk

In [None]:
label = "E5-small"

persist_dir = os.path.join(embeddings_save_path, label)

# Initialize ChromaDB client
chroma_client = chromadb.PersistentClient(path=persist_dir)

# Load the collection (must match name used when saving)
collection = chroma_client.get_or_create_collection(name="hotpotqa_chunks")

# You can now query or inspect the collection
print(f"✅ Loaded collection: {collection.name}")
print(f"📦 Number of items: {collection.count()}")

✅ Loaded collection: hotpotqa_chunks
📦 Number of items: 1243


In [None]:
stratified_sample.shape

(98, 7)

In [None]:
stratified_sample.columns

Index(['id', 'question', 'answer', 'type', 'level', 'supporting_facts',
       'context'],
      dtype='object')

In [None]:
idx = 0
question = stratified_sample["question"].iloc[idx]
answer = stratified_sample["answer"].iloc[idx]
context = stratified_sample["context"].iloc[idx]

question, answer, context

('were Black Stone Cherry and Gene Loves Jezebel both british bands?',
 'no',
 '"Hell & High Water" is the second promo single from Black Stone Cherry\'s self-titled debut "Black Stone Cherry. " It follows the first successful single, Lonely Train.  This song reached <nowiki>#</nowiki>30 on the Mainstream Rock Tracks chart.  It was supported by a video directed by JB Carlin. Tim Palmer is a British music producer, audio engineer, guitarist and songwriter of rock and alternative music.  He mixed Pearl Jam\'s debut album "Ten" (1991) and tracks on U2\'s comeback album \'All that you can\'t leave behind\' in 2000 (GRAMMY nominated for \'Album of the Year\') Tim has produced Top Ten albums over 4 decades now and has worked with U2, Robert Plant, Ozzy Osbourne the Mission UK, Mighty Lemon Drops, Gene Loves Jezebel, Pearl Jam, David Bowie’s Tin Machine, HIM, Blue October, Jason Mraz, The Polyphonic Spree, The House of Love, Texas, Tarja Turunen, The Cure, Cutting Crew, Porcupine Tree, Faith 

# Get LLM response and save them

In [None]:
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="""Use the following retrieved context to answer the question.
    If you don't know, say "I don't know." Keep it precise and concise.
    {context}

    Question: {question}
    Answer:"""
)

In [None]:
# Batch generation function
def generate_llm_responses_batch(questions, embedding_model, collection, generator, k=3):
    # Embed all questions at once
    query_embeddings = embedding_model.encode(questions, show_progress_bar=False)

    # Retrieve relevant documents for each question
    contexts = []
    for query_embedding in query_embeddings:
        results = collection.query(query_embeddings=[query_embedding], n_results=k)
        context = " ".join(results['documents'][0]) if results['documents'][0] else ""
        contexts.append(context)

    # Prepare prompts
    prompts = [
        prompt_template.format(context=contexts[i], question=questions[i])
        for i in range(len(questions))
    ]

    # Batch generate responses
    outputs = generator(prompts, max_length=300, truncation=True)
    responses = [output["generated_text"].strip() for output in outputs]
    return responses


In [None]:
# Parameters
models = ["t5-large", "google/flan-t5-large", "google/flan-t5-base", "t5-base"]
sample_sizes = [10000, 1000, 100]
embedding_models = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "MPNet": "sentence-transformers/all-mpnet-base-v2",
    "E5-small": "intfloat/e5-small-v2",
    "E5-base": "intfloat/e5-base-v2"
}

# Main loop with batching
for sample_size in sample_sizes:
    stratified_sample = pd.read_parquet(f"{destination_folder}/sampled_hotpot_train_{sample_size}.parquet")

    for model in tqdm(models):
        generator = pipeline("text2text-generation", model=model)

        for label in embedding_models.keys():
            print(f"llm model: {model}, sample size: {sample_size}, embedding model key: {label}")
            save_folder = f"{base_path}outputs/sample={sample_size}/label={label}/model={model.replace('/', '_')}"
            save_path = f"{save_folder}/out.parquet"
            print(f"save path: {save_path}")

            if os.path.exists(save_path):
                print("Output already exists. Skipping...")
                continue

            embeddings_save_path = f"{base_path}embeddings/{sample_size}"
            persist_dir = os.path.join(embeddings_save_path, label)
            print(f"loading embeddings from {persist_dir}")

            chroma_client = chromadb.PersistentClient(path=persist_dir)
            collection = chroma_client.get_or_create_collection(name="hotpotqa_chunks")
            embedding_model = SentenceTransformer(embedding_models[label])

            # Batch processing using datasets
            dataset = Dataset.from_pandas(stratified_sample)
            batched_data = dataset.map(
                lambda batch: {
                    "llm_response": generate_llm_responses_batch(
                        batch["question"], embedding_model, collection, generator, k=3
                    )
                },
                batched=True,
                batch_size=128
            )

            os.makedirs(save_folder, exist_ok=True)
            batched_data.select_columns(["id", "llm_response"]).to_parquet(save_path)

  0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


llm model: t5-large, sample size: 10000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=10000/label=MiniLM/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 10000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=10000/label=MPNet/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 10000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-small/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 10000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-base/model=t5-large/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: google/flan-t5-large, sample size: 10000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=10000/label=MiniLM/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 10000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=10000/label=MPNet/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 10000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-small/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 10000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-base/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: google/flan-t5-base, sample size: 10000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=10000/label=MiniLM/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 10000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=10000/label=MPNet/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 10000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-small/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 10000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-base/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: t5-base, sample size: 10000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=10000/label=MiniLM/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 10000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=10000/label=MPNet/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 10000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-small/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 10000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=10000/label=E5-base/model=t5-base/out.parquet
Output already exists. Skipping...


  0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


llm model: t5-large, sample size: 1000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=1000/label=MiniLM/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 1000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=1000/label=MPNet/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 1000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-small/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 1000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-base/model=t5-large/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: google/flan-t5-large, sample size: 1000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=1000/label=MiniLM/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 1000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=1000/label=MPNet/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 1000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-small/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 1000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-base/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: google/flan-t5-base, sample size: 1000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=1000/label=MiniLM/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 1000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=1000/label=MPNet/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 1000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-small/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 1000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-base/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: t5-base, sample size: 1000, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=1000/label=MiniLM/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 1000, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=1000/label=MPNet/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 1000, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-small/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 1000, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=1000/label=E5-base/model=t5-base/out.parquet
Output already exists. Skipping...


  0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


llm model: t5-large, sample size: 100, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=100/label=MiniLM/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 100, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=100/label=MPNet/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 100, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-small/model=t5-large/out.parquet
Output already exists. Skipping...
llm model: t5-large, sample size: 100, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-base/model=t5-large/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: google/flan-t5-large, sample size: 100, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=100/label=MiniLM/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 100, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=100/label=MPNet/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 100, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-small/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-large, sample size: 100, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-base/model=google_flan-t5-large/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: google/flan-t5-base, sample size: 100, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=100/label=MiniLM/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 100, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=100/label=MPNet/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 100, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-small/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...
llm model: google/flan-t5-base, sample size: 100, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-base/model=google_flan-t5-base/out.parquet
Output already exists. Skipping...


Device set to use cuda:0


llm model: t5-base, sample size: 100, embedding model key: MiniLM
save path: /rag_evaluation_dir/outputs/sample=100/label=MiniLM/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 100, embedding model key: MPNet
save path: /rag_evaluation_dir/outputs/sample=100/label=MPNet/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 100, embedding model key: E5-small
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-small/model=t5-base/out.parquet
Output already exists. Skipping...
llm model: t5-base, sample size: 100, embedding model key: E5-base
save path: /rag_evaluation_dir/outputs/sample=100/label=E5-base/model=t5-base/out.parquet
Output already exists. Skipping...


# LLM Response Evaluation

In [13]:
# Normalize text for EM/F1
def normalize_text(text):
    def remove_articles(s):
        return re.sub(r'\b(a|an|the)\b', ' ', s)

    def white_space_fix(s):
        return ' '.join(s.split())

    def remove_punc(s):
        return ''.join(ch for ch in s if ch not in string.punctuation)

    def lower(s):
        return s.lower()

    def unicode_to_ascii(s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    return white_space_fix(remove_articles(remove_punc(lower(unicode_to_ascii(text)))))

# Exact Match
def exact_match_score(prediction, ground_truth):
    return int(normalize_text(prediction) == normalize_text(ground_truth))

# F1 Score
def f1_score(prediction, ground_truth):
    pred_tokens = normalize_text(prediction).split()
    gt_tokens = normalize_text(ground_truth).split()
    common = Counter(pred_tokens) & Counter(gt_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0.0
    precision = num_same / len(pred_tokens)
    recall = num_same / len(gt_tokens)
    return 2 * precision * recall / (precision + recall)


In [14]:
# Load Hugging Face metrics
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [15]:
def get_metrics(df):

    answers = df['answer'].fillna("").astype(str).tolist()
    predictions = df['llm_response'].fillna("").astype(str).tolist()

    em_scores = [exact_match_score(a, p) for a, p in zip(answers, predictions)]
    f1_scores = [f1_score(a, p) for a, p in zip(answers, predictions)]

    # BLEU
    bleu_result = bleu.compute(predictions=predictions, references=[[a] for a in answers])
    # ROUGE
    rouge_result = rouge.compute(predictions=predictions, references=answers)

    return em_scores, f1_scores, bleu_result, rouge_result


In [16]:
# Parameters
models = ["google/flan-t5-large", "google/flan-t5-base", "t5-base", "t5-large"]
sample_sizes = [100, 1000, 10000]
embedding_models = {
    "MiniLM": "sentence-transformers/all-MiniLM-L6-v2",
    "MPNet": "sentence-transformers/all-mpnet-base-v2",
    "E5-small": "intfloat/e5-small-v2",
    "E5-base": "intfloat/e5-base-v2"
}
results = []
# Main loop with batching
for sample_size in tqdm(sample_sizes):
    stratified_sample = pd.read_parquet(f"{destination_folder}/sampled_hotpot_train_{sample_size}.parquet")
    for model in models:
        for label in embedding_models.keys():
            print(f"sample size: {sample_size}, embedding model key: {label}, llm model: {model}")
            save_folder = f"{base_path}outputs/sample={sample_size}/label={label}/model={model.replace('/', '_')}"
            save_path = f"{save_folder}/out.parquet"
            try:
                save_df = pd.read_parquet(save_path)
                df = pd.merge(stratified_sample, save_df, on='id', how='inner')
                em_scores, f1_scores, bleu_result, rouge_result = get_metrics(df)
                result = {
                    "sample size": sample_size,
                    "LLM": model,
                    "Embedding Model": embedding_models[label],
                    "difficulty level": "all",
                    "exact match": np.mean(em_scores),
                    "f1": np.mean(f1_scores),
                    "bleu": bleu_result["bleu"],
                    "rouge1": rouge_result["rouge1"],
                    "rouge2": rouge_result["rouge2"],
                    "rougeL": rouge_result["rougeL"],
                    "rougeLsum": rouge_result["rougeLsum"]
                }
                results.append(result)

                for level in df["level"].unique():
                    level_df = df[df["level"] == level]
                    em_scores, f1_scores, bleu_result, rouge_result = get_metrics(level_df)
                    result = {
                        "sample size": sample_size,
                        "LLM": model,
                        "Embedding Model": embedding_models[label],
                        "difficulty level": level,
                        "exact match": np.mean(em_scores),
                        "f1": np.mean(f1_scores),
                        "bleu": bleu_result["bleu"],
                        "rouge1": rouge_result["rouge1"],
                        "rouge2": rouge_result["rouge2"],
                        "rougeL": rouge_result["rougeL"],
                        "rougeLsum": rouge_result["rougeLsum"]
                    }
                    results.append(result)
            except Exception as e:
                pass

  0%|          | 0/3 [00:00<?, ?it/s]

sample size: 100, embedding model key: MiniLM, llm model: google/flan-t5-large
sample size: 100, embedding model key: MPNet, llm model: google/flan-t5-large
sample size: 100, embedding model key: E5-small, llm model: google/flan-t5-large
sample size: 100, embedding model key: E5-base, llm model: google/flan-t5-large
sample size: 100, embedding model key: MiniLM, llm model: google/flan-t5-base
sample size: 100, embedding model key: MPNet, llm model: google/flan-t5-base
sample size: 100, embedding model key: E5-small, llm model: google/flan-t5-base
sample size: 100, embedding model key: E5-base, llm model: google/flan-t5-base
sample size: 100, embedding model key: MiniLM, llm model: t5-base
sample size: 100, embedding model key: MPNet, llm model: t5-base
sample size: 100, embedding model key: E5-small, llm model: t5-base
sample size: 100, embedding model key: E5-base, llm model: t5-base
sample size: 100, embedding model key: MiniLM, llm model: t5-large
sample size: 100, embedding model k

In [17]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,sample size,LLM,Embedding Model,difficulty level,exact_match,f1,bleu,rouge1,rouge2,rougeL,rougeLsum
0,100,google/flan-t5-large,sentence-transformers/all-MiniLM-L6-v2,all,0.459184,0.568707,0.432618,0.566448,0.257823,0.567177,0.565549
1,100,google/flan-t5-large,sentence-transformers/all-MiniLM-L6-v2,medium,0.483871,0.532258,0.279567,0.532258,0.264516,0.52957,0.534562
2,100,google/flan-t5-large,sentence-transformers/all-MiniLM-L6-v2,easy,0.421053,0.578446,0.37606,0.568421,0.252632,0.571429,0.581454
3,100,google/flan-t5-large,sentence-transformers/all-MiniLM-L6-v2,hard,0.411765,0.690756,0.524877,0.681793,0.243697,0.679552,0.682353
4,100,google/flan-t5-large,sentence-transformers/all-mpnet-base-v2,all,0.44898,0.546404,0.427329,0.547911,0.262391,0.549684,0.548534


In [18]:
base_path

'/rag_evaluation_dir/'

In [19]:
results_df.to_excel(f"{base_path}evaluation_results/results.xlsx", index = None)