In [None]:
!pip install -q sentence-transformers faiss-cpu pandas numpy

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
!pip install faiss-cpu


from typing import List, Dict
import pickle
import faiss
from sentence_transformers import SentenceTransformer


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m102.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [8]:
# Module 1: retrieval_engine.py
class RetrievalEngine:
    def __init__(self, model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        self.index = None
        self.documents = []
        self.doc_ids = []

    def add_documents(self, documents: List[Dict[str, str]]):
        self.documents = documents
        texts = [doc["text"] for doc in documents]
        self.doc_ids = [doc["id"] for doc in documents]

        embeddings = self.model.encode(texts, show_progress_bar=True)
        dim = embeddings.shape[1]

        self.index = faiss.IndexFlatL2(dim)
        self.index.add(embeddings.astype("float32"))

    def retrieve(self, query: str, top_k: int = 5) -> List[Dict]:
        query_embedding = self.model.encode([query]).astype("float32")
        distances, indices = self.index.search(query_embedding, top_k)

        results = []
        for rank, (idx, distance) in enumerate(zip(indices[0], distances[0])):
            if idx == -1:
                continue

            results.append({
                "doc_id": self.doc_ids[idx],
                "text": self.documents[idx]["text"],
                "score": float(distance),
                "rank": rank + 1
            })

        return results

    def save(self, path: str):
        with open(f"{path}_data.pkl", "wb") as f:
            pickle.dump(
                {
                    "documents": self.documents,
                    "doc_ids": self.doc_ids
                },
                f
            )
        faiss.write_index(self.index, f"{path}_index.faiss")

    def load(self, path: str):
        with open(f"{path}_data.pkl", "rb") as f:
            data = pickle.load(f)
            self.documents = data["documents"]
            self.doc_ids = data["doc_ids"]

        self.index = faiss.read_index(f"{path}_index.faiss")


In [9]:
class TextChunker:
    def __init__(self, chunk_size: int = 512, overlap: int = 50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_document(self, doc_id: str, text: str) -> List[Dict[str, str]]:
        words = text.split()
        chunks = []

        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk_words = words[i : i + self.chunk_size]
            if len(chunk_words) < 10:
                continue

            chunk_text = ' '.join(chunk_words)
            chunk_id = f"{doc_id}_chunk_{i // (self.chunk_size - self.overlap)}"

            chunks.append({
                'id': chunk_id,
                'text': chunk_text,
                'parent_doc_id': doc_id,
                'start_word': i,
                'end_word': i + len(chunk_words)
            })

        return chunks

    def chunk_documents(self, documents: List[Dict[str, str]]) -> List[Dict[str, str]]:
        all_chunks = []
        for doc in documents:
            chunks = self.chunk_document(doc['id'], doc['text'])
            all_chunks.extend(chunks)
        return all_chunks

In [10]:
import numpy as np
import pandas as pd
from typing import List, Dict

class RAGEvaluator:
    def __init__(self):
        self.results = []

    def calculate_mrr(self, retrieved_ids: List[str], relevant_ids: List[str]) -> float:
        for rank, doc_id in enumerate(retrieved_ids, 1):
            if doc_id in relevant_ids:
                return 1.0 / rank
        return 0.0

    def calculate_recall_at_k(self, retrieved_ids: List[str], relevant_ids: List[str], k: int) -> float:
        retrieved_at_k = set(retrieved_ids[:k])
        relevant_set = set(relevant_ids)
        if len(relevant_set) == 0:
            return 0.0
        found = len(retrieved_at_k & relevant_set)
        return found / len(relevant_set)

    def calculate_precision_at_k(self, retrieved_ids: List[str], relevant_ids: List[str], k: int) -> float:
        if k == 0: return 0.0
        retrieved_at_k = retrieved_ids[:k]
        relevant_set = set(relevant_ids)
        relevant_count = sum(1 for doc_id in retrieved_at_k if doc_id in relevant_set)
        return relevant_count / k

    def evaluate_query(self, question: str, retrieved_ids: List[str], relevant_ids: List[str]) -> Dict:
        metrics = {
            'question': question,
            'mrr': self.calculate_mrr(retrieved_ids, relevant_ids),
            'recall@5': self.calculate_recall_at_k(retrieved_ids, relevant_ids, 5),
            'recall@10': self.calculate_recall_at_k(retrieved_ids, relevant_ids, 10),
            'precision@5': self.calculate_precision_at_k(retrieved_ids, relevant_ids, 5),
            'num_relevant': len(relevant_ids),
            'num_retrieved': len(retrieved_ids)
        }
        self.results.append(metrics)
        return metrics

    def get_summary(self) -> Dict:
        if not self.results: return {}
        df = pd.DataFrame(self.results)
        return {
            'avg_mrr': df['mrr'].mean(),
            'avg_recall@5': df['recall@5'].mean(),
            'avg_recall@10': df['recall@10'].mean(),
            'avg_precision@5': df['precision@5'].mean(),
            'num_queries': len(self.results)
        }

    def to_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame(self.results)

    def save_results(self, filepath: str):
        df = self.to_dataframe()
        df.to_csv(filepath, index=False)



In [12]:
import time
class ExperimentRunner:
    def __init__(self):
        self.experiment_results = []

    def run_experiment(self,
                       documents: List[Dict],
                       eval_questions: List[Dict],
                       model_name: str,
                       chunk_size: int,
                       overlap: int,
                       top_k: int,
                       experiment_name: str = None) -> Dict:

        print(f"\n{'='*60}")
        print(f"Running: {experiment_name or 'Unnamed Experiment'}")
        print(f"Model: {model_name}, Chunk: {chunk_size}, K: {top_k}")
        print(f"{'='*60}")

        start_time = time.time()

        chunker = TextChunker(chunk_size=chunk_size, overlap=overlap)
        chunks = chunker.chunk_documents(documents)
        print(f"Created {len(chunks)} chunks from {len(documents)} documents")

        engine = RetrievalEngine(model_name=model_name)
        engine.add_documents(chunks)
        print(f"Built index with {len(chunks)} chunks")

        evaluator = RAGEvaluator()
        for q in eval_questions:
            results = engine.retrieve(q['question'], top_k=top_k)
            retrieved_ids = [r['doc_id'] for r in results]

            retrieved_parent_ids = [cid.rsplit('_chunk_', 1)[0] for cid in retrieved_ids]

            evaluator.evaluate_query(
                question=q['question'],
                retrieved_ids=retrieved_parent_ids,
                relevant_ids=q['relevant_doc_ids']
            )

        summary = evaluator.get_summary()
        elapsed = time.time() - start_time

        experiment_result = {
            'experiment_name': experiment_name,
            'model_name': model_name,
            'chunk_size': chunk_size,
            'overlap': overlap,
            'top_k': top_k,
            'num_chunks': len(chunks),
            'avg_mrr': summary.get('avg_mrr', 0),
            'avg_recall@5': summary.get('avg_recall@5', 0),
            'avg_recall@10': summary.get('avg_recall@10', 0),
            'avg_precision@5': summary.get('avg_precision@5', 0),
            'elapsed_time': elapsed
        }

        self.experiment_results.append(experiment_result)

        print(f"\nResults:")
        print(f" MRR:        {summary.get('avg_mrr',0):.3f}")
        print(f" Recall@5:   {summary.get('avg_recall@5',0):.3f}")
        print(f" Recall@10:  {summary.get('avg_recall@10',0):.3f}")
        print(f" Time:       {elapsed:.2f}s")

        return experiment_result

    def run_grid_search(self,
                        documents: List[Dict],
                        eval_questions: List[Dict],
                        models: List[str],
                        chunk_sizes: List[int],
                        top_ks: List[int],
                        overlap: int = 50):

        for model in models:
            for chunk_size in chunk_sizes:
                for top_k in top_ks:
                    exp_name = f"{model.split('/')[-1]}_c{chunk_size}_k{top_k}"
                    self.run_experiment(
                        documents=documents,
                        eval_questions=eval_questions,
                        model_name=model,
                        chunk_size=chunk_size,
                        overlap=overlap,
                        top_k=top_k,
                        experiment_name=exp_name
                    )

        return self.get_results_df()

    def get_results_df(self) -> pd.DataFrame:
        return pd.DataFrame(self.experiment_results)

    def save_results(self, filepath: str):
        df = self.get_results_df()
        df.to_csv(filepath, index=False)
        print(f"\nSaved results to {filepath}")

    def get_best_config(self, metric: str = 'avg_mrr') -> Dict:
        df = self.get_results_df()
        if df.empty: return {}
        best_idx = df[metric].idxmax()
        return df.loc[best_idx].to_dict()

# ──────────────────────────────────────────────────────────────
# Sample Finance Data: 10-K style excerpts (mini version)
# ──────────────────────────────────────────────────────────────

documents = [
    {
        "id": "tsla_2023",
        "text": """Tesla Inc. 2023 Annual Report excerpt. Revenue grew 19% to $96.8 billion driven by increased vehicle deliveries and higher average selling prices. Automotive segment revenue was $81.5 billion. Energy generation and storage revenue increased 54% to $6.0 billion. We delivered approximately 1.81 million vehicles in 2023. Operating income was $8.9 billion. Net income attributable to common stockholders was $15.0 billion. Cybertruck production began in 2023. We opened new Megapack factory in Shanghai."""
    },
    {
        "id": "aapl_2023",
        "text": """Apple Inc. Fiscal 2023 Form 10-K summary. Total net sales were $383.3 billion, down 3% year over year. iPhone sales $200.6 billion, Mac $29.4 billion, iPad $28.3 billion, Wearables $39.8 billion, Services $85.2 billion. Gross margin 44.1%. Operating income $114.3 billion. Net income $97.0 billion. Cash, cash equivalents and marketable securities $143.6 billion. We returned $27.5 billion via share repurchases and dividends in Q4."""
    },
    {
        "id": "nvda_2024",
        "text": """NVIDIA Corporation Fiscal 2024 excerpt. Revenue reached $60.9 billion, up 126% YoY, driven by Data Center segment which grew to $47.5 billion. Gaming revenue $10.4 billion. Professional Visualization $1.6 billion. Automotive $1.1 billion. Gross margin 72.7%. Operating income $32.97 billion. Net income $29.76 billion. Demand for Hopper and upcoming Blackwell architecture was very strong."""
    }
]

eval_questions = [
    {
        "question": "Which company had the highest revenue growth percentage in their latest reported year?",
        "relevant_doc_ids": ["nvda_2024"]
    },
    {
        "question": "What was Tesla's vehicle delivery number in 2023?",
        "relevant_doc_ids": ["tsla_2023"]
    },
    {
        "question": "Which company reported services revenue over 80 billion?",
        "relevant_doc_ids": ["aapl_2023"]
    },
    {
        "question": "Who started Cybertruck production?",
        "relevant_doc_ids": ["tsla_2023"]
    },
    {
        "question": "Which company had operating income above 30 billion recently?",
        "relevant_doc_ids": ["nvda_2024"]
    }
]

# ──────────────────────────────────────────────────────────────
# Run grid search experiment
# ──────────────────────────────────────────────────────────────

runner = ExperimentRunner()

results_df = runner.run_grid_search(
    documents=documents,
    eval_questions=eval_questions,
    models=[
        "sentence-transformers/all-MiniLM-L6-v2",
        "sentence-transformers/all-mpnet-base-v2"
    ],
    chunk_sizes=[80, 150, 300],
    top_ks=[3, 6],
    overlap=25
)


print("\nFinal Experiment Results:")
display(results_df.sort_values("avg_mrr", ascending=False))


best = runner.get_best_config('avg_mrr')
print("\nBest configuration:")
print(best)


Running: all-MiniLM-L6-v2_c80_k3
Model: sentence-transformers/all-MiniLM-L6-v2, Chunk: 80, K: 3
Created 4 chunks from 3 documents


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 4 chunks

Results:
 MRR:        0.900
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       8.85s

Running: all-MiniLM-L6-v2_c80_k6
Model: sentence-transformers/all-MiniLM-L6-v2, Chunk: 80, K: 6
Created 4 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 4 chunks

Results:
 MRR:        0.900
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.23s

Running: all-MiniLM-L6-v2_c150_k3
Model: sentence-transformers/all-MiniLM-L6-v2, Chunk: 150, K: 3
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.900
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.25s

Running: all-MiniLM-L6-v2_c150_k6
Model: sentence-transformers/all-MiniLM-L6-v2, Chunk: 150, K: 6
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.900
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.17s

Running: all-MiniLM-L6-v2_c300_k3
Model: sentence-transformers/all-MiniLM-L6-v2, Chunk: 300, K: 3
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.900
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.19s

Running: all-MiniLM-L6-v2_c300_k6
Model: sentence-transformers/all-MiniLM-L6-v2, Chunk: 300, K: 6
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.900
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.21s

Running: all-mpnet-base-v2_c80_k3
Model: sentence-transformers/all-mpnet-base-v2, Chunk: 80, K: 3
Created 4 chunks from 3 documents


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 4 chunks

Results:
 MRR:        0.800
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       9.63s

Running: all-mpnet-base-v2_c80_k6
Model: sentence-transformers/all-mpnet-base-v2, Chunk: 80, K: 6
Created 4 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 4 chunks

Results:
 MRR:        0.800
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.44s

Running: all-mpnet-base-v2_c150_k3
Model: sentence-transformers/all-mpnet-base-v2, Chunk: 150, K: 3
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.800
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.37s

Running: all-mpnet-base-v2_c150_k6
Model: sentence-transformers/all-mpnet-base-v2, Chunk: 150, K: 6
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.800
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.41s

Running: all-mpnet-base-v2_c300_k3
Model: sentence-transformers/all-mpnet-base-v2, Chunk: 300, K: 3
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.800
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.35s

Running: all-mpnet-base-v2_c300_k6
Model: sentence-transformers/all-mpnet-base-v2, Chunk: 300, K: 6
Created 3 chunks from 3 documents


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Built index with 3 chunks

Results:
 MRR:        0.800
 Recall@5:   1.000
 Recall@10:  1.000
 Time:       2.54s

Final Experiment Results:


Unnamed: 0,experiment_name,model_name,chunk_size,overlap,top_k,num_chunks,avg_mrr,avg_recall@5,avg_recall@10,avg_precision@5,elapsed_time
0,all-MiniLM-L6-v2_c80_k3,sentence-transformers/all-MiniLM-L6-v2,80,25,3,4,0.9,1.0,1.0,0.28,8.846389
1,all-MiniLM-L6-v2_c80_k6,sentence-transformers/all-MiniLM-L6-v2,80,25,6,4,0.9,1.0,1.0,0.28,2.229379
2,all-MiniLM-L6-v2_c150_k3,sentence-transformers/all-MiniLM-L6-v2,150,25,3,3,0.9,1.0,1.0,0.2,2.25131
3,all-MiniLM-L6-v2_c150_k6,sentence-transformers/all-MiniLM-L6-v2,150,25,6,3,0.9,1.0,1.0,0.2,2.173828
4,all-MiniLM-L6-v2_c300_k3,sentence-transformers/all-MiniLM-L6-v2,300,25,3,3,0.9,1.0,1.0,0.2,2.190474
5,all-MiniLM-L6-v2_c300_k6,sentence-transformers/all-MiniLM-L6-v2,300,25,6,3,0.9,1.0,1.0,0.2,2.212576
6,all-mpnet-base-v2_c80_k3,sentence-transformers/all-mpnet-base-v2,80,25,3,4,0.8,1.0,1.0,0.28,9.632159
7,all-mpnet-base-v2_c80_k6,sentence-transformers/all-mpnet-base-v2,80,25,6,4,0.8,1.0,1.0,0.28,2.435251
8,all-mpnet-base-v2_c150_k3,sentence-transformers/all-mpnet-base-v2,150,25,3,3,0.8,1.0,1.0,0.2,2.371199
9,all-mpnet-base-v2_c150_k6,sentence-transformers/all-mpnet-base-v2,150,25,6,3,0.8,1.0,1.0,0.2,2.405707



Best configuration:
{'experiment_name': 'all-MiniLM-L6-v2_c80_k3', 'model_name': 'sentence-transformers/all-MiniLM-L6-v2', 'chunk_size': 80, 'overlap': 25, 'top_k': 3, 'num_chunks': 4, 'avg_mrr': 0.9, 'avg_recall@5': 1.0, 'avg_recall@10': 1.0, 'avg_precision@5': 0.28, 'elapsed_time': 8.846388816833496}
