In [None]:
# Cell 1: Installation
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q sentence-transformers[faiss-gpu] rank-bm25 datasets pandas tqdm

In [None]:
# Cell 2: GPU Verification
import torch
assert torch.cuda.is_available(), "GPU not detected! Go to Runtime → Change runtime type → GPU"
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA available: {torch.cuda.is_available()}")


In [None]:
# Cell 3: Imports
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, InputExample, losses, util
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from torch.utils.data import DataLoader
import os
import random

In [None]:
# Cell 4: Load Dataset
train_ds = load_dataset("hotpot_qa", "fullwiki", split="train")
print(f"Train examples: {len(train_ds):,}")

example = train_ds[0]
print(f"\nQuestion: {example['question']}")
print(f"Answer: {example['answer']}")
print(f"Supporting Facts - Titles: {example['supporting_facts']['title']}")
print(f"Supporting Facts - Sent IDs: {example['supporting_facts']['sent_id']}")

In [None]:
# Cell 5: Flatten Dataset to Sentence-Level
def dataset_to_dataframe(ds):
    rows = []
    for ex in tqdm(ds, desc="Flattening to sentences"):
        q = ex["question"]
        qid = ex["id"]
        answer = ex["answer"]

        supporting_facts = set(
            zip(ex["supporting_facts"]["title"], ex["supporting_facts"]["sent_id"])
        )

        for title, sentences in zip(ex["context"]["title"], ex["context"]["sentences"]):
            for sent_id, sentence in enumerate(sentences):
                is_relevant = 1 if (title, sent_id) in supporting_facts else 0

                rows.append({
                    "query_id": qid,
                    "query": q,
                    "passage_text": sentence,
                    "passage_title": title,
                    "sent_id": sent_id,
                    "label": is_relevant,
                    "answer": answer
                })

    return pd.DataFrame(rows)

train_df = dataset_to_dataframe(train_ds)
print(f"\nTotal sentence rows: {len(train_df):,}")
print(f"Positive sentences: {train_df['label'].sum():,}")
print(f"Negative sentences: {(train_df['label'] == 0).sum():,}")
print(f"Avg positives per query: {train_df['label'].sum() / train_df['query_id'].nunique():.2f}")

In [None]:
# Cell 6: Build BM25 Index
unique_passages = train_df["passage_text"].drop_duplicates().tolist()
print(f"\nUnique passages: {len(unique_passages):,}")
print("Tokenizing for BM25...")
tokenized_corpus = [doc.lower().split() for doc in tqdm(unique_passages)]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
# Cell 7: BM25 Search Function
def search_bm25(query: str, top_k: int = 10):
    tokens = query.lower().split()
    scores = bm25.get_scores(tokens)
    top_idx = np.argpartition(-scores, range(top_k))[:top_k]
    top_idx = top_idx[np.argsort(-scores[top_idx])]
    return [
        {"rank": i+1, "score": float(scores[idx]), "passage": unique_passages[idx]}
        for i, idx in enumerate(top_idx)
    ]

print("\n=== BM25 Test ===")
for r in search_bm25("Which magazine was started first Arthur's Magazine or First for Women?", 5):
    print(f"[{r['rank']}] {r['score']:.2f} | {r['passage'][:150]}...")

In [None]:
# Cell 8: Create Training Examples
print("\nBuilding training examples...")
train_examples = []

for ex in tqdm(train_ds, desc="Creating training pairs"):
    query = ex["question"]

    supporting_facts = set(
        zip(ex["supporting_facts"]["title"], ex["supporting_facts"]["sent_id"])
    )

    if not supporting_facts:
        continue

    for title, sentences in zip(ex["context"]["title"], ex["context"]["sentences"]):
        for sent_id, sentence in enumerate(sentences):
            if (title, sent_id) in supporting_facts:
                train_examples.append(InputExample(texts=[query, sentence]))

print(f"Training pairs: {len(train_examples):,}")
print(f"Average positives per query: {len(train_examples) / len(train_ds):.2f}")

In [None]:
# Cell 9: Create Validation Set
print("\nBuilding validation set...")
val_queries = []
val_positives = []
val_scores = []
seen_queries = set()

for example in tqdm(train_ds, desc="Validation set"):
    query = example["question"]

    if query in seen_queries:
        continue
    seen_queries.add(query)

    supporting_facts = set(
        zip(example["supporting_facts"]["title"], example["supporting_facts"]["sent_id"])
    )

    if not supporting_facts:
        continue

    positives = []
    negatives = []

    for title, sentences in zip(example["context"]["title"], example["context"]["sentences"]):
        for sent_id, sentence in enumerate(sentences):
            if (title, sent_id) in supporting_facts:
                positives.append(sentence)
            else:
                negatives.append(sentence)

    if positives and len(negatives) >= 10:
        for pos_sentence in positives:
            val_queries.append(query)
            val_positives.append(pos_sentence)
            val_scores.append(1.0)

        selected_negatives = random.sample(negatives, 10)
        for neg_sentence in selected_negatives:
            val_queries.append(query)
            val_positives.append(neg_sentence)
            val_scores.append(0.0)

    if len(seen_queries) >= 5000:
        break

print(f"Final validation pairs: {len(val_queries)}")
print(f"Unique queries: {len(seen_queries)}")
print(f"Avg positives per query: {val_scores.count(1.0) / len(seen_queries):.2f}")
print(f"Avg negatives per query: {val_scores.count(0.0) / len(seen_queries):.2f}")

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_queries,
    sentences2=val_positives,
    scores=val_scores,
    batch_size=128,
    name="hotpotqa-val",
    show_progress_bar=True
)

In [None]:
# Cell 10: Training Configuration
print("\nLoading base model on GPU...")
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to('cuda')

train_loss = losses.MultipleNegativesRankingLoss(model=model, scale=20.0)
dataloader = DataLoader(train_examples, shuffle=True, batch_size=64)

best_score = -1.0
patience = 5
patience_counter = 0
max_epochs = 20
learning_rates = [2e-5, 2e-5, 2e-5, 2e-5, 2e-5, 2e-5, 2e-5,
                  1e-5, 1e-5, 1e-5, 1e-5, 1e-5, 1e-5,
                  2e-6, 2e-6, 2e-6, 2e-6, 2e-6, 2e-6, 2e-6]

os.makedirs("models", exist_ok=True)

print(f"\nTotal epochs: {max_epochs}")
print(f"Batch size: 64")
print(f"Patience: {patience}")
print(f"LR schedule: 2e-5 (epochs 1-7) → 1e-5 (epochs 8-13) → 2e-6 (epochs 14-20)\n")

In [None]:
# Cell 11: Training Loop
for epoch in range(1, max_epochs + 1):
    current_lr = learning_rates[epoch - 1]
    print(f"\n=== Epoch {epoch}/{max_epochs} (LR: {current_lr}) ===")

    model.fit(
        train_objectives=[(dataloader, train_loss)],
        epochs=1,
        warmup_steps=int(0.1 * len(dataloader)),
        optimizer_params={'lr': current_lr},
        show_progress_bar=True,
    )

    print("Evaluating on validation set...")
    eval_result = evaluator(model)

    if isinstance(eval_result, dict):
        score = (eval_result.get("cosine_pearson") or
                 eval_result.get("pearson_cosine") or
                 eval_result.get("cosine_spearman") or
                 eval_result.get("spearman_cosine") or
                 list(eval_result.values())[0])
        spearman = (eval_result.get("cosine_spearman") or
                    eval_result.get("spearman_cosine") or
                    score)
        print(f"Validation → Main metric: {score:.5f} | Spearman: {spearman:.5f}")
    else:
        score = eval_result
        print(f"Validation score: {score:.5f}")

    if epoch % 5 == 0:
        checkpoint_path = f"models/minilm-hotpotqa-epoch{epoch}-score{score:.4f}"
        model.save(checkpoint_path)
        print(f"✓ Checkpoint saved: {checkpoint_path}")

    if score > best_score + 0.001:
        best_score = score
        patience_counter = 0
        best_model_path = f"models/minilm-hotpotqa-best-score{best_score:.4f}"
        model.save(best_model_path)
        print(f"✓ New best model saved: {best_model_path}")
    else:
        patience_counter += 1
        print(f"No improvement ({patience_counter}/{patience})")

    if patience_counter >= patience:
        print("Early stopping triggered!")
        break

In [None]:
# Cell 12: Load Best Model and Save Final
print("\n=== Loading best model for final save ===")
best_models = [d for d in os.listdir("models") if d.startswith("minilm-hotpotqa-best")]
if best_models:
    best_model_dir = sorted(best_models)[-1]
    model = SentenceTransformer(f"models/{best_model_dir}")
    model = model.to('cuda')
    print(f"Loaded: {best_model_dir}")

final_path = "models/minilm-hotpotqa-finetuned-final"
model.save(final_path)

print(f"\n{'='*60}")
print(f"Training finished!")
print(f"Best validation score: {best_score:.5f}")
print(f"Total epochs completed: {epoch}")
print(f"Model saved to: {final_path}")
print(f"{'='*60}")

# Cell 13: Performance Test
print("\n=== Quick Performance Test ===")
test_queries = [
    "Which magazine was started first Arthur's Magazine or First for Women?",
    "What is the capital of France?",
    "Who invented the telephone?"
]

for test_q in test_queries:
    print(f"\nQuery: {test_q}")
    query_emb = model.encode(test_q, convert_to_tensor=True)

    corpus_sample = random.sample(unique_passages, min(1000, len(unique_passages)))
    corpus_embs = model.encode(corpus_sample, convert_to_tensor=True, show_progress_bar=False)

    scores = util.cos_sim(query_emb, corpus_embs)[0]
    top_idx = torch.topk(scores, k=5).indices

    for i, idx in enumerate(top_idx):
        print(f"  [{i+1}] {scores[idx]:.4f} | {corpus_sample[idx][:100]}...")


In [None]:
# Cell 14: Save to Google Drive
from google.colab import drive
drive.mount('/content/drive')
!mkdir -p "/content/drive/MyDrive/Colab Models"
!cp -r models "/content/drive/MyDrive/Colab Models/hotpotqa-minilm-$(date +%Y%m%d-%H%M%S)"
print("\n All models copied to your Google Drive!")

In [None]:
# Cell 15: Load Model
MODEL_PATH = "/content/drive/MyDrive/Interview/minilm-hotpotqa-epoch15-score0.5296"
DATASET_PATH = "/content/drive/MyDrive/Interview/HotpotQA_Dataset_20251128"

USE_GPU = torch.cuda.is_available()
BATCH_SIZE = 256 if USE_GPU else 128
USE_FP16 = USE_GPU
NUM_WORKERS = min(4, mp.cpu_count())

if not os.path.exists(MODEL_PATH):
    print(f"ERROR: Model not found at {MODEL_PATH}")
    print("Available models in Interview folder:")
    interview_path = "/content/drive/MyDrive/Interview"
    if os.path.exists(interview_path):
        for item in os.listdir(interview_path):
            print(f"  - {item}")
    exit()

if not os.path.exists(DATASET_PATH):
    print(f"ERROR: Dataset not found at {DATASET_PATH}")
    exit()

print(f" Model path verified: {MODEL_PATH}")
print(f" Dataset path verified: {DATASET_PATH}")
model = SentenceTransformer(MODEL_PATH)
device = 'cuda' if USE_GPU else 'cpu'
model = model.to(device)

if USE_GPU:
    model.half()
    print(f" Model loaded on GPU with FP16: {torch.cuda.get_device_name(0)}")
    print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print(f" Model loaded on CPU")

model.eval()

In [None]:
# Cell 16: Load Dataset
print("\n[2/7] Loading validation dataset...")
try:
    val_df = pd.read_parquet(f"{DATASET_PATH}/validation_sentences.parquet")
    print(f" Loaded from parquet: {len(val_df):,} sentences")
except FileNotFoundError:
    try:
        val_df = pd.read_csv(f"{DATASET_PATH}/validation_sentences.csv")
        print(f" Loaded from CSV: {len(val_df):,} sentences")
    except FileNotFoundError:
        print("ERROR: Could not find validation_sentences.parquet or .csv")
        exit()

print(f"  Supporting sentences: {val_df['is_supporting'].sum():,}")
print(f"  Unique questions: {val_df['id'].nunique():,}")

In [None]:
# Cell 17: Prepare and encode Corpus
print("\n[3/7] Preparing corpus with deduplication...")
corpus_df = val_df[['context_title', 'context_sentence', 'sent_id']].drop_duplicates(
    subset=['context_sentence'],
    keep='first'
).reset_index(drop=True)

corpus = corpus_df['context_sentence'].tolist()
print(f" Unique sentences in corpus: {len(corpus):,}")
print(f"  Deduplication ratio: {len(corpus) / len(val_df) * 100:.1f}% of original")

print("\n[4/7] Encoding corpus with trained model...")
print(f"Settings: Batch size={BATCH_SIZE}, FP16={USE_FP16}, Device={device}")

def encode_corpus_optimized(model, corpus: List[str], batch_size: int, device: str) -> np.ndarray:
    all_embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(corpus), batch_size), desc="Encoding batches"):
            batch = corpus[i:i+batch_size]

            embeddings = model.encode(
                batch,
                convert_to_tensor=True,
                show_progress_bar=False,
                batch_size=batch_size,
                normalize_embeddings=False
            )

            if device == 'cuda':
                embeddings = embeddings.cpu()

            all_embeddings.append(embeddings)

            if USE_GPU and i % (batch_size * 10) == 0:
                torch.cuda.empty_cache()

    corpus_embeddings = torch.cat(all_embeddings, dim=0)
    return corpus_embeddings.numpy().astype('float32')

corpus_embeddings_np = encode_corpus_optimized(model, corpus, BATCH_SIZE, device)
print(f"✓ Corpus encoded: {corpus_embeddings_np.shape}")
print(f"  Embedding dimension: {corpus_embeddings_np.shape[1]}")
print(f"  Total size: {corpus_embeddings_np.nbytes / 1e6:.2f} MB")

if USE_GPU:
    torch.cuda.empty_cache()
gc.collect()

In [None]:
# Cell 18: Build FAISS Index
print("\n[5/7] Building optimized FAISS index...")
dimension = corpus_embeddings_np.shape[1]
num_vectors = corpus_embeddings_np.shape[0]

print(f"  Dimension: {dimension}")
print(f"  Vectors: {num_vectors:,}")

if num_vectors < 100000:
    index_type = "IndexFlatIP"
    print(f"  Index type: {index_type} (Exact search)")
    index = faiss.IndexFlatIP(dimension)
elif num_vectors < 1000000:
    index_type = "IndexIVFFlat"
    nlist = min(4096, int(np.sqrt(num_vectors)))
    print(f"  Index type: {index_type} (nlist={nlist})")
    quantizer = faiss.IndexFlatIP(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)
else:
    index_type = "IndexIVFFlat"
    nlist = min(16384, int(np.sqrt(num_vectors) * 2))
    print(f"  Index type: {index_type} (nlist={nlist})")
    quantizer = faiss.IndexFlatIP(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_INNER_PRODUCT)

print("  Normalizing embeddings for cosine similarity...")
faiss.normalize_L2(corpus_embeddings_np)

if USE_GPU:
    try:
        res = faiss.StandardGpuResources()
        res.setTempMemory(1024 * 1024 * 512)

        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        print("  ✓ Using GPU for FAISS indexing")

        if 'IVF' in index_type:
            print("  Training IVF index...")
            train_size = min(num_vectors, 100000)
            train_indices = np.random.choice(num_vectors, train_size, replace=False)
            gpu_index.train(corpus_embeddings_np[train_indices])

        print("  Adding vectors to index...")
        gpu_index.add(corpus_embeddings_np)

        index = faiss.index_gpu_to_cpu(gpu_index)
        print(f"  ✓ GPU indexing complete")

    except Exception as e:
        print(f"  ⚠ GPU indexing failed ({e}), falling back to CPU")
        if 'IVF' in index_type:
            print("  Training IVF index on CPU...")
            train_size = min(num_vectors, 100000)
            train_indices = np.random.choice(num_vectors, train_size, replace=False)
            index.train(corpus_embeddings_np[train_indices])
        index.add(corpus_embeddings_np)
else:
    print("  Using CPU for FAISS indexing")
    if 'IVF' in index_type:
        print("  Training IVF index...")
        train_size = min(num_vectors, 100000)
        train_indices = np.random.choice(num_vectors, train_size, replace=False)
        index.train(corpus_embeddings_np[train_indices])
    index.add(corpus_embeddings_np)

print(f"✓ FAISS index built with {index.ntotal:,} vectors")

if 'IVF' in index_type:
    index.nprobe = min(64, index.nlist // 4)
    print(f"  Set nprobe={index.nprobe} for search")

In [None]:
# Cell 19: Save Index and Corpus
os.makedirs(DATASET_PATH, exist_ok=True)

index_path = f"{DATASET_PATH}/faiss_index.bin"
faiss.write_index(index, index_path)
index_size_mb = os.path.getsize(index_path) / 1e6
print(f"✓ FAISS index saved: {index_path}")
print(f"  File size: {index_size_mb:.2f} MB")

corpus_path = f"{DATASET_PATH}/corpus.pkl"
corpus_data = {
    'corpus': corpus,
    'corpus_df': corpus_df,
    'model_name': MODEL_PATH,
    'embedding_dim': dimension,
    'num_vectors': num_vectors,
    'index_type': index_type
}

with open(corpus_path, 'wb') as f:
    pickle.dump(corpus_data, f, protocol=pickle.HIGHEST_PROTOCOL)
corpus_size_mb = os.path.getsize(corpus_path) / 1e6
print(f"✓ Corpus metadata saved: {corpus_path}")
print(f"  File size: {corpus_size_mb:.2f} MB")


In [None]:
# Cell 20: Test Index
print("\n[7/7] Running comprehensive test...")
print("="*80)

test_queries = [
    "Which magazine was started first Arthur's Magazine or First for Women?",
    "What is the capital of France?",
    "Who won the Nobel Prize in Physics?"
]

for test_query in test_queries:
    print(f"\nTest query: {test_query}")

    with torch.no_grad():
        query_emb = model.encode(test_query, convert_to_tensor=True, show_progress_bar=False)
        if USE_GPU:
            query_emb = query_emb.cpu()
        query_emb_np = query_emb.numpy().astype('float32').reshape(1, -1)

    faiss.normalize_L2(query_emb_np)

    k = 5
    scores, indices = index.search(query_emb_np, k)

    print(f"Top {k} retrieved sentences:")
    for rank, (idx, score) in enumerate(zip(indices[0], scores[0]), 1):
        print(f"  [{rank}] Score: {score:.4f}")
        print(f"      {corpus[idx][:120]}...")
  print("\n" + "="*80)
print(" INDEX BUILD COMPLETE!")
print(f"\nOptimization Summary:")
print(f"  Device: {device.upper()}")
print(f"  FP16 enabled: {USE_FP16}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Index type: {index_type}")
if 'IVF' in index_type:
    print(f"  Search nprobe: {index.nprobe}")

print(f"\nFiles saved:")
print(f"  1. {index_path} ({index_size_mb:.2f} MB)")
print(f"  2. {corpus_path} ({corpus_size_mb:.2f} MB)")

print(f"\nPerformance Stats:")
print(f"  Total vectors indexed: {num_vectors:,}")
print(f"  Embedding dimension: {dimension}")
print(f"  Index memory usage: ~{index_size_mb:.1f} MB")

print(f"\n✓ You can now run the evaluation script!")

In [None]:
# Cell 22: Advanced Hybrid Retriever Class - Initialization
class AdvancedHybridRetriever:
    def __init__(self, model_path: str, index_path: str, corpus_path: str,
                 use_bm25_plus: bool = True):
        print("Initializing Advanced Hybrid Retriever...")

        self.model = SentenceTransformer(model_path)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = self.model.to(self.device)
        print(f" Fine-tuned model loaded on {self.device}")

        self.dense_index = faiss.read_index(index_path)
        if torch.cuda.is_available() and hasattr(faiss, 'StandardGpuResources'):
            try:
                res = faiss.StandardGpuResources()
                self.dense_index = faiss.index_cpu_to_gpu(res, 0, self.dense_index)
                print(f" FAISS index moved to GPU")
            except:
                print(f" FAISS index on CPU")
        print(f"  Index size: {self.dense_index.ntotal:,} vectors")

        with open(corpus_path, 'rb') as f:
            data = pickle.load(f)
            self.corpus = data['corpus']
            self.corpus_df = data.get('corpus_df', None)
        print(f" Corpus loaded: {len(self.corpus):,} sentences")

        print(f"Building BM25{'Plus' if use_bm25_plus else ''} index...")
        tokenized_corpus = [doc.lower().split() for doc in tqdm(self.corpus, desc="Tokenizing")]

        if use_bm25_plus:
            self.bm25 = BM25Plus(tokenized_corpus, k1=1.2, b=0.75, delta=1.0)
        else:
            self.bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)
        print(" BM25 index built")

        self.cross_encoders = []
        cross_encoder_models = [
            'cross-encoder/ms-marco-MiniLM-L-6-v2',
            'cross-encoder/ms-marco-MiniLM-L-12-v2'
        ]

        for ce_model in cross_encoder_models:
            try:
                ce = CrossEncoder(ce_model, device=self.device, max_length=512)
                self.cross_encoders.append(ce)
                print(f" Loaded cross-encoder: {ce_model}")
            except Exception as e:
                print(f" Could not load {ce_model}: {e}")

        self.use_reranking = len(self.cross_encoders) > 0
        self.query_cache = {}
        self.cache_enabled = True

In [None]:
# Cell 23: Main Retrieve Method
    def retrieve(self,
                 query: str,
                 top_k: int = 12,
                 method: str = "rrf",
                 dense_k: int = 100,
                 sparse_k: int = 100,
                 rrf_k: int = 60,
                 rerank: bool = True,
                 rerank_top_k: int = 50,
                 query_expansion: bool = False,
                 context_aware: bool = True) -> List[Tuple[str, float, int]]:

        if query_expansion:
            query = self._expand_query(query, top_n=5)

        if method == "dense":
            results = self._dense_retrieval(query, top_k)
        elif method == "bm25":
            results = self._bm25_retrieval(query, top_k)
        elif method == "hybrid":
            results = self._hybrid_retrieval(query, top_k, alpha=0.7)
        elif method == "rrf":
            results = self._rrf_retrieval(query, top_k, dense_k, sparse_k, rrf_k)
        else:
            raise ValueError(f"Unknown method: {method}")

        if rerank and self.use_reranking:
            if len(results) < rerank_top_k:
                if method == "rrf":
                    candidates = self._rrf_retrieval(query, rerank_top_k, dense_k, sparse_k, rrf_k)
                else:
                    candidates = self._hybrid_retrieval(query, rerank_top_k, alpha=0.7)
            else:
                candidates = results[:rerank_top_k]

            results = self._ensemble_rerank(query, candidates, top_k)

        return results[:top_k]

In [None]:
# Cell 24: Dense Retrieval Method
    def _dense_retrieval(self, query: str, top_k: int) -> List[Tuple[str, float, int]]:
        cache_key = f"dense_{query}"
        if self.cache_enabled and cache_key in self.query_cache:
            all_results = self.query_cache[cache_key]
            return all_results[:top_k]

        query_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
        query_emb_np = query_emb.cpu().numpy().astype('float32').reshape(1, -1)
        faiss.normalize_L2(query_emb_np)

        scores, indices = self.dense_index.search(query_emb_np, min(top_k * 2, self.dense_index.ntotal))

        results = []
        for idx, score in zip(indices[0], scores[0]):
            results.append((self.corpus[idx], float(score), int(idx)))

        if self.cache_enabled:
            self.query_cache[cache_key] = results

        return results[:top_k]

In [None]:
# Cell 25: BM25 Retrieval Method
    def _bm25_retrieval(self, query: str, top_k: int) -> List[Tuple[str, float, int]]:
        tokens = query.lower().split()
        scores = self.bm25.get_scores(tokens)

        top_idx = np.argpartition(-scores, range(min(top_k, len(scores))))[:top_k]
        top_idx = top_idx[np.argsort(-scores[top_idx])]

        results = []
        for idx in top_idx:
            results.append((self.corpus[idx], float(scores[idx]), int(idx)))

        return results

In [None]:
# Cell 26: Hybrid Retrieval Method
    def _hybrid_retrieval(self, query: str, top_k: int, alpha: float) -> List[Tuple[str, float, int]]:
        dense_results = self._dense_retrieval(query, top_k * 3)
        dense_dict = {idx: score for _, score, idx in dense_results}

        bm25_results = self._bm25_retrieval(query, top_k * 3)
        bm25_dict = {idx: score for _, score, idx in bm25_results}

        if dense_dict:
            dense_scores = list(dense_dict.values())
            dense_min, dense_max = min(dense_scores), max(dense_scores)
            dense_dict = {k: (v - dense_min) / (dense_max - dense_min + 1e-10)
                         for k, v in dense_dict.items()}

        if bm25_dict:
            bm25_scores = list(bm25_dict.values())
            bm25_min, bm25_max = min(bm25_scores), max(bm25_scores)
            bm25_dict = {k: (v - bm25_min) / (bm25_max - bm25_min + 1e-10)
                        for k, v in bm25_dict.items()}

        all_indices = set(dense_dict.keys()) | set(bm25_dict.keys())
        combined = {}
        for idx in all_indices:
            d_score = dense_dict.get(idx, 0)
            b_score = bm25_dict.get(idx, 0)
            combined[idx] = alpha * d_score + (1 - alpha) * b_score

        sorted_results = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:top_k]

        results = []
        for idx, score in sorted_results:
            results.append((self.corpus[idx], float(score), int(idx)))

        return results

In [None]:
# Cell 27: RRF Retrieval Method
    def _rrf_retrieval(self, query: str, top_k: int,
                      dense_k: int, sparse_k: int, rrf_k: int) -> List[Tuple[str, float, int]]:
        dense_results = self._dense_retrieval(query, dense_k)
        bm25_results = self._bm25_retrieval(query, sparse_k)

        rrf_scores = defaultdict(float)

        for rank, (_, _, idx) in enumerate(dense_results, 1):
            rrf_scores[idx] += 1.0 / (rrf_k + rank)

        for rank, (_, _, idx) in enumerate(bm25_results, 1):
            rrf_scores[idx] += 1.0 / (rrf_k + rank)

        sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]

        results = []
        for idx, score in sorted_results:
            results.append((self.corpus[idx], float(score), int(idx)))

        return results

In [None]:
# Cell 28: Ensemble Reranking Method
    def _ensemble_rerank(self, query: str, candidates: List[Tuple[str, float, int]],
                        top_k: int) -> List[Tuple[str, float, int]]:
        if not self.cross_encoders:
            return candidates[:top_k]

        sentences = [sent for sent, _, _ in candidates]
        indices = [idx for _, _, idx in candidates]
        pairs = [[query, sent] for sent in sentences]

        all_scores = []
        for ce in self.cross_encoders:
            scores = ce.predict(pairs, show_progress_bar=False, batch_size=32)
            all_scores.append(scores)

        ensemble_scores = np.mean(all_scores, axis=0)

        reranked = [(sentences[i], float(ensemble_scores[i]), indices[i])
                   for i in range(len(sentences))]
        reranked.sort(key=lambda x: x[1], reverse=True)

        return reranked[:top_k]

In [None]:
# Cell 29: Query Expansion Method
    def _expand_query(self, query: str, top_n: int = 5) -> str:
        initial_results = self._dense_retrieval(query, top_n)

        word_freq = defaultdict(int)
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
                     'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be', 'been'}

        for sent, _, _ in initial_results:
            words = sent.lower().split()
            for word in words:
                if word not in stop_words and len(word) > 3:
                    word_freq[word] += 1

        top_terms = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:3]
        expansion_terms = [term for term, _ in top_terms if term not in query.lower()]

        if expansion_terms:
            expanded = query + " " + " ".join(expansion_terms[:2])
            return expanded

        return query

In [None]:
# Cell 30: Batch Retrieve Method
    def batch_retrieve(self, queries: List[str], **kwargs) -> List[List[Tuple[str, float, int]]]:
        results = []
        for query in tqdm(queries, desc="Batch retrieval"):
            results.append(self.retrieve(query, **kwargs))
        return results

In [None]:
# Cell 31: Evaluation Function
def evaluate_retrieval_comprehensive(retriever: AdvancedHybridRetriever,
                                    test_df: pd.DataFrame,
                                    num_questions: int = 500,
                                    configs: List[Dict] = None) -> Dict:
    if configs is None:
        configs = [
            {"name": "Dense", "method": "dense", "top_k": 12, "rerank": False},
            {"name": "BM25", "method": "bm25", "top_k": 12, "rerank": False},
            {"name": "Hybrid (α=0.7)", "method": "hybrid", "top_k": 12, "rerank": False},
            {"name": "RRF", "method": "rrf", "top_k": 12, "rerank": False},
            {"name": "RRF + Rerank", "method": "rrf", "top_k": 12, "rerank": True, "rerank_top_k": 50},
        ]

    print(f"\n{'='*80}")
    print(f"COMPREHENSIVE EVALUATION ON {num_questions} QUESTIONS")
    print(f"{'='*80}\n")

    test_questions = test_df.groupby('id').first().reset_index().head(num_questions)

    results = {config['name']: {
        'recall@5': [], 'recall@10': [], 'recall@12': [],
        'precision@5': [], 'precision@10': [], 'precision@12': [],
        'mrr': [], 'map': [], 'ndcg@10': []
    } for config in configs}

    for idx, row in tqdm(test_questions.iterrows(), total=len(test_questions), desc="Evaluating"):
        question = row['question']
        question_id = row['id']

        ground_truth = set(test_df[
            (test_df['id'] == question_id) &
            (test_df['is_supporting'] == 1)
        ]['context_sentence'].tolist())

        if not ground_truth:
            continue

        for config in configs:
            try:
                retrieved = retriever.retrieve(question, **{k: v for k, v in config.items() if k != 'name'})
                retrieved_sentences = [sent for sent, _, _ in retrieved]

                for k in [5, 10, 12]:
                    topk_sentences = retrieved_sentences[:k]
                    hits = sum([sent in ground_truth for sent in topk_sentences])

                    recall = hits / len(ground_truth) if ground_truth else 0
                    precision = hits / k if k > 0 else 0

                    results[config['name']][f'recall@{k}'].append(recall)
                    results[config['name']][f'precision@{k}'].append(precision)

                mrr = 0
                for rank, sent in enumerate(retrieved_sentences, 1):
                    if sent in ground_truth:
                        mrr = 1.0 / rank
                        break
                results[config['name']]['mrr'].append(mrr)

                ap = 0
                hits = 0
                for rank, sent in enumerate(retrieved_sentences, 1):
                    if sent in ground_truth:
                        hits += 1
                        ap += hits / rank
                ap = ap / len(ground_truth) if ground_truth else 0
                results[config['name']]['map'].append(ap)

                dcg = 0
                for rank, sent in enumerate(retrieved_sentences[:10], 1):
                    if sent in ground_truth:
                        dcg += 1.0 / np.log2(rank + 1)
                idcg = sum([1.0 / np.log2(i + 2) for i in range(min(len(ground_truth), 10))])
                ndcg = dcg / idcg if idcg > 0 else 0
                results[config['name']]['ndcg@10'].append(ndcg)

            except Exception as e:
                print(f"Error with config {config['name']}: {e}")
                continue

    print("EVALUATION RESULTS")

    summary = {}
    for config_name in results:
        print(f"{config_name}:")
        metrics_summary = {}
        for metric in ['recall@5', 'recall@10', 'recall@12', 'precision@12', 'mrr', 'map', 'ndcg@10']:
            if results[config_name][metric]:
                avg = np.mean(results[config_name][metric])
                metrics_summary[metric] = avg
                if 'recall' in metric or 'precision' in metric or 'ndcg' in metric or 'map' in metric:
                    print(f"  {metric:15s}: {avg:.2%}")
                else:
                    print(f"  {metric:15s}: {avg:.4f}")
        print()
        summary[config_name] = metrics_summary

    return results, summary

In [None]:
# Cell 32: Path Configuration
MODEL_PATH = "/content/drive/MyDrive/Interview/minilm-hotpotqa-epoch15-score0.5296"
DATASET_PATH = "/content/drive/MyDrive/Interview/HotpotQA_Dataset_20251128"

if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model path not found: {MODEL_PATH}")
if not os.path.exists(DATASET_PATH):
    raise FileNotFoundError(f"Dataset path not found: {DATASET_PATH}")

print("ADVANCED HYBRID RETRIEVAL SYSTEM")
print(f"Model: {MODEL_PATH}")
print(f"Dataset: {DATASET_PATH}")

# Cell 33: Initialize Retriever
retriever = AdvancedHybridRetriever(
    model_path=MODEL_PATH,
    index_path=f"{DATASET_PATH}/faiss_index.bin",
    corpus_path=f"{DATASET_PATH}/corpus.pkl",
    use_bm25_plus=True
)

In [None]:
# Cell 34: Load Test Data
test_df = pd.read_parquet(f"{DATASET_PATH}/validation_sentences.parquet")
print(f" Loaded {len(test_df):,} validation sentences")

# Cell 35: Run Evaluation
results, summary = evaluate_retrieval_comprehensive(
    retriever=retriever,
    test_df=test_df,
    num_questions=500
)

# Cell 36: Save Results
print("\nSaving evaluation results...")
output_path = f"{DATASET_PATH}/advanced_evaluation_results.json"
with open(output_path, 'w') as f:
    json.dump(summary, f, indent=2)
print(f" Results saved to {output_path}")

In [None]:
# Cell 37: Best Configuration
print("BEST CONFIGURATION RECOMMENDATION")
best_config = max(summary.items(), key=lambda x: x[1].get('recall@12', 0))
print(f"\nBest method: {best_config[0]}")
print(f"Recall@12: {best_config[1]['recall@12']:.2%}")
print(f"MRR: {best_config[1]['mrr']:.4f}")

In [None]:
# Cell 37: Graphs for Best Configuration and Performance Overview
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

results = {
    "Dense": {
        "recall@5": 50.68, "recall@10": 58.49, "recall@12": 60.90,
        "precision@12": 8.29, "mrr": 51.15, "map": 40.90, "ndcg@10": 47.71
    },
    "BM25": {
        "recall@5": 52.33, "recall@10": 60.27, "recall@12": 61.33,
        "precision@12": 8.23, "mrr": 57.62, "map": 43.38, "ndcg@10": 51.01
    },
    "Hybrid (α=0.7)": {
        "recall@5": 56.86, "recall@10": 67.68, "recall@12": 71.04,
        "precision@12": 9.57, "mrr": 57.38, "map": 46.55, "ndcg@10": 54.31
    },
    "RRF": {
        "recall@5": 61.45, "recall@10": 70.38, "recall@12": 72.36,
        "precision@12": 9.79, "mrr": 62.90, "map": 50.78, "ndcg@10": 58.61
    },
    "RRF + Rerank": {
        "recall@5": 73.10, "recall@10": 78.15, "recall@12": 78.54,
        "precision@12": 10.72, "mrr": 78.74, "map": 66.90, "ndcg@10": 72.69
    }
}

methods = list(results.keys())

# Main comprehensive figure
fig = plt.figure(figsize=(20, 12))
gs = fig.add_gridspec(3, 3, hspace=0.3, wspace=0.3)

# 1. Recall at different k
ax1 = fig.add_subplot(gs[0, :2])
x = np.arange(len(methods))
width = 0.25

ax1.bar(x - width, [results[m]["recall@5"] for m in methods], width, label='Recall@5', color=colors[0], alpha=0.8)
ax1.bar(x,       [results[m]["recall@10"] for m in methods], width, label='Recall@10', color=colors[1], alpha=0.8)
ax1.bar(x + width, [results[m]["recall@12"] for m in methods], width, label='Recall@12', color=colors[2], alpha=0.8)

ax1.set_xlabel('Method', fontsize=12, fontweight='bold')
ax1.set_ylabel('Recall (%)', fontsize=12, fontweight='bold')
ax1.set_title('Recall Comparison at Different k Values', fontsize=14, fontweight='bold', pad=20)
ax1.set_xticks(x)
ax1.set_xticklabels(methods, rotation=15, ha='right')
ax1.legend(loc='upper left')
ax1.set_ylim(0, 85)
ax1.grid(axis='y', alpha=0.3)

for bars in ax1.patches:
    height = bars.get_height()
    if height > 0:
        ax1.text(bars.get_x() + bars.get_width()/2, height + 1,
                 f'{height:.1f}', ha='center', va='bottom', fontsize=9)

# 2. Best method summary
ax2 = fig.add_subplot(gs[0, 2])
best = "RRF + Rerank"
metrics_best = ['Recall@12', 'MRR', 'MAP', 'NDCG@10']
values_best = [results[best][k.lower().replace('@', '@') if '@' in k else k.lower()]
               for k in ['recall@12', 'mrr', 'map', 'ndcg@10']]

bars = ax2.barh(metrics_best, values_best, color=colors[4], alpha=0.8)
ax2.set_xlabel('Score (%)')
ax2.set_title(f'Best Method Performance\n({best})', fontsize=12, fontweight='bold', pad=15)
ax2.set_xlim(0, 85)
ax2.grid(axis='x', alpha=0.3)

for bar, val in zip(bars, values_best):
    ax2.text(val + 1, bar.get_y() + bar.get_height()/2, f'{val:.1f}',
             va='center', fontsize=11, fontweight='bold')

# 3–5. Individual metric bars
def bar_plot(ax, values, title, ylabel, ylim):
    bars = ax.bar(methods, values, color=colors[methods.index(ax.get_title()[-10:]) % len(colors)],
                  alpha=0.8, edgecolor='black', linewidth=1.2)
    ax.set_ylabel(ylabel)
    ax.set_title(title, fontsize=13, fontweight='bold', pad=15)
    ax.set_xticklabels(methods, rotation=15, ha='right')
    ax.grid(axis='y', alpha=0.3)
    ax.set_ylim(0, ylim)
    for bar in bars:
        h = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2, h + 1, f'{h:.1f}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

ax3 = fig.add_subplot(gs[1, 0])
bar_plot(ax3, [results[m]["mrr"] for m in methods], "Mean Reciprocal Rank (MRR)", "MRR (%)", 85)

ax4 = fig.add_subplot(gs[1, 1])
bar_plot(ax4, [results[m]["map"] for m in methods], "Mean Average Precision (MAP)", "MAP (%)", 75)

ax5 = fig.add_subplot(gs[1, 2])
bar_plot(ax5, [results[m]["ndcg@10"] for m in methods], "NDCG@10", "NDCG@10 (%)", 80)

# 6. Heatmap
ax6 = fig.add_subplot(gs[2, :2])
metrics_list = ['Recall@5', 'Recall@10', 'Recall@12', 'Precision@12', 'MRR', 'MAP', 'NDCG@10']
data = np.array([[results[m][k.lower().replace('@', '@') if '@' in k else k.lower()]
                  for k in ['recall@5','recall@10','recall@12','precision@12','mrr','map','ndcg@10']]
                 for m in methods])

im = ax6.imshow(data, cmap='RdYlGn', aspect='auto', vmin=0, vmax=80)
ax6.set_xticks(np.arange(len(metrics_list)))
ax6.set_yticks(np.arange(len(methods)))
ax6.set_xticklabels(metrics_list, fontsize=11)
ax6.set_yticklabels(methods, fontsize=11)
ax6.set_title('Comprehensive Metrics Heatmap', fontsize=14, fontweight='bold', pad=15)

for i in range(len(methods)):
    for j in range(len(metrics_list)):
        ax6.text(j, i, f'{data[i,j]:.1f}', ha='center', va='center',
                 color='black', fontsize=10, fontweight='bold')

plt.colorbar(im, ax=ax6, label='Score (%)')

# 7. Improvement over baseline
ax7 = fig.add_subplot(gs[2, 2])
baseline_recall = results['Dense']['recall@12']
improvements = [(results[m]['recall@12'] - baseline_recall) / baseline_recall * 100
                for m in methods[1:]]

bars = ax7.barh(methods[1:], improvements, color=colors[1:], alpha=0.8, edgecolor='black', linewidth=1.2)
ax7.set_xlabel('Improvement over Dense Baseline (%)')
ax7.set_title('Recall@12 Improvement\nover Dense', fontsize=12, fontweight='bold', pad=15)
ax7.grid(axis='x', alpha=0.3)
ax7.axvline(0, color='black', linewidth=1)

for bar, val in zip(bars, improvements):
    ax7.text(val + (1 if val > 0 else -1), bar.get_y() + bar.get_height()/2,
             f'{val:+.1f}%', va='center', fontsize=10, fontweight='bold')

fig.suptitle a title('HotpotQA Retrieval Performance Analysis', fontsize=18, fontweight='bold', y=0.98)
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Interview/retrieval_performance_analysis.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

# Progressive improvement chart
fig2, ax = plt.subplots(figsize=(12, 7))
stages = ['Baseline\n(Dense)', 'Add BM25\n(Sparse)', 'Hybrid\nFusion', 'RRF\nFusion', 'Add Cross-\nEncoder']
recall_progress = [results[m]['recall@12'] for m in ['Dense', 'BM25', 'Hybrid (α=0.7)', 'RRF', 'RRF + Rerank']]

ax.plot(stages, recall_progress, marker='o', linewidth=3, markersize=12, color=colors[2])
ax.fill_between(range(len(stages)), recall_progress, alpha=0.3, color=colors[2])

for i, val in enumerate(recall_progress):
    ax.text(i, val + 1.5, f'{val:.1f}%', ha='center', fontsize=12, fontweight='bold')
    if i > 0:
        inc = val - recall_progress[i-1]
        ax.annotate(f'+{inc:.1f}%', xy=(i-0.5, (val + recall_progress[i-1])/2),
                    ha='center', fontsize=10, color='green', fontweight='bold')

ax.set_ylabel('Recall@12 (%)', fontsize=14, fontweight='bold')
ax.set_title('Progressive Improvement from Baseline to Final System', fontsize=16, fontweight='bold', pad=20)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim(50, 85)

ax.axhline(65, color='red', linestyle='--', linewidth=2, alpha=0.7, label='DPR Baseline (~65%)')
ax.axhline(80, color='orange', linestyle='--', linewidth=2, alpha=0.7, label='ColBERT (~80%)')
ax.axhline(85, color='green', linestyle='--', linewidth=2, alpha=0.7, label='SOTA (~85%)')
ax.legend(loc='lower right')

plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Interview/progressive_improvement.png',
            dpi=300, bbox_inches='tight', facecolor='white')
plt.show()

# Summary table
print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)

df = pd.DataFrame(results).T.round(2)
print("\nComplete Results Table:")
print(df.to_string())

print("\nImprovements over Dense Baseline (Recall@12):")
baseline = results['Dense']['recall@12']
for method in methods[1:]:
    imp = results[method]['recall@12'] - baseline
    rel = imp / baseline * 100
    print(f"{method:20s} +{imp:5.2f}%  ({rel:+5.1f}% relative)")

print("\nBest Scores:")
for metric, key in [('Recall@12', 'recall@12'), ('MRR', 'mrr'), ('MAP', 'map'), ('NDCG@10', 'ndcg@10')]:
    best_val = max(results[m][key] for m in methods)
    best_method = max(methods, key=lambda m: results[m][key])
    print(f"  {metric:12s}: {best_val:5.2f}%  ({best_method})")

print("\nAll visualizations saved to /content/drive/MyDrive/Interview/")

![Chart](/home/too_geeky/interview/chart.png)
 

![Improvement](/home/too_geeky/interview/imporvement.png)
 

In [None]:
# Cell 38: Imports and Setup
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, CrossEncoder
from tqdm.auto import tqdm
import faiss
import pickle
from typing import List, Tuple, Dict
import json
from collections import defaultdict, Counter
import re
import string
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from rank_bm25 import BM25Plus
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Cell 39: Configuration
DATASET_INPUT_PATH = "/kaggle/input/hotpotqa/HotpotQA_Dataset_20251128"
MODEL_INPUT_PATH = "/kaggle/input/minilm/pytorch/default/1/minilm-hotpotqa-epoch15-score0.5296"
WORKING_PATH = "/kaggle/working"
NUM_EVAL_QUESTIONS = 100

In [None]:
# Cell 40: Hybrid Retriever Class
class HybridRetriever:
    def __init__(self, model_path: str, index_path: str, corpus_path: str):
        
        self.model = SentenceTransformer(model_path)
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.model = self.model.to(self.device)
        if self.device == 'cuda':
            self.model.half()
        self.model.eval()
        
        self.dense_index = faiss.read_index(index_path)
        if torch.cuda.is_available():
            try:
                res = faiss.StandardGpuResources()
                self.dense_index = faiss.index_cpu_to_gpu(res, 0, self.dense_index)
            except:
                pass
        
        with open(corpus_path, 'rb') as f:
            data = pickle.load(f)
            self.corpus = data['corpus']
        
        tokenized_corpus = [doc.lower().split() for doc in tqdm(self.corpus, desc="Building BM25", leave=False)]
        self.bm25 = BM25Plus(tokenized_corpus)
        
        try:
            self.cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', 
                                             device=self.device, max_length=512)
        except:
            self.cross_encoder = None
        
        print(f" Retriever ready on {self.device}")

    def retrieve(self, query: str, top_k: int = 2, min_score: float = 0.5) -> List[Tuple[str, float, int]]:
        candidates = self._rrf_retrieval(query, top_k * 5, dense_k=100, sparse_k=100, rrf_k=60)
        
        if self.cross_encoder and len(candidates) > 0:
            candidates = self._rerank(query, candidates[:50], top_k * 3)
        
        filtered = [(sent, score, idx) for sent, score, idx in candidates if score > min_score]
        
        if filtered:
            return filtered[:top_k]
        else:
            return candidates[:1]

    def _dense_retrieval(self, query: str, top_k: int) -> List[Tuple[str, float, int]]:
        with torch.no_grad():
            query_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
            if self.device == 'cuda':
                query_emb = query_emb.cpu()
            query_emb_np = query_emb.numpy().astype('float32').reshape(1, -1)
        faiss.normalize_L2(query_emb_np)
        scores, indices = self.dense_index.search(query_emb_np, min(top_k, self.dense_index.ntotal))
        return [(self.corpus[idx], float(score), int(idx)) for idx, score in zip(indices[0], scores[0])]

    def _bm25_retrieval(self, query: str, top_k: int) -> List[Tuple[str, float, int]]:
        tokens = query.lower().split()
        scores = self.bm25.get_scores(tokens)
        top_idx = np.argpartition(-scores, range(min(top_k, len(scores))))[:top_k]
        top_idx = top_idx[np.argsort(-scores[top_idx])]
        return [(self.corpus[idx], float(scores[idx]), int(idx)) for idx in top_idx]

    def _rrf_retrieval(self, query: str, top_k: int, dense_k: int, 
                      sparse_k: int, rrf_k: int) -> List[Tuple[str, float, int]]:
        dense_results = self._dense_retrieval(query, dense_k)
        bm25_results = self._bm25_retrieval(query, sparse_k)
        
        rrf_scores = defaultdict(float)
        for rank, (_, _, idx) in enumerate(dense_results, 1):
            rrf_scores[idx] += 1.0 / (rrf_k + rank)
        for rank, (_, _, idx) in enumerate(bm25_results, 1):
            rrf_scores[idx] += 1.0 / (rrf_k + rank)
        
        sorted_results = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
        return [(self.corpus[idx], float(score), int(idx)) for idx, score in sorted_results]

    def _rerank(self, query: str, candidates: List[Tuple[str, float, int]], 
               top_k: int) -> List[Tuple[str, float, int]]:
        if not self.cross_encoder:
            return candidates[:top_k]
        
        sentences = [sent for sent, _, _ in candidates]
        indices = [idx for _, _, idx in candidates]
        pairs = [[query, sent] for sent in sentences]
        
        scores = self.cross_encoder.predict(pairs, show_progress_bar=False, batch_size=32)
        reranked = [(sentences[i], float(scores[i]), indices[i]) for i in range(len(sentences))]
        reranked.sort(key=lambda x: x[1], reverse=True)
        return reranked[:top_k]

In [None]:
# Cell 41: QA System Class
class QwenQASystem:
    def __init__(self, model_name: str = "Qwen/Qwen2.5-7B-Instruct", 
                 lora_path: str = None, load_in_8bit: bool = True):
        
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        if load_in_8bit:
            quantization_config = BitsAndBytesConfig(
                load_in_8bit=True,
                llm_int8_threshold=6.0,
                llm_int8_enable_fp32_cpu_offload=True
            )
        else:
            quantization_config = None
        
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name, trust_remote_code=True, padding_side='left'
        )
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            quantization_config=quantization_config,
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16 if not load_in_8bit else None,
            low_cpu_mem_usage=True
        )
        
        if lora_path:
            from peft import PeftModel
            self.model = PeftModel.from_pretrained(self.model, lora_path)
            self.model = self.model.merge_and_unload()
        
        self.model.eval()
        print(f"✓ Model loaded on {self.device}")

    def generate_answer(self, question: str, context: str = None) -> str:
        if context:
            prompt = f"""Extract ONLY the answer from the context. Give the shortest possible answer - just the name, number, or phrase needed. Do not explain.

Context: {context}

Question: {question}

Answer (2-5 words max):"""
        else:
            prompt = f"""Question: {question}

Answer (2-5 words max):"""
        
        inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=30,
                temperature=1.0,
                do_sample=False,
                num_beams=1,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id
            )
        
        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        if "Answer:" in full_response:
            answer = full_response.split("Answer:")[-1].strip()
        else:
            answer = full_response[len(prompt):].strip()
        
        answer = answer.split('\n')[0].strip()
        
        return answer

    def answer_with_strategy(self, question: str, context: str, retrieval_score: float,
                             min_confidence: float = 0.6) -> Tuple[str, str]:
        answer_no_rag = self.generate_answer(question, context=None)
        
        if retrieval_score < min_confidence:
            return answer_no_rag, "no_rag_low_confidence"
        
        answer_with_rag = self.generate_answer(question, context=context)
        
        if len(answer_with_rag) > len(answer_no_rag) * 2.5:
            return answer_no_rag, "no_rag_verbose"
        
        low_confidence_phrases = ["no answer", "not mentioned", "not provided", 
                                 "cannot answer", "no information"]
        if any(phrase in answer_with_rag.lower() for phrase in low_confidence_phrases):
            return answer_no_rag, "no_rag_uncertain"
        
        return answer_with_rag, "rag"

In [None]:
# Cell 42: Evaluation Metrics
def normalize_answer(s: str) -> str:
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punc(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def lower(text):
        return text.lower()
    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction: str, ground_truth: str) -> float:
    return float(normalize_answer(prediction) == normalize_answer(ground_truth))

def compute_f1(prediction: str, ground_truth: str) -> float:
    pred_tokens = normalize_answer(prediction).split()
    truth_tokens = normalize_answer(ground_truth).split()
    
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common = Counter(pred_tokens) & Counter(truth_tokens)
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0
    
    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    return (2 * precision * recall) / (precision + recall)

In [None]:
# Cell 43: Evaluation Function
def evaluate_system(retriever, qa_system, test_df, num_questions=100):
    print(f"\nEvaluating {num_questions} questions...")
    print("Fixes applied:")
    print("   Reduced context: top_k = 2")
    print("   Confidence filtering: min_score = 0.5")
    print("   Fixed prompts: max_tokens = 30, temp = 0.0")
    print("   Hybrid decision strategy\n")
    
    unique_questions = test_df.groupby('id').first().reset_index().head(num_questions)
    
    results = {
        'original_no_rag': {'predictions': [], 'ground_truths': []},
        'fixed_with_rag': {'predictions': [], 'ground_truths': []},
        'smart_hybrid': {'predictions': [], 'ground_truths': []}
    }
    
    question_details = []
    strategy_counts = defaultdict(int)
    
    for idx, row in tqdm(unique_questions.iterrows(), total=len(unique_questions)):
        question = row['question']
        ground_truth = row['answer']
        
        try:
            answer_no_rag = qa_system.generate_answer(question, context=None)
            em_no_rag = compute_exact_match(answer_no_rag, ground_truth)
            f1_no_rag = compute_f1(answer_no_rag, ground_truth)
        except:
            answer_no_rag = ""
            em_no_rag = f1_no_rag = 0
        
        results['original_no_rag']['predictions'].append(answer_no_rag)
        results['original_no_rag']['ground_truths'].append(ground_truth)
        
        retrieved = retriever.retrieve(question, top_k=2, min_score=0.5)
        
        if not retrieved:
            context = ""
            avg_score = 0.0
        else:
            context = "\n\n".join([f"[{i+1}] {sent}" for i, (sent, _, _) in enumerate(retrieved)])
            avg_score = np.mean([score for _, score, _ in retrieved])
        
        try:
            answer_rag = qa_system.generate_answer(question, context=context)
            em_rag = compute_exact_match(answer_rag, ground_truth)
            f1_rag = compute_f1(answer_rag, ground_truth)
        except:
            answer_rag = ""
            em_rag = f1_rag = 0
        
        results['fixed_with_rag']['predictions'].append(answer_rag)
        results['fixed_with_rag']['ground_truths'].append(ground_truth)
        
        try:
            answer_hybrid, strategy = qa_system.answer_with_strategy(question, context, avg_score)
            strategy_counts[strategy] += 1
            em_hybrid = compute_exact_match(answer_hybrid, ground_truth)
            f1_hybrid = compute_f1(answer_hybrid, ground_truth)
        except:
            answer_hybrid = ""
            strategy = "error"
            em_hybrid = f1_hybrid = 0
        
        results['smart_hybrid']['predictions'].append(answer_hybrid)
        results['smart_hybrid']['ground_truths'].append(ground_truth)
        
        question_details.append({
            'question': question,
            'ground_truth': ground_truth,
            'answer_no_rag': answer_no_rag,
            'answer_fixed_rag': answer_rag,
            'answer_smart': answer_hybrid,
            'strategy': strategy,
            'retrieval_score': avg_score,
            'num_contexts': len(retrieved),
            'em_no_rag': em_no_rag,
            'f1_no_rag': f1_no_rag,
            'em_fixed_rag': em_rag,
            'f1_fixed_rag': f1_rag,
            'em_smart': em_hybrid,
            'f1_smart': f1_hybrid
        })
    
    metrics = {}
    for method_name in ['original_no_rag', 'fixed_with_rag', 'smart_hybrid']:
        preds = results[method_name]['predictions']
        truths = results[method_name]['ground_truths']
        
        em_scores = [compute_exact_match(p, t) for p, t in zip(preds, truths)]
        f1_scores = [compute_f1(p, t) for p, t in zip(preds, truths)]
        
        metrics[method_name] = {
            'em': np.mean(em_scores),
            'f1': np.mean(f1_scores),
            'avg_length': np.mean([len(p) for p in preds])
        }
    
    print("\nRESULTS COMPARISON")

    print(f"{'Method':<25} {'EM':>10} {'F1':>10} {'Avg Length':>12}")

    
    for method_name, method_label in [
        ('original_no_rag', 'Original (No RAG)'),
        ('fixed_with_rag', 'Fixed (Always RAG)'),
        ('smart_hybrid', 'Smart Hybrid')
    ]:
        m = metrics[method_name]
        print(f"{method_label:<25} {m['em']:>9.1%} {m['f1']:>9.1%} {m['avg_length']:>11.1f}")
    
    print("\nIMPROVEMENTS vs ORIGINAL NO-RAG")
    
    baseline_em = metrics['original_no_rag']['em']
    baseline_f1 = metrics['original_no_rag']['f1']
    
    for method_name, method_label in [('fixed_with_rag', 'Fixed RAG'), ('smart_hybrid', 'Smart Hybrid')]:
        em_improvement = (metrics[method_name]['em'] - baseline_em) / (baseline_em + 1e-10) * 100
        f1_improvement = (metrics[method_name]['f1'] - baseline_f1) / (baseline_f1 + 1e-10) * 100
        
        print(f"\n{method_label}:")
        print(f"  EM: {em_improvement:+.1f}%")
        print(f"  F1: {f1_improvement:+.1f}%")
    
    print("\nSMART HYBRID STRATEGY BREAKDOWN")
    for strategy, count in sorted(strategy_counts.items(), key=lambda x: -x[1]):
        print(f"  {strategy}: {count} ({count/len(question_details)*100:.1f}%)")
    
    return {
        'metrics': metrics,
        'question_details': question_details,
        'strategy_counts': dict(strategy_counts)
    }

In [None]:
# Cell 44: Load Dataset
print("Loading dataset...")
val_df = pd.read_parquet(f"{DATASET_INPUT_PATH}/validation_sentences.parquet")
print(f" Loaded: {len(val_df):,} rows, {val_df['id'].nunique():,} questions")

# Cell 45: Initialize Retriever
print("Initializing retriever...")
retriever = HybridRetriever(
    model_path=MODEL_INPUT_PATH,
    index_path=f"{WORKING_PATH}/faiss_index.bin",
    corpus_path=f"{WORKING_PATH}/corpus.pkl"
)

In [None]:
# Cell 46: Initialize QA System
print("Initializing QA system...")
qa_system = QwenQASystem(
    model_name="Qwen/Qwen2.5-7B-Instruct",
    lora_path=None,
    load_in_8bit=True
)

In [None]:
# Cell 47: Run Evaluation
print("Running evaluation...")
results = evaluate_system(
    retriever=retriever,
    qa_system=qa_system,
    test_df=val_df,
    num_questions=NUM_EVAL_QUESTIONS
)

In [None]:
# Cell 48: Save Results
output_path = f"{WORKING_PATH}/fixed_rag_results.json"
with open(output_path, 'w') as f:
    json.dump({
        'metrics': results['metrics'],
        'strategy_counts': results['strategy_counts']
    }, f, indent=2)
print(f"\n Results saved: {output_path}")

detailed_df = pd.DataFrame(results['question_details'])
detailed_csv = f"{WORKING_PATH}/fixed_detailed_results.csv"
detailed_df.to_csv(detailed_csv, index=False)
print(f" Detailed results: {detailed_csv}")

![RAG-Performance](/home/too_geeky/interview/rag_analysis_plots.png)