# Install Libraries

In [1]:
!pip install pymilvus "pymilvus[milvus_lite]" evaluate openai ragas langchain-openai

Collecting pymilvus
  Downloading pymilvus-2.6.2-py3-none-any.whl.metadata (6.5 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting ragas
  Downloading ragas-0.3.6-py3-none-any.whl.metadata (21 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.3.34-py3-none-any.whl.metadata (2.4 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.11.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (9.4 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus[milvus_lite])
  Downloading milvus_lite-2.5.1-py3-none-manylinux2014_x86_64.whl.metadata (10.0 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.11.3-py3-none-any.whl.metadata (11 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_

In [2]:
pip install --upgrade --force-reinstall torch torchvision torchaudio


Collecting torch
  Downloading torch-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.23.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading torchaudio-2.8.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (7.2 kB)
Collecting filelock (from torch)
  Downloading filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.15.0-py3-none-any.whl.metadata (3.3 kB)
Collecting setuptools (from torch)
  Downloading setuptools-80.9.0-py3-none-any.whl.metadata (6.6 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.9.0-py3-none-

# Import Libraries

In [3]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from pymilvus import MilvusClient, FieldSchema, CollectionSchema, DataType
import os
import json
import gc
from typing import List, Dict, Any, Tuple

from sklearn.metrics.pairwise import cosine_similarity
import evaluate
import glob

KeyboardInterrupt: 

# Naive RAG

### Configuration

In [None]:
class Config:
    # Model settings - MULTIPLE EMBEDDING MODELS
    EMBEDDING_CONFIGS = [
        {
            "name": "all-MiniLM-L6-v2",
            "dim": 384,
            "collection_suffix": "384d"
        },
        {
            "name": "all-mpnet-base-v2",  # 768 dimensions - more powerful
            "dim": 768,
            "collection_suffix": "768d"
        }
    ]

    LLM_MODEL = "google/flan-t5-base"
    MAX_PASSAGE_LENGTH = 2000

    # Database settings
    DB_NAME = "rag_wikipedia_multi_embed.db"
    COLLECTION_BASE_NAME = "rag_mini"
    MAX_K = 10
    TOP_K_VALUES = [1, 3, 5]
    MAX_CONTEXT_LENGTH = 400  # words

    # Generation settings
    MAX_NEW_TOKENS = 200
    BATCH_SIZE = 16

    # Output settings
    OUTPUT_DIR = "results"

### Data Loading and Preprocessing

In [None]:
def load_and_clean_passages(parquet_path: str) -> pd.DataFrame:
    """Load passages and perform EDA"""
    print("Loading passages...")
    passages = pd.read_parquet(parquet_path)
    passages = passages.dropna()

    # Add length statistics
    passages["char_len"] = passages["passage"].str.len()
    passages["word_len"] = passages["passage"].apply(lambda x: len(str(x).split()))

    print(f"\nPassages loaded: {len(passages)}")
    print(f"Character length - Mean: {passages['char_len'].mean():.0f}, "
          f"Min: {passages['char_len'].min()}, Max: {passages['char_len'].max()}")
    print(f"Word length - Mean: {passages['word_len'].mean():.0f}, "
          f"Min: {passages['word_len'].min()}, Max: {passages['word_len'].max()}")

    return passages

In [None]:
def load_and_clean_queries(parquet_path: str) -> pd.DataFrame:
    """Load and clean queries"""
    print("\nLoading queries...")
    queries = pd.read_parquet(parquet_path)
    queries = queries.dropna()
    print(f"Queries loaded: {len(queries)}")
    return queries

In [None]:
def generate_embeddings(texts: List[str], model_name: str,
                       batch_size: int = 32) -> np.ndarray:
    """Generate embeddings for texts"""
    print(f"\nGenerating embeddings using {model_name}...")
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts, batch_size=batch_size, show_progress_bar=True)
    print(f"Embedding shape: {embeddings.shape}")
    return embeddings

### Vector Database

In [None]:
def create_milvus_schema(embedding_dim: int) -> CollectionSchema:
    """Create Milvus collection schema with specified embedding dimension"""
    fields = [
        FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
        FieldSchema(name="passage", dtype=DataType.VARCHAR, max_length=Config.MAX_PASSAGE_LENGTH),
        FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=embedding_dim)
    ]
    return CollectionSchema(
        fields=fields,
        description=f"RAG Mini Wikipedia passages with {embedding_dim}D embeddings"
    )

In [None]:
def setup_vector_db(passages: pd.DataFrame, embeddings: np.ndarray,
                   collection_name: str, embedding_dim: int) -> MilvusClient:
    """Setup Milvus database and insert data"""
    print(f"\nSetting up vector database: {collection_name}...")

    # Initialize client
    client = MilvusClient(Config.DB_NAME)

    # Create collection
    schema = create_milvus_schema(embedding_dim)

    try:
        client.drop_collection(collection_name)
        print(f"Dropped existing collection: {collection_name}")
    except:
        pass

    client.create_collection(
        collection_name=collection_name,
        schema=schema
    )
    print(f"Created collection: {collection_name}")

    # Prepare data
    passages = passages.reset_index().rename(columns={"index": "id"})
    rag_data = [
        {
            "id": int(row["id"]),
            "passage": row["passage"][:Config.MAX_PASSAGE_LENGTH],
            "embedding": embeddings[idx].tolist()
        }
        for idx, row in passages.iterrows()
    ]

    # Insert data
    res = client.insert(collection_name=collection_name, data=rag_data)
    print(f"Inserted {res['insert_count']} records")

    # Create index
    index_params = MilvusClient.prepare_index_params()
    index_params.add_index(
        field_name="embedding",
        index_type="IVF_FLAT",
        metric_type="IP",
        params={"nlist": 128}
    )
    client.create_index(collection_name=collection_name, index_params=index_params)
    print("Index created")

    # Load collection
    client.load_collection(collection_name)
    print("Collection loaded into memory")

    return client

### Prompting Strategies

In [None]:
PROMPTING_STRATEGIES = [
    {
        "name": "instruction",
        "template": "You are a helpful assistant. Answer the question based on the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
    },
    {
        "name": "cot",
        "template": "You are a helpful assistant. Think step by step.\nContext: {context}\nQuestion: {question}\nLet's think step by step:\nAnswer:"
    },
    {
        "name": "persona",
        "template": "You are an expert historian and researcher.\nContext: {context}\nQuestion: {question}\nExpert Answer:"
    }
]

### Retrieve Context

In [None]:
def retrieve_all_contexts(client: MilvusClient, collection_name: str,
                         queries_df: pd.DataFrame,
                         embedding_model: SentenceTransformer,
                         max_k: int = Config.MAX_K,
                         max_passage_length: int = Config.MAX_CONTEXT_LENGTH) -> List[Dict]:
    """Retrieve contexts for all queries (one-time operation)"""
    print(f"\nRetrieving top-{max_k} contexts for {len(queries_df)} queries from {collection_name}...")
    all_query_contexts = []

    for idx, row in tqdm(queries_df.iterrows(), total=len(queries_df), desc="Retrieving contexts"):
        query_text = row["question"]
        query_id = row.get("id", idx)

        # Encode query
        query_embedding = embedding_model.encode(query_text)

        # Search
        search_results = client.search(
            collection_name=collection_name,
            data=[query_embedding.tolist()],
            limit=max_k,
            search_params={"metric_type": "IP", "params": {"nprobe": 10}},
            output_fields=["id", "passage"]
        )

        # Extract and truncate passages
        passages = []
        for result in search_results[0]:
            passage = result["entity"]["passage"]
            words = passage.split()
            if len(words) > max_passage_length:
                passage = " ".join(words[:max_passage_length])
            passages.append(passage)

        all_query_contexts.append({
            "id": query_id,
            "question": query_text,
            "passages": passages
        })

    print(f"Retrieved contexts for {len(all_query_contexts)} queries")
    return all_query_contexts

### Generate Answers

In [None]:
def generate_answers_with_topk(all_query_contexts: List[Dict],
                               rag_pipeline,
                               top_k: int = 1,
                               prompt_template: str = None,
                               strategy_name: str = None,
                               batch_size: int = Config.BATCH_SIZE) -> pd.DataFrame:
    """Generate answers using top-k passages"""
    if strategy_name is None:
        strategy_name = f"top_{top_k}"

    if prompt_template is None:
        prompt_template = "Context: {context}\nQuestion: {question}\nAnswer:"

    results = []

    for i in tqdm(range(0, len(all_query_contexts), batch_size),
                  desc=f"Generating answers: {strategy_name}"):
        batch = all_query_contexts[i:i+batch_size]

        for item in batch:
            # Get top_k passages
            if top_k == 1:
                context = item["passages"][0]
            else:
                context = " ".join(item["passages"][:top_k])

            # Build prompt
            prompt = prompt_template.format(context=context, question=item["question"])

            # Generate answer
            try:
                response = rag_pipeline(prompt, max_new_tokens=Config.MAX_NEW_TOKENS,
                                       do_sample=False, truncation=True)
                answer = response[0]['generated_text']
            except Exception as e:
                print(f"\nError for query {item['id']}: {str(e)[:100]}")
                answer = ""

            results.append({
                "id": item["id"],
                "question": item["question"],
                "context": context,
                "answer": answer,
                "strategy": strategy_name
            })

    return pd.DataFrame(results)

### Evaluate

In [None]:
def evaluate_rag_results(results_df: pd.DataFrame,
                        original_queries: pd.DataFrame) -> Dict[str, float]:
    """Evaluate RAG results using SQuAD metrics"""
    squad_metric = evaluate.load("squad")

    if "id" not in results_df.columns:
        results_df = results_df.reset_index().rename(columns={"index": "id"})
    if "id" not in original_queries.columns:
        original_queries = original_queries.reset_index().rename(columns={"index": "id"})

    predictions, references = [], []

    for idx, row in results_df.iterrows():
        predictions.append({
            "id": str(row["id"]),
            "prediction_text": str(row["answer"])
        })

        query_id = row["id"]
        matching_rows = original_queries[original_queries["id"] == query_id]

        if len(matching_rows) > 0:
            gold_answer = str(matching_rows["answer"].iloc[0])
            references.append({
                "id": str(row["id"]),
                "answers": {"text": [gold_answer], "answer_start": [0]}
            })

    min_len = min(len(predictions), len(references))
    predictions = predictions[:min_len]
    references = references[:min_len]

    try:
        metrics = squad_metric.compute(predictions=predictions, references=references)
        return metrics
    except Exception as e:
        print(f"Error computing metrics: {e}")
        return {"exact_match": 0.0, "f1": 0.0}

In [None]:
def evaluate_and_save_strategy(results_df: pd.DataFrame,
                               queries: pd.DataFrame,
                               strategy_name: str,
                               output_dir: str = Config.OUTPUT_DIR) -> Dict:
    """Evaluate a strategy and save results"""
    os.makedirs(output_dir, exist_ok=True)

    metrics = evaluate_rag_results(results_df, queries)

    print(f"\n{strategy_name}:")
    print(f"  Exact Match: {metrics['exact_match']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}")

    csv_path = os.path.join(output_dir, f"{strategy_name}_results.csv")
    results_df.to_csv(csv_path, index=False)
    print(f"  Saved results: {csv_path}")

    metrics_dict = {
        "strategy": strategy_name,
        "exact_match": float(metrics['exact_match']),
        "f1_score": float(metrics['f1']),
        "num_queries": len(results_df)
    }
    json_path = os.path.join(output_dir, f"{strategy_name}_metrics.json")
    with open(json_path, 'w') as f:
        json.dump(metrics_dict, f, indent=2)
    print(f"  Saved metrics: {json_path}")

    return metrics

### Create comparision summary

In [None]:
def create_comparison_summary(all_results: Dict,
                             output_path: str = "results/final_comparison_with_enhanced.csv"):
    """Create summary comparing all embedding configurations"""
    summary_data = []

    for strategy_name, data in all_results.items():
        metrics = data['metrics']
        summary_data.append({
            'strategy': strategy_name,
            'embedding_model': data.get('embedding_model', 'unknown'),
            'embedding_dim': data.get('embedding_dim', 0),
            'exact_match': metrics['exact_match'],
            'f1_score': metrics['f1'],
            'num_queries': data['num_queries']
        })

    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values('f1_score', ascending=False)

    # Save
    summary_df.to_csv(output_path, index=False)

    # Print
    print("\n" + "="*80)
    print("FINAL RESULTS SUMMARY - ALL EMBEDDINGS + ENHANCED")
    print("="*80)
    print(summary_df.to_string(index=False))
    print("="*80)

    # Find best for each type
    best_overall = summary_df.iloc[0]
    naive_strategies = summary_df[~summary_df['strategy'].str.contains('enhanced')]
    enhanced_strategies = summary_df[summary_df['strategy'].str.contains('enhanced')]

    print(f"\nBest Overall: {best_overall['strategy']} (F1: {best_overall['f1_score']:.4f})")

    if len(naive_strategies) > 0:
        best_naive = naive_strategies.iloc[0]
        print(f"Best Naive: {best_naive['strategy']} (F1: {best_naive['f1_score']:.4f})")

    if len(enhanced_strategies) > 0:
        best_enhanced = enhanced_strategies.iloc[0]
        print(f"Best Enhanced: {best_enhanced['strategy']} (F1: {best_enhanced['f1_score']:.4f})")

        if len(naive_strategies) > 0:
            improvement = ((best_enhanced['f1_score'] - best_naive['f1_score']) / best_naive['f1_score'] * 100)
            print(f"\nEnhanced Improvement over Best Naive: {improvement:.2f}%")

    return summary_df

### Run Experiments

In [None]:
def run_experiments_for_embedding(embed_config: Dict,
                                 all_contexts: List[Dict],
                                 queries: pd.DataFrame,
                                 rag_pipeline,
                                 top_k_values: List[int] = Config.TOP_K_VALUES,
                                 prompting_strategies: List[Dict] = PROMPTING_STRATEGIES) -> Dict:
    """Run all experiments for a single embedding configuration"""
    embed_name = embed_config['collection_suffix']
    print(f"\n{'='*80}")
    print(f"RUNNING EXPERIMENTS FOR: {embed_config['name']} ({embed_config['dim']}D)")
    print(f"{'='*80}")

    all_results = {}
    total_experiments = len(top_k_values) * len(prompting_strategies)
    current_experiment = 0

    for top_k in top_k_values:
        for prompt_strategy in prompting_strategies:
            current_experiment += 1
            strategy_name = f"{embed_name}_top{top_k}_{prompt_strategy['name']}"

            print(f"\n{'='*80}")
            print(f"Experiment {current_experiment}/{total_experiments}: {strategy_name}")
            print(f"{'='*80}")

            # Generate answers
            results_df = generate_answers_with_topk(
                all_contexts,
                rag_pipeline,
                top_k=top_k,
                prompt_template=prompt_strategy['template'],
                strategy_name=strategy_name
            )

            # Evaluate
            metrics = evaluate_and_save_strategy(results_df, queries, strategy_name)

            # Store results
            all_results[strategy_name] = {
                'metrics': metrics,
                'num_queries': len(results_df),
                'embedding_model': embed_config['name'],
                'embedding_dim': embed_config['dim']
            }

            # Clean up
            del results_df
            gc.collect()

    return all_results

# Enhanced RAG

### Configuration

In [None]:
class Config:
    LLM_MODEL = "google/flan-t5-base"
    EMBEDDING_MODEL_384D = "all-MiniLM-L6-v2"

    DB_NAME = "rag_wikipedia_multi_embed.db"
    COLLECTION_BASE_NAME = "rag_mini"
    MAX_K = 10
    MAX_CONTEXT_LENGTH = 400

    MAX_NEW_TOKENS = 200
    BATCH_SIZE = 16
    OUTPUT_DIR = "results"

### Rewrite Query

In [None]:
def rewrite_query(query: str, rewriter_pipeline) -> str:
    """Rewrite query to be more specific/focused"""
    rewrite_prompt = f"Rewrite this question to be more clear and specific: {query}\nRewritten question:"

    try:
        response = rewriter_pipeline(rewrite_prompt, max_new_tokens=50,
                                     do_sample=False, truncation=True)
        rewritten = response[0]['generated_text'].strip()
        return rewritten if rewritten else query
    except Exception as e:
        print(f"Error rewriting query: {str(e)[:100]}")
        return query

### Rerank

In [None]:
def rerank_passages(query: str,
                   passages: List[str],
                   embedding_model: SentenceTransformer,
                   top_k: int = 1) -> List[str]:
    """Rerank passages using cosine similarity with query embedding"""
    query_emb = embedding_model.encode([query])
    passage_embs = embedding_model.encode(passages)

    scores = cosine_similarity(query_emb, passage_embs)[0]
    ranked_idx = np.argsort(scores)[::-1]  # descending
    reranked = [passages[i] for i in ranked_idx[:top_k]]
    return reranked

### Create Answers

In [None]:
def generate_enhanced_answers(all_query_contexts: List[Dict],
                             rag_pipeline,
                             rewriter_pipeline,
                             embedding_model: SentenceTransformer,
                             top_k: int = 1,
                             strategy_name: str = "enhanced_rerank") -> pd.DataFrame:
    """Generate answers with query rewriting + reranking"""
    results = []

    for item in tqdm(all_query_contexts, desc=f"Enhanced generation: {strategy_name}"):
        # 1. Rewrite query
        rewritten_query = rewrite_query(item["question"], rewriter_pipeline)

        # 2. Rerank passages using rewritten query
        reranked_passages = rerank_passages(rewritten_query, item["passages"],
                                           embedding_model, top_k)
        context = " ".join(reranked_passages)

        # 3. Standard prompt
        prompt = f"Context: {context}\nQuestion: {rewritten_query}\nAnswer:"

        try:
            response = rag_pipeline(prompt, max_new_tokens=Config.MAX_NEW_TOKENS,
                                   do_sample=False, truncation=True)
            answer = response[0]['generated_text']
        except Exception as e:
            print(f"\nError for query {item['id']}: {str(e)[:100]}")
            answer = ""

        results.append({
            "id": item["id"],
            "question": item["question"],
            "rewritten_question": rewritten_query,
            "context": context,
            "answer": answer,
            "strategy": strategy_name
        })

    return pd.DataFrame(results)

# Load Dataset

In [None]:
passages = load_and_clean_passages("hf://datasets/rag-datasets/rag-mini-wikipedia/data/passages.parquet/part.0.parquet")

queries = load_and_clean_queries("hf://datasets/rag-datasets/rag-mini-wikipedia/data/test.parquet/part.0.parquet")

Loading passages...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



Passages loaded: 3200
Character length - Mean: 390, Min: 1, Max: 2515
Word length - Mean: 62, Min: 1, Max: 425

Loading queries...
Queries loaded: 918


# Load LLM

In [None]:
# 2. Load LLM (shared across all embeddings)
print("\nLoading language model...")
tokenizer = AutoTokenizer.from_pretrained(Config.LLM_MODEL)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(Config.LLM_MODEL)
rag_pipeline = pipeline("text2text-generation", model=llm_model, tokenizer=tokenizer)
print("LLM loaded successfully")


Loading language model...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cuda:0


LLM loaded successfully


### Run

In [None]:
print("="*80)
print("RAG SYSTEM: MULTI-EMBEDDING SIZE COMPARISON")
print("="*80)

# 3. Process each embedding configuration
all_results = {}

for embed_config in Config.EMBEDDING_CONFIGS:
  print(f"\n{'#'*80}")
  print(f"PROCESSING EMBEDDING: {embed_config['name']}")
  print(f"{'#'*80}")

  # Generate embeddings
  embeddings = generate_embeddings(
      passages["passage"].tolist(),
      embed_config['name']
  )

  # Setup vector database
  collection_name = f"{Config.COLLECTION_BASE_NAME}_{embed_config['collection_suffix']}"
  client = setup_vector_db(passages, embeddings, collection_name, embed_config['dim'])

  # Load embedding model for retrieval
  embedding_model = SentenceTransformer(embed_config['name'])

  # Retrieve contexts
  all_contexts = retrieve_all_contexts(
      client, collection_name, queries, embedding_model
  )

  # Run experiments
  results = run_experiments_for_embedding(
      embed_config,
      all_contexts,
      queries,
      rag_pipeline
  )

  # Add to overall results
  all_results.update(results)

  # Clean up
  del embeddings, embedding_model, all_contexts
  gc.collect()

RAG SYSTEM: MULTI-EMBEDDING SIZE COMPARISON

################################################################################
PROCESSING EMBEDDING: all-MiniLM-L6-v2
################################################################################

Generating embeddings using all-MiniLM-L6-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

Embedding shape: (3200, 384)

Setting up vector database: rag_mini_384d...
Dropped existing collection: rag_mini_384d
Created collection: rag_mini_384d
Inserted 3200 records
Index created
Collection loaded into memory

Retrieving top-10 contexts for 918 queries from rag_mini_384d...


Retrieving contexts: 100%|██████████| 918/918 [00:07<00:00, 120.81it/s]


Retrieved contexts for 918 queries

RUNNING EXPERIMENTS FOR: all-MiniLM-L6-v2 (384D)

Experiment 1/9: 384d_top1_instruction


Generating answers: 384d_top1_instruction:   0%|          | 0/58 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating answers: 384d_top1_instruction: 100%|██████████| 58/58 [05:10<00:00,  5.36s/it]


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]


384d_top1_instruction:
  Exact Match: 34.6405
  F1 Score: 42.1265
  Saved results: results/384d_top1_instruction_results.csv
  Saved metrics: results/384d_top1_instruction_metrics.json

Experiment 2/9: 384d_top1_cot


Generating answers: 384d_top1_cot: 100%|██████████| 58/58 [23:22<00:00, 24.18s/it]



384d_top1_cot:
  Exact Match: 0.0000
  F1 Score: 10.2225
  Saved results: results/384d_top1_cot_results.csv
  Saved metrics: results/384d_top1_cot_metrics.json

Experiment 3/9: 384d_top1_persona


Generating answers: 384d_top1_persona: 100%|██████████| 58/58 [09:14<00:00,  9.57s/it]



384d_top1_persona:
  Exact Match: 26.9063
  F1 Score: 34.0904
  Saved results: results/384d_top1_persona_results.csv
  Saved metrics: results/384d_top1_persona_metrics.json

Experiment 4/9: 384d_top3_instruction


Generating answers: 384d_top3_instruction: 100%|██████████| 58/58 [07:15<00:00,  7.51s/it]



384d_top3_instruction:
  Exact Match: 32.4619
  F1 Score: 41.2115
  Saved results: results/384d_top3_instruction_results.csv
  Saved metrics: results/384d_top3_instruction_metrics.json

Experiment 5/9: 384d_top3_cot


Generating answers: 384d_top3_cot: 100%|██████████| 58/58 [30:52<00:00, 31.94s/it]



384d_top3_cot:
  Exact Match: 0.0000
  F1 Score: 9.7309
  Saved results: results/384d_top3_cot_results.csv
  Saved metrics: results/384d_top3_cot_metrics.json

Experiment 6/9: 384d_top3_persona


Generating answers: 384d_top3_persona: 100%|██████████| 58/58 [13:38<00:00, 14.11s/it]



384d_top3_persona:
  Exact Match: 24.5098
  F1 Score: 32.7006
  Saved results: results/384d_top3_persona_results.csv
  Saved metrics: results/384d_top3_persona_metrics.json

Experiment 7/9: 384d_top5_instruction


Generating answers: 384d_top5_instruction: 100%|██████████| 58/58 [12:19<00:00, 12.76s/it]



384d_top5_instruction:
  Exact Match: 23.6383
  F1 Score: 30.0920
  Saved results: results/384d_top5_instruction_results.csv
  Saved metrics: results/384d_top5_instruction_metrics.json

Experiment 8/9: 384d_top5_cot


Generating answers: 384d_top5_cot: 100%|██████████| 58/58 [40:50<00:00, 42.24s/it]



384d_top5_cot:
  Exact Match: 0.0000
  F1 Score: 7.4768
  Saved results: results/384d_top5_cot_results.csv
  Saved metrics: results/384d_top5_cot_metrics.json

Experiment 9/9: 384d_top5_persona


Generating answers: 384d_top5_persona: 100%|██████████| 58/58 [18:00<00:00, 18.63s/it]



384d_top5_persona:
  Exact Match: 18.1917
  F1 Score: 24.8797
  Saved results: results/384d_top5_persona_results.csv
  Saved metrics: results/384d_top5_persona_metrics.json

################################################################################
PROCESSING EMBEDDING: all-mpnet-base-v2
################################################################################

Generating embeddings using all-mpnet-base-v2...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/100 [00:00<?, ?it/s]

Embedding shape: (3200, 768)

Setting up vector database: rag_mini_768d...
Dropped existing collection: rag_mini_768d
Created collection: rag_mini_768d
Inserted 3200 records
Index created
Collection loaded into memory

Retrieving top-10 contexts for 918 queries from rag_mini_768d...


Retrieving contexts: 100%|██████████| 918/918 [00:12<00:00, 74.26it/s]


Retrieved contexts for 918 queries

RUNNING EXPERIMENTS FOR: all-mpnet-base-v2 (768D)

Experiment 1/9: 768d_top1_instruction


Generating answers: 768d_top1_instruction: 100%|██████████| 58/58 [05:39<00:00,  5.85s/it]



768d_top1_instruction:
  Exact Match: 33.9869
  F1 Score: 41.6693
  Saved results: results/768d_top1_instruction_results.csv
  Saved metrics: results/768d_top1_instruction_metrics.json

Experiment 2/9: 768d_top1_cot


Generating answers: 768d_top1_cot: 100%|██████████| 58/58 [23:45<00:00, 24.57s/it]



768d_top1_cot:
  Exact Match: 0.0000
  F1 Score: 10.3962
  Saved results: results/768d_top1_cot_results.csv
  Saved metrics: results/768d_top1_cot_metrics.json

Experiment 3/9: 768d_top1_persona


Generating answers: 768d_top1_persona: 100%|██████████| 58/58 [10:11<00:00, 10.55s/it]



768d_top1_persona:
  Exact Match: 27.3420
  F1 Score: 35.1569
  Saved results: results/768d_top1_persona_results.csv
  Saved metrics: results/768d_top1_persona_metrics.json

Experiment 4/9: 768d_top3_instruction


Generating answers: 768d_top3_instruction: 100%|██████████| 58/58 [08:39<00:00,  8.96s/it]



768d_top3_instruction:
  Exact Match: 31.8083
  F1 Score: 39.9307
  Saved results: results/768d_top3_instruction_results.csv
  Saved metrics: results/768d_top3_instruction_metrics.json

Experiment 5/9: 768d_top3_cot


Generating answers: 768d_top3_cot: 100%|██████████| 58/58 [32:53<00:00, 34.03s/it]



768d_top3_cot:
  Exact Match: 0.0000
  F1 Score: 9.5667
  Saved results: results/768d_top3_cot_results.csv
  Saved metrics: results/768d_top3_cot_metrics.json

Experiment 6/9: 768d_top3_persona


Generating answers: 768d_top3_persona: 100%|██████████| 58/58 [15:50<00:00, 16.39s/it]



768d_top3_persona:
  Exact Match: 25.0545
  F1 Score: 33.0150
  Saved results: results/768d_top3_persona_results.csv
  Saved metrics: results/768d_top3_persona_metrics.json

Experiment 7/9: 768d_top5_instruction


Generating answers: 768d_top5_instruction: 100%|██████████| 58/58 [13:34<00:00, 14.04s/it]



768d_top5_instruction:
  Exact Match: 21.7865
  F1 Score: 27.8582
  Saved results: results/768d_top5_instruction_results.csv
  Saved metrics: results/768d_top5_instruction_metrics.json

Experiment 8/9: 768d_top5_cot


Generating answers: 768d_top5_cot: 100%|██████████| 58/58 [43:21<00:00, 44.85s/it]



768d_top5_cot:
  Exact Match: 0.0000
  F1 Score: 7.2701
  Saved results: results/768d_top5_cot_results.csv
  Saved metrics: results/768d_top5_cot_metrics.json

Experiment 9/9: 768d_top5_persona


Generating answers: 768d_top5_persona: 100%|██████████| 58/58 [19:51<00:00, 20.54s/it]



768d_top5_persona:
  Exact Match: 17.2113
  F1 Score: 23.2195
  Saved results: results/768d_top5_persona_results.csv
  Saved metrics: results/768d_top5_persona_metrics.json


In [None]:
print("="*80)
print("RUNNING ENHANCED RAG EXPERIMENTS (384D)")
print("="*80)

# 2. Load models
print("\nLoading models...")
tokenizer = AutoTokenizer.from_pretrained(Config.LLM_MODEL)
llm_model = AutoModelForSeq2SeqLM.from_pretrained(Config.LLM_MODEL)
rag_pipeline = pipeline("text2text-generation", model=llm_model, tokenizer=tokenizer)
rewriter_pipeline = pipeline("text2text-generation", model=llm_model, tokenizer=tokenizer)
print("LLM loaded successfully")

# 3. Load 384D embedding model
embedding_model_384d = SentenceTransformer(Config.EMBEDDING_MODEL_384D)
print(f"Loaded embedding model: {Config.EMBEDDING_MODEL_384D}")

# 4. Initialize Milvus client and retrieve contexts
client = MilvusClient(Config.DB_NAME)
collection_name_384d = f"{Config.COLLECTION_BASE_NAME}_384d"

all_contexts_384d = retrieve_all_contexts(
    client,
    collection_name_384d,
    queries,
    embedding_model_384d
)

RUNNING ENHANCED RAG EXPERIMENTS (384D)

Loading models...


Device set to use cuda:0
Device set to use cuda:0


LLM loaded successfully
Loaded embedding model: all-MiniLM-L6-v2

Retrieving top-10 contexts for 918 queries from rag_mini_384d...


Retrieving contexts: 100%|██████████| 918/918 [00:07<00:00, 126.73it/s]

Retrieved contexts for 918 queries





In [None]:
# 5. Run enhanced experiments
print("\n" + "="*80)
print("GENERATING ENHANCED ANSWERS WITH RERANKING")
print("="*80)

strategy_name = "384d_enhanced_top1"

results_df = generate_enhanced_answers(
    all_contexts_384d,
    rag_pipeline,
    rewriter_pipeline,
    embedding_model_384d,
    top_k=1,
    strategy_name=strategy_name
)

# 6. Evaluate
metrics = evaluate_and_save_strategy(results_df, queries, strategy_name)


GENERATING ENHANCED ANSWERS WITH RERANKING


Enhanced generation: 384d_enhanced_top1: 100%|██████████| 918/918 [12:30<00:00,  1.22it/s]



384d_enhanced_top1:
  Exact Match: 28.9760
  F1 Score: 36.4054
  Saved results: results/384d_enhanced_top1_results.csv
  Saved metrics: results/384d_enhanced_top1_metrics.json


In [None]:
# Store enhanced results
enhanced_results = {
    strategy_name: {
        'metrics': metrics,
        'num_queries': len(results_df),
        'embedding_model': Config.EMBEDDING_MODEL_384D,
        'embedding_dim': 384
    }
}

# 7. Load existing results and combine
print("\n" + "="*80)
print("COMBINING WITH EXISTING RESULTS")
print("="*80)

existing_results = {}


COMBINING WITH EXISTING RESULTS


In [None]:
# Load all existing result JSONs
for json_file in glob.glob(os.path.join(Config.OUTPUT_DIR, "*_metrics.json")):
    if "enhanced" not in json_file:  # Don't reload the enhanced we just created
        with open(json_file, 'r') as f:
            data = json.load(f)
            strategy_name_loaded = data['strategy']

            # Infer embedding info from strategy name
            if '384d' in strategy_name_loaded:
                embed_dim = 384
                embed_model = 'all-MiniLM-L6-v2'
            elif '768d' in strategy_name_loaded:
                embed_dim = 768
                embed_model = 'all-mpnet-base-v2'
            else:
                embed_dim = 0
                embed_model = 'unknown'

            existing_results[strategy_name_loaded] = {
                'metrics': {'exact_match': data['exact_match'], 'f1': data['f1_score']},
                'num_queries': data['num_queries'],
                'embedding_model': embed_model,
                'embedding_dim': embed_dim
            }

print(f"Loaded {len(existing_results)} existing results")

Loaded 18 existing results


In [None]:
# 8. Combine all results
all_combined_results = {**existing_results, **enhanced_results}

# 9. Create final comparison
create_comparison_summary(all_combined_results,
                         output_path=os.path.join(Config.OUTPUT_DIR, "final_comparison_with_enhanced.csv"))

print("\n" + "="*80)
print("ENHANCED EXPERIMENTS COMPLETED!")
print("="*80)

# Clean up
del results_df, embedding_model_384d, all_contexts_384d
gc.collect()


FINAL RESULTS SUMMARY - ALL EMBEDDINGS + ENHANCED
             strategy   embedding_model  embedding_dim  exact_match  f1_score  num_queries
384d_top1_instruction  all-MiniLM-L6-v2            384    34.640523 42.126472          918
768d_top1_instruction all-mpnet-base-v2            768    33.986928 41.669301          918
384d_top3_instruction  all-MiniLM-L6-v2            384    32.461874 41.211498          918
768d_top3_instruction all-mpnet-base-v2            768    31.808279 39.930657          918
   384d_enhanced_top1  all-MiniLM-L6-v2            384    28.976035 36.405409          918
    768d_top1_persona all-mpnet-base-v2            768    27.342048 35.156856          918
    384d_top1_persona  all-MiniLM-L6-v2            384    26.906318 34.090438          918
    768d_top3_persona all-mpnet-base-v2            768    25.054466 33.015015          918
    384d_top3_persona  all-MiniLM-L6-v2            384    24.509804 32.700608          918
384d_top5_instruction  all-MiniLM-L6-v2

4262

In [None]:
# 4. Create final comparison
create_comparison_summary(all_results)

print("\n" + "="*80)
print("ALL EXPERIMENTS COMPLETED!")
print("="*80)


FINAL RESULTS SUMMARY - ALL EMBEDDINGS + ENHANCED
             strategy   embedding_model  embedding_dim  exact_match  f1_score  num_queries
384d_top1_instruction  all-MiniLM-L6-v2            384    34.640523 42.126472          918
768d_top1_instruction all-mpnet-base-v2            768    33.986928 41.669301          918
384d_top3_instruction  all-MiniLM-L6-v2            384    32.461874 41.211498          918
768d_top3_instruction all-mpnet-base-v2            768    31.808279 39.930657          918
    768d_top1_persona all-mpnet-base-v2            768    27.342048 35.156856          918
    384d_top1_persona  all-MiniLM-L6-v2            384    26.906318 34.090438          918
    768d_top3_persona all-mpnet-base-v2            768    25.054466 33.015015          918
    384d_top3_persona  all-MiniLM-L6-v2            384    24.509804 32.700608          918
384d_top5_instruction  all-MiniLM-L6-v2            384    23.638344 30.091996          918
768d_top5_instruction all-mpnet-base-v2

# RAGAS Evaluation

### Setup API Key

In [2]:
os.environ["OPENAI_API_KEY"] = "api key"

if "OPENAI_API_KEY" not in os.environ or not os.environ["OPENAI_API_KEY"].startswith("sk-"):
    raise ValueError("OPENAI_API_KEY not set or invalid. Please paste your key above.")

In [3]:
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# ARES Evaluation

In [None]:
import os
import pandas as pd
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# ----------------------------------------------------------------------------
# ARES-STYLE EVALUATION WITH CONTEXT PRECISION & RECALL (Colab compatible)
# ----------------------------------------------------------------------------

print("="*80)
print("ARES-STYLE EVALUATION WITH CONTEXT PRECISION & RECALL")
print("="*80)

# ----------------------------------------------------------------------------
# 1. Load CSVs (Both Naive and Enhanced)
# ----------------------------------------------------------------------------
print("\nLoading CSV files...")
naive_top1 = pd.read_csv("results/384d_top1_instruction_results.csv")
enhanced_top1 = pd.read_csv("results/384d_enhanced_top1_results.csv")

print(f"Loaded Naive: {len(naive_top1)} rows")
print(f"Loaded Enhanced: {len(enhanced_top1)} rows")
print(f"\nNaive Columns: {naive_top1.columns.tolist()}")
print(f"Enhanced Columns: {enhanced_top1.columns.tolist()}")

# Preview
print(f"\nNaive Preview:\n{naive_top1.head(2)}")
print(f"\nEnhanced Preview:\n{enhanced_top1.head(2)}")

# ----------------------------------------------------------------------------
# 2. Load Embedding Model
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("Loading embedding model...")
print("="*80)

embed_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Model loaded: all-MiniLM-L6-v2")

# ----------------------------------------------------------------------------
# 3. Define Evaluation Metrics (Including Precision & Recall)
# ----------------------------------------------------------------------------

def compute_context_relevance(question: str, context: str, model) -> float:
    """
    Context Relevance: How relevant is the retrieved context to the question?
    Measures: cosine similarity between question and context embeddings
    """
    q_emb = model.encode([question])
    c_emb = model.encode([context])
    score = cosine_similarity(q_emb, c_emb)[0][0]
    return float(score)

def compute_answer_relevance(question: str, answer: str, model) -> float:
    """
    Answer Relevance: How relevant is the answer to the question?
    Measures: cosine similarity between question and answer embeddings
    """
    q_emb = model.encode([question])
    a_emb = model.encode([answer])
    score = cosine_similarity(q_emb, a_emb)[0][0]
    return float(score)

def compute_answer_faithfulness(context: str, answer: str, model) -> float:
    """
    Answer Faithfulness: Is the answer grounded in the context?
    Measures: cosine similarity between context and answer embeddings
    """
    c_emb = model.encode([context])
    a_emb = model.encode([answer])
    score = cosine_similarity(c_emb, a_emb)[0][0]
    return float(score)

def compute_context_precision(question: str, context: str, answer: str, model, threshold: float = 0.5) -> float:
    """
    Context Precision: What proportion of the retrieved context is relevant/useful?

    Approximation: Split context into sentences, check how many are relevant to the answer.
    Higher score = more precise/focused context (less noise)
    """
    # Split context into sentences
    sentences = [s.strip() for s in context.replace('!', '.').replace('?', '.').split('.') if s.strip()]

    if len(sentences) == 0:
        return 0.0

    # Encode answer
    answer_emb = model.encode([answer])

    # Count relevant sentences (similar to answer)
    relevant_count = 0
    for sentence in sentences:
        if len(sentence.split()) < 3:  # Skip very short fragments
            continue
        sent_emb = model.encode([sentence])
        similarity = cosine_similarity(answer_emb, sent_emb)[0][0]
        if similarity > threshold:
            relevant_count += 1

    precision = relevant_count / len(sentences) if len(sentences) > 0 else 0.0
    return float(precision)

def compute_context_recall(context: str, ground_truth: str, model, threshold: float = 0.5) -> float:
    """
    Context Recall: Does the context contain all necessary information from ground truth?

    Approximation: Split ground truth into key phrases, check if they appear in context.
    Higher score = context captures more of the gold answer information
    """
    # Split ground truth into sentences/phrases
    gt_sentences = [s.strip() for s in ground_truth.replace('!', '.').replace('?', '.').split('.') if s.strip()]

    if len(gt_sentences) == 0:
        return 0.0

    # Encode context
    context_emb = model.encode([context])

    # Count how many ground truth sentences are covered in context
    covered_count = 0
    for gt_sent in gt_sentences:
        if len(gt_sent.split()) < 3:  # Skip very short fragments
            continue
        gt_emb = model.encode([gt_sent])
        similarity = cosine_similarity(context_emb, gt_emb)[0][0]
        if similarity > threshold:
            covered_count += 1

    recall = covered_count / len(gt_sentences) if len(gt_sentences) > 0 else 0.0
    return float(recall)

def compute_context_f1(precision: float, recall: float) -> float:
    """
    Context F1: Harmonic mean of precision and recall
    """
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

# ----------------------------------------------------------------------------
# 4. Compute Scores for All Rows (Both Naive and Enhanced)
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("Computing metrics for NAIVE approach...")
print("="*80)

naive_results_list = []

for idx, row in tqdm(naive_top1.iterrows(), total=len(naive_top1), desc="Evaluating Naive"):
    question = str(row['question'])
    context = str(row['context'])
    answer = str(row['answer'])

    # Get ground truth if available
    ground_truth = str(row.get('ground_truth', row.get('reference', answer)))

    # Compute all metrics
    context_rel = compute_context_relevance(question, context, embed_model)
    answer_rel = compute_answer_relevance(question, answer, embed_model)
    faithfulness = compute_answer_faithfulness(context, answer, embed_model)

    # New: Precision and Recall
    precision = compute_context_precision(question, context, answer, embed_model)
    recall = compute_context_recall(context, ground_truth, embed_model)
    f1 = compute_context_f1(precision, recall)

    naive_results_list.append({
        'id': row.get('id', idx),
        'question': question,
        'context': context,
        'answer': answer,
        'ground_truth': ground_truth,
        'context_relevance': context_rel,
        'answer_relevance': answer_rel,
        'answer_faithfulness': faithfulness,
        'context_precision': precision,
        'context_recall': recall,
        'context_f1': f1
    })

# Create results DataFrame for naive
naive_results_df = pd.DataFrame(naive_results_list)

print("Naive scoring completed!")

# Now compute for Enhanced
print("\n" + "="*80)
print("Computing metrics for ENHANCED approach...")
print("="*80)

enhanced_results_list = []

for idx, row in tqdm(enhanced_top1.iterrows(), total=len(enhanced_top1), desc="Evaluating Enhanced"):
    question = str(row['question'])
    context = str(row['context'])
    answer = str(row['answer'])

    # Get ground truth if available
    ground_truth = str(row.get('ground_truth', row.get('reference', answer)))

    # Compute all metrics
    context_rel = compute_context_relevance(question, context, embed_model)
    answer_rel = compute_answer_relevance(question, answer, embed_model)
    faithfulness = compute_answer_faithfulness(context, answer, embed_model)

    # New: Precision and Recall
    precision = compute_context_precision(question, context, answer, embed_model)
    recall = compute_context_recall(context, ground_truth, embed_model)
    f1 = compute_context_f1(precision, recall)

    enhanced_results_list.append({
        'id': row.get('id', idx),
        'question': question,
        'context': context,
        'answer': answer,
        'ground_truth': ground_truth,
        'context_relevance': context_rel,
        'answer_relevance': answer_rel,
        'answer_faithfulness': faithfulness,
        'context_precision': precision,
        'context_recall': recall,
        'context_f1': f1
    })

# Create results DataFrame for enhanced
enhanced_results_df = pd.DataFrame(enhanced_results_list)

print("Enhanced scoring completed!")

# ----------------------------------------------------------------------------
# 5. Calculate Average Scores (Both Approaches)
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("EVALUATION RESULTS (ALL METRICS)")
print("="*80)

naive_avg_scores = {
    'context_relevance': naive_results_df['context_relevance'].mean(),
    'answer_relevance': naive_results_df['answer_relevance'].mean(),
    'answer_faithfulness': naive_results_df['answer_faithfulness'].mean(),
    'context_precision': naive_results_df['context_precision'].mean(),
    'context_recall': naive_results_df['context_recall'].mean(),
    'context_f1': naive_results_df['context_f1'].mean(),
    'num_queries': len(naive_results_df)
}

enhanced_avg_scores = {
    'context_relevance': enhanced_results_df['context_relevance'].mean(),
    'answer_relevance': enhanced_results_df['answer_relevance'].mean(),
    'answer_faithfulness': enhanced_results_df['answer_faithfulness'].mean(),
    'context_precision': enhanced_results_df['context_precision'].mean(),
    'context_recall': enhanced_results_df['context_recall'].mean(),
    'context_f1': enhanced_results_df['context_f1'].mean(),
    'num_queries': len(enhanced_results_df)
}

print("\n" + "="*80)
print("NAIVE Average Scores:")
print("="*80)
print(f"  Context Relevance:     {naive_avg_scores['context_relevance']:.4f}")
print(f"  Answer Relevance:      {naive_avg_scores['answer_relevance']:.4f}")
print(f"  Answer Faithfulness:   {naive_avg_scores['answer_faithfulness']:.4f}")
print(f"  Context Precision:     {naive_avg_scores['context_precision']:.4f}")
print(f"  Context Recall:        {naive_avg_scores['context_recall']:.4f}")
print(f"  Context F1:            {naive_avg_scores['context_f1']:.4f}")
print(f"  Number of Queries:     {naive_avg_scores['num_queries']}")

print("\n" + "="*80)
print("ENHANCED Average Scores:")
print("="*80)
print(f"  Context Relevance:     {enhanced_avg_scores['context_relevance']:.4f}")
print(f"  Answer Relevance:      {enhanced_avg_scores['answer_relevance']:.4f}")
print(f"  Answer Faithfulness:   {enhanced_avg_scores['answer_faithfulness']:.4f}")
print(f"  Context Precision:     {enhanced_avg_scores['context_precision']:.4f}")
print(f"  Context Recall:        {enhanced_avg_scores['context_recall']:.4f}")
print(f"  Context F1:            {enhanced_avg_scores['context_f1']:.4f}")
print(f"  Number of Queries:     {enhanced_avg_scores['num_queries']}")

# Calculate improvements
print("\n" + "="*80)
print("IMPROVEMENT (Enhanced vs Naive):")
print("="*80)
improvements = {}
for metric in ['context_relevance', 'answer_relevance', 'answer_faithfulness',
               'context_precision', 'context_recall', 'context_f1']:
    diff = enhanced_avg_scores[metric] - naive_avg_scores[metric]
    pct = (diff / naive_avg_scores[metric] * 100) if naive_avg_scores[metric] != 0 else 0
    improvements[metric] = {'absolute': diff, 'percent': pct}
    print(f"  {metric.replace('_', ' ').title():25s}: {diff:+.4f} ({pct:+.2f}%)")

# Distribution stats
print("\n" + "="*80)
print("Score Distributions (NAIVE):")
print("="*80)

metrics_to_show = ['context_relevance', 'answer_relevance', 'answer_faithfulness',
                   'context_precision', 'context_recall', 'context_f1']

for metric in metrics_to_show:
    print(f"\n{metric.replace('_', ' ').title()}:")
    print(naive_results_df[metric].describe())

print("\n" + "="*80)
print("Score Distributions (ENHANCED):")
print("="*80)

for metric in metrics_to_show:
    print(f"\n{metric.replace('_', ' ').title()}:")
    print(enhanced_results_df[metric].describe())

# ----------------------------------------------------------------------------
# 6. Save Results (Both Approaches)
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("Saving results...")
print("="*80)

output_dir = "results"
os.makedirs(output_dir, exist_ok=True)

# Save detailed per-question results for NAIVE
naive_detailed_csv = os.path.join(output_dir, "ares_naive_top1_detailed.csv")
naive_results_df.to_csv(naive_detailed_csv, index=False)
print(f"Saved naive detailed results: {naive_detailed_csv}")

# Save detailed per-question results for ENHANCED
enhanced_detailed_csv = os.path.join(output_dir, "ares_enhanced_top1_detailed.csv")
enhanced_results_df.to_csv(enhanced_detailed_csv, index=False)
print(f"Saved enhanced detailed results: {enhanced_detailed_csv}")

# Save average scores for NAIVE
naive_avg_df = pd.DataFrame([naive_avg_scores])
naive_avg_csv = os.path.join(output_dir, "ares_naive_top1_averages.csv")
naive_avg_df.to_csv(naive_avg_csv, index=False)
print(f"Saved naive averages: {naive_avg_csv}")

naive_avg_json = os.path.join(output_dir, "ares_naive_top1_averages.json")
with open(naive_avg_json, 'w') as f:
    json.dump(naive_avg_scores, f, indent=2)
print(f"Saved naive JSON: {naive_avg_json}")

# Save average scores for ENHANCED
enhanced_avg_df = pd.DataFrame([enhanced_avg_scores])
enhanced_avg_csv = os.path.join(output_dir, "ares_enhanced_top1_averages.csv")
enhanced_avg_df.to_csv(enhanced_avg_csv, index=False)
print(f"Saved enhanced averages: {enhanced_avg_csv}")

enhanced_avg_json = os.path.join(output_dir, "ares_enhanced_top1_averages.json")
with open(enhanced_avg_json, 'w') as f:
    json.dump(enhanced_avg_scores, f, indent=2)
print(f"Saved enhanced JSON: {enhanced_avg_json}")

# ----------------------------------------------------------------------------
# 7. Create Comprehensive Summary and Comparison
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("Summary Statistics")
print("="*80)

# Naive summary
naive_summary = pd.DataFrame({
    'Metric': [
        'Context Relevance',
        'Answer Relevance',
        'Answer Faithfulness',
        'Context Precision',
        'Context Recall',
        'Context F1'
    ],
    'Mean': [
        naive_results_df['context_relevance'].mean(),
        naive_results_df['answer_relevance'].mean(),
        naive_results_df['answer_faithfulness'].mean(),
        naive_results_df['context_precision'].mean(),
        naive_results_df['context_recall'].mean(),
        naive_results_df['context_f1'].mean()
    ],
    'Std': [
        naive_results_df['context_relevance'].std(),
        naive_results_df['answer_relevance'].std(),
        naive_results_df['answer_faithfulness'].std(),
        naive_results_df['context_precision'].std(),
        naive_results_df['context_recall'].std(),
        naive_results_df['context_f1'].std()
    ],
    'Min': [
        naive_results_df['context_relevance'].min(),
        naive_results_df['answer_relevance'].min(),
        naive_results_df['answer_faithfulness'].min(),
        naive_results_df['context_precision'].min(),
        naive_results_df['context_recall'].min(),
        naive_results_df['context_f1'].min()
    ],
    'Max': [
        naive_results_df['context_relevance'].max(),
        naive_results_df['answer_relevance'].max(),
        naive_results_df['answer_faithfulness'].max(),
        naive_results_df['context_precision'].max(),
        naive_results_df['context_recall'].max(),
        naive_results_df['context_f1'].max()
    ]
})

print("\nNAIVE Summary:")
print(naive_summary.to_string(index=False))

# Enhanced summary
enhanced_summary = pd.DataFrame({
    'Metric': [
        'Context Relevance',
        'Answer Relevance',
        'Answer Faithfulness',
        'Context Precision',
        'Context Recall',
        'Context F1'
    ],
    'Mean': [
        enhanced_results_df['context_relevance'].mean(),
        enhanced_results_df['answer_relevance'].mean(),
        enhanced_results_df['answer_faithfulness'].mean(),
        enhanced_results_df['context_precision'].mean(),
        enhanced_results_df['context_recall'].mean(),
        enhanced_results_df['context_f1'].mean()
    ],
    'Std': [
        enhanced_results_df['context_relevance'].std(),
        enhanced_results_df['answer_relevance'].std(),
        enhanced_results_df['answer_faithfulness'].std(),
        enhanced_results_df['context_precision'].std(),
        enhanced_results_df['context_recall'].std(),
        enhanced_results_df['context_f1'].std()
    ],
    'Min': [
        enhanced_results_df['context_relevance'].min(),
        enhanced_results_df['answer_relevance'].min(),
        enhanced_results_df['answer_faithfulness'].min(),
        enhanced_results_df['context_precision'].min(),
        enhanced_results_df['context_recall'].min(),
        enhanced_results_df['context_f1'].min()
    ],
    'Max': [
        enhanced_results_df['context_relevance'].max(),
        enhanced_results_df['answer_relevance'].max(),
        enhanced_results_df['answer_faithfulness'].max(),
        enhanced_results_df['context_precision'].max(),
        enhanced_results_df['context_recall'].max(),
        enhanced_results_df['context_f1'].max()
    ]
})

print("\nENHANCED Summary:")
print(enhanced_summary.to_string(index=False))

# Save summaries
naive_summary_path = os.path.join(output_dir, "ares_naive_top1_summary.csv")
naive_summary.to_csv(naive_summary_path, index=False)
print(f"\nSaved naive summary: {naive_summary_path}")

enhanced_summary_path = os.path.join(output_dir, "ares_enhanced_top1_summary.csv")
enhanced_summary.to_csv(enhanced_summary_path, index=False)
print(f"Saved enhanced summary: {enhanced_summary_path}")

# Create side-by-side comparison
comparison_df = pd.DataFrame({
    'Metric': [
        'Context Relevance',
        'Answer Relevance',
        'Answer Faithfulness',
        'Context Precision',
        'Context Recall',
        'Context F1'
    ],
    'Naive_Mean': [
        naive_results_df['context_relevance'].mean(),
        naive_results_df['answer_relevance'].mean(),
        naive_results_df['answer_faithfulness'].mean(),
        naive_results_df['context_precision'].mean(),
        naive_results_df['context_recall'].mean(),
        naive_results_df['context_f1'].mean()
    ],
    'Enhanced_Mean': [
        enhanced_results_df['context_relevance'].mean(),
        enhanced_results_df['answer_relevance'].mean(),
        enhanced_results_df['answer_faithfulness'].mean(),
        enhanced_results_df['context_precision'].mean(),
        enhanced_results_df['context_recall'].mean(),
        enhanced_results_df['context_f1'].mean()
    ]
})

comparison_df['Improvement'] = comparison_df['Enhanced_Mean'] - comparison_df['Naive_Mean']
comparison_df['Improvement_Pct'] = (comparison_df['Improvement'] / comparison_df['Naive_Mean'] * 100).round(2)

print("\n" + "="*80)
print("NAIVE vs ENHANCED COMPARISON")
print("="*80)
print(comparison_df.to_string(index=False))

comparison_path = os.path.join(output_dir, "ares_naive_vs_enhanced_comparison.csv")
comparison_df.to_csv(comparison_path, index=False)
print(f"\n Saved comparison: {comparison_path}")


print("\n" + "="*80)
print("EVALUATION COMPLETE!")
print("="*80)
print(f"\nAll results saved in: {output_dir}/")
print("\nFiles created:")
print("  NAIVE:")
print("    - ares_naive_top1_detailed.csv (per-question scores)")
print("    - ares_naive_top1_averages.csv (average scores)")
print("    - ares_naive_top1_averages.json (JSON format)")
print("    - ares_naive_top1_summary.csv (statistics)")
print("\n  ENHANCED:")
print("    - ares_enhanced_top1_detailed.csv (per-question scores)")
print("    - ares_enhanced_top1_averages.csv (average scores)")
print("    - ares_enhanced_top1_averages.json (JSON format)")
print("    - ares_enhanced_top1_summary.csv (statistics)")
print("\n  COMPARISON:")
print("    - ares_naive_vs_enhanced_comparison.csv (side-by-side)")

🚀 ARES-STYLE EVALUATION WITH CONTEXT PRECISION & RECALL

Loading CSV files...
✅ Loaded Naive: 918 rows
✅ Loaded Enhanced: 918 rows

Naive Columns: ['id', 'question', 'context', 'answer', 'strategy']
Enhanced Columns: ['id', 'question', 'rewritten_question', 'context', 'answer', 'strategy']

Naive Preview:
   id                                           question  \
0   0  Was Abraham Lincoln the sixteenth President of...   
1   2  Did Lincoln sign the National Banking Act of 1...   

                                             context answer  \
0                              Young Abraham Lincoln     no   
1  Lincoln believed in the Whig theory of the pre...    Yes   

                strategy  
0  384d_top1_instruction  
1  384d_top1_instruction  

Enhanced Preview:
   id                                           question  \
0   0  Was Abraham Lincoln the sixteenth President of...   
1   2  Did Lincoln sign the National Banking Act of 1...   

                                  rewritt

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded: all-MiniLM-L6-v2

🔍 Computing metrics for NAIVE approach...


Evaluating Naive: 100%|██████████| 918/918 [01:17<00:00, 11.91it/s]


✅ Naive scoring completed!

🔍 Computing metrics for ENHANCED approach...


Evaluating Enhanced: 100%|██████████| 918/918 [01:17<00:00, 11.85it/s]

✅ Enhanced scoring completed!

📊 EVALUATION RESULTS (ALL METRICS)

NAIVE Average Scores:
  Context Relevance:     0.6675
  Answer Relevance:      0.3393
  Answer Faithfulness:   0.3460
  Context Precision:     0.2152
  Context Recall:        0.2692
  Context F1:            0.1813
  Number of Queries:     918

ENHANCED Average Scores:
  Context Relevance:     0.6626
  Answer Relevance:      0.3440
  Answer Faithfulness:   0.3495
  Context Precision:     0.2109
  Context Recall:        0.2336
  Context F1:            0.1662
  Number of Queries:     918

📈 IMPROVEMENT (Enhanced vs Naive):
  Context Relevance        : -0.0049 (-0.73%)
  Answer Relevance         : +0.0047 (+1.39%)
  Answer Faithfulness      : +0.0034 (+0.99%)
  Context Precision        : -0.0043 (-2.01%)
  Context Recall           : -0.0355 (-13.20%)
  Context F1               : -0.0151 (-8.33%)

📈 Score Distributions (NAIVE):

Context Relevance:
count    918.000000
mean       0.667495
std        0.115232
min        0.27728




# RAGAS testing

In [7]:
import os
import pandas as pd
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, context_precision, context_recall, answer_relevancy
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import json

# ----------------------------------------------------------------------------
# 0. Load CSVs
# ----------------------------------------------------------------------------
print("Loading CSV files...")
naive_top1_instruction = pd.read_csv("results/384d_top1_instruction_results.csv")
enhanced_top1 = pd.read_csv("results/384d_enhanced_top1_results.csv")

print(f"Naive rows (before sample): {len(naive_top1_instruction)}")
print(f"Enhanced rows (before sample): {len(enhanced_top1)}")

# ----------------------------------------------------------------------------
# 0a. Subsample for quick testing (30 rows)
# ----------------------------------------------------------------------------
naive_top1_instruction = naive_top1_instruction.sample(n=30, random_state=42).reset_index(drop=True)
enhanced_top1 = enhanced_top1.sample(n=30, random_state=42).reset_index(drop=True)

print(f"Naive rows (after sample): {len(naive_top1_instruction)}")
print(f"Enhanced rows (after sample): {len(enhanced_top1)}")

# ----------------------------------------------------------------------------
# 1. Convert to RAGAS format
# ----------------------------------------------------------------------------
def to_ragas_format(df: pd.DataFrame) -> Dataset:
    ragas_data = {
        "question": df["question"].tolist(),
        "answer": df["answer"].tolist(),
        "contexts": df["context"].apply(lambda x: [x]).tolist(),
        "ground_truth": df["answer"].tolist()
    }
    ragas_df = pd.DataFrame(ragas_data)
    print(f"\nRAGAS format preview:\n{ragas_df.head(2)}")
    return Dataset.from_pandas(ragas_df)

print("\nConverting naive data to RAGAS format...")
ragas_naive = to_ragas_format(naive_top1_instruction)

print("\nConverting enhanced data to RAGAS format...")
ragas_enhanced = to_ragas_format(enhanced_top1)

# ----------------------------------------------------------------------------
# 2. Setup LLM + embeddings
# ----------------------------------------------------------------------------
print("\nSetting up OpenAI models...")
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0,
    api_key=os.environ["OPENAI_API_KEY"],
    n=3   # avoid warnings: ragas expects multiple generations
)

embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    api_key=os.environ["OPENAI_API_KEY"]
)

metrics = [faithfulness, context_precision, context_recall, answer_relevancy]

# ----------------------------------------------------------------------------
# 3. Run evaluation, save per-row + summary
# ----------------------------------------------------------------------------
def run_evaluation(dataset, name="Dataset"):
    print(f"\n{name} - Running evaluation on {len(dataset)} rows...")
    try:
        result = evaluate(
            dataset,
            metrics=metrics,
            llm=llm,
            embeddings=embeddings
        )

        # Convert to DataFrame
        df = result.to_pandas()

        # Save per-row results (includes questions + metrics)
        out_csv = f"results/{name.lower()}_per_row.csv"
        df.to_csv(out_csv, index=False)
        print(f"{name}: per-row results saved → {out_csv}")

        # Compute averages on only metric columns
        metric_cols = [c for c in df.columns if c in ["faithfulness", "context_precision", "context_recall", "answer_relevancy"]]
        summary = df[metric_cols].mean().to_dict()

        # Save summary
        out_json = f"results/{name.lower()}_summary.json"
        with open(out_json, "w") as f:
            json.dump(summary, f, indent=4)
        print(f"{name}: summary saved → {out_json}")

        return summary

    except Exception as e:
        print(f"{name} evaluation failed: {e}")
        return None

# ----------------------------------------------------------------------------
# 4. Run Naive + Enhanced
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("Running RAGAS Evaluation on NAIVE...")
print("="*80)
results_naive = run_evaluation(ragas_naive, "Naive")

print("\n" + "="*80)
print("Running RAGAS Evaluation on ENHANCED...")
print("="*80)
results_enhanced = run_evaluation(ragas_enhanced, "Enhanced")

# ----------------------------------------------------------------------------
# 5. Show results
# ----------------------------------------------------------------------------
print("\n" + "="*80)
print("FINAL RESULTS")
print("="*80)

if results_naive:
    print("\nNaive Results (averages):")
    for k, v in results_naive.items():
        print(f"{k}: {v:.4f}")
else:
    print("Naive evaluation failed completely")

if results_enhanced:
    print("\nEnhanced Results (averages):")
    for k, v in results_enhanced.items():
        print(f"{k}: {v:.4f}")
else:
    print("Enhanced evaluation failed completely")


Loading CSV files...
Naive rows (before sample): 918
Enhanced rows (before sample): 918
Naive rows (after sample): 30
Enhanced rows (after sample): 30

Converting naive data to RAGAS format...

RAGAS format preview:
                           question                             answer  \
0  What is actually black in color?  A melanistic morph of the leopard   
1   When did he become a professor?                               1833   

                                            contexts  \
0  [A melanistic morph of the leopard occurs part...   
1  [He was elected a member of the Royal Society ...   

                        ground_truth  
0  A melanistic morph of the leopard  
1                               1833  

Converting enhanced data to RAGAS format...

RAGAS format preview:
                           question   answer  \
0  What is actually black in color?  leopard   
1   When did he become a professor?     1833   

                                            contexts ground_tr

Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[10]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[16]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[24]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[42]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[51]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[52]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[64]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[82]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[84]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[88]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[90]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[95]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[96]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[105]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[113]: TimeoutError()
ERROR:ragas.executor:Ex

Naive: per-row results saved → results/naive_per_row.csv
Naive: summary saved → results/naive_summary.json

Running RAGAS Evaluation on ENHANCED...

Enhanced - Running evaluation on 30 rows...


Evaluating:   0%|          | 0/120 [00:00<?, ?it/s]

ERROR:ragas.executor:Exception raised in Job[9]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[10]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[8]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[16]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[32]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[43]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[45]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[52]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[56]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[62]: TimeoutError()
ERROR:ragas.executor:Exception raised in Job[115]: TimeoutError()


Enhanced: per-row results saved → results/enhanced_per_row.csv
Enhanced: summary saved → results/enhanced_summary.json

FINAL RESULTS

Naive Results (averages):
faithfulness: 0.8261
context_precision: 0.7778
context_recall: 0.6538
answer_relevancy: 0.3043

Enhanced Results (averages):
faithfulness: 0.6667
context_precision: 0.7500
context_recall: 0.6429
answer_relevancy: 0.2168
