# 4 ‚Äì Advanced Retrieval: Hybrid, Reranking, Finetuning


In [None]:
# ‚öôÔ∏è Global Config & Services (using centralized modules)

import json
import sys
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv

# Add parent directory to path and change to project root
import os

# Get the notebook's current directory and find project root
notebook_dir = Path.cwd()
if notebook_dir.name == "notebooks":
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

# Change to project root and add to path
os.chdir(project_root)
sys.path.insert(0, str(project_root))

print(f"üìÇ Working directory: {os.getcwd()}")

from src.services.llm_services import (
    load_config,
    get_llm,
    get_text_embeddings,
    validate_api_keys,
    print_config_summary
)

# Load environment variables
load_dotenv()

# Load configuration from config.yaml (now we're in project root)
config = load_config("src/config/config.yaml")

# Validate API keys
validate_api_keys(config, verbose=True)

# Print summary
print_config_summary(config)


‚úÖ Config loaded:
  LLM: openrouter (openai/gpt-4o-mini)
  Embeddings: sbert / sentence-transformers/all-MiniLM-L6-v2
  Temperature: 0.2
  Artifacts: ./artifacts




In [2]:
# Initialize LLM, Embeddings, and Reranker
from sentence_transformers import CrossEncoder

llm = get_llm(config)
embeddings = get_text_embeddings(config)

# Reranker (specific to this notebook)
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

print(f"‚úÖ LLM: {config['llm_provider']} / {config.get('openrouter_model', config.get('llm_model'))}")
print(f"‚úÖ Embeddings: {config['text_emb_model']}")
print(f"‚úÖ Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2")

# Verify API key with test completion
print("\nüîç Testing LLM API connection...")
try:
    test_response = llm.invoke("Say 'API working!' if you can read this.")
    test_msg = test_response.content if hasattr(test_response, 'content') else str(test_response)
    print(f"‚úÖ LLM API verified: {test_msg[:50]}")
except Exception as e:
    print(f"‚ùå LLM API test failed: {e}")
    print("‚ö†Ô∏è  Please check your .env file and API key configuration.")


  return HuggingFaceEmbeddings(


‚úÖ LLM: openrouter / gpt-4o-mini
‚úÖ Embeddings: sentence-transformers/all-MiniLM-L6-v2
‚úÖ Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2

üîç Testing LLM API connection...
‚úÖ LLM API verified: API working!


---

## Step 1: Load or Create Data


In [3]:
from langchain.schema import Document
import re

# Load corpus dynamically from raw_text files
text_dir = Path(config["data_root"]) / "raw_text"

def load_and_chunk_text_files(directory: Path, chunk_size: int = 500):
    """Load text files and chunk them into manageable paragraphs."""
    corpus = []
    
    for txt_file in directory.glob("*.txt"):
        print(f"  Loading: {txt_file.name}")
        content = txt_file.read_text(encoding='utf-8')
        
        # Split by double newlines (paragraphs) or section markers
        paragraphs = re.split(r'\n\n+|‚∏ª', content)
        
        for para in paragraphs:
            # Clean and normalize
            para = para.strip()
            
            # Skip very short paragraphs, headers, or empty lines
            if len(para) < 50 or para.startswith('‚Ä¢') or para.startswith('#'):
                continue
            
            # Remove excessive whitespace and bullet points
            para = re.sub(r'\s+', ' ', para)
            para = re.sub(r'^\s*[‚Ä¢\-]\s*', '', para)
            
            # Skip if still too short after cleaning
            if len(para) < 100:
                continue
                
            corpus.append(para)
    
    return corpus

print("üìö Loading dermatology corpus from raw_text files...")
corpus = load_and_chunk_text_files(text_dir)

# Create documents with metadata
documents = [
    Document(
        page_content=text, 
        metadata={
            'doc_id': i, 
            'source': 'dermatology_corpus',
            'length': len(text)
        }
    ) 
    for i, text in enumerate(corpus)
]

print(f"‚úÖ Loaded {len(documents)} dermatology documents from text files")

# Check if loaded successfully - add fallback if empty
if len(documents) == 0:
    print("‚ö†Ô∏è  No documents from raw_text. Creating sample corpus...")
    sample_texts = [
        "Eczema (atopic dermatitis) is a chronic inflammatory skin condition. Treatment includes daily moisturizing, topical corticosteroids during flare-ups, and avoiding triggers.",
        "Psoriasis is an autoimmune condition causing rapid skin cell turnover, resulting in thick, silvery scales. Common treatments include topical corticosteroids, phototherapy, and systemic medications.",
        "Fungal infections (tinea) such as ringworm are caused by dermatophytes. Treatment involves topical antifungal creams like terbinafine applied for 2-4 weeks.",
        "Acne vulgaris occurs when hair follicles become clogged. Treatment options include topical retinoids, benzoyl peroxide, and oral antibiotics for severe cases.",
        "Contact dermatitis results from skin exposure to irritants or allergens. Management involves identifying and avoiding triggers.",
        "Rosacea causes facial redness and visible blood vessels. Treatment includes avoiding triggers and topical medications like metronidazole.",
        "Seborrheic dermatitis causes scaly patches on the scalp. Treatment includes medicated shampoos containing ketoconazole.",
        "Vitiligo causes loss of skin pigmentation. Management includes sun protection, topical corticosteroids, and phototherapy.",
    ]
    documents = [
        Document(page_content=text, metadata={"doc_id": i, "source": "sample_corpus", "length": len(text)})
        for i, text in enumerate(sample_texts)
    ]
    print(f"‚úÖ Created {len(documents)} sample documents")

if len(documents) > 0:
    avg_len = sum(len(d.page_content) for d in documents) // len(documents)
    print(f"  Average length: {avg_len} chars")
    print(f"  Topics: eczema, psoriasis, fungal infections, treatments")
    print(f"\nSample: {documents[0].page_content[:120]}...")


üìö Loading dermatology corpus from raw_text files...
  Loading: Understanding Skin Diseases.txt
  Loading: skin-care habits.txt
‚úÖ Loaded 34 dermatology documents from text files
  Average length: 348 chars
  Topics: eczema, psoriasis, fungal infections, treatments

Sample: Sure ‚Äî here‚Äôs a detailed and comprehensive overview of skin diseases, written in an informative, medically accurate styl...


---

## Step 2: Dense Retrieval (ChromaDB)

Build a vector store using dense embeddings.


In [4]:
from langchain_chroma import Chroma

chroma_root = Path(config["artifacts_root"]) / "chroma"
chroma_root.mkdir(parents=True, exist_ok=True)

print("üîµ Building dense vector store...")

dense_vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embeddings,
    collection_name="advanced_dense",
    persist_directory=str(chroma_root / "advanced_dense"),
)

print(f"‚úÖ Dense index built: {len(documents)} docs")

# Test dense retrieval
query = "What are treatments for eczema?"
dense_results = dense_vectorstore.similarity_search(query, k=3)

print(f"\nüîç Dense search: '{query}'")
for i, doc in enumerate(dense_results, 1):
    print(f"  [{i}] {doc.page_content[:100]}...")

üîµ Building dense vector store...
‚úÖ Dense index built: 34 docs

üîç Dense search: 'What are treatments for eczema?'
  [1] Eczema (atopic dermatitis) ‚Ä¢ Core remedies: daily emollients, short lukewarm baths/showers, fragranc...
  [2] Eczema (atopic dermatitis) ‚Ä¢ Core remedies: daily emollients, short lukewarm baths/showers, fragranc...
  [3] Treatment depends on the underlying cause and may include: ‚Ä¢ Topical medications: Corticosteroids, a...


---

## Step 3: Sparse Retrieval (BM25)

Use BM25 for keyword-based retrieval.


In [None]:
from rank_bm25 import BM25Okapi
import numpy as np

print("üü† Building BM25 index...")

# Tokenize corpus
tokenized_corpus = [doc.page_content.lower().split() for doc in documents]
bm25 = BM25Okapi(tokenized_corpus)

print(f"‚úÖ BM25 index built")

def bm25_search(query: str, top_k: int = 3):
    """
    Search using BM25 (sparse retrieval algorithm).
    
    Args:
        query: Search query string
        top_k: Number of top results to return
        
    Returns:
        List of dictionaries with doc, score, and doc_id
    """
    ### START CODE HERE ### (‚âà 12-15 lines)
    # YOUR CODE HERE
    # HINTS:
    # 1. Tokenize the query: query.lower().split()
    # 2. Get BM25 scores for all documents: bm25.get_scores(tokenized_query)
    # 3. Find top-k indices using np.argsort():
    #    - np.argsort(scores) gives indices sorted ascending
    #    - [::-1] reverses to descending order
    #    - [:top_k] takes first top_k indices
    # 4. Create an empty results list
    # 5. Loop through top_indices
    # 6. For each idx, append a dictionary with:
    #    - "doc": documents[idx]
    #    - "score": float(scores[idx])
    #    - "doc_id": idx
    # 7. Return the results list
    
    raise NotImplementedError("Complete the bm25_search function")
    ### END CODE HERE ###

# Test BM25
bm25_results = bm25_search(query, top_k=3)

print(f"\nüîç BM25 search: '{query}'")
for i, res in enumerate(bm25_results, 1):
    print(f"  [{i}] (score: {res['score']:.2f}) {res['doc'].page_content[:100]}...")


üü† Building BM25 index...
‚úÖ BM25 index built

üîç BM25 search: 'What are treatments for eczema?'
  [1] (score: 3.76) Actinic keratoses, BCC, SCC, melanoma ‚Ä¢ What helps: prevention & early detection. Follow the ABCDE s...
  [2] (score: 2.61) Urticaria (hives) ‚Ä¢ What helps: for most, second-generation oral antihistamines (non-sedating) are f...
  [3] (score: 2.21) 2) Fungal (tinea/ringworm, athlete‚Äôs foot, jock itch) ‚Ä¢ What helps at home: OTC antifungals (creams,...


---

## Step 4: Hybrid Fusion (Dense + BM25)

Combine dense and sparse retrieval using Reciprocal Rank Fusion (RRF).


In [None]:
from typing import List

def rrf_fusion(dense_docs: List, bm25_results: List, k: int = 60) -> List:
    """
    Reciprocal Rank Fusion - combines dense and sparse retrieval.
    
    Args:
        dense_docs: Results from dense (vector) retrieval
        bm25_results: Results from BM25 (sparse) retrieval
        k: Constant for RRF formula (default 60)
        
    Returns:
        Fused results sorted by RRF score
    """
    ### START CODE HERE ### (‚âà 20-25 lines)
    # YOUR CODE HERE
    # HINTS:
    # 1. Initialize an empty dictionary: rrf_scores = {}
    #
    # 2. Add dense scores:
    #    - Loop through dense_docs with enumerate(dense_docs, 1) to get rank and doc
    #    - Get doc_id from doc.metadata["doc_id"]
    #    - Calculate RRF score: 1.0 / (k + rank)
    #    - Add to rrf_scores[doc_id] (initialize to 0.0 if not exists)
    #    - Use .get(doc_id, 0.0) to handle missing keys
    #
    # 3. Add BM25 scores:
    #    - Loop through bm25_results with enumerate(bm25_results, 1) to get rank and res
    #    - Get doc_id from res["doc_id"]
    #    - Calculate RRF score: 1.0 / (k + rank)
    #    - Add to rrf_scores[doc_id]
    #
    # 4. Sort by RRF score:
    #    - Use sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    #    - This gives list of (doc_id, score) tuples sorted by score
    #
    # 5. Build results list:
    #    - Loop through sorted_ids
    #    - For each (doc_id, score), append dictionary with:
    #      - "doc": documents[doc_id]
    #      - "score": score
    #      - "doc_id": doc_id
    #
    # 6. Return fused_docs list
    
    raise NotImplementedError("Complete the rrf_fusion function")
    ### END CODE HERE ###

# Test hybrid fusion
fused_results = rrf_fusion(dense_results, bm25_results)[:3]

print(f"üîÄ Hybrid (RRF) search: '{query}'")
for i, res in enumerate(fused_results, 1):
    print(f"  [{i}] (RRF: {res['score']:.3f}) {res['doc'].page_content[:100]}...")


üîÄ Hybrid (RRF) search: 'What are treatments for eczema?'
  [1] (RRF: 0.033) Eczema (atopic dermatitis) ‚Ä¢ Core remedies: daily emollients, short lukewarm baths/showers, fragranc...
  [2] (RRF: 0.016) Actinic keratoses, BCC, SCC, melanoma ‚Ä¢ What helps: prevention & early detection. Follow the ABCDE s...
  [3] (RRF: 0.016) Urticaria (hives) ‚Ä¢ What helps: for most, second-generation oral antihistamines (non-sedating) are f...


---

## Step 5: Reranking with Cross-Encoder

Refine results using a cross-encoder for more accurate relevance scoring.


In [None]:
def rerank(query: str, results: List, top_k: int = 3):
    """
    Rerank results using a cross-encoder for more accurate relevance scoring.
    
    Args:
        query: Search query
        results: List of initial results to rerank
        top_k: Number of top results to return after reranking
        
    Returns:
        Reranked results with rerank_score added
    """
    ### START CODE HERE ### (‚âà 12-15 lines)
    # YOUR CODE HERE
    # HINTS:
    # 1. Prepare query-document pairs for the cross-encoder:
    #    - Create list of [query, document_content] pairs
    #    - Use list comprehension: [[query, res["doc"].page_content] for res in results]
    #    - Store in variable: pairs
    #
    # 2. Get scores from the cross-encoder:
    #    - Use: reranker.predict(pairs)
    #    - This returns array of relevance scores
    #
    # 3. Add rerank scores to results:
    #    - Loop through results with enumerate to get index i and result res
    #    - Add new field: res["rerank_score"] = float(scores[i])
    #
    # 4. Sort by rerank_score:
    #    - Use: sorted(results, key=lambda x: x["rerank_score"], reverse=True)
    #    - Take top_k: [:top_k]
    #
    # 5. Return the reranked results
    
    raise NotImplementedError("Complete the rerank function")
    ### END CODE HERE ###

# Test reranking
reranked_results = rerank(query, fused_results[:6], top_k=3)

print(f"üèÜ Reranked results: '{query}'")
for i, res in enumerate(reranked_results, 1):
    print(f"  [{i}] (rerank: {res['rerank_score']:.3f}) {res['doc'].page_content[:100]}...")


üèÜ Reranked results: 'What are treatments for eczema?'
  [1] (rerank: 4.700) Eczema (atopic dermatitis) ‚Ä¢ Core remedies: daily emollients, short lukewarm baths/showers, fragranc...
  [2] (rerank: -1.724) Actinic keratoses, BCC, SCC, melanoma ‚Ä¢ What helps: prevention & early detection. Follow the ABCDE s...
  [3] (rerank: -4.781) Urticaria (hives) ‚Ä¢ What helps: for most, second-generation oral antihistamines (non-sedating) are f...


---

## Bonus Exercise: Complete Hybrid RAG Pipeline

**Challenge:** Combine all techniques into a single end-to-end pipeline!

In [None]:
def hybrid_rag_pipeline(query: str, dense_top_n: int = 10, bm25_top_n: int = 10, rerank_top_n: int = 6, final_top_k: int = 3):
    """
    Complete hybrid RAG pipeline combining all advanced techniques.
    
    Pipeline: Dense Retrieval ‚Üí BM25 Retrieval ‚Üí RRF Fusion ‚Üí Reranking ‚Üí LLM Generation
    
    Args:
        query: User question
        dense_top_n: Number of results from dense retrieval
        bm25_top_n: Number of results from BM25
        rerank_top_n: Number of fused results to rerank
        final_top_k: Final number of chunks to use for generation
        
    Returns:
        Dictionary with query, answer, and retrieved_docs
    """
    ### START CODE HERE ### (‚âà 25-30 lines)
    # YOUR CODE HERE
    # HINTS:
    # 1. Dense retrieval:
    #    - Use: dense_vectorstore.similarity_search(query, k=dense_top_n)
    #    - Store in: dense_results
    #
    # 2. Sparse (BM25) retrieval:
    #    - Use: bm25_search(query, top_k=bm25_top_n)
    #    - Store in: bm25_results
    #
    # 3. Fusion:
    #    - Use: rrf_fusion(dense_results, bm25_results)
    #    - Take top rerank_top_n: [:rerank_top_n]
    #    - Store in: fused_results
    #
    # 4. Reranking:
    #    - Use: rerank(query, fused_results, top_k=final_top_k)
    #    - Store in: reranked_results
    #
    # 5. Build context from reranked results:
    #    - Extract text from each result: res["doc"].page_content
    #    - Join with "\n\n"
    #    - Store in: context
    #
    # 6. Build RAG prompt:
    #    - System instruction + context + question
    #    - Example: f"""Use the context to answer the question...
    #                  Context: {context}
    #                  Question: {query}
    #                  Answer:"""
    #
    # 7. Generate answer:
    #    - Use: llm.invoke(prompt)
    #    - Extract text: response.content if hasattr(response, 'content') else str(response)
    #
    # 8. Return dictionary with:
    #    - "query": query
    #    - "answer": answer
    #    - "retrieved_docs": reranked_results
    #    - "num_dense": len(dense_results)
    #    - "num_bm25": len(bm25_results)
    #    - "num_fused": len(fused_results)
    #    - "num_final": len(reranked_results)
    
    raise NotImplementedError("Complete the hybrid_rag_pipeline function")
    ### END CODE HERE ###

# Test the complete pipeline
print("üöÄ Testing Complete Hybrid RAG Pipeline\n")
test_query = "What are treatments for eczema?"
result = hybrid_rag_pipeline(test_query)

print(f"Query: {test_query}")
print(f"\nPipeline stats:")
print(f"  Dense retrieval: {result.get('num_dense', 0)} docs")
print(f"  BM25 retrieval: {result.get('num_bm25', 0)} docs")
print(f"  After fusion: {result.get('num_fused', 0)} docs")
print(f"  After reranking: {result.get('num_final', 0)} docs")
print(f"\nFinal Answer:\n{result['answer']}")

---

## Summary

**What we covered:**

‚úÖ **Dense Retrieval** - Vector similarity search with ChromaDB  
‚úÖ **Sparse Retrieval** - BM25 keyword matching  
‚úÖ **Hybrid Fusion** - Reciprocal Rank Fusion (RRF) combining both  
‚úÖ **Reranking** - Cross-encoder for refined relevance scoring  
‚úÖ **Dynamic Loading** - Corpus loaded from raw_text files  

**Retrieval Pipeline:**
```
Query ‚Üí Dense (top-N) + BM25 (top-N) ‚Üí Fusion (RRF) ‚Üí Rerank (cross-encoder) ‚Üí Final top-k
```

**When to use each:**
- **BM25**: Keyword/exact match queries
- **Dense**: Semantic/paraphrase queries  
- **Hybrid**: Best of both worlds
- **Reranking**: Highest precision (slower but more accurate)

**Trade-offs:**
- Hybrid + reranking: Better accuracy, higher latency
- Dense only: Fast, good for semantic search
- BM25 only: Fast, good for keyword search

**Artifacts:**
- `./artifacts/chroma/advanced_dense/`
