# 03. RAG Librarian (Advanced)

This notebook sets up the Knowledge Base (Weaviate), performs Hybrid Search (Dense + Sparse), Reranking, and Evaluates the pipeline.

In [1]:
# Install dependencies
!pip install -q -U weaviate-client langchain-weaviate langchain-community sentence-transformers rank_bm25 tiktoken
!pip install -q -U langchain-openai langchain-groq langchain-google-genai python-dotenv

In [2]:
import os
import json
import sys
from pathlib import Path
from dotenv import load_dotenv
import weaviate
from weaviate.classes.init import Auth
import weaviate.classes.config as wn
import weaviate.classes.query as wq
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import CrossEncoder

# Add parent directory to path
notebook_dir = Path.cwd()
project_root = notebook_dir.parent if notebook_dir.name == "notebooks" else notebook_dir
sys.path.insert(0, str(project_root))

from src.services.llm_services import get_llm, load_config
from src.utils.cost_tracker import get_accurate_cost

load_dotenv()
config = load_config(os.path.join(project_root, "src/config/config.yaml"))
print("Config loaded.")


Config loaded.


## 1. Connect to Weaviate (v4)

Trying Embedded first, falling back to Local Docker.

In [3]:
print("Attempting to connect to Weaviate (v4)...")

headers = {
    "X-HuggingFace-Api-Key": os.getenv("HF_TOKEN", "")
}

client = None

# 1. Try Embedded
try:
    print("1. Trying Embedded...")
    client = weaviate.connect_to_embedded(headers=headers)
    print(" Weaviate Embedded Started!")
except Exception as e:
    print(f"   Embedded failed/skipped: {e}")
    client = None

# 2. Try Local Docker (Preferred for Windows users with Docker)
if not client:
    print("2. Trying Local Docker (localhost:8080)...")
    try:
        client = weaviate.connect_to_local(
            port=8080,
            grpc_port=50051,
            headers=headers
        )
        print(" Connected to Local Docker!")
    except Exception as e:
        print(f"   Local Docker failed: {e}")

# 3. Try Cloud (WCS) from .env
if not client:
    wcd_url = os.getenv("WEAVIATE_URL")
    wcd_api_key = os.getenv("WEAVIATE_API_KEY")
    if wcd_url and wcd_api_key:
        print(f"3. Trying Weaviate Cloud: {wcd_url}...")
        try:
            client = weaviate.connect_to_wcs(
                cluster_url=wcd_url,
                auth_credentials=Auth.api_key(wcd_api_key),
                headers=headers
            )
            print(" Connected to Weaviate Cloud!")
        except Exception as e:
            print(f"   Cloud connection failed: {e}")

if not client or not client.is_ready():
    raise ConnectionError("CRITICAL: Could not connect to any Weaviate instance (Embedded, Docker, or Cloud). Please check your setup.")

print("Client Ready.")

Attempting to connect to Weaviate (v4)...
1. Trying Embedded...
   Embedded failed/skipped: Windows is not supported with EmbeddedDB. Please upvote this feature request if you want
                 this: https://github.com/weaviate/weaviate/issues/3315
2. Trying Local Docker (localhost:8080)...
 Connected to Local Docker!
Client Ready.


## 2. Prepare Schema & Load Data

In [4]:
# Define Schema
class_name = "FinancialReport"

if client.collections.exists(class_name):
    client.collections.delete(class_name)

# Create Collection with properties
client.collections.create(
    name=class_name,
    vectorizer_config=wn.Configure.Vectorizer.none(), # We embed manually
    properties=[
        wn.Property(name="text", data_type=wn.DataType.TEXT),
        wn.Property(name="source", data_type=wn.DataType.TEXT),
        wn.Property(name="chunk_id", data_type=wn.DataType.INT),
        # Add other metadata fields as needed
        wn.Property(name="category", data_type=wn.DataType.TEXT),
        wn.Property(name="page_number", data_type=wn.DataType.INT),
    ]
)
print(f"Schema '{class_name}' created.")

Schema 'FinancialReport' created.


            Use the `vector_config` argument instead.
            


In [5]:
# Load Embeddings Model
embedding_model_name = config.get('text_emb_model', "sentence-transformers/all-MiniLM-L6-v2")
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
print(f"Loaded Embeddings: {embedding_model_name}")

# Load Data from Processed Directory
CHUNKS_PATH = os.path.join(project_root, config.get('chunks_json_path', 'data/processed/qa_dataset_full.json'))

with open(CHUNKS_PATH, 'r', encoding='utf-8') as f:
    raw_chunks = json.load(f)

collection = client.collections.get(class_name)
print(f"Loading {len(raw_chunks)} chunks...")

with collection.batch.fixed_size(batch_size=100) as batch:
    for i, chunk in enumerate(raw_chunks):
        # Handle both flat list of dicts or list of objects
        # Fix: Look for 'context' (from 01_data_factory) as well as 'chunk_content'/'page_content'
        text = chunk.get("context", chunk.get("chunk_content", chunk.get("page_content", "")))
        
        # Extract metadata if available, otherwise default
        meta = chunk.get("metadata", {})
        source_val = meta.get("source", chunk.get("source", "Unknown"))
        
        if not text: 
            if i < 3: print(f"DEBUG: Skipping empty chunk {i}. Keys: {list(chunk.keys())}")
            continue
        
        vector = embeddings.embed_query(text)
        
        batch.add_object(
            properties={
                "text": text,
                "source": source_val,
                "chunk_id": i,
                "category": meta.get("category", "Text"),
                "page_number": meta.get("page_number", 0)
            },
            vector=vector
        )
        if i % 100 == 0:
            print(f"Imported {i}...")

print(f"Ingestion Complete. Total: {collection.aggregate.over_all(total_count=True).total_count}")

  embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)


Loaded Embeddings: sentence-transformers/all-MiniLM-L6-v2
Loading 3000 chunks...
Imported 0...
Imported 100...
Imported 200...
Imported 300...
Imported 400...
Imported 500...
Imported 600...
Imported 700...
Imported 800...
Imported 900...
Imported 1000...
Imported 1100...
Imported 1200...
Imported 1300...
Imported 1400...
Imported 1500...
Imported 1600...
Imported 1700...
Imported 1800...
Imported 1900...
Imported 2000...
Imported 2100...
Imported 2200...
Imported 2300...
Imported 2400...
Imported 2500...
Imported 2600...
Imported 2700...
Imported 2800...
Imported 2900...
Ingestion Complete. Total: 3000


## 3. Retrieval Strategy: Hybrid + Reranking

In [6]:
# 1. Hybrid Search Function (Weaviate)
def hybrid_search(query, limit=20):
    collection = client.collections.get(class_name)
    query_vector = embeddings.embed_query(query)
    
    response = collection.query.hybrid(
        query=query,
        vector=query_vector,
        alpha=0.5, # 0.5 = Equal weight to Dense and Sparse
        limit=limit,
        fusion_type=wq.HybridFusion.RELATIVE_SCORE,
        return_metadata=wq.MetadataQuery(score=True)
    )
    
    results = []
    for o in response.objects:
        res = o.properties
        res['score'] = o.metadata.score
        results.append(res)
    return results

# 2. Reranker Setup
rerank_model_name = config.get('rerank_settings', {}).get('model', "cross-encoder/ms-marco-MiniLM-L-6-v2")
reranker = CrossEncoder(rerank_model_name)
print(f"Loaded Reranker: {rerank_model_name}")

# 3. Rerank Function
def rerank_results(query, retrieved_docs, top_k=5):
    if not retrieved_docs: return []
    
    pairs = [[query, doc['text']] for doc in retrieved_docs]
    scores = reranker.predict(pairs)
    
    for i, doc in enumerate(retrieved_docs):
        doc['rerank_score'] = float(scores[i])
    
    # Sort by rerank score
    return sorted(retrieved_docs, key=lambda x: x['rerank_score'], reverse=True)[:top_k]

Loaded Reranker: cross-encoder/ms-marco-MiniLM-L-6-v2


## 4. Generation & Evaluation Loop

Using the Golden Test Set to evaluate performance and cost.

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Initialize LLM
llm = get_llm(config)
print(f"LLM Initialized: {config['llm_provider']}")

# Prompt Template
template = """You are a financial analyst. Answer based ONLY on the context provided.

Context:
{context}

Question: {question}

Answer:"""
prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join([d['text'] for d in docs])

def query_system(question):
    # 1. Retrieve & Rerank
    initial_docs = hybrid_search(question, limit=20)
    top_docs = rerank_results(question, initial_docs, top_k=5)
    
    # 2. Generate
    context = format_docs(top_docs)
    chain = prompt | llm | StrOutputParser()
    answer = chain.invoke({"context": context, "question": question})
    
    return answer, top_docs

LLM Initialized: groq


In [8]:
import time

# Load Golden Test Set
TEST_SET_PATH = os.path.join(project_root, config.get('golden_test_set_path', 'data/processed/golden_test_set.jsonl'))
test_data = []

if os.path.exists(TEST_SET_PATH):
    with open(TEST_SET_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            test_data.append(json.loads(line))
else:
    print("Test set not found. Skipping evaluation.")

# Run Evaluation
results = []
print(f"Running Evaluation on {len(test_data[:5])} questions (preview)...")

for item in test_data[:5]: # Cap at 5 for quick test
    q = item['question']
    actual = item['answer']
    
    start = time.time()
    gen_answer, sources = query_system(q)
    latency = time.time() - start
    
    # Calculate Cost
    cost = get_accurate_cost(sources, q, gen_answer, prompt, model_name=config.get('llm_model', 'gpt-4o-mini'))
    
    results.append({
        "question": q,
        "generated": gen_answer,
        "actual": actual,
        "latency": latency,
        "cost": cost
    })
    
    
print("Evaluation Complete.")
print(json.dumps(results[0], indent=2)) # Show sample

Running Evaluation on 5 questions (preview)...


AttributeError: 'dict' object has no attribute 'page_content'

In [None]:
# Save Results
RESULTS_PATH = os.path.join(project_root, config.get('eval_results_path', 'data/results/rag_evaluation_results.json'))
os.makedirs(os.path.dirname(RESULTS_PATH), exist_ok=True)

with open(RESULTS_PATH, 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=4)

print(f"Results saved to {RESULTS_PATH}")

Results saved to c:\Development\financial-intelligence-engine\data/results/rag_evaluation_results.json
