# Notebook 6: Graph RAG

This notebook demonstrates:
1. Query rephrasing for better retrieval
2. Anchor document retrieval
3. Neighboring chunk expansion
4. Answer composition with citations

In [None]:
import sys
sys.path.append('..')

from src.graph_rag import (
    rephrase_query,
    retrieve_anchors,
    get_neighboring_chunks,
    compose_answer_with_citations,
    GraphRAG,
    create_graph_rag
)
from dotenv import load_dotenv
import os

load_dotenv()
print("✓ Imports successful")

## Step 1: Query Rephrasing

In [None]:
# Test query rephrasing
original_query = "What are the key principles of GDPR?"
rephrased = rephrase_query(original_query)

print(f"Original: {original_query}")
print(f"\nRephrased variations:")
for i, q in enumerate(rephrased):
    print(f"  {i+1}. {q}")

## Step 2: Anchor Retrieval

In [None]:
# Test anchor retrieval (with mock vectorstore)
anchors = retrieve_anchors(rephrased, vectorstore=None, top_k=2)

print(f"Retrieved {len(anchors)} anchor documents")
for i, anchor in enumerate(anchors):
    print(f"\nAnchor {i+1}:")
    print(f"  Score: {anchor['score']:.2f}")
    print(f"  Query: {anchor['query']}")
    print(f"  Content: {anchor['content'][:100]}...")

## Step 3: Neighbor Expansion

In [None]:
# Get neighboring chunks
neighbors = get_neighboring_chunks(anchors, context_window=1)

print(f"Retrieved {len(neighbors)} neighboring chunks")
for i, neighbor in enumerate(neighbors[:3]):
    print(f"\nNeighbor {i+1}:")
    print(f"  Offset: {neighbor['metadata'].get('neighbor_offset', 'N/A')}")
    print(f"  Content: {neighbor['content'][:80]}...")

## Step 4: Complete Graph RAG Pipeline

In [None]:
# Create Graph RAG system
graph_rag = create_graph_rag("../faiss_index", os.getenv("OPENAI_API_KEY"))

# Run complete pipeline
query = "What are the key principles of GDPR?"
result = graph_rag.query(query, verbose=True)

print(f"\n{'='*60}")
print(f"Answer:\n{result['answer']}")
print(f"\nConfidence: {result['confidence']:.2%}")
print(f"Anchors: {result['num_anchors']}")
print(f"Neighbors: {result['num_neighbors']}")

## Step 5: Citation Analysis

In [None]:
# Analyze citations
citations = result['citations']

print(f"Total citations: {len(citations)}")
anchor_citations = [c for c in citations if c.get('is_anchor')]
print(f"Anchor citations: {len(anchor_citations)}")
print(f"Context citations: {len(citations) - len(anchor_citations)}")

print("\nTop citations:")
for i, citation in enumerate(citations[:3]):
    print(f"\n{i+1}. Source: {citation['source']}")
    print(f"   Type: {'ANCHOR' if citation['is_anchor'] else 'CONTEXT'}")
    print(f"   Snippet: {citation['snippet'][:80]}...")

## Step 6: Compare with Baseline

In [None]:
# Compare Graph RAG with baseline
from src.rag_baseline import BaselineRAG

baseline_rag = BaselineRAG("../faiss_index", os.getenv("OPENAI_API_KEY"))

query = "What rights do individuals have under GDPR?"

print("Baseline RAG:")
baseline_answer = baseline_rag.query(query)
print(f"  Length: {len(baseline_answer)}")

print("\nGraph RAG:")
graph_result = graph_rag.query(query, verbose=False)
print(f"  Length: {len(graph_result['answer'])}")
print(f"  Citations: {len(graph_result['citations'])}")
print(f"  Confidence: {graph_result['confidence']:.2%}")

## Summary

✓ Implemented query rephrasing
✓ Retrieved anchor documents
✓ Expanded to neighboring chunks
✓ Composed answers with citations
✓ Compared with baseline RAG

Next: `07_responsible_ai_and_tests.ipynb` for testing and validation