# Week 3 Feature 4: Performance Optimization & Caching

This notebook demonstrates:
1. **BM25 Index Caching**: Pre-built index loaded from disk
2. **Query Result Caching**: LRU cache with TTL for repeated queries
3. **Performance Monitoring**: Detailed timing metrics
4. **Cache Effectiveness**: Hit rate analysis

## Step 1: Imports and Setup

In [None]:
import sys
import time
sys.path.append('..')

from src.vector_store import initialize_chroma_db
from src.qa_chain import ask_question
from src.cache_manager import get_query_cache, get_performance_monitor
import matplotlib.pyplot as plt
import numpy as np

print("‚úÖ Imports complete")

## Step 2: Initialize System

In [None]:
# Load ChromaDB
client, collection = initialize_chroma_db(
    persist_directory="../chroma_db",
    collection_name="documents"
)

doc_count = collection.count()
print(f"üìö Loaded {doc_count:,} documents")

# Get cache and performance monitor
cache = get_query_cache(max_size=1000, ttl_seconds=3600)
monitor = get_performance_monitor()

print("\n‚úÖ System initialized")

## Step 3: Test Queries (First Run - Cache Miss)

Run multiple queries to populate the cache

In [None]:
# Test queries
test_queries = [
    ("What is the CAN protocol?", "automotive"),
    ("What dresses are available?", "fashion"),
    ("Explain OBD-II diagnostic system", "automotive"),
    ("What are the shirt options?", "fashion"),
    ("What is engine control unit?", "automotive")
]

print("üî• Running queries for the FIRST TIME (cache miss expected)\n")
print("="*80)

first_run_times = []

for query, domain in test_queries:
    print(f"\nüìù Query: {query}")
    print(f"üè∑Ô∏è  Domain: {domain}")
    
    start = time.time()
    result = ask_question(
        collection,
        query,
        n_results=3,
        filter_metadata={"domain": domain},
        use_cache=True
    )
    elapsed = time.time() - start
    first_run_times.append(elapsed)
    
    print(f"\n‚è±Ô∏è  Time: {elapsed:.3f}s")
    print("="*80)

print(f"\n‚úÖ First run average: {np.mean(first_run_times):.3f}s")

## Step 4: Cache Statistics After First Run

In [None]:
stats = cache.get_stats()
print("‚ö° Cache Statistics (After First Run):")
print("="*60)
print(f"Cache size: {stats['size']}/{stats['max_size']}")
print(f"Cache hits: {stats['hits']}")
print(f"Cache misses: {stats['misses']}")
print(f"Hit rate: {stats['hit_rate_percent']}%")
print(f"Total requests: {stats['total_requests']}")

## Step 5: Test Same Queries Again (Cache Hit)

Run the same queries to demonstrate cache effectiveness

In [None]:
print("‚ö° Running SAME queries again (cache hit expected)\n")
print("="*80)

second_run_times = []

for query, domain in test_queries:
    print(f"\nüìù Query: {query}")
    print(f"üè∑Ô∏è  Domain: {domain}")
    
    start = time.time()
    result = ask_question(
        collection,
        query,
        n_results=3,
        filter_metadata={"domain": domain},
        use_cache=True
    )
    elapsed = time.time() - start
    second_run_times.append(elapsed)
    
    print(f"\n‚è±Ô∏è  Time: {elapsed:.3f}s")
    print("="*80)

print(f"\n‚úÖ Second run average: {np.mean(second_run_times):.3f}s")

## Step 6: Cache Statistics After Second Run

In [None]:
stats = cache.get_stats()
print("‚ö° Cache Statistics (After Second Run):")
print("="*60)
print(f"Cache size: {stats['size']}/{stats['max_size']}")
print(f"Cache hits: {stats['hits']}")
print(f"Cache misses: {stats['misses']}")
print(f"Hit rate: {stats['hit_rate_percent']}%")
print(f"Total requests: {stats['total_requests']}")

## Step 7: Performance Comparison Visualization

In [None]:
# Create comparison chart
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Chart 1: Individual query times
x = np.arange(len(test_queries))
width = 0.35

axes[0].bar(x - width/2, first_run_times, width, label='First Run (Cache Miss)', color='#e74c3c', alpha=0.8)
axes[0].bar(x + width/2, second_run_times, width, label='Second Run (Cache Hit)', color='#2ecc71', alpha=0.8)

axes[0].set_xlabel('Query Index', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Response Time (seconds)', fontsize=11, fontweight='bold')
axes[0].set_title('Query Response Times: Cache Miss vs Cache Hit', fontsize=12, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels([f'Q{i+1}' for i in range(len(test_queries))])
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Chart 2: Average comparison
avg_first = np.mean(first_run_times)
avg_second = np.mean(second_run_times)
speedup = avg_first / avg_second

axes[1].bar(['Cache Miss\n(First Run)', 'Cache Hit\n(Second Run)'], 
           [avg_first, avg_second],
           color=['#e74c3c', '#2ecc71'],
           alpha=0.8)

axes[1].set_ylabel('Average Response Time (seconds)', fontsize=11, fontweight='bold')
axes[1].set_title(f'Average Performance\nSpeedup: {speedup:.1f}x faster', fontsize=12, fontweight='bold')
axes[1].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, v in enumerate([avg_first, avg_second]):
    axes[1].text(i, v + 0.05, f'{v:.2f}s', ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../visualizations/performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nüìä Speedup: {speedup:.1f}x faster with cache")
print(f"üíæ Average cache hit time: {avg_second:.3f}s")
print(f"üî• Average cache miss time: {avg_first:.3f}s")

## Step 8: Performance Monitoring Breakdown

In [None]:
perf_stats = monitor.get_stats()

print("‚è±Ô∏è  Performance Breakdown:")
print("="*60)
print(f"Total queries: {perf_stats['queries_count']}")
print(f"\nAverage times (including cached queries):")
print(f"  Total: {perf_stats['avg_total_time']:.3f}s")
print(f"  Search: {perf_stats['avg_search_time']:.3f}s")
print(f"  Reranking: {perf_stats['avg_rerank_time']:.3f}s")
print(f"  Generation: {perf_stats['avg_generation_time']:.3f}s")

# Create breakdown visualization
fig, ax = plt.subplots(figsize=(10, 6))

components = ['Search', 'Reranking', 'Generation']
times = [
    perf_stats['avg_search_time'],
    perf_stats['avg_rerank_time'],
    perf_stats['avg_generation_time']
]
colors = ['#3498db', '#9b59b6', '#f39c12']

bars = ax.bar(components, times, color=colors, alpha=0.8)
ax.set_ylabel('Time (seconds)', fontsize=12, fontweight='bold')
ax.set_title('RAG Pipeline Component Performance', fontsize=13, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add value labels
for bar, time_val in zip(bars, times):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{time_val:.3f}s',
            ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('../visualizations/performance_breakdown.png', dpi=300, bbox_inches='tight')
plt.show()

## Step 9: Summary and Key Metrics

In [None]:
print("\n" + "="*80)
print("üìä PERFORMANCE OPTIMIZATION SUMMARY")
print("="*80)

print("\nüéØ Key Improvements:")
print(f"  1. BM25 Index Caching: Pre-built index loads instantly from disk")
print(f"  2. Query Result Caching: {speedup:.1f}x faster for repeated queries")
print(f"  3. Performance Monitoring: Detailed timing for all components")

print("\n‚ö° Cache Performance:")
print(f"  Cache size: {stats['size']} items")
print(f"  Hit rate: {stats['hit_rate_percent']}%")
print(f"  Total requests: {stats['total_requests']}")

print("\n‚è±Ô∏è  Response Times:")
print(f"  Cache miss (first run): {avg_first:.3f}s")
print(f"  Cache hit (second run): {avg_second:.3f}s")
print(f"  Speedup: {speedup:.1f}x")

print("\nüí° Benefits:")
print("  ‚úÖ Faster responses for repeated queries")
print("  ‚úÖ Reduced LLM API calls (cached results)")
print("  ‚úÖ Better user experience")
print("  ‚úÖ Scalable for high query volumes")
print("\n" + "="*80)

## Conclusion

Week 3 Feature 4 demonstrates significant performance improvements:

1. **BM25 Index Caching**: The BM25 index is built once and saved to disk, eliminating rebuild time on subsequent runs
2. **Query Result Caching**: LRU cache with 1-hour TTL provides instant responses for repeated queries
3. **Performance Monitoring**: Detailed metrics help identify bottlenecks
4. **Scalability**: System can handle high query volumes efficiently

The caching system provides significant speedup (typically 5-10x) for repeated queries while maintaining result freshness with TTL expiration.