# Improved RAG: Code-Aware Hybrid Retrieval
## YOUR CONTRIBUTION: Hybrid retrieval strategy for WordPress documentation

**Problem:** Baseline RAG treats all content equally, but WordPress queries often need:
- Function names/signatures (exact match)
- Code examples vs concepts (different retrieval strategies)

**Solution:** Hybrid retrieval combining:
1. Semantic search (embeddings) for concepts
2. Keyword/exact match for code/functions
3. Document type boosting (code vs concept)

In [None]:
# Same imports as baseline
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import json
import time
from typing import List, Dict, Tuple
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Load the same data
wp_docs = [
    {"id": "doc_001", "title": "add_action Function", "content": "The add_action() function is used to hook a function to a specific action. Actions are triggered at specific times during WordPress execution. Syntax: add_action($hook, $function_to_add, $priority, $accepted_args);", "type": "code"},
    {"id": "doc_002", "title": "WordPress Hooks Overview", "content": "Hooks are a way for one piece of code to interact with another piece of code. They make up the foundation for how plugins and themes interact with WordPress Core. There are two types of hooks: Actions and Filters.", "type": "concept"},
    {"id": "doc_003", "title": "wp_enqueue_script Function", "content": "wp_enqueue_script() is the proper way to add JavaScript files to a WordPress site. It prevents conflicts and ensures scripts load in the correct order. Syntax: wp_enqueue_script($handle, $src, $deps, $ver, $in_footer);", "type": "code"},
    {"id": "doc_004", "title": "The Loop in WordPress", "content": "The Loop is PHP code used by WordPress to display posts. Using The Loop, WordPress processes each post to be displayed on the current page and formats it according to specified criteria. The Loop extracts data from each post.", "type": "concept"},
    {"id": "doc_005", "title": "get_post_meta Function", "content": "Retrieve post meta field for a post. Returns the value of a custom field for the specified post. Syntax: get_post_meta($post_id, $key, $single); Returns an array of values if $single is false, or the value itself if true.", "type": "code"},
    {"id": "doc_006", "title": "Custom Post Types", "content": "WordPress can hold and display many different types of content. A Post Type is a way to define the structure and characteristics of different content types. Custom Post Types allow you to create content types beyond posts and pages.", "type": "concept"},
    {"id": "doc_007", "title": "register_post_type Function", "content": "Creates a custom post type. Syntax: register_post_type($post_type, $args); The $args array can contain labels, public visibility, menu position, supports features, and more configuration options.", "type": "code"},
    {"id": "doc_008", "title": "WordPress Security Best Practices", "content": "Always validate and sanitize user input. Use nonces to prevent CSRF attacks. Escape output data. Use prepared statements for database queries. Keep WordPress, themes, and plugins updated.", "type": "concept"},
    {"id": "doc_009", "title": "wp_insert_post Function", "content": "Inserts or updates a post in the database. Syntax: wp_insert_post($postarr, $wp_error); Returns the post ID on success. The $postarr parameter is an array of post data including post_title, post_content, post_status, etc.", "type": "code"},
    {"id": "doc_010", "title": "WordPress REST API", "content": "The WordPress REST API provides an interface for applications to interact with WordPress sites by sending and receiving data as JSON objects. It enables developers to create, read, update, and delete WordPress content from external applications.", "type": "concept"},
]

print("✓ Loaded WordPress docs")

## Improved RAG with Hybrid Retrieval

In [None]:
class ImprovedRAG:
    """
    IMPROVED: Code-Aware Hybrid Retrieval
    
    Improvements over baseline:
    1. Query classification (code vs concept query)
    2. Hybrid scoring: semantic + keyword match
    3. Document type boosting for code queries
    """
    
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        print(f"Initializing Improved RAG with {model_name}...")
        self.encoder = SentenceTransformer(model_name)
        self.index = None
        self.documents = []
        self.embeddings = None
        
        # Code function patterns for WordPress
        self.wp_function_pattern = re.compile(r'\b(wp_|get_|add_|register_|the_)\w+\b')
        
    def is_code_query(self, query: str) -> bool:
        """Classify if query is asking about code/functions"""
        code_indicators = [
            self.wp_function_pattern.search(query) is not None,
            'function' in query.lower(),
            'syntax' in query.lower(),
            '()' in query,
            'how to' in query.lower() and ('enqueue' in query.lower() or 'insert' in query.lower())
        ]
        return any(code_indicators)
        
    def index_documents(self, documents: List[Dict]):
        """Create vector embeddings and FAISS index"""
        print("Creating embeddings...")
        start = time.time()
        
        self.documents = documents
        texts = [f"{doc['title']} {doc['content']}" for doc in documents]
        
        self.embeddings = self.encoder.encode(texts, show_progress_bar=True)
        
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dimension)
        self.index.add(np.array(self.embeddings).astype('float32'))
        
        elapsed = time.time() - start
        print(f"✓ Indexed {len(documents)} documents in {elapsed:.2f}s")
    
    def keyword_score(self, query: str, doc: Dict) -> float:
        """Calculate keyword match score"""
        query_lower = query.lower()
        doc_text = f"{doc['title']} {doc['content']}".lower()
        
        # Extract potential function names from query
        query_functions = self.wp_function_pattern.findall(query)
        
        score = 0.0
        
        # Exact function name match (high weight)
        for func in query_functions:
            if func.lower() in doc_text:
                score += 2.0
        
        # Keyword overlap
        query_words = set(query_lower.split())
        doc_words = set(doc_text.split())
        overlap = len(query_words & doc_words) / max(len(query_words), 1)
        score += overlap
        
        return score
    
    def retrieve(self, query: str, k: int = 3) -> List[Dict]:
        """IMPROVED: Hybrid retrieval with query classification"""
        # Step 1: Classify query type
        is_code = self.is_code_query(query)
        
        # Step 2: Semantic search (same as baseline)
        query_embedding = self.encoder.encode([query])
        distances, indices = self.index.search(
            np.array(query_embedding).astype('float32'), 
            k * 2  # Get more candidates for reranking
        )
        
        # Step 3: Hybrid scoring
        candidates = []
        for idx, semantic_dist in zip(indices[0], distances[0]):
            doc = self.documents[idx].copy()
            
            # Semantic score (lower distance = better, so invert)
            semantic_score = 1.0 / (1.0 + semantic_dist)
            
            # Keyword score
            keyword_score = self.keyword_score(query, doc)
            
            # Document type boost for code queries
            type_boost = 1.0
            if is_code and doc['type'] == 'code':
                type_boost = 1.5  # Boost code docs for code queries
            elif not is_code and doc['type'] == 'concept':
                type_boost = 1.3  # Boost concept docs for concept queries
            
            # Combined score
            if is_code:
                # For code queries, weight keywords more heavily
                final_score = (0.4 * semantic_score + 0.6 * keyword_score) * type_boost
            else:
                # For concept queries, weight semantic more heavily
                final_score = (0.7 * semantic_score + 0.3 * keyword_score) * type_boost
            
            doc['score'] = float(semantic_dist)  # Keep original for logging
            doc['final_score'] = final_score
            doc['is_code_query'] = is_code
            candidates.append(doc)
        
        # Step 4: Rerank by final score
        candidates.sort(key=lambda x: x['final_score'], reverse=True)
        
        return candidates[:k]
    
    def generate_answer(self, query: str, retrieved_docs: List[Dict]) -> str:
        """Generate answer from retrieved docs"""
        context = "\n\n".join([
            f"[{doc['title']}]: {doc['content']}" 
            for doc in retrieved_docs
        ])
        return f"Based on the documentation:\n{context}"

print("✓ ImprovedRAG class defined")

## Test Improved RAG

In [None]:
# Initialize improved RAG
improved_rag = ImprovedRAG()
improved_rag.index_documents(wp_docs)

In [None]:
# Test same queries as baseline
test_queries = [
    "How do I add a JavaScript file to WordPress?",
    "What is add_action function?",
    "How to create custom post types?",
    "What are WordPress hooks?",
]

print("Testing Improved RAG:\n")
for query in test_queries[:2]:
    print(f"Query: {query}")
    is_code = improved_rag.is_code_query(query)
    print(f"  Classified as: {'CODE' if is_code else 'CONCEPT'} query")
    results = improved_rag.retrieve(query, k=2)
    for i, doc in enumerate(results, 1):
        print(f"  {i}. {doc['title']} (final_score: {doc['final_score']:.3f})")
    print()

## Evaluate Improved Performance

In [None]:
def evaluate_retrieval(rag_system, queries_with_expected):
    """Evaluate retrieval quality"""
    results = {
        'precision_at_1': [],
        'precision_at_3': [],
        'retrieval_times': []
    }
    
    for query, expected_ids in queries_with_expected:
        start = time.time()
        retrieved = rag_system.retrieve(query, k=3)
        elapsed = time.time() - start
        
        retrieved_ids = [doc['id'] for doc in retrieved]
        
        p1 = 1.0 if retrieved_ids[0] in expected_ids else 0.0
        results['precision_at_1'].append(p1)
        
        hits = sum(1 for rid in retrieved_ids if rid in expected_ids)
        p3 = hits / 3.0
        results['precision_at_3'].append(p3)
        
        results['retrieval_times'].append(elapsed)
    
    return {
        'avg_precision_at_1': np.mean(results['precision_at_1']),
        'avg_precision_at_3': np.mean(results['precision_at_3']),
        'avg_retrieval_time': np.mean(results['retrieval_times'])
    }

# Same test dataset
eval_queries = [
    ("How to enqueue JavaScript in WordPress?", ["doc_003"]),
    ("What is add_action?", ["doc_001"]),
    ("How to create custom post types?", ["doc_006", "doc_007"]),
    ("What are hooks in WordPress?", ["doc_002"]),
    ("How to insert a post?", ["doc_009"]),
]

improved_metrics = evaluate_retrieval(improved_rag, eval_queries)
print("\nImproved RAG Performance:")
print(f"  Precision@1: {improved_metrics['avg_precision_at_1']:.2%}")
print(f"  Precision@3: {improved_metrics['avg_precision_at_3']:.2%}")
print(f"  Avg Retrieval Time: {improved_metrics['avg_retrieval_time']*1000:.1f}ms")

## Compare Results

In [None]:
# Load baseline results
with open('baseline_results.json', 'r') as f:
    baseline_data = json.load(f)
    baseline_metrics = baseline_data['metrics']

# Create comparison
comparison = pd.DataFrame([
    {
        'System': 'Baseline RAG',
        'Precision@1': baseline_metrics['avg_precision_at_1'],
        'Precision@3': baseline_metrics['avg_precision_at_3'],
        'Avg Time (ms)': baseline_metrics['avg_retrieval_time'] * 1000
    },
    {
        'System': 'Improved RAG',
        'Precision@1': improved_metrics['avg_precision_at_1'],
        'Precision@3': improved_metrics['avg_precision_at_3'],
        'Avg Time (ms)': improved_metrics['avg_retrieval_time'] * 1000
    }
])

print("\n" + "="*60)
print("PERFORMANCE COMPARISON")
print("="*60)
print(comparison.to_string(index=False))
print("\nImprovement:")
print(f"  Precision@1: +{(improved_metrics['avg_precision_at_1'] - baseline_metrics['avg_precision_at_1'])*100:.1f}%")
print(f"  Precision@3: +{(improved_metrics['avg_precision_at_3'] - baseline_metrics['avg_precision_at_3'])*100:.1f}%")

## Visualize Results

In [None]:
# Create comparison charts
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Precision comparison
metrics_df = comparison[['System', 'Precision@1', 'Precision@3']].set_index('System')
metrics_df.plot(kind='bar', ax=axes[0], color=['#2E86AB', '#A23B72'])
axes[0].set_title('Retrieval Precision Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Precision Score')
axes[0].set_ylim([0, 1.0])
axes[0].legend(title='Metric')
axes[0].grid(axis='y', alpha=0.3)

# Time comparison
time_df = comparison[['System', 'Avg Time (ms)']].set_index('System')
time_df.plot(kind='bar', ax=axes[1], color='#F18F01', legend=False)
axes[1].set_title('Average Retrieval Time', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Time (milliseconds)')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('rag_comparison.png', dpi=300, bbox_inches='tight')
print("\n✓ Visualization saved to rag_comparison.png")
plt.show()

## Save Results

In [None]:
# Save improved results
improved_results = {
    'system': 'Improved RAG (Code-Aware Hybrid)',
    'metrics': improved_metrics,
    'improvements': {
        'query_classification': True,
        'hybrid_scoring': True,
        'document_type_boosting': True
    },
    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
}

with open('improved_results.json', 'w') as f:
    json.dump(improved_results, f, indent=2)

# Save comparison
comparison.to_csv('comparison_results.csv', index=False)

print("✓ Results saved!")
print("  - improved_results.json")
print("  - comparison_results.csv")
print("  - rag_comparison.png")