# Proxy Pattern for LLM Systems: Access Control & Optimization

**Pattern Focus**: Controlling access to LLM services while adding functionality like caching, authentication, and cost tracking.

**Real-World Example**: Enterprise LLM gateway that manages API access, costs, and security across multiple providers.

## 1. Core Proxy Pattern Implementation

In [None]:
from abc import ABC, abstractmethod
import time
import hashlib
from typing import Dict, Optional

# Abstract LLM Service Interface
class LLMService(ABC):
    @abstractmethod
    def complete(self, prompt: str, **kwargs) -> str:
        pass

# Real LLM Service (expensive to call)
class OpenAIService(LLMService):
    def __init__(self):
        self.call_count = 0
        self.total_cost = 0.0
    
    def complete(self, prompt: str, **kwargs) -> str:
        # Simulate API call delay and cost
        time.sleep(0.5)  # Simulate network latency
        self.call_count += 1
        cost = len(prompt) * 0.001  # $0.001 per character
        self.total_cost += cost
        
        return f"AI Response to: '{prompt[:50]}...' (Call #{self.call_count}, Cost: ${cost:.3f})"

print("✅ Core components defined")

## 2. Smart Caching Proxy - 80% Cost Reduction

In [None]:
class CachingLLMProxy(LLMService):
    """Proxy with intelligent caching for cost optimization"""
    
    def __init__(self, real_service: LLMService):
        self._real_service = real_service
        self._cache: Dict[str, str] = {}
        self.cache_hits = 0
        self.cache_misses = 0
    
    def complete(self, prompt: str, **kwargs) -> str:
        # Generate cache key
        cache_key = hashlib.md5(f"{prompt}{kwargs}".encode()).hexdigest()
        
        # Check cache first
        if cache_key in self._cache:
            self.cache_hits += 1
            return f"[CACHED] {self._cache[cache_key]}"
        
        # Cache miss - call real service
        self.cache_misses += 1
        result = self._real_service.complete(prompt, **kwargs)
        self._cache[cache_key] = result
        
        return result
    
    def get_cache_stats(self):
        total = self.cache_hits + self.cache_misses
        hit_rate = (self.cache_hits / total * 100) if total > 0 else 0
        return f"Cache Hit Rate: {hit_rate:.1f}% ({self.cache_hits}/{total})"

print("✅ Caching proxy ready")

## 3. Demo: Cache Performance Impact

In [None]:
# Create services
real_llm = OpenAIService()
cached_llm = CachingLLMProxy(real_llm)

# Test prompts (some repeated)
test_prompts = [
    "Explain machine learning",
    "What is Python?",
    "Explain machine learning",  # Repeat
    "Best practices for API design",
    "What is Python?",  # Repeat
    "Explain machine learning",  # Repeat
]

print("🚀 Testing cached vs direct calls...\n")

start_time = time.time()
for i, prompt in enumerate(test_prompts, 1):
    response = cached_llm.complete(prompt)
    print(f"{i}. {response[:80]}...")

duration = time.time() - start_time

print(f"\n📊 Results:")
print(f"Total time: {duration:.2f}s")
print(f"Real API calls: {real_llm.call_count}")
print(f"Total cost: ${real_llm.total_cost:.3f}")
print(f"{cached_llm.get_cache_stats()}")
print(f"\n💰 Cost savings: {(1 - real_llm.call_count/len(test_prompts)) * 100:.1f}%")

## 4. Authentication & Rate Limiting Proxy

In [None]:
class SecureLLMProxy(LLMService):
    """Enterprise proxy with authentication and rate limiting"""
    
    def __init__(self, real_service: LLMService):
        self._real_service = real_service
        self.valid_api_keys = {"user123", "team456", "admin789"}
        self.rate_limits = {}  # user_id -> {count, last_reset}
        self.max_requests_per_minute = 5
    
    def complete(self, prompt: str, api_key: str = None, **kwargs) -> str:
        # 1. Authentication
        if not self._authenticate(api_key):
            raise Exception("❌ Authentication failed: Invalid API key")
        
        # 2. Rate limiting
        if not self._check_rate_limit(api_key):
            raise Exception(f"❌ Rate limit exceeded: Max {self.max_requests_per_minute}/minute")
        
        # 3. Execute request
        result = self._real_service.complete(prompt, **kwargs)
        
        # 4. Audit logging
        self._log_request(api_key, prompt, success=True)
        
        return f"[AUTHORIZED] {result}"
    
    def _authenticate(self, api_key: str) -> bool:
        return api_key in self.valid_api_keys
    
    def _check_rate_limit(self, api_key: str) -> bool:
        current_time = time.time()
        
        if api_key not in self.rate_limits:
            self.rate_limits[api_key] = {'count': 0, 'last_reset': current_time}
        
        user_limits = self.rate_limits[api_key]
        
        # Reset counter if minute has passed
        if current_time - user_limits['last_reset'] >= 60:
            user_limits['count'] = 0
            user_limits['last_reset'] = current_time
        
        # Check limit
        if user_limits['count'] >= self.max_requests_per_minute:
            return False
        
        user_limits['count'] += 1
        return True
    
    def _log_request(self, api_key: str, prompt: str, success: bool):
        status = "SUCCESS" if success else "FAILED"
        print(f"🔍 AUDIT: {api_key} | {status} | '{prompt[:30]}...'")

print("✅ Secure proxy ready")

## 5. Demo: Authentication & Rate Limiting

In [None]:
# Create secure proxy
real_service = OpenAIService()
secure_proxy = SecureLLMProxy(real_service)

print("🔐 Testing authentication and rate limiting...\n")

# Test 1: Valid authentication
try:
    response = secure_proxy.complete("Hello AI", api_key="user123")
    print(f"✅ Valid auth: {response[:60]}...")
except Exception as e:
    print(f"❌ Error: {e}")

# Test 2: Invalid authentication
try:
    response = secure_proxy.complete("Hello AI", api_key="invalid")
    print(f"✅ Response: {response[:60]}...")
except Exception as e:
    print(f"❌ Expected error: {e}")

# Test 3: Rate limiting (make 6 quick requests)
print("\n🚀 Testing rate limiting (max 5/minute):")
for i in range(6):
    try:
        response = secure_proxy.complete(f"Request {i+1}", api_key="user123")
        print(f"✅ Request {i+1}: Allowed")
    except Exception as e:
        print(f"❌ Request {i+1}: {e}")

## 6. Multi-Provider Load Balancing Proxy

In [None]:
class LoadBalancingProxy(LLMService):
    """Intelligent load balancing across multiple LLM providers"""
    
    def __init__(self):
        # Simulate different providers with different characteristics
        self.providers = {
            'openai': {'service': OpenAIService(), 'cost_per_char': 0.001, 'latency': 0.5},
            'anthropic': {'service': OpenAIService(), 'cost_per_char': 0.0008, 'latency': 0.3},
            'google': {'service': OpenAIService(), 'cost_per_char': 0.0006, 'latency': 0.4}
        }
        self.usage_stats = {name: 0 for name in self.providers.keys()}
    
    def complete(self, prompt: str, optimize_for="cost", **kwargs) -> str:
        # Select optimal provider based on strategy
        provider_name = self._select_provider(prompt, optimize_for)
        provider = self.providers[provider_name]['service']
        
        # Execute request
        result = provider.complete(prompt, **kwargs)
        
        # Track usage
        self.usage_stats[provider_name] += 1
        
        return f"[{provider_name.upper()}] {result}"
    
    def _select_provider(self, prompt: str, strategy: str) -> str:
        """Select optimal provider based on strategy"""
        if strategy == "cost":
            # Choose cheapest provider
            return min(self.providers.keys(), 
                      key=lambda p: self.providers[p]['cost_per_char'])
        elif strategy == "speed":
            # Choose fastest provider
            return min(self.providers.keys(), 
                      key=lambda p: self.providers[p]['latency'])
        else:
            # Round-robin for balance
            return min(self.providers.keys(), 
                      key=lambda p: self.usage_stats[p])
    
    def get_usage_stats(self):
        total = sum(self.usage_stats.values())
        if total == 0:
            return "No requests yet"
        
        stats = []
        for provider, count in self.usage_stats.items():
            percentage = (count / total) * 100
            stats.append(f"{provider}: {count} ({percentage:.1f}%)")
        
        return " | ".join(stats)

print("✅ Load balancing proxy ready")

## 7. Demo: Multi-Provider Optimization

In [None]:
# Create load balancing proxy
lb_proxy = LoadBalancingProxy()

print("⚖️ Testing multi-provider load balancing...\n")

# Test different optimization strategies
strategies = ["cost", "speed", "balanced"]
test_prompts = ["Explain AI", "Code review tips", "System design"]

for strategy in strategies:
    print(f"📊 Strategy: {strategy.upper()}")
    for prompt in test_prompts:
        response = lb_proxy.complete(prompt, optimize_for=strategy)
        provider = response.split(']')[0][1:]  # Extract provider name
        print(f"  → {provider} handled: '{prompt}'")
    print()

print(f"📈 Final usage distribution: {lb_proxy.get_usage_stats()}")

## 8. Enterprise Proxy: All Features Combined

In [None]:
class EnterpriseLLMProxy(LLMService):
    """Production-ready proxy with all enterprise features"""
    
    def __init__(self, real_service: LLMService):
        self._real_service = real_service
        self._cache = {}  # Simple cache
        self.valid_keys = {"enterprise_key_123"}
        self.metrics = {
            'total_requests': 0,
            'cache_hits': 0,
            'auth_failures': 0,
            'total_cost_saved': 0.0
        }
    
    def complete(self, prompt: str, api_key: str = None, **kwargs) -> str:
        self.metrics['total_requests'] += 1
        
        # 1. Authentication
        if api_key not in self.valid_keys:
            self.metrics['auth_failures'] += 1
            raise Exception("🔒 Access denied")
        
        # 2. Caching check
        cache_key = hashlib.md5(prompt.encode()).hexdigest()
        if cache_key in self._cache:
            self.metrics['cache_hits'] += 1
            self.metrics['total_cost_saved'] += 0.001  # Estimated savings
            return f"⚡ [CACHED] {self._cache[cache_key]}"
        
        # 3. Real service call
        result = self._real_service.complete(prompt, **kwargs)
        self._cache[cache_key] = result
        
        return f"🚀 [LIVE] {result}"
    
    def get_dashboard(self):
        """Enterprise monitoring dashboard"""
        hit_rate = (self.metrics['cache_hits'] / max(1, self.metrics['total_requests'])) * 100
        
        dashboard = f"""
🏢 ENTERPRISE LLM PROXY DASHBOARD
=====================================
📊 Total Requests: {self.metrics['total_requests']}
⚡ Cache Hit Rate: {hit_rate:.1f}%
🔒 Auth Failures: {self.metrics['auth_failures']}
💰 Cost Saved: ${self.metrics['total_cost_saved']:.3f}
🎯 Status: {'✅ HEALTHY' if hit_rate > 50 else '⚠️ LOW CACHE EFFICIENCY'}
        """
        return dashboard

print("✅ Enterprise proxy ready")

## 9. Demo: Enterprise Dashboard & ROI

In [None]:
# Create enterprise proxy
real_service = OpenAIService()
enterprise_proxy = EnterpriseLLMProxy(real_service)

print("🏢 Enterprise LLM Proxy Demo\n")

# Simulate enterprise usage
enterprise_requests = [
    "What is machine learning?",
    "Explain cloud computing", 
    "What is machine learning?",  # Repeat for cache
    "Best database practices",
    "Explain cloud computing",     # Repeat for cache
    "What is machine learning?",  # Repeat for cache
]

# Process requests
for i, prompt in enumerate(enterprise_requests, 1):
    try:
        response = enterprise_proxy.complete(prompt, api_key="enterprise_key_123")
        print(f"{i}. {response[:70]}...")
    except Exception as e:
        print(f"{i}. ❌ {e}")

# Show enterprise dashboard
print(enterprise_proxy.get_dashboard())

## 10. Key Takeaways & Business Impact

### 🎯 **Proxy Pattern Benefits in LLM Systems**

| Feature | Business Impact | Implementation |
|---------|----------------|----------------|
| **Smart Caching** | 60-80% cost reduction | Cache frequently requested prompts |
| **Authentication** | Security & access control | API key validation |
| **Rate Limiting** | Prevent abuse & cost overruns | Request throttling |
| **Load Balancing** | Optimize cost & performance | Multi-provider routing |
| **Monitoring** | Operational visibility | Real-time metrics |

### 💡 **Real-World Applications**
- **Enterprise AI Gateways**: Single control point for all LLM access
- **Cost Optimization**: Intelligent caching and provider selection  
- **Security**: Content filtering and audit logging
- **Compliance**: Access controls and usage tracking

### 🚀 **Next Steps**
1. Implement circuit breakers for resilience
2. Add semantic caching for better hit rates
3. Integrate with monitoring systems
4. Scale to production with async/await patterns