# üí∞ Lab 9: Cost Optimization
## Module 9 - Managing Azure OpenAI Costs

**Duration:** 10 minutes

**Objectives:**
- Implement intelligent model routing
- Use Batch API for non-urgent tasks
- Monitor token usage

In [None]:
!pip install openai -q

In [None]:
import os, json

DEMO_MODE = False
client = None
MODEL_NAME = "gpt-4o"
MODEL_MINI = "gpt-4o-mini"  # Optional: add AZURE_OPENAI_DEPLOYMENT_MINI secret

try:
    from google.colab import userdata
    AZURE_OPENAI_KEY = userdata.get('AZURE_OPENAI_KEY')
    AZURE_OPENAI_ENDPOINT = userdata.get('AZURE_OPENAI_ENDPOINT')
    try: MODEL_NAME = userdata.get('AZURE_OPENAI_DEPLOYMENT')
    except: pass
    try: MODEL_MINI = userdata.get('AZURE_OPENAI_DEPLOYMENT_MINI')
    except: MODEL_MINI = MODEL_NAME  # Fall back to main model if mini not available
    if AZURE_OPENAI_KEY and AZURE_OPENAI_ENDPOINT:
        if not AZURE_OPENAI_ENDPOINT.startswith('http'):
            AZURE_OPENAI_ENDPOINT = 'https://' + AZURE_OPENAI_ENDPOINT
        print(f"‚úÖ Loaded. Main: {MODEL_NAME}, Mini: {MODEL_MINI}")
        if MODEL_NAME == MODEL_MINI:
            print("   (Using same model for both - add AZURE_OPENAI_DEPLOYMENT_MINI for cost routing)")
    else: raise ValueError()
except: print("‚ö†Ô∏è DEMO MODE"); DEMO_MODE = True

if not DEMO_MODE:
    from openai import AzureOpenAI
    client = AzureOpenAI(api_key=AZURE_OPENAI_KEY, api_version="2024-06-01", azure_endpoint=AZURE_OPENAI_ENDPOINT)

## Azure OpenAI Pricing

| Model | Input/1M | Output/1M | Best For |
|-------|----------|-----------|----------|
| GPT-4o | $5.00 | $15.00 | Complex reasoning |
| GPT-4o-mini | $0.15 | $0.60 | Simple tasks |
| Batch API | 50% off | 50% off | Non-urgent |

## Part 1: Intelligent Router

In [None]:
class CostOptimizedRouter:
    PRICING = {"gpt-4o": {"input": 5.0, "output": 15.0}, "gpt-4o-mini": {"input": 0.15, "output": 0.60}}
    SIMPLE = ["balance", "hours", "location", "hello", "status"]
    COMPLEX = ["fraud", "investigate", "analyze", "recommend", "should i"]
    
    def __init__(self, client):
        self.client = client
        self.usage_log = []
    
    def select_model(self, query):
        q = query.lower()
        if any(p in q for p in self.COMPLEX): return MODEL_NAME
        return MODEL_MINI if not DEMO_MODE else MODEL_NAME
    
    def invoke(self, query):
        model = self.select_model(query)
        
        if DEMO_MODE or not self.client:
            cost = 0.0001 if "mini" in model else 0.001
            self.usage_log.append({"model": model, "tokens": 100, "cost": cost})
            return {"response": f"Response to: {query[:30]}... [DEMO]", "model": model, "cost": cost}
        
        response = self.client.chat.completions.create(model=model, messages=[{"role": "user", "content": query}])
        usage = response.usage
        pricing = self.PRICING.get(model, self.PRICING["gpt-4o"])
        cost = (usage.prompt_tokens * pricing["input"] + usage.completion_tokens * pricing["output"]) / 1_000_000
        
        self.usage_log.append({"model": model, "tokens": usage.total_tokens, "cost": cost})
        return {"response": response.choices[0].message.content, "model": model, "tokens": usage.total_tokens, "cost": cost}

router = CostOptimizedRouter(client)

In [None]:
queries = [
    "What's my account balance?",
    "Investigate this suspicious transaction pattern",
    "What are your branch hours?",
    "Should I refinance my mortgage?"
]

print("Cost-Optimized Routing")
print("="*50)
for q in queries:
    r = router.invoke(q)
    print(f"\n{q[:40]}...")
    print(f"  Model: {r['model']} | Cost: ${r['cost']:.6f}")

## Part 2: Batch API (50% Savings)

In [None]:
def prepare_batch_request(tasks):
    """Prepare JSONL for Batch API"""
    lines = []
    for i, task in enumerate(tasks):
        lines.append(json.dumps({
            "custom_id": f"task-{i}",
            "method": "POST",
            "url": "/chat/completions",
            "body": {"model": MODEL_NAME, "messages": [{"role": "user", "content": task}]}
        }))
    return "\n".join(lines)

batch_tasks = [
    "Summarize account C-001 for compliance",
    "Summarize account C-002 for compliance",
    "Summarize account C-003 for compliance"
]

print("Batch API Request (JSONL):")
print(prepare_batch_request(batch_tasks)[:300] + "...")

## Part 3: Cost Comparison

In [None]:
DAILY = 10000
TOKENS = 500

scenarios = {
    "All GPT-4o": {"gpt4o": 1.0, "mini": 0.0, "batch": 0.0},
    "Mixed (20/80)": {"gpt4o": 0.2, "mini": 0.8, "batch": 0.0},
    "Optimized": {"gpt4o": 0.1, "mini": 0.6, "batch": 0.3}
}

print("Cost Comparison: 10,000 Daily Requests")
print("="*50)
for name, mix in scenarios.items():
    gpt4o = DAILY * mix["gpt4o"] * TOKENS * 10 / 1_000_000
    mini = DAILY * mix["mini"] * TOKENS * 0.375 / 1_000_000
    batch = DAILY * mix["batch"] * TOKENS * 5 / 1_000_000
    daily = gpt4o + mini + batch
    print(f"\n{name}: ${daily:.2f}/day, ${daily*30:.2f}/month")

## ‚úÖ Lab 9 Complete!

**Key Takeaways:**
- Route simple queries to GPT-4o-mini (97% cheaper)
- Use Batch API for non-urgent tasks (50% savings)
- Monitor token usage to optimize

**Day 2 Labs Complete!**