# Model-agnostic Gateway + Fallback + Feature Flags + Koszty (realny przyklad)


Ten notebook pokazuje produkcyjny (ale uproszczony) wzorzec:
- Model-agnostic gateway — wybor modelu po naglowku `X-Model` i/lub przez feature flags.
- Fallback — timeout lub blad -> automatyczne przelaczenie na model zapasowy (np. `oss:mock`).
- Feature flags (canary / versioning) — procentowe kierowanie czesci ruchu na nowy model / prompt.
- Monitoring kosztow i metryk — mierzymy latency, liczymy koszty per zapytanie / dziennie, progi alarmowe.

Notebook nie wymaga zewnetrznych kluczy API — uzywamy lekkich mockow modeli.


In [1]:
# === Imports & helpers ===
from __future__ import annotations
from fastapi import FastAPI, Header
from fastapi.testclient import TestClient
from pydantic import BaseModel
from datetime import datetime, date
import time, random, hashlib
from typing import Dict, Any, Optional, Tuple

# Prosty zegar (ms)
def now_ms():
    return int(time.time() * 1000)


## Konfiguracja: feature flags i stawki kosztow

In [2]:
# === Feature flags ===
# - canary_rollout: % ruchu kierowany do nowego modelu
# - prompt_versions: szybkie przelaczanie promptow

FEATURE_FLAGS = {
    'canary_rollout_percent': 15,
    'canary_model': 'openai:gpt-4o-mini-canary',
    'stable_model': 'openai:gpt-4o-mini',
    'prompt_versions': {
        'v1': 'You are a concise assistant. Answer briefly.',
        'v2': 'You are a structured assistant. Answer in bullet points with 2-4 bullets.'
    },
    'prompt_default': 'v1',
}

# === Stawki kosztow (mock) ===
COST_RATES = {
    'openai:gpt-4o-mini': 0.0000025,
    'openai:gpt-4o-mini-canary': 0.0000030,
    'anthropic:sonnet': 0.0000032,
    'oss:mock': 0.0,
}

DAILY_ALERT_THRESHOLD_USD = 50.0


## Monitoring: koszty, latency, progi alarmowe

In [3]:
METRICS = {
    'requests': [],
    'daily_costs': {},
}

def _hash_to_percent(s: str) -> int:
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()
    return int(h[:8], 16) % 100

def pick_model_for_user(user_id: str) -> str:
    # wybor wg canary rollout
    bucket = _hash_to_percent(user_id or 'anonymous')
    if bucket < int(FEATURE_FLAGS['canary_rollout_percent']):
        return FEATURE_FLAGS['canary_model']
    return FEATURE_FLAGS['stable_model']

def pick_prompt_version(user_id: str, override: Optional[str] = None) -> str:
    if override and override in FEATURE_FLAGS['prompt_versions']:
        return override
    return FEATURE_FLAGS['prompt_default']

def estimate_tokens(text: str) -> int:
    return max(1, len(text) // 4)

def compute_cost(model_name: str, prompt: str, response: str) -> float:
    rate = COST_RATES.get(model_name, 0.000003)
    tokens = estimate_tokens(prompt) + estimate_tokens(response)
    return round(tokens * rate, 6)

def log_request(user_id: str, model: str, latency_ms: int, cost_usd: float, ok: bool, prompt_ver: str):
    METRICS['requests'].append((now_ms(), user_id, model, latency_ms, cost_usd, ok, prompt_ver))
    day = date.today().isoformat()
    METRICS['daily_costs'][day] = METRICS['daily_costs'].get(day, 0.0) + cost_usd
    if METRICS['daily_costs'][day] > DAILY_ALERT_THRESHOLD_USD:
        print('ALERT: Daily cost exceeded', DAILY_ALERT_THRESHOLD_USD, 'USD! Current:', round(METRICS['daily_costs'][day], 2))

def get_daily_cost(day: Optional[str] = None) -> float:
    return METRICS['daily_costs'].get(day or date.today().isoformat(), 0.0)


## Mocki modeli (OpenAI / Anthropic / OSS) z kontrola timeoutu

In [4]:
class LLMProvider:
    def __init__(self, name: str, mean_latency_ms: int = 120, p_timeout: float = 0.02):
        self.name = name
        self.mean_latency_ms = mean_latency_ms
        self.p_timeout = p_timeout

    def invoke(self, prompt: str) -> str:
        # latency: exp around mean
        delay = random.expovariate(1.0 / max(1, self.mean_latency_ms))
        if random.random() < 0.05:
            delay *= 3.5  # long tail
        time.sleep(delay / 1000.0)
        if random.random() < self.p_timeout:
            raise TimeoutError(f'Model {self.name} timeout')
        return f'[{self.name}] Answer to: ' + prompt[:120]

OPENAI_MINI = LLMProvider('openai:gpt-4o-mini', mean_latency_ms=140, p_timeout=0.04)
OPENAI_CANARY = LLMProvider('openai:gpt-4o-mini-canary', mean_latency_ms=160, p_timeout=0.05)
ANTHROPIC_SONNET = LLMProvider('anthropic:sonnet', mean_latency_ms=170, p_timeout=0.03)

class OSSMock:
    def __init__(self):
        self.name = 'oss:mock'
    def invoke(self, prompt: str) -> str:
        time.sleep(0.02)
        return f'(OSS fallback) Echo: {prompt[:160]}'

OSS = OSSMock()

MODEL_REGISTRY: Dict[str, Any] = {
    'openai:gpt-4o-mini': OPENAI_MINI,
    'openai:gpt-4o-mini-canary': OPENAI_CANARY,
    'anthropic:sonnet': ANTHROPIC_SONNET,
    'oss:mock': OSS,
}


## FastAPI: endpoint `/chat` z naglowkami `X-Model`, `X-Prompt-Version`, `X-User-Id`

In [5]:
app = FastAPI(title='Model-agnostic Gateway (demo)')

class ChatRequest(BaseModel):
    message: str

def build_prompt(base_system: str, user_message: str) -> str:
    return base_system + '\nUser: ' + user_message

def call_with_fallback(model_name: str, prompt: str, timeout_ms: int = 2000) -> Tuple[str, str, bool, int]:
    # Try primary; on error -> fallback to oss:mock. Returns (response, final_model, ok, latency_ms)
    start = now_ms()
    try:
        prov = MODEL_REGISTRY[model_name]
        resp = prov.invoke(prompt)
        latency = now_ms() - start
        return resp, model_name, True, latency
    except Exception:
        resp = MODEL_REGISTRY['oss:mock'].invoke(prompt)
        latency = now_ms() - start
        return resp, 'oss:mock', False, latency

@app.post('/chat')
def chat_endpoint(
    req: ChatRequest,
    x_model: Optional[str] = Header(default=None, alias='X-Model'),
    x_prompt_version: Optional[str] = Header(default=None, alias='X-Prompt-Version'),
    x_user_id: Optional[str] = Header(default='anonymous', alias='X-User-Id'),
):
    # 1) model selection
    if x_model:
        if x_model not in MODEL_REGISTRY:
            return {'error': f'Unknown model: {x_model}'}
        model_name = x_model
    else:
        model_name = pick_model_for_user(x_user_id or 'anonymous')

    # 2) prompt version
    prompt_ver = pick_prompt_version(x_user_id or 'anonymous', override=x_prompt_version)
    system_prompt = FEATURE_FLAGS['prompt_versions'][prompt_ver]

    # 3) final prompt
    final_prompt = build_prompt(system_prompt, req.message)

    # 4) call with fallback
    response_text, final_model, ok, latency_ms = call_with_fallback(model_name, final_prompt)

    # 5) cost & metrics
    cost = compute_cost(final_model, final_prompt, response_text)
    log_request(user_id=x_user_id or 'anonymous', model=final_model,
                latency_ms=latency_ms, cost_usd=cost, ok=ok, prompt_ver=prompt_ver)

    return {
        'model': final_model,
        'ok': ok,
        'latency_ms': latency_ms,
        'cost_usd': cost,
        'prompt_version': prompt_ver,
        'answer': response_text,
        'daily_cost_usd': round(get_daily_cost(), 4),
    }

client = TestClient(app)


## Demo: wywolania z roznymi naglowkami (model, prompt, user_id)

In [6]:
def demo_call(message: str, user_id: str, model: Optional[str] = None, pver: Optional[str] = None):
    headers = {'X-User-Id': user_id}
    if model:
        headers['X-Model'] = model
    if pver:
        headers['X-Prompt-Version'] = pver
    r = client.post('/chat', json={'message': message}, headers=headers)
    return r.json()

# 1) Bez X-Model -> routing wg canary (czesc userow na canary)
out1 = demo_call('Give me 3 key benefits of a model-agnostic gateway.', user_id='alice')
out2 = demo_call('Give me 3 key benefits of a model-agnostic gateway.', user_id='bob')
out3 = demo_call('Give me 3 key benefits of a model-agnostic gateway.', user_id='carol')

# 2) Wymuszenie modelu OpenAI stable + prompt v2
out4 = demo_call('Short plan for adding fallback to my API.', user_id='dave', model='openai:gpt-4o-mini', pver='v2')

# 3) Wymuszenie nieznanego modelu -> blad
bad = client.post('/chat', json={'message': 'test'}, headers={'X-Model': 'unknown:model'}).json()

# 4) Wymuszenie Anthropic (mock) - inny koszt
out5 = demo_call('List 2 risks of vendor lock-in.', user_id='erin', model='anthropic:sonnet')

print('1) alice ->', out1['model'], 'cost:', out1['cost_usd'], 'ok:', out1['ok'])
print('2) bob   ->', out2['model'], 'cost:', out2['cost_usd'], 'ok:', out2['ok'])
print('3) carol ->', out3['model'], 'cost:', out3['cost_usd'], 'ok:', out3['ok'])
print('4) dave  ->', out4['model'], 'cost:', out4['cost_usd'], 'ok:', out4['ok'], 'prompt:', out4['prompt_version'])
print('5) erin  ->', out5['model'], 'cost:', out5['cost_usd'], 'ok:', out5['ok'])
print('bad model ->', bad)
print('Dzisiejszy koszt (USD):', round(get_daily_cost(), 4))


1) alice -> openai:gpt-4o-mini-canary cost: 0.00018 ok: True
2) bob   -> openai:gpt-4o-mini-canary cost: 0.00018 ok: True
3) carol -> openai:gpt-4o-mini cost: 0.000145 ok: True
4) dave  -> openai:gpt-4o-mini cost: 0.00017 ok: True prompt: v2
5) erin  -> anthropic:sonnet cost: 0.000154 ok: True
bad model -> {'error': 'Unknown model: unknown:model'}
Dzisiejszy koszt (USD): 0.0008


## Demo progu kosztow (alert)

In [7]:
for i in range(30):
    _ = demo_call(f'Run #{i}: summarize gateway pattern.', user_id=f'user{i%3}')
print('Dzisiejszy koszt po petli:', round(get_daily_cost(), 4), 'USD')


Dzisiejszy koszt po petli: 0.0043 USD


## Metryki: podglad ostatnich 5 prosb

In [8]:
from collections import deque
tail = deque(METRICS['requests'][-5:], maxlen=5)
for ts, uid, model, lat, cost, ok, pver in tail:
    ts_iso = datetime.fromtimestamp(ts/1000).isoformat(timespec='seconds')
    print(f"{ts_iso} | user={uid:8s} | model={model:24s} | latency={lat:4d}ms | cost=${cost:.6f} | ok={ok} | prompt={pver}")


2025-10-24T08:06:07 | user=user1    | model=openai:gpt-4o-mini       | latency=  65ms | cost=$0.000125 | ok=True | prompt=v1
2025-10-24T08:06:07 | user=user2    | model=openai:gpt-4o-mini       | latency= 156ms | cost=$0.000125 | ok=True | prompt=v1
2025-10-24T08:06:07 | user=user0    | model=openai:gpt-4o-mini       | latency= 100ms | cost=$0.000125 | ok=True | prompt=v1
2025-10-24T08:06:08 | user=user1    | model=openai:gpt-4o-mini       | latency= 516ms | cost=$0.000125 | ok=True | prompt=v1
2025-10-24T08:06:08 | user=user2    | model=openai:gpt-4o-mini       | latency= 137ms | cost=$0.000125 | ok=True | prompt=v1


---

### Tekst do odcinka (skrót)

W tym odcinku przechodzimy z etapu eksperymentow do wdrozen produkcyjnych... Dodaj tu swoj pelny skrypt do telepromptera.