<a href="https://colab.research.google.com/github/pberlizov/pberlizov/blob/main/SLAI_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Sample Bedrock model performance data (hypothetical 2025 ratings)
# Format: {model_id: {"size": GB, "energy": kWh/inference, "cost": $/1K tokens,
#"latency": seconds, "accuracy": 0-1}}

# For the future, makes sense to set up a system to get these ratings
# automatically, like a scraper? Mid-term step. Picked just three models.
# No real sense in going much further given the basic prototype here.
MODELS = {
    "meta.llama3-8b": {
        "size": 16.0,          # 8B params, ~16GB
        "energy": 0.0001,      # Low energy estimate
        "cost": 0.0003,        # Bedrock pricing guess
        "latency": 0.5,        # Fast for small model
        "accuracy": 0.85       # Decent accuracy
    },
    "mistral.mixtral-8x7b": {
        "size": 90.0,          # 56B params, ~90GB
        "energy": 0.0003,      # Higher energy
        "cost": 0.0005,        # More expensive
        "latency": 0.7,        # Slower due to size
        "accuracy": 0.90       # High accuracy
    },
    "amazon.titan-text": {
        "size": 30.0,          # Mid-size proprietary
        "energy": 0.00015,     # Moderate energy
        "cost": 0.0004,        # Mid-range cost
        "latency": 0.4,        # Optimized for speed
        "accuracy": 0.80       # Lower accuracy
    }
}

# Default weights (equal for now, sum to 1.0)
# Just picked a weighted sum, since we discussed the function being immaterial.
WEIGHTS = {
    "size": 0.2,
    "energy": 0.2,
    "cost": 0.2,
    "latency": 0.2,
    "accuracy": 0.2
}

# Classifying by length for now. Once we get ratings, will switch to subject
# matter ratings and semantic meaning. That would require API to LLM integration
def classify_query(query: str) -> float:
    """Estimate query complexity based on word count (0-1 scale)."""
    word_count = len(query.split())
    complexity = min(word_count / 50, 1.0)  # Normalize: 0-50 words = 0-1
    return complexity

# Normalizing the values of the ratings. Could input them that way, but this
# strikes me as more convenient.
def normalize_metric(value: float, min_val: float, max_val: float) -> float:
    """Normalize a metric to 0-1 scale."""
    return (value - min_val) / (max_val - min_val) if max_val > min_val else 0

def score_model(model_metrics: dict, complexity: float) -> float:
    """Calculate a score for a model (lower is better)."""
    # Extract raw values
    size = model_metrics["size"]
    energy = model_metrics["energy"]
    cost = model_metrics["cost"]
    latency = model_metrics["latency"]
    accuracy = model_metrics["accuracy"]

    # Define ranges for normalization (based on sample data)
    ranges = {
        "size": (16.0, 90.0),       # GB
        "energy": (0.0001, 0.0003), # kWh/inference
        "cost": (0.0003, 0.0005),   # $/1K tokens
        "latency": (0.4, 0.7),      # seconds
        "accuracy": (0.80, 0.90)    # 0-1
    }

    # Normalize metrics (invert accuracy: higher = better)
    norm_size = normalize_metric(size, *ranges["size"])
    norm_energy = normalize_metric(energy, *ranges["energy"])
    norm_cost = normalize_metric(cost, *ranges["cost"])
    norm_latency = normalize_metric(latency, *ranges["latency"])
    norm_accuracy = 1 - normalize_metric(accuracy, *ranges["accuracy"])  # Invert

    # Adjust weights dynamically with complexity (optional for future RL)
    # For now, use static weights
    score = (WEIGHTS["size"] * norm_size +
             WEIGHTS["energy"] * norm_energy +
             WEIGHTS["cost"] * norm_cost +
             WEIGHTS["latency"] * norm_latency +
             WEIGHTS["accuracy"] * norm_accuracy)
    return score

# Simplest possible routing.
def route_query(query: str) -> str:
    """Route query to the most efficient model."""
    complexity = classify_query(query)
    scores = {model_id: score_model(metrics, complexity) for model_id, metrics in MODELS.items()}
    best_model = min(scores, key=scores.get)  # Lowest score wins
    return best_model

# Just a placeholder for now.
def simulate_api_call(model_id: str, query: str) -> str:
    """Simulate calling the model's API (replace with Bedrock API later)."""
    return f"Response from {model_id}: Processed '{query}'"

# Test the prototype
if __name__ == "__main__":
    queries = [
        "What’s the capital of Brazil?",  # Short, simple
        "Write a 500-word essay on renewable energy solutions."  # Long, complex
    ]

    for query in queries:
        model_id = route_query(query)
        response = simulate_api_call(model_id, query)
        print(f"Query: {query}")
        print(f"Routed to: {model_id}")
        print(f"Response: {response}\n")

Query: What’s the capital of Brazil?
Routed to: meta.llama3-8b
Response: Response from meta.llama3-8b: Processed 'What’s the capital of Brazil?'

Query: Write a 500-word essay on renewable energy solutions.
Routed to: meta.llama3-8b
Response: Response from meta.llama3-8b: Processed 'Write a 500-word essay on renewable energy solutions.'

