# A tiny pricing engine

The following module turns usage dicts into dollars. It understands:
- Regular input tokens
- Cached-input tokens (discounted)\
- Output tokens
- Both the modern Responses API usage keys and classic prompt/completion keys

In [8]:
from dataclasses import dataclass
from decimal import Decimal, ROUND_HALF_UP
from typing import Optional, Dict, Tuple, Any, Mapping

MTOK = Decimal("1000000")  # per-million denominator

@dataclass(frozen=True)
class ModelPricing:
    input_per_mtok: Decimal
    output_per_mtok: Decimal
    cached_input_per_mtok: Optional[Decimal] = None  # None if not listed/applicable

    def cost_from_tokens(
        self,
        input_tokens: int,
        output_tokens: int,
        cache_read_input_tokens: int = 0
    ) -> Decimal:
        """
        Compute cost:
        - Non-cached input billed at input rate
        - Cached-read input billed at cached_input rate (if provided)
        - All output billed at output rate
        """
        cache_read_input_tokens = max(cache_read_input_tokens, 0)
        non_cached_input = max(input_tokens - cache_read_input_tokens, 0)

        cost_input = (Decimal(non_cached_input) / MTOK) * self.input_per_mtok
        cost_cached = Decimal(0)
        if self.cached_input_per_mtok is not None and cache_read_input_tokens > 0:
            cost_cached = (Decimal(cache_read_input_tokens) / MTOK) * self.cached_input_per_mtok
        cost_output = (Decimal(output_tokens) / MTOK) * self.output_per_mtok

        total = cost_input + cost_cached + cost_output
        return total.quantize(Decimal("0.0000001"), rounding=ROUND_HALF_UP)


# Official GPT-5 prices (per 1M tokens) as of Aug 2025 — verify against pricing page.
PRICING: Dict[str, ModelPricing] = {
    "gpt-5-2025-08-07": ModelPricing(Decimal("1.25"), Decimal("10.00"), Decimal("0.125")),
    "gpt-5-mini":       ModelPricing(Decimal("0.25"), Decimal("2.00"),  Decimal("0.025")),
    "gpt-5-nano":       ModelPricing(Decimal("0.05"), Decimal("0.40"),  Decimal("0.005")),
    # Some apps surface a chat-aliased variant:
    "gpt-5-chat-latest": ModelPricing(Decimal("1.25"), Decimal("10.00"), Decimal("0.125")),
}

def _to_mapping(u: Any) -> Mapping[str, Any]:
    """
    Convert OpenAI SDK usage objects (e.g., ResponseUsage) or dict-like things to a plain dict.
    Tries: model_dump() -> to_dict() -> __dict__ -> already-mapping -> attribute scan.
    """
    if isinstance(u, Mapping):
        return u
    if hasattr(u, "model_dump") and callable(getattr(u, "model_dump")):
        return u.model_dump()
    if hasattr(u, "to_dict") and callable(getattr(u, "to_dict")):
        return u.to_dict()
    if hasattr(u, "__dict__"):
        return dict(u.__dict__)
    keys = [k for k in dir(u) if not k.startswith("_")]
    return {k: getattr(u, k) for k in keys if hasattr(u, k)}

def parse_usage(u: Any) -> Tuple[int, int, int]:
    """
    Make this tolerant to both Responses API and older Chat Completions keys,
    and to different field names for prompt caching.

    Returns (input_tokens, output_tokens, cache_read_input_tokens)
    """
    m = _to_mapping(u)

    # Modern Responses API names:
    input_tokens = m.get("input_tokens")
    output_tokens = m.get("output_tokens")

    # Back-compat fallbacks (older docs/help center often say "prompt"/"completion"):
    if input_tokens is None:
        input_tokens = m.get("prompt_tokens", 0)
    if output_tokens is None:
        output_tokens = m.get("completion_tokens", 0)

    # Prompt caching read tokens show up under several keys depending on endpoint/version:
    cache_read = (
        m.get("cache_read_input_tokens")
        or m.get("prompt_cache_read_input_tokens")
        or m.get("cached_input_tokens")
        or 0
    )
    return int(input_tokens or 0), int(output_tokens or 0), int(cache_read or 0)

def cost_for_response(model: str, usage: Any) -> Decimal:
    if model not in PRICING:
        raise KeyError(f"Unknown model '{model}'. Add it to PRICING.")
    ipt, opt, cache_read = parse_usage(usage)
    return PRICING[model].cost_from_tokens(ipt, opt, cache_read)

## Example: compute cost from a live API call

Below is a minimal **Responses API** example using the official Python SDK. We pull `response.usage` and feed it to the pricing function.

In [9]:
from dotenv import load_dotenv
import os
from decimal import Decimal
from openai import OpenAI

load_dotenv()
api_key=os.getenv("OPENAI_API_KEY")

client = OpenAI(
  api_key=api_key
)

resp = client.responses.create(
    model="gpt-5",  # or gpt-5-mini / gpt-5-nano
    input="Explain the difference between optimistic and pessimistic concurrency control in databases.",
    # Optional cost-control knobs:
    max_output_tokens=400,   # cap outputs to bound costs
    # verbosity="terse",      # GPT-5 supports verbosity (see model docs)
    # reasoning_effort="low", # also supported across GPT-5 family
)

print(resp.output_text)  # your answer
usd = cost_for_response(resp.model, resp.usage)
print(f"Estimated cost for this call: ${usd:.6f}")


Estimated cost for this call: $0.003864


## Token details

In [11]:
u = resp.usage.model_dump()  # or to_dict()
print(u)

{'input_tokens': 19, 'input_tokens_details': {'cached_tokens': 0}, 'output_tokens': 384, 'output_tokens_details': {'reasoning_tokens': 384}, 'total_tokens': 403}


## Worked cost math (so you can sanity-check numbers)
Suppose the response usage is (replace with your numbers from the output above):

In [None]:
{
  "input_tokens": 19,
  "output_tokens": 384,
  "prompt_cache_read_input_tokens": 0
}

## For gpt-5:

`Non-cached input` = 19 ÷ 1,000,000 × $1.25 = $0.00002375

`Cached-input` = 0 ÷ 1,000,000 × $0.125 = $0.00000000

`Output` = 384 ÷ 1,000,000 × $10.00 = $0.00384000

`Total` = $0.00002375 + $0.00000000 + $0.00384000 = **$0.00386375 (~0.386¢)**

## Pre-estimate cost before you call the API (tiktoken)

If you need a budget check ahead of time, estimate tokens locally. The recommended approach is to use OpenAI’s tiktoken tokenizer (or the browser-based Tokenizer tool for quick checks).

In [10]:
from decimal import Decimal
import tiktoken  # pip install tiktoken

def count_tokens(text: str, model: str = "gpt-5-2025-08-07") -> int:
    try:
        enc = tiktoken.encoding_for_model(model)
    except Exception:
        # Fallback – pick a close encoding if model mapping isn’t present locally
        enc = tiktoken.get_encoding("cl100k_base")
    return len(enc.encode(text))

def estimate_cost(model: str, prompt: str, expected_output_tokens: int) -> Decimal:
    pricing = PRICING[model]
    input_tokens = count_tokens(prompt, model)
    # No caching assumed in a first-order estimate (set cache hits explicitly if you use it)
    input_cost  = (Decimal(input_tokens) / MTOK) * pricing.input_per_mtok
    output_cost = (Decimal(expected_output_tokens) / MTOK) * pricing.output_per_mtok
    return (input_cost + output_cost).quantize(Decimal("0.0000001"))

# Example
if __name__ == "__main__":
    prompt = "Summarize this product spec into a 5-bullet executive summary..."
    print("Estimated cost:", estimate_cost("gpt-5-2025-08-07", prompt, expected_output_tokens=300))


Estimated cost: 0.0030188
