In [3]:
from importlib.metadata import version

pkgs = [
    "thop", 
    "torch"
]

for p in pkgs:
    print(f"{p} version: {version(p)}")

thop version: 0.1.1-2209072238
torch version: 2.9.1


In [4]:
import torch
from thop import profile 

from modules import GPTModel

BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}

model_configs = {
    "gpt-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

In [6]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"[INFO] Using {device}")

[INFO] Using mps


In [None]:
batch_size = 2
input_tensor = torch.randint(0, 50257, (batch_size, 1024)).to(device) # (batch_size, context_length)

In [None]:
for size in model_configs:
    cfg = {**BASE_CONFIG, **model_configs[size]}

    model = GPTModel(cfg).bfloat16()
    model.to(device)

    # MACS = multiply-accumulate operations
    # MACS are typically counted as two FLOPS (one multiply and one accumulate)
    macs, params = profile(model, inputs=(input_tensor,), verbose=False)
    flops = 2 * macs
    print(f"{size:18}: {flops:.1e} FLOPS")

    del model
    torch.cuda.empty_cache()

gpt-small (124M)  : 5.1e+11 FLOPS
gpt-medium (355M) : 1.4e+12 FLOPS
gpt-large (774M)  : 3.2e+12 FLOPS
gpt-xl (1558M)    : 6.4e+12 FLOPS


In [None]:
cfg = {**BASE_CONFIG, **model_configs["gpt-small (124M)"]}
model = GPTModel(cfg).bfloat16().to(device)

profile(model, inputs=(input_tensor,), verbose=False) # returns (macs, params)

(252993601536.0, 123614976.0)

Now, we can use these measured FLOPS to estimate training FLOPS and the training time and cost on some popular chips.

In [15]:
# Config

# forward pass model FLOPS
FORWARD_FLOPS = {
    "gpt-small (124M)": 5.1e11,
    "gpt-medium (355M)": 1.4e12,
    "gpt-large (774M)": 3.2e12,
    "gpt-xl (1558M)": 6.4e12,
}

# hardware profiles - sustained TFLOPS + $/hour
HARDWARE = {
    "CPU":        {"tflops": 0.3,  "cost_per_hour": 0.05},
    "M1":         {"tflops": 2.6,  "cost_per_hour": 0.00},  # my device
    "RTX 3090":   {"tflops": 30.0, "cost_per_hour": 1.00},
    "RTX 4090":   {"tflops": 60.0, "cost_per_hour": 1.50},
    "A100 80GB":  {"tflops": 250.0,"cost_per_hour": 2.50},
    "H100":       {"tflops": 500.0,"cost_per_hour": 4.50},
}

# training set up
NUM_TOKENS = 10_000_000
CONTEXT_LENGTH = 1024
BATCH_SIZE = 2

TRAINING_MULTIPLIER = 3         # training = forward + backward + update = 3 * forward

In [16]:
def training_steps(num_tokens, context_length, batch_size):
    return num_tokens // (context_length * batch_size)

def total_training_flops(forward_flops, steps):
    return TRAINING_MULTIPLIER * forward_flops * steps

def flops_to_time_seconds(total_flops, tflops):
    return total_flops / (tflops * 1e12)

def time_to_cost(hours, cost_per_hour):
    return hours * cost_per_hour

In [17]:
steps = training_steps(NUM_TOKENS, CONTEXT_LENGTH, BATCH_SIZE)

print(f"\nDataset tokens      : {NUM_TOKENS:,}")
print(f"Context length      : {CONTEXT_LENGTH}")
print(f"Batch size          : {BATCH_SIZE}")
print(f"Training steps      : {steps:,}")
print("-" * 80)

for model_name, fwd_flops in FORWARD_FLOPS.items():
    total_flops = total_training_flops(fwd_flops, steps)

    print(f"\nMODEL: {model_name}")
    print(f"Total training FLOPs: {total_flops:.2e}")

    for hw_name, hw in HARDWARE.items():
        seconds = flops_to_time_seconds(
            total_flops, hw["tflops"]
        )

        hours = seconds / 3600
        cost = time_to_cost(hours, hw["cost_per_hour"])

        print(
            f"  {hw_name:10} | "
            f"{hours:8.3f} hrs | "
            f"${cost:6.2f}"
        )

    print("-" * 80)


Dataset tokens      : 10,000,000
Context length      : 1024
Batch size          : 2
Training steps      : 4,882
--------------------------------------------------------------------------------

MODEL: gpt-small (124M)
Total training FLOPs: 7.47e+15
  CPU        |    6.916 hrs | $  0.35
  M1         |    0.798 hrs | $  0.00
  RTX 3090   |    0.069 hrs | $  0.07
  RTX 4090   |    0.035 hrs | $  0.05
  A100 80GB  |    0.008 hrs | $  0.02
  H100       |    0.004 hrs | $  0.02
--------------------------------------------------------------------------------

MODEL: gpt-medium (355M)
Total training FLOPs: 2.05e+16
  CPU        |   18.986 hrs | $  0.95
  M1         |    2.191 hrs | $  0.00
  RTX 3090   |    0.190 hrs | $  0.19
  RTX 4090   |    0.095 hrs | $  0.14
  A100 80GB  |    0.023 hrs | $  0.06
  H100       |    0.011 hrs | $  0.05
--------------------------------------------------------------------------------

MODEL: gpt-large (774M)
Total training FLOPs: 4.69e+16
  CPU        |   43

In [18]:
# inference set up
CONTEXT_LENGTH = 1024
BATCH_SIZE = 1              # typical inference
GENERATED_TOKENS = 200      # per request

In [19]:
def inference_flops_per_request(
    forward_flops,
    context_length,
    generated_tokens,
    batch_size=1
):
    flops_per_token = forward_flops / context_length
    return flops_per_token * generated_tokens * batch_size


def flops_to_time_seconds(total_flops, tflops):
    return total_flops / (tflops * 1e12)


def time_to_cost(hours, cost_per_hour):
    return hours * cost_per_hour


In [27]:
print(f"\nInference setup:")
print(f"Context length     : {CONTEXT_LENGTH}")
print(f"Generated tokens   : {GENERATED_TOKENS}")
print(f"Batch size         : {BATCH_SIZE}")
print("-" * 110)

for model_name, fwd_flops in FORWARD_FLOPS.items():
    inf_flops = inference_flops_per_request(
        fwd_flops,
        CONTEXT_LENGTH,
        GENERATED_TOKENS,
        BATCH_SIZE
    )

    print(f"\nMODEL: {model_name}")
    print(f"Inference FLOPs per request: {inf_flops:.2e}")

    for hw_name, hw in HARDWARE.items():
        seconds = flops_to_time_seconds(inf_flops, hw["tflops"])
        hours = seconds / 3600
        cost = time_to_cost(hours, hw["cost_per_hour"])

        tokens_per_sec = GENERATED_TOKENS / seconds if seconds > 0 else float("inf")
        cost_per_million = cost * (1_000_000 / GENERATED_TOKENS)

        print(
            f"  {hw_name:10} | "
            f"{seconds*1000:7.2f} ms | "
            f"{tokens_per_sec:8.1f} tok/s | "
            f"${cost:.6f} per req | "
            f"${cost_per_million:.4f} per 1M tok"
        )

    print("-" * 110)


Inference setup:
Context length     : 1024
Generated tokens   : 200
Batch size         : 1
--------------------------------------------------------------------------------------------------------------

MODEL: gpt-small (124M)
Inference FLOPs per request: 9.96e+10
  CPU        |  332.03 ms |    602.4 tok/s | $0.000005 per req | $0.0231 per 1M tok
  M1         |   38.31 ms |   5220.4 tok/s | $0.000000 per req | $0.0000 per 1M tok
  RTX 3090   |    3.32 ms |  60235.3 tok/s | $0.000001 per req | $0.0046 per 1M tok
  RTX 4090   |    1.66 ms | 120470.6 tok/s | $0.000001 per req | $0.0035 per 1M tok
  A100 80GB  |    0.40 ms | 501960.8 tok/s | $0.000000 per req | $0.0014 per 1M tok
  H100       |    0.20 ms | 1003921.6 tok/s | $0.000000 per req | $0.0012 per 1M tok
--------------------------------------------------------------------------------------------------------------

MODEL: gpt-medium (355M)
Inference FLOPs per request: 2.73e+11
  CPU        |  911.46 ms |    219.4 tok/s | $0.000013