
# Inference Latency Benchmarks (BERT-base vs FinBERT)

This notebook measures end-to-end **latency** and **throughput** for small batches (micro-batching=1) â€” relevant to low-latency pipelines.

**It records:**
- p50 / p95 / p99 latency (ms) over N runs
- Mean throughput (samples/sec)
- Optional: CPU vs CUDA (if available)


## 0. Install (Colab-friendly)

In [None]:

# If in Colab:
# !pip install -q transformers==4.43.3 torch --index-url https://download.pytorch.org/whl/cu121 tqdm==4.66.4


## 1. Imports & Config

In [None]:

import time, statistics, random
from pathlib import Path
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import torch

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

MODELS = [
    ("bert-base-uncased","bert-base-uncased"),
    ("finbert-prosus","ProsusAI/finbert")
]
DEVICE = 0 if torch.cuda.is_available() else -1
N_WARMUP = 10
N_RUNS = 200  # increase to 1000 for more stable tails
TEXTS = [
    "Apple beats earnings expectations and raises guidance for Q1.",
    "Regulators open antitrust investigation into major oil company operations.",
    "Rumors suggest upcoming product delay; analysts remain cautious.",
    "Company misses revenue estimates; shares fall after hours.",
    "Chevron to acquire smaller competitor, deal valued at $10B."
]
print("CUDA available:", torch.cuda.is_available())


## 2. Benchmark Helper

In [None]:

def build_pipeline(model_id: str, device: int):
    tok = AutoTokenizer.from_pretrained(model_id)
    mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
    pipe = TextClassificationPipeline(model=mdl, tokenizer=tok, device=device, top_k=None, return_all_scores=True, truncation=True, max_length=64)
    return pipe

def sample_texts(n: int) -> list:
    import random
    arr = []
    for _ in range(n):
        arr.append(random.choice(TEXTS))
    return arr

def benchmark(pipe, n_runs=200, warmup=10):
    # Warmup
    _ = pipe(sample_texts(warmup))
    times = []
    for i in range(n_runs):
        s = sample_texts(1)
        t0 = time.perf_counter()
        _ = pipe(s)
        t1 = time.perf_counter()
        times.append((t1 - t0) * 1000.0)  # ms
    import numpy as np
    p50 = np.percentile(times, 50)
    p95 = np.percentile(times, 95)
    p99 = np.percentile(times, 99)
    mean = float(np.mean(times))
    thr = 1000.0 / mean  # samples/sec in single-sample mode
    return {"p50_ms":p50, "p95_ms":p95, "p99_ms":p99, "mean_ms":mean, "throughput_sps":thr}


## 3. Run Benchmarks

In [None]:

rows = []
for name, mid in MODELS:
    print(f"\n=== {name} ({mid}) ===")
    pipe = build_pipeline(mid, DEVICE)
    m = benchmark(pipe, n_runs=N_RUNS, warmup=N_WARMUP)
    m["model"] = name
    m["backend"] = "CUDA" if DEVICE >= 0 else "CPU"
    rows.append(m)
rows


## 4. Save Results

In [None]:

import pandas as pd
from pathlib import Path

TAB_DIR = Path("./tables"); TAB_DIR.mkdir(exist_ok=True, parents=True)
df = pd.DataFrame(rows)
df.to_csv(TAB_DIR/"bench_latency.csv", index=False)
df
