# Endpoint benchmark

We run a benchmark on a xgboost model to see how much latency / throughput we can squeeze out of the MLServe.com server.

We incrementally go from 1, 2, 4 and up to 32 rps achieving **32 rps at avg latency of 100ms** which is the equivalent of serving 2.8 million requests daily!

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import numpy as np
from mlserve_sdk.client import MLServeClient

import asyncio
import time
import statistics

import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
# Load a larger dataset from OpenML (wine-quality-red ~1600 samples, 11 features)
X, y = fetch_openml(name="wine-quality-red", version=1, return_X_y=True, as_frame=True)
y=y.astype(int)
classes = np.unique(y)
class_mapping = {c: i for i, c in enumerate(classes)}
y = np.array([class_mapping[val] for val in y])

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss"
)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Example test data (first two samples from dataset)
TEST_DATA = {
    "inputs": [
        X_test.iloc[0,:].values.tolist()
    ]
}
print(TEST_DATA)

Accuracy: 0.678125
{'inputs': [[7.7, 0.56, 0.08, 2.5, 0.114, 14.0, 46.0, 0.9971, 3.24, 0.66, 9.6]]}


In [6]:
USERNAME = os.getenv("USERNAME")
TOKEN = os.getenv("TOKEN")

client = MLServeClient()
client.login(USERNAME, TOKEN)

In [7]:
client.deploy(
    model=model,
    name="xgb",
    version="v1",
    features=list(X),
    background_df=X.sample(100).reset_index(drop=True),
    metrics={'accuracy':accuracy_score(y_test, y_pred)},
    task_type='classification'
)

{'predict_url': 'https://mlserve.com/api/v1/predict/xgb/v1'}

In [8]:
MODEL_NAME = "xgb"
MODEL_VERSION = "v1"


async def worker(latencies):
    start = time.perf_counter()
    try:
        # run sync predict() in a thread
        preds = await asyncio.to_thread(client.predict, MODEL_NAME, MODEL_VERSION, TEST_DATA)
    except Exception as e:
        print("Request failed:", e)
    end = time.perf_counter()
    latencies.append((end - start) * 1000)

async def run_benchmark(rps: int, duration: int = 10):
    n_requests = rps * duration
    latencies = []
    tasks = []
    interval = 1.0 / rps

    start = time.perf_counter()
    for _ in range(n_requests):
        tasks.append(asyncio.create_task(worker(latencies)))
        await asyncio.sleep(interval)

    await asyncio.gather(*tasks)
    elapsed = time.perf_counter() - start
    return latencies, elapsed

def summarize(latencies, elapsed, rps):
    if not latencies:
        print("No successful requests")
        return
    print(f"\n=== RPS: {rps} ===")
    print(f"Total requests: {len(latencies)} in {elapsed:.2f}s")
    print(f"Achieved throughput: {len(latencies)/elapsed:.2f} req/s")
    print(f"Avg latency: {statistics.mean(latencies):.2f} ms")
    print(f"P50 latency: {statistics.median(latencies):.2f} ms")
    print(f"P95 latency: {statistics.quantiles(latencies, n=20)[-1]:.2f} ms")
    print(f"P99 latency: {statistics.quantiles(latencies, n=100)[-1]:.2f} ms")

async def benchmark():
    for rps in [1, 2, 4, 8, 16, 32]:
        latencies, elapsed = await run_benchmark(rps, duration=10)
        summarize(latencies, elapsed, rps)

# Run inside notebook
await benchmark()


=== RPS: 1 ===
Total requests: 10 in 10.01s
Achieved throughput: 1.00 req/s
Avg latency: 168.53 ms
P50 latency: 170.39 ms
P95 latency: 240.90 ms
P99 latency: 252.38 ms

=== RPS: 2 ===
Total requests: 20 in 10.03s
Achieved throughput: 1.99 req/s
Avg latency: 153.41 ms
P50 latency: 146.57 ms
P95 latency: 231.60 ms
P99 latency: 271.62 ms

=== RPS: 4 ===
Total requests: 40 in 10.05s
Achieved throughput: 3.98 req/s
Avg latency: 165.26 ms
P50 latency: 157.89 ms
P95 latency: 238.44 ms
P99 latency: 281.19 ms

=== RPS: 8 ===
Total requests: 80 in 10.14s
Achieved throughput: 7.89 req/s
Avg latency: 152.33 ms
P50 latency: 154.81 ms
P95 latency: 172.18 ms
P99 latency: 202.95 ms

=== RPS: 16 ===
Total requests: 160 in 10.23s
Achieved throughput: 15.64 req/s
Avg latency: 127.28 ms
P50 latency: 124.16 ms
P95 latency: 149.52 ms
P99 latency: 245.66 ms

=== RPS: 32 ===
Total requests: 320 in 10.38s
Achieved throughput: 30.84 req/s
Avg latency: 101.24 ms
P50 latency: 99.20 ms
P95 latency: 116.06 ms
P99 