In [None]:
import time
import os
import subprocess
import aiohttp
import asyncio

In [None]:
model = 'meta-llama/Llama-2-7b-hf'
dataset = 'ShareGPT_V3_unfiltered_cleaned_split.json'
block_size = 32

prefix_len = 32

# feel free to tweak these values. they currently assume:
# - 16 GPU prefixes
# - 32 CPU prefixes
# - 64 Disk prefixes
prefixes_gpu = 8
prefixes_gpu_cpu = 32
prefixes_all = 256

In [None]:
if not os.path.isdir("results"):
    os.mkdir("results")

# create file and write csv header
filename = f"results/{int(time.time())}.csv"    
with open(filename, "w") as f:
    f.write("benchmark,latency (s),throughput (req/s),hits_gpu,hits_cpu,hits_disk,misses,swaps_gpu_cpu,swaps_gpu_disk,swaps_cpu_gpu,swaps_cpu_disk,swaps_disk_gpu,swaps_disk_cpu,util_gpu,util_cpu,util_disk\n")
    
print(f"Saving results to {filename}")

## Benchmark Latency

In [None]:
latency_cmd = """python benchmark_latency.py \
    --model \"{}\" \
    --batch-size 256 \
    --input-len 64 \
    --num-prefixes {} \
    --prefix-len {} \
    --output-csv \"{}\" \
    --benchmark-name \"{}\""""

In [None]:
subprocess.run(latency_cmd.format(model, 1, 0, filename, 'latency_baseline'), shell=True)

In [None]:
subprocess.run(latency_cmd.format(model, prefixes_gpu, prefix_len, filename, 'latency_gpu'), shell=True)

In [None]:
subprocess.run(latency_cmd.format(model, prefixes_gpu_cpu, prefix_len, filename, 'latency_gpu_cpu'), shell=True)

In [None]:
subprocess.run(latency_cmd.format(model, prefixes_all, prefix_len, filename, 'latency_all'), shell=True)

## Benchmark Throughput

In [None]:
throughput_cmd = """python benchmark_throughput.py \
    --model \"{}\" \
    --input-len 512 \
    --output-len 128 \
    --num-prompts 256 \
    --num-prefixes {} \
    --prefix-len {} \
    --output-csv \"{}\" \
    --benchmark-name \"{}\""""

In [None]:
subprocess.run(throughput_cmd.format(model, 1, 0, filename, 'throughput_baseline'), shell=True)

In [None]:
subprocess.run(throughput_cmd.format(model, prefixes_gpu, prefix_len, filename, 'throughput_gpu'), shell=True)

In [None]:
subprocess.run(throughput_cmd.format(model, prefixes_gpu_cpu, prefix_len, filename, 'throughput_gpu_cpu'), shell=True)

In [None]:
subprocess.run(throughput_cmd.format(model, prefixes_all, prefix_len, filename, 'throughput_all'), shell=True)

## Benchmark Serving

In [None]:
server_proc = subprocess.Popen(f"python -m vllm.entrypoints.api_server --model \"{model}\"", shell=True)

In [None]:
print("waiting for API server to start...")
while True:
    try:
        async with aiohttp.ClientSession() as session:
            async with session.get(f"http://0.0.0.0:8000/health") as response:
                if response.status == 200:
                    break
    except:
        pass
    await asyncio.sleep(1)
print("server started!")

In [None]:
serving_cmd = """python benchmark_serving.py \
    --tokenizer \"{}\" \
    --dataset \"{}\" \
    --num-prompts 256 \
    --system-prompt \"system_prompt.txt\" \
    --block-size {} \
    --use-prefix {} \
    --output-csv \"{}\" \
    --benchmark-name \"{}\""""

In [None]:
subprocess.run(serving_cmd.format(model, dataset, block_size, 'false', filename, 'serving_baseline'), shell=True)

In [None]:
subprocess.run(serving_cmd.format(model, dataset, block_size, 'true', filename, 'serving_prefix'), shell=True)

In [None]:
print("Stopping API server")
subprocess.run(f"kill -9 {server_proc.pid + 1}", shell=True) # not sure why PID is one less