In [96]:
import pandas as pd
import yaml

from pathlib import Path

In [97]:
OUTPUT_DIR = f"{Path.cwd()}/benchmark_output"
MODEL_NAME = "01-ai/Yi-34B-200K"
TP_DIMENSION = 2
MAX_BATCH_SIZE = 4
NUM_REQUESTS = 2 * MAX_BATCH_SIZE
PREFILL_LENGTH = 1024
DECODE_LENGTH = 16
SEQUENCE_LENGTH = PREFILL_LENGTH + DECODE_LENGTH


In [98]:
def generate_runs():
    devices = ','.join([str(x) for x in range(TP_DIMENSION)])
    commands = []
    base_command = (
        f"CUDA_VISIBLE_DEVICES={devices} python sarathi/benchmark/main.py \\\n"
        f"--output_dir {OUTPUT_DIR} \\\n"
        f"--model_name {MODEL_NAME} \\\n"
        f"--model_max_model_len {SEQUENCE_LENGTH} \\\n"
        f"--cluster_num_replicas 1 \\\n"
        f"--model_tensor_parallel_degree {TP_DIMENSION} \\\n"
        f"--model_pipeline_parallel_degree 1 \\\n"
        f"--request_generator_provider synthetic \\\n"
        f"--synthetic_request_generator_interval_provider static \\\n"
        f"--synthetic_request_generator_num_requests {NUM_REQUESTS} \\\n"
        f"--synthetic_request_generator_length_provider fixed \\\n"
        f"--fixed_request_length_generator_prefill_tokens {PREFILL_LENGTH} \\\n"
        f"--fixed_request_length_generator_decode_tokens {DECODE_LENGTH} \\\n"
        f"--metrics_store_keep_individual_batch_metrics true \\\n"
    )
    vllm_command = base_command + (
        f"--replica_scheduler_provider vllm \\\n"
        f"--replica_scheduler_max_batch_size {MAX_BATCH_SIZE} \\\n"
        f"--vllm_scheduler_max_tokens_in_batch {SEQUENCE_LENGTH} \\\n"
    )
    sarathi_command = base_command + (
        f"--replica_scheduler_provider sarathi \\\n"
        f"--replica_scheduler_max_batch_size {MAX_BATCH_SIZE} \\\n"
        f"--sarathi_scheduler_chunk_size {PREFILL_LENGTH} \\\n"
        f"--sarathi_scheduler_enable_rolling_prefills true \\\n"
        f"--sarathi_scheduler_enable_dynamic_chunking_schedule false \\\n"
    )
    commands += [
        vllm_command + f"--metrics_store_enable_op_level_metrics false\n",
        vllm_command + f"--metrics_store_enable_op_level_metrics true\n",
        sarathi_command + f"--metrics_store_enable_op_level_metrics false\n",
        sarathi_command + f"--metrics_store_enable_op_level_metrics true\n",
    ]
    with open(f"runs.sh", "w") as f:
        f.write("#!/bin/bash\n")
        f.write("set -x\n")
        for command in commands:
            f.write(command + "\n")


In [99]:
def _get_run_directories():
    rootdir = Path(OUTPUT_DIR)
    subdirectory_list = [
        directory for directory in rootdir.iterdir() if directory.is_dir()
    ]
    return subdirectory_list

In [100]:
def _filter_df (df: pd.DataFrame):
    df = df[
        ((df["batch_num_prefill_tokens"] == PREFILL_LENGTH) & (df["batch_num_decode_tokens"] == 0) & (df["batch_size"] == 1))
        | ((df["batch_num_prefill_tokens"] == 0) & (df["batch_num_decode_tokens"] == MAX_BATCH_SIZE) & (df["batch_size"] == MAX_BATCH_SIZE))
        | ((df["batch_num_prefill_tokens"] == PREFILL_LENGTH - MAX_BATCH_SIZE + 1) & (df["batch_num_decode_tokens"] == MAX_BATCH_SIZE - 1) & (df["batch_size"] == MAX_BATCH_SIZE))
    ]
    df = df.groupby(["batch_num_prefill_tokens", "batch_num_decode_tokens", "batch_size"]).median().reset_index()
    return df

In [101]:
def get_df():
    run_directories = _get_run_directories()
    run_directories.sort()

    non_profiled_dfs = []
    profiled_dfs = [] 
    for run_dir in run_directories:
        try:
            with open(
                f"{run_dir}/benchmark_config.yml", "r"
            ) as benchmark_config_file, open(
                f"{run_dir}/replica_0/batch_metrics.csv", "r"
            ) as batch_metrics_file:
                benchmark_config = yaml.safe_load(benchmark_config_file)
                batch_metrics = pd.read_csv(batch_metrics_file)

                if benchmark_config["metrics_store_enable_op_level_metrics"]:
                    operation_metrics = pd.read_csv(f"{run_dir}/replica_0/operation_metrics.csv")
                    operation_metrics = pd.merge(batch_metrics, operation_metrics, on=["Batch Id"], how="inner")
                    profiled_dfs.append(operation_metrics)

                non_profiled_dfs.append(batch_metrics)
        except FileNotFoundError as e:
            print(f"WARN: Skipping {run_dir} due to {e}")
    
    non_profiled_df = _filter_df(pd.concat(non_profiled_dfs))
    non_profiled_df["batch_execution_time"] *= 1000
    non_profiled_df["per_token_time"] = non_profiled_df["batch_execution_time"] / non_profiled_df["batch_num_tokens"]
    profiled_df = _filter_df(pd.concat(profiled_dfs))
    profiled_df["linear"] = sum([profiled_df[x] for x in ["mlp_up_proj", "mlp_down_proj", "attn_pre_proj", "attn_post_proj"]])
    profiled_df["attention"] = sum([profiled_df[x] for x in ["attn"]])
    profiled_df = profiled_df[["batch_num_prefill_tokens", "batch_num_decode_tokens", "batch_size", "linear", "attention"]]

    return pd.merge(non_profiled_df, profiled_df, on=["batch_num_prefill_tokens", "batch_num_decode_tokens", "batch_size"], how="inner")


In [102]:
# generate_runs()

In [103]:
df = get_df()
df.to_csv("results.csv", index=False)
df

Unnamed: 0,batch_num_prefill_tokens,batch_num_decode_tokens,batch_size,Batch Id,batch_num_tokens,batch_execution_time,inter_batch_delay,per_token_time,linear,attention
0,0,4,4,20.0,4.0,68.122095,7.4e-05,17.030524,25.500416,2.9952
1,1021,3,4,17.0,1024.0,217.649878,0.0004,0.212549,158.99648,3.9936
2,1024,0,1,3.0,1024.0,217.235959,6.9e-05,0.212144,160.926719,13.6704
