# Analysis Notebook
This notebook looks to execute on the analyses listed here: https://microsoftapc-my.sharepoint.com/personal/mtremeer_microsoft_com/_layouts/15/doc.aspx?sourcedoc={00cb3c27-ee54-43f4-87ba-1017f94635da}&action=edit

In [4]:
import os
import enum
from pathlib import Path

import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

from helpers import MODEL, DEPLOYMENT_TYPE

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

combined_logs_save_path = None

# Optional: Combine Log files
If logs have not yet been processed from separate JSON files into a single CSV, run this code below

In [2]:
# import hashlib
# from pathlib import Path

# log_dir = Path("logs")

# # Create hash of all files in directory so we can geenrate a unique combined_logs files
# all_filenames = ",".join(str(path) for path in log_dir.rglob("*.log"))
# dir_hash = hashlib.md5(all_filenames.encode()).hexdigest()[:8]
# combined_logs_save_path = f"../../logs/all_runs_combined_{dir_hash}.csv"

# # Combine all logs
# !cd .. && python -m combine_logs ../../logs $combined_logs_save_path --load-recursive

# Define constants

In [31]:
PTU_COST_PER_UNIT = 123 # Enter the PTU cost per unit here

# Create a mapper to convert model info to their key info. Make sure to check that the paygo cost per 1,000 prompt and cost per 1,000 generation tokens fields are correct (paygo_cp1kpt and paygo_cp1kgt)
MODEL_CORE_INFO = {
    MODEL.GPT_35_TURBO_0613_4K: {"human_name": "GPT-3.5 Turbo 4K 0613", "paygo_cp1kpt": 0.0015, "paygo_cp1kgt": 0.002,},
    MODEL.GPT_35_TURBO_0613_16K: {"human_name": "GPT-3.5 Turbo 16K 0613", "paygo_cp1kpt": 0.003, "paygo_cp1kgt": 0.004,},
    MODEL.GPT_35_TURBO_1106_16K: {"human_name": "GPT-3.5 Turbo 1106", "paygo_cp1kpt": 0.001, "paygo_cp1kgt": 0.0002,},
    MODEL.GPT_4_0613_8K: {"human_name": "GPT-4 8K 0613", "paygo_cp1kpt": 0.03, "paygo_cp1kgt": 0.06,},
    MODEL.GPT_4_0613_32K: {"human_name": "GPT-4 32K 0613", "paygo_cp1kpt": 0.06, "paygo_cp1kgt": 0.12,},
    MODEL.GPT_4_TURBO_1106_128K: {"human_name": "GPT-4 Turbo 1106", "paygo_cp1kpt": 0.01, "paygo_cp1kgt": 0.03,},
    MODEL.GPT_4_TURBO_VISION_1106: {"human_name": "GPT-4 Turbo Vision 1106", "paygo_cp1kpt": 0.01, "paygo_cp1kgt": 0.03,},
}
if len(MODEL_CORE_INFO) != len(MODEL):
    print("Warning: MODEL_CORE_INFO is missing models. See helpers.py for a list of models.")

# Map resource name to region. This helps us know the local datetime during the test
RESOURCE_INFO = {
    "aoai-aueast": {"region": "australiaeast", "deployment_type": DEPLOYMENT_TYPE.PAYGO},
    "aoai-sweden-mt": {"region": "swedencentral", "deployment_type": DEPLOYMENT_TYPE.PAYGO},
    "gbb-ea-openai-swedencentral-02": {"region": "swedencentral", "deployment_type": DEPLOYMENT_TYPE.PTU},
}

# Map deployment name to key variables that we'll need later
DEPLOYMENT_INFO = {
    ### Example deployment
    # "gpt-4-ptu": { # Name of deployment
    #     "model": MODEL.GPT_4_0613_8K, # Model name - see MODEL definition in helpers.py for more options
    #     "configured_paygo_tpm": None, # If deployment is PayGO, enter the configured Max TPM here
    #     "configured_ptu_units": 100, # If deployment is PTU, enter the Number of PTU units here
    #     "paygo_dynamic_quota": False, # Whether PayGO Dynamic Quota was enabled 
    #     "content_filtering_enabled": False, # Whether PayGO Dynamic Quota was enabled 
    # },
}

## Load logs

In [15]:
if combined_logs_save_path:
    # Use this line if logs have just been combined in the previous cell
    combined_logs_path = combined_logs_save_path
else:
    # Use this line of loading manually
    combined_logs_path = Path("~/Work Downloads/Customedata_AIA/aia_testing.csv")

In [16]:
# Set ouput directory for all data and plots generated in this notebook
REPO_BASE_DIR = Path.cwd().parent.parent.parent
DEFAULT_ANALYSIS_SAVE_DIR = REPO_BASE_DIR / "analysis_outputs"
combined_logs_name = combined_logs_path.name.split(".")[0]
CURRENT_ANALYSIS_SAVE_DIR = DEFAULT_ANALYSIS_SAVE_DIR / combined_logs_name

In [17]:
# Load all logs
all_logs = pd.read_csv(combined_logs_path)
print(f"{len(all_logs)} logs loaded")

# Remove entries that were early-terminated
all_logs = all_logs[~all_logs["early_terminated"]]

# Remove entries that have no TPM data (aggregation window might have been too small)
all_logs = all_logs[~all_logs["tpm_context"].isna()]
print(f"{len(all_logs)} logs remaining after removing early-terminations and those missing data. ")

# Drop unneeded cols
all_logs.drop(columns=["early_terminated", "api_version", "frequency_penalty", "presence_penalty", "temperature", "top_p", "output_format"], inplace=True)

# Extract region from endpoint_name and add local time
all_logs["aoai_resource"] = all_logs["api_base_endpoint"].apply(lambda x: x.split('.')[0].split("//")[1])

# Create token profile / replay / combined strings for easier groupby
all_logs["token_profile"] = all_logs.apply(lambda row: f"{int(row['context_tokens'])} / {int(row['max_tokens'])}" if row["context_generation_method"] == "generate" else f"{int(row['context_tpr_avg'])} / {int(row['max_tokens'])}", axis=1)
all_logs["replay_name"] = all_logs.apply(lambda row: os.path.basename(row['replay_path']).split(".")[0] if row["context_generation_method"] == "replay" else "", axis=1)
all_logs["workload_name_config"] = all_logs.apply(lambda row: row["token_profile"] if row["context_generation_method"] == "generate" else f'{row["replay_name"]} ({row["context_tpr_avg"]} / {row["max_tokens"]})', axis=1)
all_logs["workload_name_config_with_break"] = all_logs.apply(lambda row: row["token_profile"] if row["context_generation_method"] == "generate" else f'{row["replay_name"]}<br>({row["context_tpr_avg"]} / {row["max_tokens"]})', axis=1)
all_logs["workload_name_observed"] = all_logs.apply(lambda row: row["token_profile"] if row["context_generation_method"] == "generate" else f'{row["replay_name"]} ({row["context_tpr_avg"]} / {row["gen_tpr_avg"]})', axis=1)
all_logs["workload_name_observed_with_break"] = all_logs.apply(lambda row: row["token_profile"] if row["context_generation_method"] == "generate" else f'{row["replay_name"]}<br>({row["context_tpr_avg"]} / {row["gen_tpr_avg"]})', axis=1)


14 logs loaded
14 logs remaining after removing early-terminations and those missing data. 


In [18]:
# Add date of run to DF
all_logs["run_date"] = all_logs["filename"].apply(lambda x: x.split("/")[-1][:10])
all_logs["is_ptu"] = all_logs["deployment"].apply(lambda x: "ptu" in x)
all_logs.groupby(["is_ptu", "run_date"]).apply(lambda df: len(df))

is_ptu  run_date  
True    2024-01-19    14
dtype: int64

# Overview of runs

In [None]:
all_logs[["context_generation_method", "deployment", "run_date", "workload_name_config", "duration"]].value_counts().sort_index()

# Check that all required data has been loaded into the Mappers

We'll use these mappers to make sure all models have associated information, and make it easy to enrich our DF/make readable graphs laer on

In [None]:
resources_missing = False
for resource in all_logs["aoai_resource"].unique():
    if resource not in RESOURCE_INFO:
        print(f"WARNING: Azure AOAI resource {resource} has no entry in RESOURCE_INFO")
        resources_missing = True
if not resources_missing:
    print("Great! All Azure AOAI resources in logs have a RESOURCE_INFO entry.")

print("")

# Check that all models and resources in logs have a mapping entry
models_missing = False
for deployment in all_logs["deployment"].unique():
    if deployment not in DEPLOYMENT_INFO:
        print(f"Warning: Model {deployment} has no entry in DEPLOYMENT_INFO")
        model_missing = True
if not models_missing:
    print("Great! All deployments logs have a DEPLOYMENT_INFO entry.")
print("")

Great! All Azure AOAI resources in logs have a RESOURCE_INFO entry.

Great! All deployments logs have a DEPLOYMENT_INFO entry.



In [11]:
# Suggested Max PTU RPM for each workload
max_ptu_rpm_by_workload_mapper = {
    (3500, 300): 10,
    (500, 100): 49,
    (3800, 5): 16,
    (3000, 5): 20,
    (1000, 5): 62,
    (1000, 100): 35,
    (1000, 250): 21,
    (1000, 500): 12,
    (1000, 1000): 4,
    (500, 1000): 5,
    (250, 1000): 5,
    (100, 1000): 5,
    (5950, 1000): 4,
    (5950, 150): 8,
    (6311, 1000): 4,
    (6311, 150): 7,
}

# TODO: Turn this into a mapper and apply it to the DF

# Preprocess data - add common fields/transformations

* Merge info from mappers
* Add fields for observed values from model vs what was requested for the benchmark
* Calculate required fields for TCO analysis

In [33]:
## Helpers

def format_val_to_k(val):
    return f"{int(val/1000)}K"


def int_to_thousands(val):
    if np.inf == val:
        return "inf"
    if np.isnan(val):
        return "NaN"
    if val < 1000:
        return f"{int(val)}"
    if val < 10000:
        return f"{round(val/1000, 1)}K"
    if val >= 1000000:
        return f"{round(val/1000000, 2)}M"
    return f"{int(val/1000)}K"

def pretty_str(s):
    """Convert a string to title case and replace underscores with spaces"""
    new = s.replace("_", " ").title()
    capitalize_words = ["Rpm", "Tpm", "Tpr", "Ptu", "Ttft", "Tbt", "Gpt"]
    for word in capitalize_words:
        new = new.replace(word, word.upper())
    new = new.replace("Paygo", "PayGO")
    return new

deployment_type_str_mapper = {
    DEPLOYMENT_TYPE.PAYGO: "PayGO",
    DEPLOYMENT_TYPE.PTU: "PTU",
}

def  round_floats(val):
    if np.inf == val:
        return "inf"
    if np.isnan(val):
        return "NaN"
    if not (isinstance(val, int) or isinstance(val, float)):
        return val
    if val < 1:
        return f"{round(val, 3)}"
    if val < 10:
        return f"{round(val, 2)}"
    if val < 100:
        return f"{round(val, 1)}"
    if val < 1000:
        return f"{round(val)}"
    return int_to_thousands(val)

def apply_pretty_formatting(df: pd.DataFrame, cols_to_ignore = None) -> pd.DataFrame:
    """Apply pretty string formatting before displaying a DF"""
    if not cols_to_ignore:
        cols_to_ignore = list()
    out = df.copy()
    numeric_cols = out.select_dtypes(include=np.number).columns
    out.loc[:, numeric_cols] = out.loc[:, numeric_cols].apply(lambda x: pd.to_numeric(x, downcast='integer'))
    # Format values
    for col in [c for c in out.columns if c not in cols_to_ignore]:
        try:
            if col == "deployment_type":
                out[col] = out[col].apply(lambda x: deployment_type_str_mapper[x])
            if col in ["total_tpm_relative_to_expected", "ptu_rpm_relative_to_expected", "observed_prompt_vs_gen_token_ratio", "Observed Context/Generation Token Ratio"]:
                out[col] = out[col].apply(lambda x: "{:,.2f}%".format(x*100) if x*100 < 100 else ("{:,.1f}%".format(x*100) if x*100 < 1000 else "{:,.0f}%".format(x*100)))
            elif isinstance(col, str) and "cost" in col:
                out[col] = out[col].apply(lambda x: "${:,.3f}".format(x))
            elif (isinstance(col, tuple) and "cost" in col[0]) or (isinstance(col, str) and "cost" in col):
                out[col] = out[col].apply(lambda x: "${:,.3f}".format(x))
            else:
                out[col] = out[col].apply(round_floats)
        except Exception as e:
            print(f"Transform of col '{col}' failed: {e}")
                
    # Format column names
    new_cols = []
    for col in out.columns.tolist():
        if isinstance(col, str):
            new_cols.append(pretty_str(col))
        else:
            new_col = []
            for sub_col in col:
                new_col.append(pretty_str(sub_col))
            new_cols.append(tuple(new_col))
    if out.columns.nlevels == 1:
        out.columns = new_cols
    else:
        names = out.columns.names
        out.columns = pd.MultiIndex.from_tuples(new_cols, names = names)
    # out.columns = pd.MultiIndex.from_tuples(out.columns, names = names)
        
    # Index names
    if out.index.nlevels == 1:
        # out.index = new_idxs
        out.index = pd.Index(out.index, name = pretty_str(out.index.name) if out.index.name else out.index.name)
    else:
        names = [pretty_str(name) for name in out.index.names]
        out.index = pd.MultiIndex.from_tuples(out.index.tolist(), names = names)
    return out

In [34]:
### Preprocess all data
# Merge info from RESOURCE_INFO
def merge_resource_info(row):
    resource = row["aoai_resource"]
    if resource in RESOURCE_INFO:
        resource_info = RESOURCE_INFO[resource]
        for key, value in resource_info.items():
            row[key] = value
    return row
all_logs = all_logs.apply(merge_resource_info, axis=1)

# Merge info from DEPLOYMENT_INFO
def merge_deployment_info(row):
    deployment = row["deployment"]
    if deployment in DEPLOYMENT_INFO:
        model_info = DEPLOYMENT_INFO[deployment]
        for key, value in model_info.items():
            row[key] = value
    return row
all_logs = all_logs.apply(merge_deployment_info, axis=1)

# Merge info from MODEL_CORE_INFO
def merge_model_core_info(row):
    model = row["model"]
    if model in MODEL_CORE_INFO:
        model_core_info = MODEL_CORE_INFO[model]
        for key, value in model_core_info.items():
            row[key] = value
    return row
all_logs = all_logs.apply(merge_model_core_info, axis=1)

# Add ratio of prompt vs generated tokens for each run (using actual generated token values)
all_logs["observed_prompt_vs_gen_token_ratio"] = all_logs["context_tpr_avg"] / (all_logs["context_tpr_avg"] + all_logs["gen_tpr_avg"])

# Calculate time between prompt tokens
all_logs["tbt_prompt_avg"] = all_logs["ttft_avg"] / all_logs["context_tokens"]
all_logs["tbt_prompt_95th"] = all_logs["ttft_95th"] / all_logs["context_tokens"]

# Forecast the expected e2e latency if the model predicted the full number of max_tokens (multiple ttft * number of missing tokens and add on to the existing e2e time)
all_logs["e2e_avg_assuming_max_tokens"] = all_logs.apply(lambda row: row["e2e_avg"] + ((row["max_tokens"] - row["gen_tpr_avg"]) * row["tbt_avg"]), axis=1)

# Total time from first token to last token
all_logs["tfft_to_lt_avg"] = all_logs["e2e_avg"] - all_logs["ttft_avg"]
all_logs["tfft_to_lt_95th"] = all_logs["e2e_95th"] - all_logs["ttft_95th"]
all_logs["tfft_to_lt_avg_assuming_max_tokens"] = all_logs["e2e_avg_assuming_max_tokens"] - all_logs["ttft_avg"]

# Estimate Average concurrency
# For runs with a low number of RPM, the PTU may not be fully utilised. We will attempt to calculate an effective utilisation based on the number of throttled requests.
all_logs["minutely_gpu_time"] = all_logs.apply(lambda row: row["e2e_avg"] * row["rpm"], axis=1)
all_logs["avg_concurrency"] = all_logs["minutely_gpu_time"].apply(lambda x: round(x / 60, 1))

# TPM while in context or generation (to prevent TPM for prompt being reduced while waiting for generation)
all_logs["tpm_context_exclusive"] = all_logs["context_tpr_avg"] / all_logs["ttft_avg"] * 60 * all_logs["avg_concurrency"]
all_logs["tpm_gen_exclusive_per_request"] = all_logs["gen_tpr_avg"] / all_logs["tfft_to_lt_avg"] * 60
all_logs["tpm_gen_exclusive"] = all_logs["tpm_gen_exclusive_per_request"] * all_logs["avg_concurrency"]

# Add throttled RPM
all_logs["successful"] = all_logs["completed"] - all_logs["failures"]
all_logs["throttled_rpm"] = all_logs["throttled"] / all_logs["aggregation_window"] * 60

# Other helper cols
all_logs["human_name_w_tpm"] = all_logs.apply(
    lambda row: f"{row['human_name']} - ({int(row['configured_ptu_units'])} PTUs)" if (row['deployment_type'] == DEPLOYMENT_TYPE.PTU) else f"{row['human_name']} - PayGO ({format_val_to_k(row['configured_paygo_tpm'])} Max TPM)", axis=1
)
all_logs["human_name_w_deployment_type"]  = all_logs.apply(
    lambda row: f"{row['human_name']} - PTU" if (row['deployment_type'] == DEPLOYMENT_TYPE.PTU) else f"{row['human_name']} - PayGO ", axis=1
)

### Estimate cost over time
# Estimate hourly and monthly cost based on deployment type
MINUTES_PER_MONTH = 60 * 24 * 365.25 / 12
all_logs["paygo_cost_per_request"] = all_logs.apply(lambda row: round((row["context_tpr_avg"] * row["paygo_cp1kpt"] / 1000) + (row["gen_tpr_avg"] * row["paygo_cp1kgt"] / 1000), 4), axis=1)
all_logs["paygo_cost_per_month"] = all_logs["paygo_cost_per_request"] * all_logs["rpm"] * MINUTES_PER_MONTH
all_logs["paygo_cost_per_hour"] = all_logs["paygo_cost_per_request"] * all_logs["rpm"] * 60
all_logs["est_monthly_cost"] = all_logs.apply(lambda row: round(row["configured_ptu_units"] * PTU_COST_PER_UNIT) if row["deployment_type"] is DEPLOYMENT_TYPE.PTU else row["paygo_cost_per_month"], axis=1)
all_logs["est_hourly_cost"] = all_logs.apply(lambda row: round(row["est_monthly_cost"] / MINUTES_PER_MONTH * 60, 3), axis=1)

# Estimate cost per request
all_logs["est_cost_per_request"] = all_logs.apply(lambda row: round(row["est_hourly_cost"] / 60 / row["rpm"], 4) if row["deployment_type"] is DEPLOYMENT_TYPE.PTU else row["paygo_cost_per_request"], axis=1)
all_logs["est_cost_per_1k_requests"] = all_logs["est_cost_per_request"] * 1000
all_logs["paygo_cost_per_1k_requests"] = all_logs["paygo_cost_per_request"] * 1000
# Compare PTU and PayGO cost
all_logs["ptu_cost_vs_paygo_cost"] = all_logs["est_cost_per_request"] / all_logs["paygo_cost_per_request"]

# Add expected RPM/TPM and comparison
all_logs["exp_ptu_rpm"] = all_logs.apply(lambda row: max_ptu_rpm_by_workload_mapper.get((row["context_tpr_avg"], row["max_tokens"])), axis=1)
all_logs["exp_ptu_tpm"] = all_logs.apply(lambda row: row["exp_ptu_rpm"] * (row["context_tpr_avg"] + row["max_tokens"]) if row["deployment_type"] is DEPLOYMENT_TYPE.PTU else None, axis=1)
all_logs["ptu_rpm_relative_to_expected"] = all_logs["rpm"] / all_logs["exp_ptu_rpm"]
all_logs["exp_max_tpm"] = all_logs.apply(lambda row: row["exp_ptu_tpm"] if row["deployment_type"] is DEPLOYMENT_TYPE.PTU else row["configured_paygo_tpm"], axis=1)
all_logs["total_tpm_relative_to_expected"] = all_logs["tpm_total"] / all_logs["exp_max_tpm"]

# Add float representation of utilisation for PTU
all_logs["util_avg_float"] = all_logs["util_avg"].apply(lambda s: float(s.replace("%", "")) if isinstance(s, str) else s)
all_logs["util_95th_float"] = all_logs["util_95th"].apply(lambda s: float(s.replace("%", "")) if isinstance(s, str) else s)

# Add col to switch between PTU groups but still include PayGO runs, based on date of run
all_logs["run_group"] = all_logs.apply(lambda row: " or ".join(all_logs["run_date"].unique().tolist()) if row["deployment_type"] is DEPLOYMENT_TYPE.PAYGO else row['run_date'], axis=1)

# General findings

## Stats for the best run: By model and workload

Metrics and group-by to get these:

Across all runs
- e2e avg & p95
- TTFT avg & p95
- Latency avg & p95
- TPM
- RPM
- Max concurrency
- Cost per request

Across runs where gen tokens >= 100:
- Min/Max Gen TPM/request

In [35]:
COL_RENAME_MAPPER = {
    "context_tpr_avg": "Average Context Tokens/Request",
    "gen_tpr_avg": "Average Generation Tokens/Request",
    "observed_prompt_vs_gen_token_ratio": "Observed Context/Generation Token Ratio",
    "tpm_total": "TPM - Total",
    "tpm_context_exclusive": "TPM - Context Exclusive",
    "tpm_gen_exclusive": "TPM - Generation Exclusive",
    "tpm_gen_exclusive_per_request": "TPM - Generation Exclusive Per Request",
    "est_cost_per_1k_requests": "Cost Per 1K Requests",
    "tpm_context": "TPM - Context",
    "tpm_gen": "TPM - Generation",
    "ttft_avg": "Time-to-First-Token - Average",
    "ttft_95th": "Time-to-First-Token - P95",
    "e2e_avg": "End-to-End Latency - Average",
    "e2e_95th": "End-to-End Latency - P95",
    "avg_concurrency": "Average Concurrent Requests"
}

In [None]:
group_cols = ["human_name_w_tpm", "workload_name_config"]
cols_for_agg = ["deployment_type", "rate", "clients", "context_tpr_avg", "gen_tpr_avg", "observed_prompt_vs_gen_token_ratio", "tpm_total", "tpm_context", "tpm_context_exclusive", "tpm_gen", "tpm_gen_exclusive", "tpm_gen_exclusive_per_request", "ttft_avg", "ttft_95th", "e2e_avg", "e2e_95th", "rpm", "est_cost_per_1k_requests", "paygo_cost_per_1k_requests", "ptu_cost_vs_paygo_cost"]
min_100t_cols_for_agg = ["tpm_gen_exclusive", "tpm_gen_exclusive_per_request"]

# Filter to best runs for each deployment and workload
min_max_agg_mask = (all_logs["deployment_type"] == DEPLOYMENT_TYPE.PAYGO) | ((all_logs["deployment_type"] == DEPLOYMENT_TYPE.PTU) & (all_logs["throttled"] == 0))
best_runs_df = all_logs[min_max_agg_mask].sort_values(["tpm_total"], ascending=False).groupby(group_cols).first()
runs_with_low_completion_tokens_mask = best_runs_df["gen_tpr_avg"] < 100
runs_to_include_df = best_runs_df[cols_for_agg].rename(columns=COL_RENAME_MAPPER)

out = apply_pretty_formatting(runs_to_include_df, cols_to_ignore=["context_tpr_avg", "gen_tpr_avg", "ptu_cost_vs_paygo_cost", "Cost Per 1K Requests", "paygo_cost_per_1k_requests"])

# Ignore the generation speed TPM for runs with less than 100 completion tokens
out.loc[runs_with_low_completion_tokens_mask, "TPM - Generation Exclusive"] = "N/A"
out.loc[runs_with_low_completion_tokens_mask, "TPM - Generation Exclusive Per Request"] = "N/A"

# Format cost columns for PTU and PayGO
ptu_mask = out["Deployment Type"] == "PTU"
out.loc[ptu_mask, "Cost Per 1K Requests"] = out.loc[ptu_mask, "Cost Per 1K Requests"].apply(lambda x: f"${round(x, 1)}")
out.loc[ptu_mask, "PayGO Cost Per 1K Requests"] = out.loc[ptu_mask, "PayGO Cost Per 1K Requests"].apply(lambda x: f"${round(x, 1)}")
# Get cost for different usage patterns
out.loc[ptu_mask, "PTU Cost Vs PayGO Cost - Weekdays"] = out.loc[ptu_mask, "PTU Cost Vs PayGO Cost"] * 7 / 5
out.loc[ptu_mask, "PTU Cost Vs PayGO Cost - Business Hours (8AM-6PM)"] = out.loc[ptu_mask, "PTU Cost Vs PayGO Cost"] * 7 * 24 / 5 / 10

out.loc[ptu_mask, "PTU Cost Vs PayGO Cost"] = out.loc[ptu_mask, "PTU Cost Vs PayGO Cost"].apply(lambda x: f"{round(x, 2)}x")
out.loc[ptu_mask, "PTU Cost Vs PayGO Cost - Weekdays"] = out.loc[ptu_mask, "PTU Cost Vs PayGO Cost - Weekdays"].apply(lambda x: f"{round(x, 2)}x")
out.loc[ptu_mask, "PTU Cost Vs PayGO Cost - Business Hours (8AM-6PM)"] = out.loc[ptu_mask, "PTU Cost Vs PayGO Cost - Business Hours (8AM-6PM)"].apply(lambda x: f"{round(x, 2)}x")
out.rename(columns={"PTU Cost Vs PayGO Cost": "PTU Cost Vs PayGO Cost - 24/7"}, inplace=True)

outpath = CURRENT_ANALYSIS_SAVE_DIR / "data" / "best_run_stats_by_deployment_and_workload.csv"
outpath.parent.mkdir(exist_ok=True, parents=True)
out.to_csv(outpath, index=True)
out

## Plot: Latency by workload

In [43]:
plot_df = best_runs_df.reset_index()

workloads_to_plot  = plot_df["workload_name_observed"].unique().tolist()
legend_gap = 150

In [None]:
## Create plot
fig = make_subplots(
    rows=len(workloads_to_plot), cols=1, 
    vertical_spacing=0.1, 
    subplot_titles=workloads_to_plot,
)
x_axis_max = plot_df["e2e_95th"].max() * 1.1

for i, workload_name in enumerate(workloads_to_plot, start=1):
    temp_df = plot_df[plot_df["workload_name_observed"] == workload_name].sort_values(["deployment"], ascending=False)
    x_max = temp_df["e2e_95th"].max()
    dtick = round(x_max * 1.03 / 25 * 4) / 4
    # Create data
    model_labels = []
    stat_labels = []

    ttft_avg_data = []
    e2e_avg_data = []
    e2e_95th_data = []
    ttft_95th_data = []

    for _i, row in temp_df.iterrows():
        # 95th
        model_labels.append(row["human_name_w_deployment_type"])
        stat_labels.append("P95")
        ttft_95th_data.append(row["ttft_95th"])
        e2e_95th_data.append(row["e2e_95th"] - row["ttft_95th"])
        ttft_95th_data.append(0)
        e2e_95th_data.append(0)
        # Avg
        model_labels.append(row["human_name_w_deployment_type"])
        stat_labels.append("Avg")
        ttft_avg_data.append(0)
        e2e_avg_data.append(0)
        ttft_avg_data.append(row["ttft_avg"])
        e2e_avg_data.append(row["e2e_avg"] - row["ttft_avg"])
    axis_labels = [
        model_labels,
        stat_labels
    ]
    # e2e latency
    fig.add_trace(
        go.Bar(
            x=ttft_avg_data, 
            y=axis_labels,
            name="Avg TTFT",
            legendgroup=i,
            legendrank=4,
            marker={"color": "green", "opacity": 0.8},
            orientation='h',
            ),
        row = i, 
        col = 1
    )
    fig.add_trace(
        go.Bar(
            x=e2e_avg_data, 
            y=axis_labels,
            name="Avg Generation",
            legendgroup=i,
            legendrank=3,
            marker={"color": "blue", "opacity": 0.8},
            orientation='h',
            ),
        row = i, 
        col = 1
    )
    # P95 latency
    fig.add_trace(
        go.Bar(
            x=ttft_95th_data, 
            y=axis_labels,
            name="P95 TTFT",
            legendgroup=i,
            legendrank=2,
            marker={"color": "darkorange", "opacity": 0.9},
            orientation='h',
            ),
        row = i, 
        col = 1
    )
    fig.add_trace(
        go.Bar(
            x=e2e_95th_data, 
            y=axis_labels,
            name="P95 Generation",
            legendgroup=i,
            legendrank=1,
            marker={"color": "red", "opacity": 0.8},
            orientation='h',
            ),
        row = i, 
        col = 1
    )
    # fig.update_yaxes(title_text="Seconds", row=y, col=1)
    fig.update_xaxes(title_text="Seconds", row=i, col=1, dtick=dtick)

title = "Request Latency Metrics by Model and Workload"
height = 1000
fig.update_layout(title_text=title, title_x=0.5, height=height, width=1600, barmode='stack', legend_tracegroupgap=legend_gap)
fig.show()

# Save to disk
outpath = CURRENT_ANALYSIS_SAVE_DIR / "plots/latency_by_workload.png"
outpath.parent.mkdir(exist_ok=True, parents=True)
fig.write_image(outpath)

print("Saved to", outpath)

# Plots by Workload: Maximum performance stats with common workloads

In [46]:
# https://pythondatascience.plavox.info/wp-content/uploads/2016/06/colorpalette.png

BAR_MARKER_SETTINGS = {
    "latency_gen": {"color": "blueviolet", "opacity": 0.8},
    "latency_prompt": {"color": "blue", "opacity": 0.8},
    "ttft_prompt": {"color": "grey", "opacity": 0.8},
    "ttft_gen": {"color": "red", "opacity": 0.8},
    "tpm_prompt": {"color": "darkorange", "opacity": 0.8},
    "tpm_gen": {"color": "maroon", "opacity": 0.8},
    "rpm": {"color": "green", "opacity": 0.8},
    "throttled_rpm": {"color": "red", "opacity": 0.8},
}

In [None]:
group_set = plot_df["deployment"].drop_duplicates().values.tolist()

for deployment in group_set:
    temp_df = plot_df[(plot_df["deployment"] == deployment)].sort_values(["workload_name_observed", "deployment", "clients", "rate"])
    x_axis_title = "Workload Name"
    title = f"{temp_df['human_name_w_tpm'].iloc[0]} - Best Run Stats by Workload"

    fig = make_subplots(
        rows=2, cols=2, row_heights=[400, 400], 
        subplot_titles=("Average End-to-End Request Latency", "P95 End-to-End Request Latency", "Successful & Throttled Requests Per Minute (RPM)", "Total Tokens Per Minute (TPM)")
    )
    workload_labels = temp_df["workload_name_observed_with_break"].values.tolist()
    latency_y_max = temp_df["e2e_95th"].max() * 1.1
    # Average Latencies
    fig.add_trace(
        go.Bar(
            x=workload_labels,
            y=temp_df["ttft_avg"].values.tolist(),
            name="Time to First Token",
            legendgroup=0,
            marker=BAR_MARKER_SETTINGS["latency_prompt"],
            # orientation='h',
            ),
        row = 1, 
        col = 1
    )
    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=(temp_df["e2e_avg"] - temp_df["ttft_avg"]).values.tolist(),
            name="Token Generation",
            legendgroup=0,
            marker=BAR_MARKER_SETTINGS["latency_gen"],
            # orientation='h',
            ),
        row = 1, 
        col = 1
    )
    fig.update_yaxes(title_text="Seconds", range=[0, latency_y_max], row=1, col=1)
    fig.update_xaxes(title_text=x_axis_title, row=1, col=1)

    # P95 Latencies
    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=temp_df["ttft_95th"].values.tolist(),
            name="Token Generation",
            # legendgroup=1,
            showlegend = False,
            marker=BAR_MARKER_SETTINGS["latency_prompt"],
            # orientation='h',
            ),
        row = 1, 
        col = 2
    )
    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=(temp_df["e2e_95th"] - temp_df["ttft_95th"]).values.tolist(),
            name="Token Generation",
            # legendgroup=1,
            showlegend = False,
            marker=BAR_MARKER_SETTINGS["latency_gen"],
            # orientation='h',
            ),
        row = 1, 
        col = 2
    )
    fig.update_yaxes(title_text="Seconds", range=[0, latency_y_max], row=1, col=2)
    fig.update_xaxes(title_text=x_axis_title, row=1, col=2)

    # RPM
    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=temp_df["rpm"].values.tolist(),
            name=deployment,
            legendgroup=False,
            showlegend = False,
            marker=BAR_MARKER_SETTINGS["rpm"],
        ),
        row = 2, 
        col = 1
    )
    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=temp_df["throttled_rpm"].values.tolist(),
            name=deployment,
            legendgroup=False,
            showlegend = False,
            marker=BAR_MARKER_SETTINGS["throttled_rpm"],
        ),
        row = 2, 
        col = 1
    )
    fig.update_yaxes(title_text="RPM - Log Scale", type="log", row=2, col=1)
    fig.update_xaxes(title_text=x_axis_title, row=2, col=1)

    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=temp_df["tpm_context"].values.tolist(),
            name="Context TPM",
            legendgroup=2,
            showlegend = True,
            marker=BAR_MARKER_SETTINGS["tpm_prompt"],
            ),
        row = 2, 
        col = 2
    )
    fig.add_trace(
        go.Bar(
            x=workload_labels, 
            y=temp_df["tpm_gen"].values.tolist(),
            name="Generation TPM",
            legendgroup=2,
            showlegend = True,
            marker=BAR_MARKER_SETTINGS["tpm_gen"],
            ),
        row = 2, 
        col = 2
    )
    fig.update_yaxes(title_text="TPM - Log Scale", type="log", row=2, col=2)
    fig.update_xaxes(title_text=x_axis_title, row=2, col=2)

    # Save short and tall versions
    for height, legend_gap in [(600, 220), (800, 350)]:
        fig.update_layout(title_text=title, title_x=0.5, height=height, width=1600, coloraxis=dict(colorscale='Bluered_r'), barmode='stack', legend_tracegroupgap=legend_gap)
        if height == 800:
            fig.show()

        # Save to disk
        outpath = CURRENT_ANALYSIS_SAVE_DIR / f"plots/workload_stats_2x2_{deployment}_{height}px.png"
        outpath.parent.mkdir(exist_ok=True, parents=True)
        fig.write_image(outpath)

        print("Saved to", outpath)

In [49]:
# group_set = common_workload_df[["deployment", "run_date"]].drop_duplicates().values.tolist()

# for deployment, run_date in group_set:
#     temp_df = common_workload_df[(common_workload_df["deployment"] == deployment)].sort_values(["workload_name", "deployment", "clients", "rate"])
#     # Determine whether to use clients or rate as the x-axis
#     num_clients = temp_df["clients"].nunique()
#     num_rates = temp_df["rate"].nunique()
#     if num_clients > 1 and num_rates > 1:
#         print(f"Skipping {deployment} as it has multiple clients and rates")
#         continue
#     elif num_clients > 1:
#         group_col = "clients"
#         x_axis_title = "Workload Profile and Number of Clients (No Rate Limit)"
#         title = f"{temp_df['human_name_w_tpm'].iloc[0]} - Common Workloads with Various Levels of Concurrency"
#     else:
#         group_col = "rate"
#         x_axis_title = "Workload Profile and Max Request RPM"
#         title = f"{temp_df['human_name_w_tpm'].iloc[0]} - Common Workloads with Various Request Rates"
#     fig = make_subplots(
#         rows=2, cols=2, row_heights=[400, 400], 
#         subplot_titles=("Average End-to-End Request Latency", "P95 End-to-End Request Latency", "Successful & Throttled Requests Per Minute (RPM)", "Total Tokens Per Minute (TPM)")
#     )
#     workload_labels = [
#         temp_df["workload_name"].values.tolist(),
#         temp_df[group_col].values.tolist(),
#     ]
#     latency_y_max = temp_df["e2e_95th"].max() * 1.1
#     # Average Latencies
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels,
#             y=temp_df["ttft_avg"].values.tolist(),
#             name="Time to First Token",
#             legendgroup=0,
#             marker=BAR_MARKER_SETTINGS["latency_prompt"],
#             # orientation='h',
#             ),
#         row = 1, 
#         col = 1
#     )
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=(temp_df["e2e_avg"] - temp_df["ttft_avg"]).values.tolist(),
#             name="Token Generation",
#             legendgroup=0,
#             marker=BAR_MARKER_SETTINGS["latency_gen"],
#             # orientation='h',
#             ),
#         row = 1, 
#         col = 1
#     )
#     fig.update_yaxes(title_text="Seconds", range=[0, latency_y_max], row=1, col=1)
#     fig.update_xaxes(title_text=x_axis_title, row=1, col=1)

#     # P95 Latencies
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=temp_df["ttft_95th"].values.tolist(),
#             name="Token Generation",
#             # legendgroup=1,
#             showlegend = False,
#             marker=BAR_MARKER_SETTINGS["latency_prompt"],
#             # orientation='h',
#             ),
#         row = 1, 
#         col = 2
#     )
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=(temp_df["e2e_95th"] - temp_df["ttft_95th"]).values.tolist(),
#             name="Token Generation",
#             # legendgroup=1,
#             showlegend = False,
#             marker=BAR_MARKER_SETTINGS["latency_gen"],
#             # orientation='h',
#             ),
#         row = 1, 
#         col = 2
#     )
#     fig.update_yaxes(title_text="Seconds", range=[0, latency_y_max], row=1, col=2)
#     fig.update_xaxes(title_text=x_axis_title, row=1, col=2)

#     # RPM
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=temp_df["rpm"].values.tolist(),
#             name=deployment,
#             legendgroup=False,
#             showlegend = False,
#             marker=BAR_MARKER_SETTINGS["rpm"],
#         ),
#         row = 2, 
#         col = 1
#     )
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=temp_df["throttled_rpm"].values.tolist(),
#             name=deployment,
#             legendgroup=False,
#             showlegend = False,
#             marker=BAR_MARKER_SETTINGS["throttled_rpm"],
#         ),
#         row = 2, 
#         col = 1
#     )
#     fig.update_yaxes(title_text="RPM - Log Scale", type="log", row=2, col=1)
#     fig.update_xaxes(title_text=x_axis_title, row=2, col=1)

#     # TPM
#     # if temp_df["deployment_type"].iloc[0] is DEPLOYMENT_TYPE.PAYGO:
#     #     # Add dotted line for congigured PayGO TPM. Use both a plot and a shape to get a legend and a full height vline
#     #     configured_tpm = temp_df["configured_paygo_tpm"].iloc[0]
#     #     fig.add_trace(
#     #         go.Scatter(
#     #             y=[configured_tpm] * len(workload_labels),
#     #             x=workload_labels,
#     #             name=f"Configured PayGO TPM ({int_to_thousands(int(configured_tpm))})",
#     #             legendgroup=2,
#     #             line=dict(color='green', width=2, dash='5px'),
#     #             marker=dict(color='rgba(255, 0, 0, 0)')
#     #             ),
#     #         row = 2, 
#     #         col = 2
#     #     )
#     #     fig.add_hline(
#     #         y=configured_tpm,
#     #         line_dash="dot",
#     #         showlegend=False,
#     #         row = 2, 
#     #         col = 2,
#     #         line=dict(color='green', width=2, dash='5px')
#     #     )
#     # else:
#     #     # Add dotted line for congigured PTU TPM (min and max). Use both a plot and a shape to get a legend and a full height vline
#     #     exp_min_ptu_tpm = temp_df["exp_min_ptu_tpm"].iloc[0]
#     #     exp_max_ptu_tpm = temp_df["exp_max_ptu_tpm"].iloc[0]
#     #     fig.add_trace(
#     #         go.Scatter(
#     #             y=[exp_max_ptu_tpm] * len(workload_labels),
#     #             x=workload_labels,
#     #             name=f"Min Expected PTU TPM ({int_to_thousands(int(exp_min_ptu_tpm))})",
#     #             legendgroup=2,
#     #             line=dict(color='green', width=2, dash='5px'),
#     #             marker=dict(color='rgba(255, 0, 0, 0)')
#     #             ),
#     #         row = 2, 
#     #         col = 2
#     #     )
#     #     fig.add_hline(
#     #         y=exp_min_ptu_tpm,
#     #         line_dash="dot",
#     #         showlegend=False,
#     #         row = 2, 
#     #         col = 2,
#     #         line=dict(color='green', width=2, dash='5px'),
#     #     )
#     #     fig.add_trace(
#     #         go.Scatter(
#     #             y=[exp_max_ptu_tpm] * len(workload_labels),
#     #             x=workload_labels,
#     #             name=f"Max Expected PTU TPM ({int_to_thousands(int(exp_max_ptu_tpm))})",
#     #             legendgroup=2,
#     #             line=dict(color='purple', width=2, dash='5px'),
#     #             marker=dict(color='rgba(255, 0, 0, 0)')
#     #             ),
#     #         row = 2, 
#     #         col = 2
#     #     )
#     #     fig.add_hline(
#     #         y=exp_max_ptu_tpm,
#     #         line_dash="dot",
#     #         showlegend=False,
#     #         row = 2, 
#     #         col = 2,
#     #         line=dict(color='purple', width=2, dash='5px'),
#     #     )
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=temp_df["tpm_context"].values.tolist(),
#             name="Context TPM",
#             legendgroup=2,
#             showlegend = True,
#             marker=BAR_MARKER_SETTINGS["tpm_prompt"],
#             ),
#         row = 2, 
#         col = 2
#     )
#     fig.add_trace(
#         go.Bar(
#             x=workload_labels, 
#             y=temp_df["tpm_gen"].values.tolist(),
#             name="Generation TPM",
#             legendgroup=2,
#             showlegend = True,
#             marker=BAR_MARKER_SETTINGS["tpm_gen"],
#             ),
#         row = 2, 
#         col = 2
#     )
#     fig.update_yaxes(title_text="TPM - Log Scale", type="log", row=2, col=2)
#     fig.update_xaxes(title_text=x_axis_title, row=2, col=2)


#     # Save short and tall versions
#     for height, legend_gap in [(600, 220), (800, 350)]:
#         fig.update_layout(title_text=title, title_x=0.5, height=height, width=1600, coloraxis=dict(colorscale='Bluered_r'), barmode='stack', legend_tracegroupgap=legend_gap)
#         if height == 800:
#             fig.show()

#         # Save to disk
#         outpath = CURRENT_ANALYSIS_SAVE_DIR / f"workload_stats_2x2/workload_stats_2x2_{deployment}_{run_date}_{height}px.png"
#         outpath.parent.mkdir(exist_ok=True, parents=True)
#         fig.write_image(outpath)

#         print("Saved to", outpath)

# PTU Stats vs expected

In [56]:
ptu_only_df = all_logs[(all_logs["deployment_type"] == DEPLOYMENT_TYPE.PTU)].copy()
ptu_only_df["max_ptu_rpm_achieved_for_workload"] = ptu_only_df.groupby(["deployment", "token_profile"])["rpm"].transform("max")
ptu_only_df["max_ptu_tpm_achieved_for_workload"] = ptu_only_df.groupby(["deployment", "token_profile"])["tpm_total"].transform("max")

In [None]:
## Max TPM vs expected
df = ptu_only_df[ptu_only_df["tpm_total"] == ptu_only_df["max_ptu_tpm_achieved_for_workload"]].copy()
# Remove runs where avg concurrency was the same for the same token profile (remove duplicates)
df = df.groupby("workload_name_observed").first().reset_index().set_index("human_name").sort_values(
    ["observed_prompt_vs_gen_token_ratio","context_tokens", "max_tokens"], ascending = [True, True, True]
)[["context_tpr_avg", "max_tokens", "gen_tpr_avg", "observed_prompt_vs_gen_token_ratio", "exp_ptu_tpm", "tpm_total", "total_tpm_relative_to_expected"]]
# df.rename(columns={"context_tpr_avg": "context_tokens_per_request", "max_tokens": "max_tokens_per_request", "gen_tpr_avg": "average_generated_tokens", "exp_ptu_tpm": "expected_ptu_tpm"}, inplace=True)
df.rename(columns=COL_RENAME_MAPPER, inplace=True)
out = apply_pretty_formatting(df, cols_to_ignore=["Average Context Tokens/Request", "Max Tokens", "Average Generation Tokens/Request"])
out

In [None]:
## Max Concurrency vs expected
df = ptu_only_df[ptu_only_df["max_ptu_rpm_achieved_for_workload"] == ptu_only_df["rpm"]].copy()
# Remove runs where avg concurrency was the same for the same token profile (remove duplicates)
df = df.groupby("workload_name_observed").first().reset_index().set_index("human_name").sort_values(
    ["observed_prompt_vs_gen_token_ratio","context_tokens", "max_tokens"], ascending = [True, True, True]
)[["context_tpr_avg", "max_tokens", "gen_tpr_avg", "observed_prompt_vs_gen_token_ratio", "exp_ptu_rpm", "max_ptu_rpm_achieved_for_workload", "ptu_rpm_relative_to_expected"]]
# df.rename(columns={"context_tpr_avg": "context_tokens_per_request", "max_tokens": "max_tokens_per_request", "gen_tpr_avg": "average_generated_tokens", "exp_ptu_rpm": "expected_ptu_rpm"}, inplace=True)
df.rename(columns=COL_RENAME_MAPPER, inplace=True)
out = apply_pretty_formatting(df, cols_to_ignore=["Average Context Tokens/Request", "Max Tokens", "Average Generation Tokens/Request"])
out

# PTU TPM by ratio of prompt to generation tokens

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

temp_df = ptu_only_df.copy()
# Filter to runs with highest TPM
temp_df = temp_df.sort_values(["context_tpr_avg", "max_tokens", "tpm_total"], ascending=(False, True, False)).groupby(["token_profile"]).first().reset_index()
# Reorder from lowest ratio to highest
temp_df = temp_df.sort_values(["observed_prompt_vs_gen_token_ratio"], ascending=False)

# Format data
# temp_df["total_tpm_relative_to_expected"] = temp_df["total_tpm_relative_to_expected"].apply(lambda x: round(x, 2))
# temp_df["total_tpm_relative_to_expected"] = temp_df["total_tpm_relative_to_expected"] / 100
total_tpm_plot_limit = max(temp_df["exp_ptu_tpm"].max() * 1.1, temp_df["tpm_total"].max() * 1.1)

# temp_df["observed_prompt_vs_gen_token_ratio"] = temp_df["observed_prompt_vs_gen_token_ratio"].apply(lambda x: round(x * 100, 2))
tpm_vs_expected_plot_limit = max(temp_df["total_tpm_relative_to_expected"].max() * 1.1, 1.1)

temp_df["throttled_rpm"] = temp_df["throttled_rpm"].apply(lambda x: round(x * 100))
temp_df["rate"] = temp_df["rate"].apply(lambda x: "None" if pd.isna(x) else str(int(x)))

custom_data_cols = ["workload_name", "token_profile", "rate", "clients", "rpm", "throttled_rpm", "avg_concurrency", "e2e_avg"]
hover_template_str = "<br>".join([f"{col.replace('_', ' ').title()}: %{{customdata[{i}]}}" for i, col in enumerate(custom_data_cols)])
# 'Workload Regime:%{customdata[0]} <br>Token Profile:%{customdata[1]} <br>Token Profile:%{customdata[2]} <br>Token Profile:%{customdata[3]} <br>RPM:%{customdata[4]} <br>Throttled RPM:%{customdata[5]} <br>  '

fig = make_subplots(specs=[[{"secondary_y": True}]])
# Add secondary (hidden) trace with % of expected TPM on right
fig.add_trace(
    go.Scatter(
        x=temp_df["observed_prompt_vs_gen_token_ratio"].values.tolist(),
        y=temp_df["total_tpm_relative_to_expected"].values.tolist(),
        mode="markers",
        # Hide from plot so we only get the y-axis labels
        marker_opacity=0,
        showlegend=False,
        ),
    secondary_y=True
)
# Add main trace with actual TPM values on left (last, so it is on top)
fig.add_trace(
    go.Scatter(
        x=temp_df["observed_prompt_vs_gen_token_ratio"].values.tolist(),
        y=temp_df["tpm_total"].values.tolist(),
        mode="markers+text",
        name="Max TPM by Workload",
        customdata=temp_df[custom_data_cols].values,
        hovertemplate=hover_template_str,
        text=temp_df["token_profile"].values.tolist(),
        textposition="bottom center",
        marker=dict(size=8, color="red"),
    ),
    secondary_y=False
)

# Add trendline
time_size_regr = LinearRegression()
poly_features = PolynomialFeatures(degree=3, include_bias=False)
X = poly_features.fit_transform(temp_df[["observed_prompt_vs_gen_token_ratio"]])
X = X[:, [1,2]]

time_size_regr.fit(X, temp_df["tpm_total"].values.reshape(-1, 1))
trendline_values = time_size_regr.predict(X).reshape(-1).tolist()

fig.add_trace(
    go.Scatter(
        x=temp_df["observed_prompt_vs_gen_token_ratio"].values.tolist(),
        y=trendline_values,
        mode="lines",
        name="TPM Best Fit",
        line=dict(color='blue', width=2, dash='5px'),
        # showlegend=False,
        # marker=dict(color='rgba(255, 0, 0, 0)')
    ),
    secondary_y=False
)

# Add dotted line for congigured PTU TPM (min and max). Use both a plot and a shape to get a legend and a full height vline
# exp_max_ptu_tpm = int(temp_df["exp_max_tpm"].iloc[0])
fig.add_trace(
    go.Scatter(
        y=temp_df["exp_ptu_tpm"],
        x=temp_df["observed_prompt_vs_gen_token_ratio"].values.tolist(),
        name="Expected PTU TPM",
        mode="lines",
        line=dict(color='purple', width=2, dash='5px'),
        marker=dict(size=8, color='rgba(255, 0, 0, 0)')
        ),
)


fig.update_xaxes(title_text="Observed Prompt vs Gen Token Ratio")
fig.update_yaxes(title_text="Max Total TPM Achieved", range = [0, total_tpm_plot_limit], showgrid=False, secondary_y=False)
fig.update_yaxes(title_text="TPM Relative to Expected", range = [0, tpm_vs_expected_plot_limit], secondary_y=True)

title = f"TPM by Prompt:Generation Token Ratio. Model: {temp_df['human_name_w_tpm'].iloc[0]}"
fig.update_layout(title_text=title, title_x=0.5, height=500, width=1000, coloraxis=dict(colorscale='Bluered_r'), xaxis_tickformat = '.0%', yaxis2_tickformat = '.0%')
fig.show()

# Save to disk
outpath = CURRENT_ANALYSIS_SAVE_DIR / "plots/ptu_max_tpm_vs_expected.png"
outpath.parent.mkdir(exist_ok=True, parents=True)
fig.write_image(outpath)

print("Saved to", outpath)