In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import altair as alt

In [None]:
!pip install pyathena

In [None]:
query = '''
SELECT 
    scenario,
    model_id as model,
    region,
    
    executed_at, 
    year(executed_at) as year,
    month(executed_at) as month,
    day(executed_at) as day,
    hour(executed_at) as hour,
    minute(executed_at) as minute,
    
    metrics.inputtokencount   as input_token_count,
    metrics.outputtokencount as output_token_count,
    
    metrics.firstbytelatency/1000. as server_first_byte_latency_s,
    metrics.invocationlatency/1000. as server_invocation_latency_s,
    
    client_measured_latency_s as client_invocation_latency_s,
    client_measured_time_to_first_token_s as client_first_token_latency_s
     
FROM "default"."run_reports"
'''

In [None]:
%%time
import pandas as pd
from pyathena import connect

pd.set_option("display.precision", 4)

conn = connect(s3_staging_dir='s3://mkamp-aws-dub/athena/tmp/')
all_runs_df = pd.read_sql(query, conn)
conn.close()

all_runs_df

In [None]:
all_runs_df.groupby('scenario').agg({'input_token_count': 'mean', 'output_token_count': 'mean'}).astype(int)

In [None]:
# Max tokens:
# Long, long:  12_000 
# Long, short: 12_000
# Short, long:  1_500
# Short, short:   150

In [None]:
all_runs_df.groupby(['scenario', 'model']).agg({'client_invocation_latency_s': 'median', 'client_first_token_latency_s': 'median'})

In [None]:
all_runs_df[['server_first_byte_latency_s', 'client_first_token_latency_s', ]]

In [None]:
abs((all_runs_df['server_first_byte_latency_s']-all_runs_df['client_first_token_latency_s'])/all_runs_df['client_first_token_latency_s']).mean()

In [None]:
abs((all_runs_df['server_invocation_latency_s']-all_runs_df['client_invocation_latency_s'])/all_runs_df['client_invocation_latency_s']).mean()

In [None]:
def p50(x):
    return x.quantile(0.5)
def p90(x):
    return x.quantile(0.9)
def rstd(x):
    return x.std()/x.median()

all_runs_df.groupby(['scenario', 'model', 'region']).agg(
    {
        'server_first_byte_latency_s': ['count', 'min', p50, p90, 'max', 'std', rstd],
        'server_invocation_latency_s': ['min', p50, p90, 'max', 'std', rstd],
    
    })

In [None]:
all_runs_df.groupby(['scenario', 'model']).agg(
    {
        'server_first_byte_latency_s': ['count', 'min', p50, p90, 'max', 'std', rstd],
        'server_invocation_latency_s': ['min', p50, p90, 'max', 'std', rstd],
    
    })

In [None]:
def latency_chart(field):
    base = alt.Chart(all_runs_df)

    bars = base.mark_bar(color="orange", opacity=0.8).encode(
        y=alt.Y(f"median({field})"),
        x=alt.X("model:N"),
    )
    error_bars = base.mark_errorbar(color='black', opacity=0.8, extent="stdev").encode(
        x=alt.X("model:N"),
        y=alt.Y(f"{field}:Q"),
    )
    mean_circles = base.mark_circle(color='black', opacity=0.8, size=15).encode(
        x=alt.X("model:N"),
        y=alt.Y(f"mean({field}):Q"),
    )
   
    return (bars + error_bars + mean_circles).facet('scenario:N').properties(title=field)
latency_chart('client_first_token_latency_s')

In [None]:
latency_chart('client_invocation_latency_s')

In [None]:
scenario_df = all_runs_df[all_runs_df.scenario == 'Long prompt, short completion']
#scenario_df = all_runs_df[all_runs_df.scenario == 'Short prompt, long completion']

scenario_df.groupby(['model']).agg(
    {
        'server_first_byte_latency_s': ['count', 'min', p50, p90, 'max', 'std', rstd],
        'server_invocation_latency_s': ['min', p50, p90, 'max', 'std', rstd],
    
    })

In [None]:
def over_time_chart(field):
    base = alt.Chart(scenario_df)
    scatter = base.mark_circle(color="orange", size=50, opacity=0.8).encode(
        y=alt.Y(f'{field}:Q'),
        x=alt.X("executed_at:T"),
        tooltip=[f'{field}:Q', 
                 alt.Tooltip('executed_at:T', format="%B %d, %Y %H:%M:%S")]
    )
    loess = scatter.transform_loess('executed_at', field, bandwidth=0.5).mark_line(color='grey', size=2, opacity=0.8)
    
    return (scatter+loess).facet(column='model:N', row='region:N').properties(title=field)
over_time_chart('client_first_token_latency_s')

In [None]:
over_time_chart('client_invocation_latency_s')

In [None]:
one_model_df = all_runs_df[all_runs_df.model == 'anthropic.claude-v2:1']
def over_time_chart(field):
    base = alt.Chart(all_runs_df)

    bar = base.mark_bar().encode(
        x= alt.X(f"{field}:Q", bin=alt.Bin(step=0.5)),
        y='count()',
    )
    
    return bar.facet(row='model:N', column='scenario:N')
over_time_chart('client_first_token_latency_s')