<a href="https://colab.research.google.com/github/michellepace/anthropic-model-compare/blob/main/Anthropic_Model_Compare_(simple).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<dif class="markdown-google-sans">
<font size="6">
<b>Anthropic Models: A Surprising Measured Comparison</b>
</font></div>

<br>

<br>

<div class="markdown-google-sans">
<b>Speed • Capability • Cost</b>
</div>

---

<div class="markdown-google-sans">
<font size="4">
How five Anthropic models compare as measured by two simple prompts. Visually compares speed, capability, and costs. You need an Anthropic API key and $0.98 to run this notebook, or you can load <a href="https://github.com/michellepace/anthropic-model-compare/tree/main" target="_blank">these</a> historical results. This notebook is the base analysis for <a href="https://ailearnlog.com/pushing-aside-the-bench-for-the-mark/" target="_blank">Pushing Aside the Bench for the Mark</a> (short article).
</font>
</div>

<br>

<div class="markdown-google-sans">
  <font size="3">Contents</font>
  <ul>
  <li><a href="#id-setup">Setup</a></li>
  <li><a href="#id-run-tests">Run Tests</a></li>
  <li><a href="#id-speed-results">Speed Results</a></li>
  <li><a href="#id-capability-results">Capability Results</a></li>
  <li><a href="#id-costs">Costs</a></li>
  <li><a href="#id-results-summary">Results Summary</a></li>
  </ul>
</div>

<br>

<br><a name="id-setup"></a>
# **SETUP**

## Libraries

In [None]:
# Install libraries that aren't pre-installed.
try:
    !pip install --upgrade-strategy only-if-needed --quiet anthropic langchain
    print("Success! All required libraries are installed.")
except Exception as e:
    print(f"An error occurred during library installation: {str(e)}")


# Import all used libraries.
try:
    from google.colab import userdata   # To access your Colab Secret: ANTHROPIC_API_KEY
    from google.colab import drive      # Saving nd loading results from your Google Drive
    from pathlib import Path
    from anthropic import Anthropic, APIError
    import pandas as pd
    import plotly.express as px
    import textwrap
    print("Success! Library imports are complete.")

except Exception as e:
    print(f"An error occurred during library importation: {str(e)}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/222.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m194.6/222.8 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.8/222.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hSuccess! All required libraries are installed.
Success! Library imports are complete.


## Anthropic API

In [None]:
# @title Set API key ⭐ { vertical-output: true, display-mode: "form" }
#@markdown **Instructions:**<br>

#@markdown **Step 1:** Create an [Anthropic API key](https://console.anthropic.com/settings/keys).<br>
#@markdown **Step 2:** Then run this cell by clicking the 'play' icon just under the title.<br>
#@markdown **Step 3:** you will be guided to setup a Colab secret if you don't have one.<br>

#@markdown

anthropic_api_secret_name = 'ANTHROPIC_API_KEY'  # @param {type: "string"}

def test_anthropic_connection(anthropic_client: Anthropic) -> None:
    my_test_prompt = "Hello Claude, have I connected to you? (answer briefly!)"
    print()
    try:
        message = anthropic_client.messages.create(
            model="claude-3-5-sonnet-latest",
            max_tokens=20,
            messages=[{"role": "user", "content": my_test_prompt}]
        )
        print("Success!\nAPI key is valid and working.")
        print(f"• My prompt was:  {my_test_prompt}")
        print(f"• Claude responded: {message.content[0].text}")

    except APIError as e:
        print(f"API error occurred: {e}")
        raise KeyboardInterrupt("Connection test failed. Stopping execution 🛑.") from e
    except Exception as e:
        print(f"Unexpected error occurred: {e}")
        raise KeyboardInterrupt("Connection test failed. Stopping execution 🛑.") from e


def validate_anthropic_api_key_format(api_key):
    if not api_key.startswith('sk-'):
        raise ValueError("Anthropic API keys start with \"sk-\"")
    if ' ' in api_key:
        raise ValueError("Anthropic API keys don't have white spaces.")
    if len(api_key) <= 100:
        raise ValueError("Anthropic API keys are longer than 100 characters.")


def get_anthropic_api_key(secret_name):
    try:
        api_key = userdata.get(secret_name)
        validate_anthropic_api_key_format(api_key)
        print("Success!")
        print(f'Your Colab secret "{secret_name}" was found.')
        print(f"• If the secret contains a valid API Key, we can connect to Claude.")
        print(f'• To change API Key: Click the "key" icon in left handside panel, delete "{secret_name}", rerun this block.')
        return api_key

    except userdata.SecretNotFoundError:
        print(f"🛑 Error: Colab secret '{secret_name}' not found in your Colab environment")
        print(" To fix:")
        print(f" 1. Click the \"key\" icon on the left of this Notebook")
        print(f" 2. Add new secret with name '{secret_name}'")
        print(f" 3. Set value to Anthropic API key from: https://console.anthropic.com/settings/keys")
        print(f" 4. Rerun this block and follow next instructions")
        print(" About Colab secrets: https://bit.ly/4cad0v7")
        print("🛑🛑🛑\n")
        raise
    except userdata.NotebookAccessError:
        print(f"🛑 Error: You denied this Notebook access to your Colab secret '{secret_name}'")
        print(" To fix:")
        print(" 1. Rerun this block and click \"Grant access\"")
        print(" About Colab secrets: https://bit.ly/4cad0v7")
        print(" Worried about safety? Save your own copy of this Notebook and run that.")
        print("🛑🛑🛑\n")
        raise
    except ValueError as ve:
        print(f"🛑 Error: Invalid format, {str(ve)}")
        print(" To fix:")
        print(f" 1. Click the \"key\" icon on the left of this Notebook")
        print(f" 2. Delete '{anthropic_api_secret_name}'")
        print(f" 4. Rerun this block and follow next instructions")
        print("🛑🛑🛑\n")
        raise
    except Exception as e:
        print("🛑 Unexpected error occurred")
        print(" Please check:")
        print(f" 1. '{secret_name}' secret exists in Colab (click \"key\" icon on the left)")
        print(" 2. Secret value is a valid Anthropic API key")
        print(" Get API key: https://console.anthropic.com/settings/keys")
        print(" About Colab secrets: https://bit.ly/4cad0v7")
        print("🛑🛑🛑\n")
        raise


### Do the work
MY_ANTHROPIC_API_KEY = get_anthropic_api_key(anthropic_api_secret_name)
### Do the work

ANTHROPIC_CLIENT = Anthropic(
    api_key=MY_ANTHROPIC_API_KEY, # From: "Set Secret API key ⭐"
    max_retries=2, # Maximum retry attempts
    timeout=10, # Timeout of the retry
)

test_anthropic_connection(ANTHROPIC_CLIENT)

Success!
Your Colab secret "ANTHROPIC_API_KEY" was found.
• If the secret contains a valid API Key, we can connect to Claude.
• To change API Key: Click the "key" icon in left handside panel, delete "ANTHROPIC_API_KEY", rerun this block.

Success!
API key is valid and working.
• My prompt was:  Hello Claude, have I connected to you? (answer briefly!)
• Claude responded: Yes, I'm here and ready to help!


In [None]:
# @title Set Google Drive ⭐ { vertical-output: true, display-mode: "form" }
#@markdown **Instructions:**

#@markdown **Step 1:** Specify where to save RESULTS in Google Drive (see Example below)<br>
#@markdown **Step 2:** Run this cell by clicking the little "play" icon just under the title<br>

#@markdown **Example:**<br>
#@markdown - Google Drive paths ALWAYS start with: <font color='#FF1493'>/content/​drive/​MyDrive/</font><br>
#@markdown - For example: `/content/drive/MyDrive/my-output-folder/`<br><br>

output_drive_path = '/content/drive/MyDrive/my-output-folder/' #@param {type:"string", placeholder:"(here is an example)   /content/drive/MyDrive/MyOutputFolder/"}

def map_to_drive_path(drive_path: str) -> Path:
    """Maps to a given Google Drive path and returns the Path object.
    Creates the folder if it doesn't exist.

    @Args: drive_path: String path to a folder on Google Drive.
    @Returns: Path object of the mapped drive path.
    @Raises: ValueError: If the path is invalid.
    """

    if not drive_path.strip():
        raise ValueError("No drive path provided.")

    if not Path('/content/drive').exists():
        drive.mount('/content/drive')

    path = Path(drive_path)

    if not path.is_dir():
        if not path.parent.exists():
            raise ValueError(f"Parent directory '{path.parent}' does not exist.")
        path.mkdir(parents=True, exist_ok=True)
        print("\nSuccess!")
        print(f"• Created folder: {path.absolute()}")

    print("Success!")
    print(f"• Mapped to Google Drive path: {path.absolute()}")
    return path

# Map to the specified drive path
try:
    output_drive_path = map_to_drive_path(output_drive_path)
except ValueError as e:
    print(f"🛑 Error: {e}")
    print("Ensure the path is a valid folder in your Google Drive.")
except Exception as e:
    print(f"🛑 Unexpected error: {e}")

Mounted at /content/drive

Success!
• Created folder: /content/drive/MyDrive/my-output-folder
Success!
• Mapped to Google Drive path: /content/drive/MyDrive/my-output-folder


## Test Set

### Runs & Output

In [None]:
TOTAL_RUNS = 50

OUTPUT_FILE = 'RESULTS.csv'
full_output_path = Path(output_drive_path) / OUTPUT_FILE

print(f'Success!')
print(f'• Test prompts will be run (x{TOTAL_RUNS}) times for every model.')
print(f'• Test results will be saved to: {full_output_path}')

Success!
• Test prompts will be run (x50) times for every model.
• Test results will be saved to: /content/drive/MyDrive/my-output-folder/RESULTS.csv


### Test Prompts

In [None]:
SPEED_PROMPT = 'Explain photosynthesis in a concise paragraph.'

CAPABILITY_PROMPT = {
'prompt': """What is the geometric monthly fecal coliform mean of a distribution \
system with the following FC counts: 24, 15, 7, 16, 31 and 23? The result will be \
inputted into a NPDES DMR, therefore, round to the nearest whole number. \n\
**IMPORTANT!!:** Respond only with a number and strictly nothing else!!""",

'number_solution': 18
}

def wrap_my_prompt(text_to_wrap: str, subtext: str = '', width: int = 80) -> str:
    title = f'Test Prompt ({subtext}):'
    header = f'{title}\n{"~" * len(title)}\n'
    wrapped_text = textwrap.fill(text_to_wrap, width=width, replace_whitespace=False)
    return header + wrapped_text + '\n'


print('Success! Test prompts are setup:\n')
print(wrap_my_prompt(SPEED_PROMPT, 'speed'))
print(wrap_my_prompt(CAPABILITY_PROMPT['prompt'], 'capability'))

Success! Test prompts are setup:

Test Prompt (speed):
~~~~~~~~~~~~~~~~~~~~
Explain photosynthesis in a concise paragraph.

Test Prompt (capability):
~~~~~~~~~~~~~~~~~~~~~~~~~
What is the geometric monthly fecal coliform mean of a distribution system with
the following FC counts: 24, 15, 7, 16, 31 and 23? The result will be inputted
into a NPDES DMR, therefore, round to the nearest whole number.
**IMPORTANT!!:** Respond only with a number and strictly nothing else!!



### Test Models
Source [Anthropic Docs](https://docs.anthropic.com/en/docs/about-claude/models), 1st Feb, 2025.

In [None]:
MODELS = [
    {
        'snapshot': 'current',    # As of 1st Feb, 2025
        'model_family': 'Opus',
        'model': 'claude-3-opus-20240229',
        'model_name': 'Opus 3 (Feb 2024)',
        'cost_input_mtok': 15.00,   # US$ per million tokens
        'cost_output_mtok': 75.00,  # US$ per million tokens
        'context_window_tokens': 200000,
        'max_output_tokens': 4096,
        'knowledge_cutoff': pd.Timestamp('2023-08-01').date(),
        'model_release': pd.Timestamp('2024-02-01').date(),
    },
    {
        'snapshot': 'prior',    # As of 1st Feb, 2025
        'model_family': 'Haiku',
        'model': 'claude-3-haiku-20240307',
        'model_name': 'Haiku 3 (Mar 2024)',
        'cost_input_mtok': 0.25,
        'cost_output_mtok': 1.25,
        'context_window_tokens': 200000,
        'max_output_tokens': 4096,
        'knowledge_cutoff': pd.Timestamp('2023-08-01').date(),
        'model_release': pd.Timestamp('2024-03-01').date(),
    },
    {
        'snapshot': 'prior',    # As of 1st Feb, 2025
        'model_family': 'Sonnet',
        'model': 'claude-3-5-sonnet-20240620',
        'model_name': 'Sonnet 3.5 (Jun 2024)',
        'cost_input_mtok': 3.00,
        'cost_output_mtok': 15.00,
        'context_window_tokens': 200000,
        'max_output_tokens': 8196,
        'knowledge_cutoff': pd.Timestamp('2024-04-01').date(),
        'model_release': pd.Timestamp('2024-06-01').date(),
    },
    {
        'snapshot': 'current',    # As of 1st Feb, 2025
        'model_family': 'Sonnet',
        'model': 'claude-3-5-sonnet-20241022',
        'model_name': 'Sonnet 3.5 (Oct 2024)',
        'cost_input_mtok': 3.00,
        'cost_output_mtok': 15.00,
        'context_window_tokens': 200000,
        'max_output_tokens': 8196,
        'knowledge_cutoff': pd.Timestamp('2024-04-01').date(),
        'model_release': pd.Timestamp('2024-10-01').date(),
    },
    {
        'snapshot': 'current',    # As of 1st Feb, 2025
        'model_family': 'Haiku',
        'model': 'claude-3-5-haiku-20241022',
        'model_name': 'Haiku 3.5 (Oct 2024)',
        'cost_input_mtok': 0.80,
        'cost_output_mtok': 4.00,
        'context_window_tokens': 200000,
        'max_output_tokens': 8196,
        'knowledge_cutoff': pd.Timestamp('2024-07-01').date(),
        'model_release': pd.Timestamp('2024-10-01').date(),
    },
]

print(f'Success! (x{len(MODELS)}) models are setup for testing.')

Success! (x5) models are setup for testing.


## Visuals

- Easy model sort order(s)

In [None]:
MODEL_FAMILY_SORT = ['Opus', 'Haiku', 'Sonnet']

MODEL_NAME_SORT = [
    'Opus 3 (Feb 2024)',
    'Haiku 3 (Mar 2024)', 'Haiku 3.5 (Oct 2024)',
    'Sonnet 3.5 (Jun 2024)','Sonnet 3.5 (Oct 2024)'
]

MODEL_NAME_SORT_BY_RELEASE = [
    'Opus 3 (Feb 2024)',     # 2024-02-01
    'Haiku 3 (Mar 2024)',    # 2024-03-01
    'Sonnet 3.5 (Jun 2024)', # 2024-06-01
    'Sonnet 3.5 (Oct 2024)'  # 2024-10-01
    'Haiku 3.5 (Oct 2024)',  # 2024-10-01
]

print('Success! These settings make it easier to sort models.')

Success! These settings make it easier to sort models.


- Consistent Look & Feel. (But better to use Ploty Express <a href="https://plotly.com/python/templates/" target="_blank">templates</a> for theming.)

In [None]:
## For look and feel
viz_custom = {
    'template': 'plotly_white',
    'font_colour': '#141413',
    'axis_colour': '#525252',
    'grid_colour': '#F0E4DB',
    'runs_colour': '#D4A375',
    'capbars_colour': '#41668C',
    'plot_bgcolour': '#F0EEE6',
    'anthrop_colours': ['#41668C', '#4A366F', '#8778AB', '#656565', '#D09B74', '#D09B74']
}

print('Success! Shared settings for visuals are setup.')

Success! Shared settings for visuals are setup.


<br><a name="id-run-tests"></a>
# **RUN TESTS**

## a. core code

- Helper function for test prompts that expect number answers

In [None]:
def check_math_answer(response_text: str, expected_solution: float) -> float:
    """
    Helper function for test prompts that expect number answers
    Returns: 1 if correct, 0 if wrong, NaN if not a number.
    """
    try:
        response_num = float(response_text)
        return 1 if response_num == expected_solution else 0
    except ValueError:
        return pd.nan

- Heavy lifting here (Anthropic API etc)

In [None]:
def collect_model_test_results(
    model_config: dict,
    prompt: str,
    anthrop_client: Anthropic = ANTHROPIC_CLIENT,
    temperature: float = 0.0,
    max_tokens: int = 500,
    total_test_runs: int = 1,
    math_solution: float = None,
    ) -> list[dict]:
    """
    Collect Anthropic API responses and metrics across all test runs.
    Supports optional mathematical answer validation.
    :return: List of dicts: each contains result of 1 test run and model info.
    """
    model_test_results = []

    for run in range(total_test_runs):
        start_time = pd.Timestamp.now('UTC')
        try:
            response = anthrop_client.messages.create(
                model=model_config['model'],
                max_tokens=max_tokens,
                temperature=temperature,
                messages=[{'role': 'user', 'content': prompt}]
            )
        except APIError as e:
            print(f"Run {run + 1}: API error - {str(e)}")
            continue

        end_time = pd.Timestamp.now('UTC')

        execution_seconds = (end_time - start_time).total_seconds()
        input_tokens = response.usage.input_tokens
        output_tokens = response.usage.output_tokens

        response_text = response.content[0].text.strip()

        run_result = {
            # About the model
            'snapshot': model_config['snapshot'],
            'model_family': model_config['model_family'],
            'model': model_config['model'],
            'model_name': model_config['model_name'],

            # About the test run
            'model_test_type': 'speed' if math_solution is None else 'capability',
            'prompt': prompt,
            'input_tokens': input_tokens,
            'run_timestamp_utc': start_time,
            'run': run + 1,
            'temperature': temperature,

            # About the test run result
            'output_tokens': output_tokens,
            'execution_seconds': execution_seconds,
            'tokens_per_second': output_tokens/execution_seconds,
            'prompt_response': response_text,

            # For capability prompt tests that expect a number
            'prompt_solution': None if math_solution is None else math_solution,
            'is_correct': None if math_solution is None else check_math_answer(response_text, math_solution),

            # About cost (expressed in tokens NOT 'millions of tokens')
            'USD_per_input_token': model_config['cost_input_mtok'] / 1_000_000,
            'USD_per_output_token': model_config['cost_output_mtok'] / 1_000_000,
        }

        model_test_results.append(run_result)

    return model_test_results


print('Success! Core functions to run models tests are loaded.')

Success! Core functions to run models tests are loaded.


## b. run tests (now)

In [None]:
# RUN SPEED TEST
print(f"Running speed tests...\n")
print(wrap_my_prompt(SPEED_PROMPT, f"run {TOTAL_RUNS} times per model"))

speed_results = []
for model in MODELS:
    model_results = collect_model_test_results(
        model_config=model,
        temperature=1,
        prompt=SPEED_PROMPT,
        total_test_runs=TOTAL_RUNS,
    )
    speed_results.extend(model_results)
    print(f" • ✅ {model['model_name']}")


# RUN CAPABILITY TEST
print("\n\nRunning capability tests...\n")
print(wrap_my_prompt(CAPABILITY_PROMPT['prompt'], f"run {TOTAL_RUNS} times per model"))

capability_results = []
for model in MODELS:
    model_results = collect_model_test_results(
        model_config=model,
        temperature=1,
        prompt=CAPABILITY_PROMPT['prompt'],
        math_solution=CAPABILITY_PROMPT['number_solution'],
        total_test_runs=TOTAL_RUNS,
    )
    capability_results.extend(model_results)
    print(f" • ✅ {model['model_name']}")


# SAVE RESULTS AS DATA FRAME
speed_results_df = pd.DataFrame(speed_results)
cap_results_df = pd.DataFrame(capability_results)
print("\n\nDone!")

Running speed tests...

Test Prompt (run 50 times per model):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Explain photosynthesis in a concise paragraph.

 • ✅ Opus 3 (Feb 2024)
 • ✅ Haiku 3 (Mar 2024)
 • ✅ Sonnet 3.5 (Jun 2024)
 • ✅ Sonnet 3.5 (Oct 2024)
 • ✅ Haiku 3.5 (Oct 2024)


Running capability tests...

Test Prompt (run 50 times per model):
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
What is the geometric monthly fecal coliform mean of a distribution system with
the following FC counts: 24, 15, 7, 16, 31 and 23? The result will be inputted
into a NPDES DMR, therefore, round to the nearest whole number.
**IMPORTANT!!:** Respond only with a number and strictly nothing else!!

 • ✅ Opus 3 (Feb 2024)
 • ✅ Haiku 3 (Mar 2024)
 • ✅ Sonnet 3.5 (Jun 2024)
 • ✅ Sonnet 3.5 (Oct 2024)
 • ✅ Haiku 3.5 (Oct 2024)


Done!


## c. save results

In [None]:
def concat_and_save_csv(df1: pd.DataFrame, df2: pd.DataFrame, output_file: str | Path) -> pd.DataFrame:
    """Concatenates two dataframes and saves to CSV in Google Drive, appending if file exists."""
    combined_df = pd.concat([df1, df2], axis=0, ignore_index=True)
    num_records = len(combined_df)

    output_file = Path(output_file) # incase its a string

    if output_file.exists():
        existing_records = len(pd.read_csv(output_file))
        total_records = existing_records + num_records
        combined_df.to_csv(output_file, mode='a', header=False, index=False, encoding='utf-8')
        print(f'Success!\n • {output_file}\n • Records appended: {num_records} (total records {total_records})')
    else:
        combined_df.to_csv(output_file, index=False, encoding='utf-8')
        print(f'Success!\n • {output_file}\n • Records saved: {num_records} (total records {num_records})')

    return combined_df


# Usage
all_results_df = concat_and_save_csv(speed_results_df, cap_results_df, full_output_path)

Success!
 • /content/drive/MyDrive/my-output-folder/RESULTS.csv
 • Records saved: 500 (total records 500)


## c. load results

Comment out (b) above and you can just load and use the existing RESULTS.csv in your Google Drive set above.

In [None]:
def load_csv_to_df(file_path: str | Path) -> pd.DataFrame:
    data_df = pd.read_csv(
        file_path,
        parse_dates=['run_timestamp_utc'],
        encoding='utf-8'
    )

    print(f"Success! Loaded {len(data_df)} records from {file_path}")
    return data_df

# Load the entire CSV
all_results_df = load_csv_to_df(full_output_path)

# Split into two dataframes
speed_results_df = all_results_df[all_results_df['model_test_type'] == 'speed']
cap_results_df = all_results_df[all_results_df['model_test_type'] == 'capability']

# Verify the split
print(f"\n Combined test results:\t\t{len(all_results_df)} runs")
print(f" • Speed test results:\t\t{len(speed_results_df)} runs")
print(f" • Capability test results:\t{len(cap_results_df)} runs")

Success! Loaded 500 records from /content/drive/MyDrive/my-output-folder/RESULTS.csv

 Combined test results:		500 runs
 • Speed test results:		250 runs
 • Capability test results:	250 runs


<br><a name="id-speed-results"></a>
# **SPEED RESULTS**

## Speed Over Time

In [None]:
# VISUAL: Speed Over Time Figure
speed_line_fig_title = f'Model Speed Over Time (tokens per second)'

# 1. Create Base Visualization
speed_line_fig = px.line(
    data_frame=speed_results_df, x='run', y='tokens_per_second',
    title=speed_line_fig_title,
    facet_row='model_family',
    color='model_name',
    color_discrete_sequence=viz_custom['anthrop_colours'],
    category_orders={'model_family': MODEL_FAMILY_SORT,'model_name': MODEL_NAME_SORT},
    markers=True,
    width=900, height=500,
    labels={'speed': 'Model Speed','run': f'Run Number (each model was prompted {TOTAL_RUNS} times in sequence)'},
)

# 3. Style & Format
# Axis styling
speed_line_fig.update_yaxes(title_text='Speed')
# Facet Labels: remove "model_family="
speed_line_fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

# Layout styling
speed_line_fig.update_layout(
    legend_title_text=None,
    showlegend=True,
    margin=dict(t=100, l=80, r=60, b=80),
    font=dict(color=viz_custom['font_colour']),
    plot_bgcolor=viz_custom['plot_bgcolour'],
)

# 4. Output & Display
# speed_line_fig.write_image('speed_line_fig.svg')
print(wrap_my_prompt(SPEED_PROMPT, "speed"))
speed_line_fig.show()

Test Prompt (speed):
~~~~~~~~~~~~~~~~~~~~
Explain photosynthesis in a concise paragraph.



## Individual Runs

In [None]:
# VISUAL: Speed Individuals Figure
speed_strip_fig_title = f'Model Speed (tokens per second)  — higher is faster'

# 1. Create Base Visualization
speed_strip_fig = px.strip(
    speed_results_df, x='model_name', y='tokens_per_second',
    title=speed_strip_fig_title,
    template=viz_custom['template'],
    width=900, height=700,
    # color='model_family',
    color_discrete_sequence=viz_custom['anthrop_colours'],
    category_orders={'model_family': MODEL_FAMILY_SORT,'model_name': MODEL_NAME_SORT},
)

# 2. Add Supplementary Box Plot — OPTIONAL
# speed_strip_fig.add_traces(
#     px.box(speed_results_df, x='model_name', y='tokens_per_second',
#     color_discrete_sequence=[viz_custom['runs_colour']]).data,
# )

# Adjust strip plot points
speed_strip_fig.update_traces(
    marker_size=5,
    jitter=0.8,
    offsetgroup=0
)

# 3. Style & Format
# Axis styling
speed_strip_fig.update_xaxes(title=None,
    showgrid=False, ticklabelstandoff=15,
    showline=True, linewidth=1.5, linecolor=viz_custom['axis_colour'],

    # Wrap model name
    ticktext=[label.replace(" (", "<br>(") for label in MODEL_NAME_SORT],
    tickvals=list(range(len(MODEL_NAME_SORT)))
)
speed_strip_fig.update_yaxes(title=f'Individual Speeds ({TOTAL_RUNS} runs)',
    gridcolor=viz_custom['grid_colour'], ticklabelstandoff=10,
    # rangemode='tozero',
    showline=True, linewidth=1.5, linecolor=viz_custom['axis_colour'],
)

# Layout styling
speed_strip_fig.update_layout(showlegend=False,
    margin=dict(t=100, l=90, r=60, b=80),
    font=dict(color=viz_custom['font_colour']),
)

# 4. Output & Display
# speed_strip_fig.write_image('speed_strip_fig.svg')
print(wrap_my_prompt(SPEED_PROMPT, 'speed'))
speed_strip_fig.show()

Test Prompt (speed):
~~~~~~~~~~~~~~~~~~~~
Explain photosynthesis in a concise paragraph.



## Summary Table

In [None]:
# 1. Constants/Configuration
speed_summary_table_title = 'Model Speed (tokens per second) — higher is faster'

# 2. Data Processing
speed_summary_df = (
    speed_results_df.groupby('model_name', sort=False)
    .agg({
        'run': 'count',
        'tokens_per_second': ['mean', 'median', 'std']
    })
    .set_axis(['Runs', 'Mean Speed', 'Median Speed', 'Std.Dev'], axis=1)
    .reindex(MODEL_NAME_SORT)
)

# 3. Styling
speed_summary_styled = (
    speed_summary_df.style
    .set_caption(f'Summary: {speed_summary_table_title}')
    .set_table_styles([{'selector': 'caption', 'props': [('text-align', 'left')]}])
    .format(precision=2)
    .map(lambda x: f'background-color: {viz_custom["runs_colour"]}; color: black;', subset=['Runs'])
    .background_gradient(cmap='Purples', subset=['Median Speed'], low=0, high=0.3)
)

# 4. Output
display(speed_summary_styled)

Unnamed: 0_level_0,Runs,Mean Speed,Median Speed,Std.Dev
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Opus 3 (Feb 2024),50,25.27,25.51,1.71
Haiku 3 (Mar 2024),50,96.05,97.37,8.3
Haiku 3.5 (Oct 2024),50,46.2,46.22,3.5
Sonnet 3.5 (Jun 2024),50,55.61,56.16,6.32
Sonnet 3.5 (Oct 2024),50,54.07,53.98,6.23


<br><a name="id-capability-results"></a>
# **CAPABILITY RESULTS**

## Summary Table

In [None]:
# 1. Constants/Configuration
prompt_solution = CAPABILITY_PROMPT['number_solution']
cap_summary_table_title = f'Summary: Model Capability by Correct Response ({prompt_solution} = Correct)'

def format_number_to_3dp(x):
    return f"{float(x):.3f}" if isinstance(x, (int, float)) else x

# 2. Data Processing
model_cat = pd.Categorical(
    cap_results_df['model_name'], categories=MODEL_NAME_SORT, ordered=True)

# Build summary table
cap_summary_df = (
    pd.crosstab(model_cat, cap_results_df['prompt_response'].astype(float))
    .rename_axis('model_name')
    .assign(
        Runs=lambda df: df.sum(axis=1),
        **{'Correct %': cap_results_df.groupby('model_name', observed=True)['is_correct'].mean()}
    )
    .rename(columns=format_number_to_3dp)
)

# 3. Styling
cap_summary_styled = (
    cap_summary_df.style
    .set_caption(cap_summary_table_title)
    .set_table_styles([{'selector': 'caption', 'props': [('text-align', 'left')]}])
    .format({
        'Runs': '{:.0f}',
        'Correct %': '{:.0%}' # Format decimal as percent
    })
    .map(lambda x: f'background-color: {viz_custom["runs_colour"]}; color: black;',
         subset=['Runs'])
    .background_gradient(cmap='Purples', subset=['Correct %'], low=0, high=0.2)
)

# 4. Output
print(wrap_my_prompt(CAPABILITY_PROMPT['prompt'], 'capability',width=68))
print()
display(cap_summary_styled)

Test Prompt (capability):
~~~~~~~~~~~~~~~~~~~~~~~~~
What is the geometric monthly fecal coliform mean of a distribution
system with the following FC counts: 24, 15, 7, 16, 31 and 23? The
result will be inputted into a NPDES DMR, therefore, round to the
nearest whole number. 
**IMPORTANT!!:** Respond only with a number
and strictly nothing else!!




prompt_response,17.000,18.000,19.000,Runs,Correct %
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Opus 3 (Feb 2024),1,47,2,50,94%
Haiku 3 (Mar 2024),6,44,0,50,88%
Haiku 3.5 (Oct 2024),37,13,0,50,26%
Sonnet 3.5 (Jun 2024),13,37,0,50,74%
Sonnet 3.5 (Oct 2024),46,4,0,50,8%


## Barchart

In [None]:
# 1. Create Base Visualization
cap_bar_fig_title = 'Model Capability: Correct Math Solution Responses (%)'
data_for_plot = cap_summary_df['Correct %'].reset_index()

cap_bar_fig = px.bar(data_for_plot, x='Correct %', y='model_name',
    title=cap_bar_fig_title,
    template=viz_custom['template'],
    width=600, height=350,
    text='Correct %',
)

# 2. Add Supplementary Elements
# Update text position and format
cap_bar_fig.update_traces(
    marker_color=viz_custom['runs_colour'], textfont=dict(color='white'),
    texttemplate='%{text:.0%} ', textposition='auto',
)

# 3. Style & Format
# Axis styling
cap_bar_fig.update_xaxes(title='Correct Responses % (same prompt run 50 times per model)', showgrid=False,
    gridcolor=viz_custom['grid_colour'],
    showline=False, showticklabels=False,
)
cap_bar_fig.update_yaxes(title='', showgrid=False,
    categoryarray=MODEL_NAME_SORT[::-1],  # Reverse order
    categoryorder='array',
    ticklabelstandoff=10,
    showline=True, linewidth=1, linecolor=viz_custom['axis_colour'],
)

# Layout styling
cap_bar_fig.update_layout(showlegend=False,
    margin=dict(t=100, l=80, r=80, b=60),
    font=dict(color=viz_custom['font_colour']),
)

# 4. Output & Display
# cap_bar_fig.write_html('cap_bar_fig.html')
# cap_bar_fig.write_image('cap_bar_fig.svg')
print(wrap_my_prompt(CAPABILITY_PROMPT['prompt'], 'capability'))
cap_bar_fig.show()

Test Prompt (capability):
~~~~~~~~~~~~~~~~~~~~~~~~~
What is the geometric monthly fecal coliform mean of a distribution system with
the following FC counts: 24, 15, 7, 16, 31 and 23? The result will be inputted
into a NPDES DMR, therefore, round to the nearest whole number.
**IMPORTANT!!:** Respond only with a number and strictly nothing else!!



<br><a name="id-costs"></a>
# **COSTS**

## Quoted Costs

In [None]:
import pandas as pd

# Create DataFrame
df = pd.DataFrame(MODELS)[['model_name', 'cost_input_mtok', 'cost_output_mtok']]
df = df.rename(columns={
    'model_name': 'Model Name',
    'cost_input_mtok': 'Cost per Million Input Tokens',
    'cost_output_mtok': 'Cost per Million Output Tokens'
})

# Get the maximum value across both price columns for the bars
max_value = max(
    df['Cost per Million Input Tokens'].max(),
    df['Cost per Million Output Tokens'].max()
)

# Create Styler with number formatting and bars
styled_df = df.style.format({
    'Cost per Million Input Tokens': '${:.2f}',
    'Cost per Million Output Tokens': '${:.2f}'
}).bar(
    subset=['Cost per Million Input Tokens', 'Cost per Million Output Tokens'],
    vmin=0,
    vmax=max_value,
    align='left',
    axis=None,
).hide(axis='index')\
.set_caption('Summary: Anthropic Model Pricing ($US/M tokens) • Feb 2025')\
.set_table_styles([
    {'selector': 'caption', 'props': [('text-align', 'left')]}
])

# Display the styled DataFrame
display(styled_df)

Model Name,Cost per Million Input Tokens,Cost per Million Output Tokens
Opus 3 (Feb 2024),$15.00,$75.00
Haiku 3 (Mar 2024),$0.25,$1.25
Sonnet 3.5 (Jun 2024),$3.00,$15.00
Sonnet 3.5 (Oct 2024),$3.00,$15.00
Haiku 3.5 (Oct 2024),$0.80,$4.00


## Cost Calcs

- Without totals

In [None]:
costs_df = (
    all_results_df[[
        'model_test_type', 'model_name',
        'input_tokens', 'output_tokens',
        'USD_per_input_token', 'USD_per_output_token'
    ]]
    .assign(
        model_name=lambda x: pd.Categorical(x['model_name'], categories=MODEL_NAME_SORT, ordered=True),
        input_cost=lambda x: x['input_tokens'] * x['USD_per_input_token'],
        output_cost=lambda x: x['output_tokens'] * x['USD_per_output_token'],
        total_cost=lambda x: (x['input_tokens'] * x['USD_per_input_token'] +
                            x['output_tokens'] * x['USD_per_output_token'])
    )
    .groupby(['model_test_type', 'model_name'], observed=True)
    .agg({
        'input_cost': 'sum',
        'output_cost': 'sum',
        'total_cost': 'sum'
    })
    .rename(columns={
        'input_cost': 'Input Token Cost',
        'output_cost': 'Output Token Cost',
        'total_cost': 'Total Cost',
    })
)

display(costs_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,Input Token Cost,Output Token Cost,Total Cost
model_test_type,model_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
capability,Opus 3 (Feb 2024),0.06525,0.01875,0.084
capability,Haiku 3 (Mar 2024),0.001087,0.000313,0.0014
capability,Haiku 3.5 (Oct 2024),0.00348,0.001,0.00448
capability,Sonnet 3.5 (Jun 2024),0.01305,0.00375,0.0168
capability,Sonnet 3.5 (Oct 2024),0.01305,0.00375,0.0168
speed,Opus 3 (Feb 2024),0.01425,0.522075,0.536325
speed,Haiku 3 (Mar 2024),0.000237,0.007695,0.007933
speed,Haiku 3.5 (Oct 2024),0.00076,0.029596,0.030356
speed,Sonnet 3.5 (Jun 2024),0.00285,0.110685,0.113535
speed,Sonnet 3.5 (Oct 2024),0.00285,0.111225,0.114075


- With totals

In [None]:
# 2. Calculate subtotals and grand total
subtotals = costs_df.groupby(level=0).sum()
grand_total = costs_df.sum()

# 3. Combine everything with proper indexing
final_df = pd.concat([
    # Original data
    costs_df,
    # Subtotals with model_name = 'Subtotal'
    subtotals.assign(model_name='Subtotal').set_index('model_name', append=True),
    # Grand total with special indexing
    pd.DataFrame([grand_total], index=pd.MultiIndex.from_tuples([('Grand Total', '')]))
])

# 4. Styling with enhanced formatting
def style_table(styler):
    # Format numbers
    styler.format({
        'Input Token Cost': '{:.4f}',
        'Output Token Cost': '{:.4f}',
        'Total Cost': '$ {:.2f}',
    })

    # Create a mask for regular rows (non-totals)
    regular_rows = ~final_df.index.get_level_values(1).isin(['Subtotal', ''])

    # Apply background gradient only to regular rows
    if regular_rows.any():
        styler.background_gradient(
            cmap='YlOrRd',
            subset=pd.IndexSlice[regular_rows, ['Total Cost']]
        )

    # Style totals rows
    def highlight_totals(x):
        df = pd.DataFrame('', index=x.index, columns=x.columns)
        for idx in x.index:
            if idx[1] == 'Subtotal':
                df.loc[idx] = 'background-color: #806FC3; font-weight: bold'
            elif idx[0] == 'Grand Total':
                df.loc[idx] = 'background-color: #5C48AD; font-weight: bold'
        return df

    styler.apply(highlight_totals, axis=None)

    # Add caption and table styles
    styler.set_caption('Summary: Costs by Test Type and Model (US$)')
    styler.set_table_styles([
        {'selector': 'caption', 'props': [('text-align', 'left')]},
        {'selector': 'th', 'props': [('text-align', 'left')]},
        # Adjusted border positions - now after Sonnet entries
        {'selector': 'tr:nth-child(6)', 'props': [('border-top', '2px solid #666')]},
        {'selector': 'tr:nth-child(11)', 'props': [('border-top', '2px solid #666')]},
        {'selector': 'tr:last-child', 'props': [('border-top', '2px solid #666')]}
    ])

    # Add this line before return styler
    styler.set_properties(subset=['Total Cost'], **{'color': 'black'})
    return styler

# 5. Apply styling
costs_df_styled = style_table(final_df.style)

# 6. Display
display(costs_df_styled)

Unnamed: 0,Unnamed: 1,Input Token Cost,Output Token Cost,Total Cost
capability,Opus 3 (Feb 2024),0.0653,0.0187,$ 0.08
capability,Haiku 3 (Mar 2024),0.0011,0.0003,$ 0.00
capability,Haiku 3.5 (Oct 2024),0.0035,0.001,$ 0.00
capability,Sonnet 3.5 (Jun 2024),0.0131,0.0038,$ 0.02
capability,Sonnet 3.5 (Oct 2024),0.0131,0.0038,$ 0.02
speed,Opus 3 (Feb 2024),0.0142,0.5221,$ 0.54
speed,Haiku 3 (Mar 2024),0.0002,0.0077,$ 0.01
speed,Haiku 3.5 (Oct 2024),0.0008,0.0296,$ 0.03
speed,Sonnet 3.5 (Jun 2024),0.0029,0.1107,$ 0.11
speed,Sonnet 3.5 (Oct 2024),0.0029,0.1112,$ 0.11


<br><a name="id-results-summary"></a>
# **RESULTS SUMMARY**

## Combine all

In [None]:
# 1. Remove test_type from costs
# Just take the existing costs_df and group by model_name
costs_df_simple = costs_df.groupby('model_name', observed=True).sum()

# 2. Combine speed, capaiblity, costs
# Create new DataFrame with safe merging approach
full_summary_df = pd.DataFrame({
    'model_name': MODEL_NAME_SORT
}).set_index('model_name').join(
    speed_summary_df[['Median Speed']],
    how='left'
).join(
    cap_summary_df[['Correct %']] * 100,
    how='left'
).join(
    costs_df_simple[['Input Token Cost', 'Output Token Cost', 'Total Cost']],
    how='left'
).reset_index()

# Add Styling
full_summary_df_styled = (
    full_summary_df.style
    .set_caption("Complete Model Performance Summary")
    .set_table_styles([{'selector': 'caption', 'props': [('text-align', 'left')]}])
    .format({
        'Median Speed': '{:.2f}',
        'Correct %': '{:.0f} %',
        'Total Cost': '$ {:.2f}'},
    )
    .background_gradient(cmap='YlOrRd', subset=['Total Cost'])
    .hide(axis='index')
)

all_costs = f"${costs_df_simple['Total Cost'].sum():.2f} US"


display(full_summary_df_styled)
print(f'Total Cost of Experiement: {all_costs}')

model_name,Median Speed,Correct %,Input Token Cost,Output Token Cost,Total Cost
Opus 3 (Feb 2024),25.51,94 %,0.0795,0.540825,$ 0.62
Haiku 3 (Mar 2024),97.37,88 %,0.001325,0.008008,$ 0.01
Haiku 3.5 (Oct 2024),46.22,26 %,0.00424,0.030596,$ 0.03
Sonnet 3.5 (Jun 2024),56.16,74 %,0.0159,0.114435,$ 0.13
Sonnet 3.5 (Oct 2024),53.98,8 %,0.0159,0.114975,$ 0.13


Total Cost of Experiement: $0.93 US


## Visualise all

In [None]:
# VISUAL: Capaiblity v Speed v Cost
full_summary_fig_title = f'All Results: Capability vs Speed vs Total Cost'

# 1. Create Base Visualization
full_summary_fig = px.scatter(
    # Core Data Parameters
    full_summary_df, x='Median Speed', y='Correct %',
    # Visual Identity Parameters
    color='model_name',
    size='Total Cost',
    size_max = 50,
    color_discrete_sequence=viz_custom['anthrop_colours'],
    # Add text display
    text='Total Cost',
    # Plot Structure Parameters
    title=full_summary_fig_title,
    labels={
        'Median Speed': 'Median Speed (tokens per second)',
        'Correct %': 'Correct Responses (%)'
    },
    width=900, height=580
)

# Simple layout update with only essential parameters
full_summary_fig.update_layout(
    yaxis=dict( scaleanchor='x', scaleratio=1, ticksuffix='% ', range=[0, 100] ),
    font=dict(color=viz_custom['font_colour']),
    plot_bgcolor=viz_custom['plot_bgcolour'],
    margin=dict(t=100, l=90, r=300, b=80),
    legend=dict(title=None),
    annotations=[
        dict(
            text=f'Notes<br>• x-axis: Speed test results (prompt 1)<br>• y-axis: Capability results (prompt 2)<br>• Each test run x50 times per model<br><br>Experiment Summary<br>• Total Runs: {len(MODELS)} x 2 x {TOTAL_RUNS} = 500<br>• Prompt 1: concise text explanation<br>• Prompt 2: math calculation<br>• Total Cost: {all_costs}',
            # Position
            xref="paper", yref="paper", x=1.52, y=0.0, showarrow=False, align="left", font=dict(size=12)
        )
    ]
)

# Add text positioning configuration
full_summary_fig.update_traces(
    texttemplate='$%{text:.2f}',
    textposition='middle right',
    textfont=dict(size=13)
)

# Output
# full_summary_fig.write_html('speed_cap_cost_fig.html')
# full_summary_fig.write_image('speed_cap_cost_fig.svg')
full_summary_fig.show()

# Viz Check

In [None]:
print('(uncomment to see everything together.)\n')

# print(wrap_my_prompt(SPEED_PROMPT, 'speed'))
# print("\n")
# print(wrap_my_prompt(CAPABILITY_PROMPT['prompt'], 'capability'))
# print("\n\n")

# display(speed_summary_styled)
# print("\n\n")
# display(cap_summary_styled)
# print("\n\n")
# display(costs_df_styled)
# print("\n\n")
# display(full_summary_df_styled)
# print("\n\n")
# speed_line_fig.show()
# print("\n\n")
# speed_strip_fig.show()
# print("\n\n")
# cap_bar_fig.show()
# print("\n\n")
# full_summary_fig.show()
# print("\n\n")



(uncomment to see everything together.)

