In [1]:
# Configure environment for anywidget
import os
os.environ['ANYWIDGET_HUB'] = 'false'

# packages and libraries
from inspect_ai.log import read_eval_log
from inspect_ai.analysis import log_viewer, prepare, evals_df, samples_df, EvalModel, EvalResults, EvalScores, EvalInfo, EvalTask, SampleSummary, model_info, task_info
from inspect_viz import Data
from inspect_viz.plot import plot
from inspect_viz.view.beta import scores_by_task, scores_heatmap, scores_radar_by_task, scores_by_model, scores_radar_by_task_df, scores_by_factor, scores_timeline, scores_by_limit_df, scores_by_limit
import pandas as pd

In [2]:
# helper function to prep sample-level data for plotting
def samples_df_prep(logs):
    logs_df = samples_df(
        logs=logs,
        columns=(SampleSummary + EvalInfo + EvalModel)
    )
    return logs_df

# helper function to prep eval-level data for plotting
def evals_df_prep(logs):
    # Read into dataframe
    logs_df = evals_df(
        logs=logs,
        columns=(EvalInfo + EvalTask + EvalModel + EvalResults + EvalScores)
    )
    return logs_df

def load_experiment_logs(experiment_path, subdirs, log_viewer_url, metadata_extractors, samples=False):
    """
    Load evaluation logs from an experiment with multiple subdirectories.
      
    Parameters:
    - experiment_path: Path to main experiment directory
    - subdirs: List of subdirectory names containing eval/logs
    - log_viewer_url: URL for log viewer (e.g., "http://localhost:8000/my_logs/")
    - metadata_extractors: Dict mapping column names to lambda functions
    E.g., {"model": lambda df: df['model_path'].str.extract(r'...', expand=False)}
      
    Returns: Prepared dataframe ready for visualization
    """
    # Build paths and LOG_DIRS mapping
    log_paths = [os.path.join(experiment_path, subdir, "eval", "logs") for subdir in subdirs]
    LOG_DIRS = {path: log_viewer_url for path in log_paths}

    # Collect all log files
    logs = []
    for path in log_paths:
        logs.extend([os.path.join(path, f) for f in os.listdir(path)])

    # Extract model paths
    model_paths = [read_eval_log(log).eval.model_args['model_path'] for log in logs]

    if samples:
        # Read into sample-level dataframe
        logs_df = samples_df_prep(logs)
    else:
        logs_df = evals_df_prep(logs)

    # Add model_path and custom metadata
    logs_df['model_path'] = model_paths
    for col_name, extractor in metadata_extractors.items():
        logs_df[col_name] = extractor(logs_df)

    # Prepare with log viewer
    
    return prepare(logs_df, [log_viewer("eval", LOG_DIRS)])



In [3]:
# load experiment 1 (model x wordlength) logs
logs_df = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_wordlen_2026-01-12",
    subdirs=["Llama-3.2-1B-Instruct_5L", "Llama-3.2-3B-Instruct_5L"],
    log_viewer_url="http://localhost:8000/cap_wordlen_logs_viewer/",
    metadata_extractors={
        # new model naming convention automatically identifies 4 different model types
        # here we further separate by task and epoch
        # cleaned model name: {size}_epoch{epoch} (same information as default, but easier to read)
      "model": lambda df: df['model'].str.extract(r'Llama-3\.2-(?P<size>\d+B)', expand=False) + '_epoch' + df['model'].str.extract(r'epoch_(?P<epoch>\d+)', expand=False),
      # overwrite task_name to reflect word length
      "task_name": lambda df: df['task_arg_data_path'].str.extract(r'words_(?P<task_name>\d+L)', expand=False)
    },
    samples=False
  )

In [4]:
# create scores_by_task plot (experiment 1: model x wordlength)
evals = Data.from_dataframe(logs_df)

# scores_by_task match accuracy plot
scores_by_task(
    evals,
    task_name='task_name',
    score_value="score_match_accuracy",
    score_stderr="score_match_stderr",
    score_label="Match Accuracy",
    ci = 0.95
    )

Component(spec='{"hconcat":[{"plot":[{"mark":"barY","data":{"from":"Jon2CgnbyiZksF8qRz534H","filterBy":"$selec…

In [5]:
# scores_by_task includes accuracy plot
scores_by_task(
    evals,
    task_name='task_name',
    score_value="score_includes_accuracy",
    score_stderr="score_includes_stderr",
    score_label="Includes Accuracy",
    ci = 0.95
    )

Component(spec='{"hconcat":[{"plot":[{"mark":"barY","data":{"from":"Jon2CgnbyiZksF8qRz534H","filterBy":"$selec…

In [6]:
# create heatmap plot (experiment 1: model x wordlength)
# match accuracy heatmap
scores_heatmap(
    evals,
    task_name='task_name',
    model_name = "model_display_name",
    model_label = "Model",
    score_value="score_match_accuracy",
    tip = True,
    title = ""
    )

Component(spec='{"hconcat":[{"plot":[{"mark":"text","text":[""],"dy":-20,"frameAnchor":"top","fontSize":16,"fa…

In [7]:
# includes accuracy heatmap
scores_heatmap(
    evals,
    task_name='task_name',
    model_name = "model_display_name",
    model_label = "Model",
    score_value="score_includes_accuracy",
    tip = True,
    title = "",
    orientation = "horizontal",
    )

Component(spec='{"hconcat":[{"plot":[{"mark":"text","text":[""],"dy":-20,"frameAnchor":"top","fontSize":16,"fa…

In [None]:
# create scores_radar_by_task_df with min-max normalization
evals_scores = scores_radar_by_task_df(
    logs_df,
    normalization="min_max",
    domain=(0, 1),
)

Component(spec='{"hconcat":[{"plot":[{"mark":"line","data":{"from":"3sFS4ndsgsAjxnuPRm6ibW","filterBy":"$selec…

In [None]:
# match accuracy radar plot
evals_radar = Data.from_dataframe(evals_scores)
scores_radar_by_task(evals_radar)

In [9]:
# load experiment 2 (model x system prompt) data
logs_df_2 = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_model_prompt_2025-11-21",
    subdirs=["Llama-3.2-1B-Instruct_no_prompt", "Llama-3.2-3B-Instruct_no_prompt", "Llama-3.2-1B-Instruct_with_prompt", "Llama-3.2-3B-Instruct_with_prompt"],
    log_viewer_url="http://localhost:8000/cap_prompts_logs_viewer/",
    metadata_extractors={
        "model": lambda df: df['model_path'].str.extract(r'(Llama-3\.2-\d+B)', expand=False),
        "prompt_type": lambda df: df['task_arg_config_path'].str.extract(r'Instruct_(?P<prompt_type>with_prompt|no_prompt)', expand=False)
    },
    samples=False
  )

# convert prompt_type to boolean (True for with_prompt, False for no_prompt)
logs_df_2['prompt_type'] = logs_df_2['prompt_type'] == 'with_prompt'

In [10]:
# create scores_by_factor plot (experiment 2: model x system prompt)
evals = Data.from_dataframe(logs_df_2)
scores_by_factor(evals, "prompt_type", ("No Prompt", "Prompt"))

# NOTE: log viewer does not appear (scores_by_factor does not link to logs) - needs investigation
# this occurs in the example docs as well.


Component(spec='{"hconcat":[{"plot":[{"mark":"frame","anchor":"left","insetTop":5,"insetBottom":5},{"mark":"ru…

In [9]:
# load experiment 4 (cross-organization model comparison)
logs_df_4 = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_cross_org_models_2025-11-21",
    subdirs=["Google-Gemma-2B", "Meta-Llama-3.2-1B-Instruct"],
    log_viewer_url="http://localhost:8000/cap_crossorg_logs_viewer/",
    metadata_extractors={
        "model": lambda df: df['model_path'].str.extract(r'(Google-Gemma-2B|Meta-Llama-3\.2-1B-Instruct)', expand=False)
    },
    samples=False
  )

In [10]:
# create scores_by_model plot (experiment 4: cross-organization model comparison)
evals_4 = Data.from_dataframe(logs_df_4)
scores_by_model(evals_4, height=200)

Component(spec='{"hconcat":[{"plot":[{"mark":"ruleY","data":{"from":"HJfDdzcFxf38KGZCVoXRHM","filterBy":"$sele…

In [None]:
""" # load experiment 3 data (model x token limit)
logs_df_3 = load_experiment_logs(
    experiment_path="/home/st0898/MSALGANIK/st0898/ck-experiments/cap_token_limit_2026-01-06",
    subdirs=["Llama-3.2-1B-Instruct", "Llama-3.2-1B-Instruct"],
    log_viewer_url="http://localhost:8000/cap_tokenlim_viewer/",
    metadata_extractors={
        "model": lambda df: df['model_path'].str.extract(r'(Llama-3\.2-\d+B)', expand=False)

    },
    samples=True
  )  """


# Build paths and LOG_DIRS mapping
log_paths = [os.path.join("/home/st0898/MSALGANIK/st0898/ck-experiments/cap_token_limit_2026-01-06", subdir, "eval", "logs") for subdir in ["Llama-3.2-1B-Instruct", "Llama-3.2-3B-Instruct"]]
LOG_DIRS = {path: "http://localhost:8000/cap_tokenlim_viewer/" for path in log_paths}

# Collect all log files
logs = []
for path in log_paths:
    logs.extend([os.path.join(path, f) for f in os.listdir(path)])

# Extract model paths and token_limits
model_paths = [read_eval_log(log).eval.model_args['model_path'] for log in logs]
token_limits = [read_eval_log(log).eval.task_args['max_tokens'] for log in logs]

# Read into sample-level dataframe
logs_df = samples_df(
        logs=logs,
        columns=(SampleSummary + EvalInfo + EvalModel)
    )

# Create mapping dictionaries from eval_id to values
eval_ids = logs_df['eval_id'].unique()
eval_to_model = dict(zip(eval_ids, model_paths))
eval_to_limit = dict(zip(eval_ids, token_limits))

# Map values using the dictionaries
logs_df['model_path'] = logs_df['eval_id'].map(eval_to_model)
logs_df['limit'] = logs_df['eval_id'].map(eval_to_limit)

# Ensure limit is numeric
#logs_df['limit'] = pd.to_numeric(logs_df['limit'])

# Replace "model" field with the Llama model that was used
logs_df['model'] = logs_df['model_path'].str.extract(r'cap-token-limit-(1B|3B)', expand=False).apply(
    lambda x: f'Llama-3.2-{x}-Instruct' if pd.notna(x) else x
)

# Convert score_match to numeric (1 for correct, 0 for incorrect)
logs_df['score_match'] = (logs_df['score_match'] == 'C').astype(int)

logs_df = scores_by_limit_df(
    logs_df,
    score="score_match",
    limit = "limit"
)

logs_df_3 = prepare(logs_df, [
  model_info(),
  log_viewer("eval", LOG_DIRS)
])


  "model": lambda df: df['model_path'].str.extract(r'(Llama-3\.2-\d+B)', expand=False)


: 

In [12]:
# create scores_by_limit plot
evals_3 = Data.from_dataframe(logs_df_3)
scores_by_limit(evals_3, limit="limit")

Component(spec='{"hconcat":[{"plot":[{"mark":"line","data":{"from":"3LV3V3EhRLixpeSeELk4ft","filterBy":"$selec…

In [None]:
# new scores_by_limit plot

# Build paths and LOG_DIRS mapping
log_paths = [os.path.join("/home/st0898/MSALGANIK/st0898/ck-experiments/cap_5L_scores_limit_2026-01-16", subdir, "eval", "logs") for subdir in ["Llama-3.2-1B-Instruct", "Llama-3.2-3B-Instruct"]]
LOG_DIRS = {path: "http://localhost:8000/cap_tokenlim_viewer/" for path in log_paths}

# Collect all log files
logs = []
for path in log_paths:
    logs.extend([os.path.join(path, f) for f in os.listdir(path)])

# Extract model paths
model_paths = [read_eval_log(log).eval.model_args['model_path'] for log in logs]
# token_limits = [read_eval_log(log).eval.task_args['max_tokens'] for log in logs]

# Read into sample-level dataframe
logs_df = samples_df(
        logs=logs,
        columns=(SampleSummary + EvalInfo + EvalModel)
    )

# Create mapping dictionaries from eval_id to values
eval_ids = logs_df['eval_id'].unique()
eval_to_model = dict(zip(eval_ids, model_paths))

# Map values using the dictionaries
logs_df['model_path'] = logs_df['eval_id'].map(eval_to_model)
# logs_df['limit'] = logs_df['eval_id'].map(eval_to_limit)

# Ensure limit is numeric
#logs_df['limit'] = pd.to_numeric(logs_df['limit'])

# Replace "model" field with the Llama model that was used
#logs_df['model'] = logs_df['model_path'].str.extract(r'cap-token-limit-(1B|3B)', expand=False).apply(
 #   lambda x: f'Llama-3.2-{x}-Instruct' if pd.notna(x) else x
#)

# Convert score_match to numeric (1 for correct, 0 for incorrect)
logs_df['score_match'] = (logs_df['score_match'] == 'C').astype(int)

logs_df = scores_by_limit_df(
    logs_df,
    score="score_match"
)

logs_df_3 = prepare(logs_df, [
  model_info(),
  log_viewer("eval", LOG_DIRS)
])

: 

: 

In [None]:
# create scores_radar_by_metric plot

In [None]:
# create sample_heatmap plot

In [None]:
# check model name from Mattie's experiment

# Build paths and LOG_DIRS mapping
log_paths = [os.path.join("/home/st0898/MSALGANIK/niznik/ck-sanity-checks/workflow_test_base_2026-01-07", subdir, "eval", "logs") for subdir in ["Llama-3.2-1B-Instruct_base", "Llama-3.2-1B-Instruct_rank4"]]
LOG_DIRS = {path: "http://localhost:8000/test_viewer/" for path in log_paths}

# Collect all log files
logs = []
for path in log_paths:
    logs.extend([os.path.join(path, f) for f in os.listdir(path)])

# Read into dataframe
logs_df = evals_df(
    logs=logs,
    columns=(EvalInfo + EvalTask + EvalModel + EvalResults + EvalScores)
)

evals = Data.from_dataframe(logs_df)
scores_by_model(evals, height=200)

Component(spec='{"hconcat":[{"plot":[{"mark":"ruleY","data":{"from":"7q9zvVoVrDvUbLijLb7hnN","filterBy":"$sele…

: 