In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import re

from collections import defaultdict, Counter
from pathlib import Path

import numpy as np
import pandas as pd

In [6]:
from lic.constants import (
    PROJECT_ROOT,
    LOGS_DIR,
    GPT_4O_MINI,
    GPT_5_MINI,
)
from collabmem.constants import REPO_ROOT

In [8]:
LOGS_DIR = REPO_ROOT / "src/lic/logs"
# !ls {LOGS_DIR / "actions/full"}

In [9]:
def load_jsonl(file_path: Path) -> list:
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [10]:
# log_file_parsing
# full_actions_gpt-5-mini_2025-08-07.jsonl
# {setting}_{task}_{model}.jsonl
# log_file_pattern = re.compile(r'^(?P<setting>.+)_(?P<task>.+)_(?P<model>.+)\.jsonl$')
# need to rework the regex: only "model" can have underscores
log_file_pattern = re.compile(
    r'^(?P<setting>[^_]+)_(?P<task>[^_]+)_(?P<model>.+)\.jsonl$'
)

def parse_log_directory(log_dir: Path, testing: bool = False) -> defaultdict:
    # task, setting, model -> outputs
    # all_outputs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    all_outputs: dict[str, dict[str, dict[str, list]]] = {}
    for task_dir in log_dir.iterdir():
        if not task_dir.is_dir():
            continue
        task = task_dir.name
        if task not in all_outputs:
            all_outputs[task] = {}
        for setting_dir in task_dir.iterdir():
            if not setting_dir.is_dir():
                continue
            setting = setting_dir.name
            if setting not in all_outputs[task]:
                all_outputs[task][setting] = {}
            for file_path in setting_dir.iterdir():
                if file_path.suffix != '.jsonl':
                    continue
                # file path format:
                # full_actions_gpt-5-mini_2025-08-07.jsonl
                # {setting}_{task}_{model}.jsonl
                

            
                match = log_file_pattern.match(file_path.name)
                if not match:
                    print(f"Skipping unrecognized file: {file_path}")
                    continue
                if testing:
                    print(file_path.name)
                    print(f"groups: {match.groupdict()}")
                    return
                model = match.group('model')
                outputs = load_jsonl(file_path)
                all_outputs[task][setting][model] = outputs
    return all_outputs

In [11]:
# parsed_logs = parse_log_directory(LOGS_DIR, testing=True)
parsed_logs = parse_log_directory(LOGS_DIR)

In [12]:
parsed_logs.keys()

dict_keys(['math', 'code'])

In [13]:
parsed_logs["math"].keys()

dict_keys(['full', 'sharded'])

In [14]:
parsed_logs["math"]["full"].keys()

dict_keys(['local_cl_math', 'local_base_llama'])

In [18]:
dummy_model = "local_base_llama"

In [19]:
dummy = parsed_logs["math"]["full"][dummy_model]
print(f"type: {type(dummy)},\nlength: {len(dummy)},\nfirst entry:\n{dummy[0].keys()}")

type: <class 'list'>,
length: 24,
first entry:
dict_keys(['conv_id', 'conv_type', 'task', 'task_id', 'dataset_fn', 'assistant_model', 'system_model', 'user_model', 'git_version', 'trace', 'is_correct', 'score'])


## Scoring Script

In [20]:
# score dummy first
total_entries = len(dummy)
total_score = 0
for entry in dummy:
    total_score += entry.get("score", 0)
average_score = total_score / total_entries
print(f"Average score for {dummy_model} on math/full: {average_score:.2f} over {total_entries} entries")

Average score for local_base_llama on math/full: 0.71 over 24 entries


In [21]:
# scoring function for list[traces]
def score_task_entries(
    entries: list[dict],
    score_key: str = "score"
) -> tuple[float, int, int]:
    total_entries = len(entries)
    if total_entries == 0:
        return 0.0
    total_score = 0
    none_count = 0
    for entry in entries:
        score = entry.get(score_key, 0)
        if score is None:
            none_count += 1
        else:
            total_score += score
    average_score = total_score / total_entries
    return average_score, none_count, total_entries

#### Initial GPT-5-mini Baseline

In [None]:
# create a pandas DataFrame of scores
# columns: task
# rows: setting
# for now, fix model to GPT_5_MINI

df = pd.DataFrame()
for task, settings in parsed_logs.items():
    for setting, models in settings.items():
        entries = models.get(GPT_5_MINI, [])
        avg_score, none_counts, total_entries = score_task_entries(entries)
        df.loc[setting, task] = avg_score
df.index.name = "Setting"
df.columns.name = "Task"
df = df.astype(float)
df

Task,summary,math,data2text,code,actions
Setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sharded,0.080628,0.627451,0.274894,0.367347,0.52381
full,0.109325,0.883495,0.37626,0.49,0.885714


In [27]:
print(df.to_markdown())

| Setting   |   collabllm_math |   base_llama |
|:----------|-----------------:|-------------:|
| full      |         0.708333 |     0.708333 |
| sharded   |         0.416667 |     0.458333 |


#### Pretrained CollabLLM Baselines

In [None]:
# results I need to visualize this time:
# (code, math) x (full, sharded) x (base_llama, cl_*) where * corresponds to the (code, math) selection
# let's do 2 tables, one for code and one for math
# let's keep setting as rows and then replace columns task->model
# again let's make dataframes for easier visualization

for task in ["code", "math"]:
    print(f"Results for task: {task}")
    df = pd.DataFrame()
    for setting, models in parsed_logs[task].items():
        for model_name, entries in models.items():
            avg_score, none_counts, total_entries = score_task_entries(entries)
            # print none counts
            if none_counts > 0:
                print(f"Warning: {none_counts} entries with None score for {model_name} on {task}/{setting}")
            model_name = model_name.replace("local_", "").replace("cl_", "collabllm_")
            df.loc[setting, model_name] = avg_score
    df.index.name = "Setting"
    df.columns.name = "Model"
    df = df.astype(float)
    display(df)
    # also print df to markdown to copy paste into docsA
    print(df.to_markdown())

Results for task: code


Model,collabllm_code,base_llama
Setting,Unnamed: 1_level_1,Unnamed: 2_level_1
full,0.541667,0.521739
sharded,0.25,0.416667


| Setting   |   collabllm_code |   base_llama |
|:----------|-----------------:|-------------:|
| full      |         0.541667 |     0.521739 |
| sharded   |         0.25     |     0.416667 |
Results for task: math


Model,collabllm_math,base_llama
Setting,Unnamed: 1_level_1,Unnamed: 2_level_1
full,0.708333,0.708333
sharded,0.416667,0.458333


| Setting   |   collabllm_math |   base_llama |
|:----------|-----------------:|-------------:|
| full      |         0.708333 |     0.708333 |
| sharded   |         0.416667 |     0.458333 |


In [31]:
def prep_serve_olmo_adapters():
    adapter_directory = Path("/home/v-homatthew/collabmem/outputs/offline_dpo_from_base")
    # !ls {Path("/home/v-homatthew/collabmem/outputs/offline_dpo_from_base")}
    # collabllm-multiturn-bfcl      collabllm-multiturn-lic-code
    # collabllm-multiturn-gsm8k     collabllm-multiturn-spider
    # collabllm-multiturn-gsm8k-r3  collabllm-multiturn-totto

    # mapping {math: gsm8k-r3, code: lic-code, actions: bfcl, database: spider, data2text: totto}
    # create actual python mapping of these keys to the corresponding path
    task2adapter_path = {
        "math": adapter_directory / "collabllm-multiturn-gsm8k-r3",
        "code": adapter_directory / "collabllm-multiturn-lic-code",
        "actions": adapter_directory / "collabllm-multiturn-bfcl",
        "database": adapter_directory / "collabllm-multiturn-spider",
        "data2text": adapter_directory / "collabllm-multiturn-totto",
    }

    # need a string containing
    entries = [f'dpo_{k}="{v}"' for k, v in task2adapter_path.items()]
    print(" ".join(entries))

prep_serve_olmo_adapters()


dpo_math="/home/v-homatthew/collabmem/outputs/offline_dpo_from_base/collabllm-multiturn-gsm8k-r3" dpo_code="/home/v-homatthew/collabmem/outputs/offline_dpo_from_base/collabllm-multiturn-lic-code" dpo_actions="/home/v-homatthew/collabmem/outputs/offline_dpo_from_base/collabllm-multiturn-bfcl" dpo_database="/home/v-homatthew/collabmem/outputs/offline_dpo_from_base/collabllm-multiturn-spider" dpo_data2text="/home/v-homatthew/collabmem/outputs/offline_dpo_from_base/collabllm-multiturn-totto"
