In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import sys
from pathlib import Path

In [3]:
from collabmem.constants import LIC_DATA_PATH, LIC_EVAL_SUBSET_PATH, REPO_ROOT

In [4]:
tgt_path = REPO_ROOT / "examples"
if str(tgt_path) not in sys.path:
    sys.path.append(str(tgt_path))

In [5]:
from single_turn_ds import (
    GSM8K,
    BFCLSingleTurnDataset,
    LiCCode,
    SpiderDatabaseSingleTurn,
    ToTToSingleTurn,
    datasets_info,
)

2026-01-21 05:13:43,867 [INFO] collabllm: CollabLLM logging enabled.


2026-01-21 05:13:45,689 [INFO] collabllm: Disable LiteLLM cache and logging by default. 


In [6]:
with open(LIC_DATA_PATH, "r") as f:
    lic_data = json.load(f)

In [7]:
lic_dataset_names = [
    "gsm8k",
    "lic-code",
    "bfcl",
    "spider",
    "totto",
]
lic_datasets = {k: datasets_info[k]["class"]() for k in lic_dataset_names}
hf_datasets = {k: ds.to_hf_dataset() for k, ds in lic_datasets.items()}

In [8]:
hf_datasets["totto"]

DatasetDict({
    train: Dataset({
        features: ['single_turn_prompt', 'single_turn_completion', 'single_turn_metadata'],
        num_rows: 60
    })
    test: Dataset({
        features: ['single_turn_prompt', 'single_turn_completion', 'single_turn_metadata'],
        num_rows: 60
    })
})

In [9]:
stm = "single_turn_metadata"

In [10]:
hf_datasets["lic-code"]["test"][0][stm]

{'extraction_requirement': 'Return executable Python code only. Provide a valid function definition (def ...) that satisfies all test cases. Do not include explanations.',
 'func_name': 'by_length',
 'humaneval_raw_prompt': '\ndef by_length(arr):\n    """\n    Given an array of integers, sort the integers that are between 1 and 9 inclusive,\n    reverse the resulting array, and then replace each digit by its corresponding name from\n    "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine".\n\n    For example:\n      arr = [2, 1, 1, 4, 5, 8, 2, 3]   \n            -> sort arr -> [1, 1, 2, 2, 3, 4, 5, 8] \n            -> reverse arr -> [8, 5, 4, 3, 2, 2, 1, 1]\n      return ["Eight", "Five", "Four", "Three", "Two", "Two", "One", "One"]\n    \n      If the array is empty, return an empty array:\n      arr = []\n      return []\n    \n      If the array has any strange number ignore it:\n      arr = [1, -1 , 55] \n            -> sort arr -> [-1, 1, 55]\n            -> reve

In [11]:
lic_data_dict = {e["task_id"]: e for e in lic_data}

In [12]:
lic_eval_data = []
for ds_name, hf_ds in hf_datasets.items():
    print(f"handling dataset: {ds_name}")
    eval_split = hf_ds["eval"] if "eval" in hf_ds.keys() else hf_ds["test"]
    for row in eval_split:
        if ds_name == "gsm8k":
            task_id = row[stm]["source_task_id"].upper()
            task_id = f"sharded-{task_id}"
        else:
            task_id = row[stm]["task_id"]
        lic_eval_data.append(lic_data_dict[task_id])
print(f"Total eval examples collected: {len(lic_eval_data)}")

handling dataset: gsm8k
handling dataset: lic-code
handling dataset: bfcl
handling dataset: spider
handling dataset: totto
Total eval examples collected: 268


In [13]:
lic_train_data = []
# everything in lic_data_dict that is not in lic_eval_data
eval_task_ids = set(e["task_id"] for e in lic_eval_data)
for task_id, data in lic_data_dict.items():
    if task_id not in eval_task_ids:
        lic_train_data.append(data)
print(f"Total train examples collected: {len(lic_train_data)}")

Total train examples collected: 359


In [14]:
print(LIC_DATA_PATH)

/home/v-homatthew/collabmem/src/lic/data/sharded_instructions_600.json


In [15]:
eval_split_path = LIC_DATA_PATH.parent / "sharded_eval_subset.json"
assert eval_split_path == LIC_EVAL_SUBSET_PATH
with open(eval_split_path, "w") as f:
    json.dump(lic_eval_data, f, indent=2)

train_split_path = LIC_DATA_PATH.parent / "sharded_train_subset.json"
with open(train_split_path, "w") as f:
    json.dump(lic_train_data, f, indent=2)

### make a mini subset for faster iteration

In [7]:
with open(LIC_EVAL_SUBSET_PATH, "r") as f:
    eval_subset = json.load(f)

In [11]:
mini_eval_path = LIC_DATA_PATH.parent / "lic_mini_eval.json"
examples_per_task = 24
mini_eval_subset = []
for task in ["math", "code", "actions", "database", "data2text"]:
    task_examples = [e for e in eval_subset if e["task"] == task]
    mini_eval_subset.extend(task_examples[:examples_per_task])
with open(mini_eval_path, "w") as f:
    json.dump(mini_eval_subset, f, indent=2)
    print(f"wrote mini eval subset to {mini_eval_path}")

wrote mini eval subset to /home/v-homatthew/collabmem/src/lic/data/lic_mini_eval.json


In [18]:
mini_eval_path = LIC_DATA_PATH.parent / "lic_mini_eval.json"
with open(mini_eval_path, "r") as f:
    mini_eval_subset = json.load(f)

In [19]:
# get first math task from mini_eval_subset
for e in mini_eval_subset:
    if e["task"] == "math":
        print(e)
        break

e.keys()

{'question': '90 single use contacts come in 1 box and will last Pete 45 days.  Each box is $100.00 and currently 10% off.  If he buys 2 boxes of contact, how much will each pair of contacts cost?', 'answer': "Each box of contacts is $100.00 with 10% off so that's 100*.10 = $<<100*.10=10.00>>10.00 off\nSo the box was $100.00 and he has $10.00 off so each box will cost 100-10 = $<<100-10=90.00>>90.00\nHe buys 2 boxes and each box costs $90.00 so that's 2*90= $<<2*90=180.00>>180.00\n1 box of contacts has 90 contacts and he buys 2 boxes so that's 90*2 = <<90*2=180>>180 contacts\nHe needs 1 contact for each eye and he has 180 contacts so that's 180/2 = <<180/2=90>>90 pairs of contacts\nHe spent $180.00 to have 90 pairs of contacts so each pair of contacts costs 180/90 = $<<180/90=2.00>>2.00 a pair\n#### 2", 'task_id': 'sharded-GSM8K/584', 'shards': [{'shard_id': 1, 'shard': "what's the cost per pair of contacts after Pete's purchase?"}, {'shard_id': 2, 'shard': 'each box contains 90 single

dict_keys(['question', 'answer', 'task_id', 'shards', 'task', 'full_spec_q', 'ground_truth_a'])

In [16]:
from collections import Counter

In [20]:
# I want yet another split
# from from the train split guaranteeing that it's disjoint from the "mini eval split"
# filter code samples such that task_id always contains HumanEval (not livecodebench)

mini_train_with_humaneval = []
mini_eval_task_ids = set(e["task_id"] for e in mini_eval_subset)

per_task = 30

for e in lic_train_data:
    if e["task_id"] in mini_eval_task_ids:
        continue
    if e["task"] == "code" and "HumanEval" in e["task_id"]:
        mini_train_with_humaneval.append(e)
        if len(mini_train_with_humaneval) >= per_task:
            break
print(f"Total mini train with HumanEval examples collected: {len(mini_train_with_humaneval)}")
# print per task counts
task_counts = Counter(e["task"] for e in mini_train_with_humaneval)
print("Task counts in mini train with HumanEval subset:")
for task, count in task_counts.items():
    print(f"{task}: {count}")
mini_train_path = LIC_DATA_PATH.parent / "lic_mini_train_with_humaneval.json"
with open(mini_train_path, "w") as f:
    json.dump(mini_train_with_humaneval, f, indent=2)
    print(f"wrote mini train with HumanEval subset to {mini_train_path}")

Total mini train with HumanEval examples collected: 22
Task counts in mini train with HumanEval subset:
code: 22
wrote mini train with HumanEval subset to /home/v-homatthew/collabmem/src/lic/data/lic_mini_train_with_humaneval.json


## also make a mini train set

In [15]:
# same principles, but I want 20 examples per task for training
mini_train_path = LIC_DATA_PATH.parent / "lic_mini_train.json"
examples_per_task = 20
mini_train_subset = []
train_task_ids = set(e["task_id"] for e in lic_train_data)
for task in ["math", "code", "actions", "database", "data2text"]:
    task_examples = [e for e in lic_train_data if e["task"] == task]
    mini_train_subset.extend(task_examples[:examples_per_task])
with open(mini_train_path, "w") as f:
    json.dump(mini_train_subset, f, indent=2)
    print(f"wrote mini train subset to {mini_train_path}")

wrote mini train subset to /home/v-homatthew/collabmem/src/lic/data/lic_mini_train.json
