In [2]:
import json
import os
from typing import Optional


def evaluate_dataset_dims(folder: str = "data/re_arc", cutoff_dim: Optional[int] = None) -> None:
    if cutoff_dim is not None:
        total_below_cutoff = 0
        total_above_cutoff = 0
    for i, filename in enumerate(sorted(os.listdir(f"{folder}"))):
        key = filename.replace(".json", "")
        with open(f"{folder}/{key}.json", "r") as fp:
            generated_task = json.load(fp)

            count_by_dim: dict[int, int] = {}
            for pair in generated_task:
                max_dim = max(
                    [
                        len(pair["input"]),
                        len(pair["output"]),
                        len(pair["input"][0]),
                        len(pair["output"][0]),
                    ]
                )
                count = count_by_dim.get(max_dim, 0)
                count_by_dim[max_dim] = count + 1
            
            print(key)
            if cutoff_dim is not None:
                below_cutoff = 0
                above_cutoff = 0
                for k, v in count_by_dim.items():
                    if k <= cutoff_dim:
                        below_cutoff += v
                    else:
                        above_cutoff += v
                total_below_cutoff += below_cutoff
                total_above_cutoff += above_cutoff
                print("Cutoff", cutoff_dim, "Below", below_cutoff, "Above", above_cutoff)

            print(count_by_dim)
    
    if cutoff_dim is not None:
        print("Total cutoff", cutoff_dim, "Below", total_below_cutoff, "Above", total_above_cutoff)


In [9]:
import random

from arc_prize.synth_data.utils import ChallengeTask, DatasetTasks, GridInput, GridPair

def split_list(input_list: list, min_size: int = 2, max_size: int = 5) -> list[list]:
    result = []
    while len(input_list) >= min_size:
        # Choose a random size between 2 and 5, but not larger than the remaining list
        size = min(random.randint(min_size, max_size), len(input_list))
        # Append a slice of the input list to the result
        result.append(input_list[:size])
        # Remove the slice from the input list
        input_list = input_list[size:]
    return result

def make_challenge_and_solution(task_pairs: list[GridPair]) -> tuple[ChallengeTask, list[list]]:
    train_pairs = task_pairs[:-1]
    test_pair = task_pairs[-1]
    return (ChallengeTask(train=train_pairs, test=[GridInput(input=test_pair.input)]), [test_pair.output])

def create_examples_with_dataset_dim(folder: str = "data/re_arc", cutoff_dim: int = 15, eval_split: float = 0.25, max_per_task: Optional[int] = None, output_folder: str = "data/re_arc_pruned") -> None:
    train_challenges: dict[str, ChallengeTask] = {}
    train_solutions: dict[str, list] = {}
    eval_challenges: dict[str, ChallengeTask] = {}
    eval_solutions: dict[str, list] = {}
    for i, filename in enumerate(sorted(os.listdir(f"{folder}"))):
        key = filename.replace(".json", "")
        print(f"Starting {key}")
        with open(f"{folder}/{key}.json", "r") as fp:
            generated_task = json.load(fp)
            if max_per_task is not None:
                generated_task = generated_task[:max_per_task]
            valid_pairs: list[GridPair] = []
            for pair in generated_task:
                max_dim = max(
                    [
                        len(pair["input"]),
                        len(pair["output"]),
                        len(pair["input"][0]),
                        len(pair["output"][0]),
                    ]
                )
                if max_dim <= cutoff_dim:
                    valid_pairs.append(GridPair(input=pair["input"], output=pair["output"]))
            eval_pairs_cutoff = int(len(valid_pairs) * eval_split)
            random.shuffle(valid_pairs)
            eval_pairs = split_list(valid_pairs[:eval_pairs_cutoff])
            train_pairs = split_list(valid_pairs[eval_pairs_cutoff:])
            for i, train_pair in enumerate(train_pairs):
                challenge, solution = make_challenge_and_solution(train_pair)
                train_challenges[f"{key}_{i}"] = challenge
                train_solutions[f"{key}_{i}"] = solution
            for i, eval_pair in enumerate(eval_pairs):
                challenge, solution = make_challenge_and_solution(eval_pair)
                eval_challenges[f"{key}_{i}"] = challenge
                eval_solutions[f"{key}_{i}"] = solution
    print("Train challenges", len(train_challenges), "Eval challenges", len(eval_challenges))

    training_tasks = DatasetTasks(challenges=train_challenges, solutions=train_solutions)
    eval_tasks = DatasetTasks(challenges=eval_challenges, solutions=eval_solutions)
    
    with open(f"{output_folder}/training_challenges.json", "w") as f:
        json.dump(training_tasks.to_dict()["challenges"], f)
    with open(f"{output_folder}/training_solutions.json", "w") as f:
        json.dump(training_tasks.to_dict()["solutions"], f)
    with open(f"{output_folder}/evaluation_challenges.json", "w") as f:
        json.dump(eval_tasks.to_dict()["challenges"], f)
    with open(f"{output_folder}/evaluation_solutions.json", "w") as f:
        json.dump(eval_tasks.to_dict()["solutions"], f)

create_examples_with_dataset_dim(folder="/Users/pfh/work/arc-data/re_arc_5k", cutoff_dim=30, max_per_task=2500, eval_split=0.2, output_folder="/Users/pfh/work/arc-data/re_arc_dim_30")

Starting 007bbfb7
Starting 00d62c1b
Starting 017c7c7b
Starting 025d127b
Starting 045e512c
Starting 0520fde7
Starting 05269061
Starting 05f2a901
Starting 06df4c85
Starting 08ed6ac7
Starting 09629e4f
Starting 0962bcdd
Starting 0a938d79
Starting 0b148d64
Starting 0ca9ddb6
Starting 0d3d703e
Starting 0dfd9992
Starting 0e206a2e
Starting 10fcaaa3
Starting 11852cab
Starting 1190e5a7
Starting 137eaa0f
Starting 150deff5
Starting 178fcbfb
Starting 1a07d186
Starting 1b2d62fb
Starting 1b60fb0c
Starting 1bfc4729
Starting 1c786137
Starting 1caeab9d
Starting 1cf80156
Starting 1e0a9b12
Starting 1e32b0e9
Starting 1f0c79e5
Starting 1f642eb9
Starting 1f85a75f
Starting 1f876c06
Starting 1fad071e
Starting 2013d3e2
Starting 2204b7a8
Starting 22168020
Starting 22233c11
Starting 2281f1f4
Starting 228f6490
Starting 22eb0ac0
Starting 234bbc79
Starting 23581191
Starting 239be575
Starting 23b5c85d
Starting 253bf280
Starting 25d487eb
Starting 25d8a9c8
Starting 25ff71a9
Starting 264363fd
Starting 272f95fa
Starting 2

In [10]:
from arc_prize.synth_data.utils import ChallengeTask, GridInput, GridPair


def make_challenge_and_solution_from_html(task_pairs: list[list]) -> tuple[ChallengeTask, list[list]]:
    train_pairs = [GridPair(input=input, output=output) for (input, output) in task_pairs[:-1]]
    test_pair = GridPair(input=task_pairs[-1][0], output=task_pairs[-1][1])
    return (ChallengeTask(train=train_pairs, test=[GridInput(input=test_pair.input)]), [test_pair.output])

def create_examples_from_html(folder: str = "data/html", max_dim: int = 30, eval_split: float = 0.25, output_folder: str = "data/re_arc_pruned") -> None:
    train_challenges: dict[str, ChallengeTask] = {}
    train_solutions: dict[str, list] = {}
    eval_challenges: dict[str, ChallengeTask] = {}
    eval_solutions: dict[str, list] = {}
    for i, filename in enumerate(sorted(os.listdir(f"{folder}"))):
        if filename[0] == ".":
            continue
        key = filename.replace(".json", "")
        print(f"Starting {key}")
        with open(f"{folder}/{key}.json", "r") as fp:
            generated_tasks = json.load(fp)
            valid_tasks = []
            for task in generated_tasks:
                task_dim = 0
                for pair in task:
                    pair_dim = max(
                        [
                            len(pair[0]),
                            len(pair[1]),
                            len(pair[0][0]),
                            len(pair[1][0]),
                        ]
                    )
                    if pair_dim > task_dim:
                        task_dim = pair_dim
                if task_dim <= max_dim:
                    valid_tasks.append(task)
            random.shuffle(valid_tasks)
            eval_cutoff = int(len(valid_tasks) * eval_split)
            eval_tasks = valid_tasks[:eval_cutoff]
            train_tasks = valid_tasks[eval_cutoff:]
            for i, train_pair in enumerate(train_tasks):
                challenge, solution = make_challenge_and_solution_from_html(train_pair)
                train_challenges[f"{key}_{i}"] = challenge
                train_solutions[f"{key}_{i}"] = solution
            for i, eval_pair in enumerate(eval_tasks):
                challenge, solution = make_challenge_and_solution_from_html(eval_pair)
                eval_challenges[f"{key}_{i}"] = challenge
                eval_solutions[f"{key}_{i}"] = solution
    print("Train challenges", len(train_challenges), "Eval challenges", len(eval_challenges))

    training_tasks = DatasetTasks(challenges=train_challenges, solutions=train_solutions)
    eval_tasks = DatasetTasks(challenges=eval_challenges, solutions=eval_solutions)
    
    with open(f"{output_folder}/training_challenges.json", "w") as f:
        json.dump(training_tasks.to_dict()["challenges"], f)
    with open(f"{output_folder}/training_solutions.json", "w") as f:
        json.dump(training_tasks.to_dict()["solutions"], f)
    with open(f"{output_folder}/evaluation_challenges.json", "w") as f:
        json.dump(eval_tasks.to_dict()["challenges"], f)
    with open(f"{output_folder}/evaluation_solutions.json", "w") as f:
        json.dump(eval_tasks.to_dict()["solutions"], f)

create_examples_from_html(folder="/Users/pfh/work/arc-data/html", max_dim=30, eval_split=0.2, output_folder="/Users/pfh/work/arc-data/html_dim_30")

Starting .DS_Store


FileNotFoundError: [Errno 2] No such file or directory: '/Users/pfh/work/arc-data/html/.DS_Store.json'

In [16]:
challenges = {}
solutions = {}

train_challenges_file = "data/arc/training_challenges.json"
train_solutions_file = "data/arc/training_solutions.json"
eval_challenges_file = "data/arc/evaluation_challenges.json"
eval_solutions_file = "data/arc/evaluation_solutions.json"

with open(train_challenges_file, "r") as f:
  for k, v in json.load(f).items():
    challenges[k] = v
with open(train_solutions_file, "r") as f:
  for k, v in json.load(f).items():
    solutions[k] = v
with open(eval_challenges_file, "r") as f:
  for k, v in json.load(f).items():
    challenges[k] = v
with open(eval_solutions_file, "r") as f:
  for k, v in json.load(f).items():
    solutions[k] = v

task_ids = list(challenges.keys())

In [20]:
combined = {}
for task_id in task_ids:
  challenge = challenges[task_id]
  solution = solutions[task_id]
  test = challenge["test"]
  for i, test_pair in enumerate(test):
    test[i]["output"] = solution[i]
  challenge["test"] = test
  combined[task_id] = challenge

combined_file = "data/arc/combined.json"
with open(combined_file, "w") as f:
  json.dump(combined, f)