In [None]:
valid_puzzles = [
  "00576224",
  # "009d5c81", hard
  "00dbd492",
  "03560426",
  # "05a7bcf2", skip
  "0607ce86",
  "0692e18c",
  "070dd51e",
  "08573cc6",
  "0934a4d8",
  # "09c534e7", # hard
  "0a1d4ef5",
  # "0a2355a6", # hard
  "0b17323b",
  "0bb8deee",
  "0becf7df", 
  "0c786b71",
  "0c9aba6e",
  "0d87d2a6",
  "0e671a1a",
  "0f63c0b9",
  "103eff5b",
  "11e1fe23", 
  "12422b43",
  "12997ef3", 
  "12eac192",
  # "136b0064", # hard
  "13713586",
  "137f0df0", 
  "140c817e", 
  "14754a24",
  # "15113be4", # hard
  # "15663ba9", # hard
  "15696249", 
  # "16b78196", # hard
  "17b80ad2", 
  "17cae0c1", 
  # "18419cfa", # hard
  "184a9768", 
  "195ba7dc", 
  # "1990f7a8",
  # "19bb5feb",
  # "1a2e2828",
  # "1a6449f1",
  # "1acc24af",
  # "1c02dbbe",
  # "1c0d0a4b",
  # "1c56ad9f",
  # "1d0a4b61",
  "1d398264",
  # Skipped many
  "2753e76c",
  "292dd178", 
  "32e9702f",
  "332efdb3", 
  "351d6448", 
  "4364c1c4",
  # "4852f2fa",
  # "5207a7b5",
  "5ffb2104", # generating
  # "695367ec",
  # "69889d6e",
  # "6ad5bdfd",
  # "7039b2d7",
  # "712bf12e",
  # "759f3fd3",
  # "8ee62060",
  # "917bccba",
  # "93b4f4b3",
  # "992798f6",
  # "9b4c17c4",
  # "9c1e755f",
  # "9def23fe",
  # "a406ac07",
  # "b457fec5",
  # "b7999b51",
  # "b7fb29bc",
  # "ba9d41b8",
  # "bcb3040b",
  "c6e1b8da",
]

print(len(valid_puzzles))

In [None]:
import os

def check_o1_html(puzzle_ids):
    results = {}
    for puzzle_id in puzzle_ids:
        file_path = os.path.join("html", puzzle_id, "o1.html")
        results[puzzle_id] = os.path.exists(file_path)
    return results

# Example usage
existence_results = check_o1_html(valid_puzzles)

for folder, exists in existence_results.items():
    print(f"o1.html {'exists' if exists else 'does not exist'} in {folder}")

In [11]:
import modal

fn = modal.Function.lookup("arc-generator", "generate_data_from_html")
for puzzle_id in valid_puzzles:
  fn_call = fn.spawn(puzzle_id, 100, 20, "html_dim_20_small")

In [None]:
import json
import os
import time
from arc_prize.synth_data.html import capture_html_screenshot, get_web_driver, process_screenshot

num_puzzles = 9000
dataset_dir = "/Users/pfh/work/arc-data/html"
edition = 1

os.makedirs(dataset_dir, exist_ok=True)

driver = get_web_driver(60, 150)

# puzzles = valid_puzzles
puzzles = ["4364c1c4"]

def validate_puzzle(puzzle: list[list[int]]) -> bool:
    if len(puzzle) < 2 or len(puzzle) > 5:
        return False
    for pair in puzzle:
        if len(pair) != 2:
            return False
    return True

for puzzle_id in puzzles:
    html_file = f"html/{puzzle_id}.html"
    current_dir = os.getcwd()
    full_path = os.path.join(current_dir, html_file)
    full_path = os.path.abspath(full_path)

    start_time = time.time()
    puzzles = []

    print("Starting", puzzle_id)

    for i in range(num_puzzles):
        try:
            raw_screenshot = capture_html_screenshot(driver, full_path)
            arc_puzzle_data, _ = process_screenshot(raw_screenshot)
        except Exception as e:
            print("Skipping", e)
            continue

        if validate_puzzle(arc_puzzle_data) is True:
            puzzles.append(arc_puzzle_data)

        if i % 100 == 0:
            duration = time.time() - start_time
            per_puzzle_time = duration / max(len(puzzles), 1)
            print(
                f"Iteration {i}, done {len(puzzles)} puzzles. Time elapsed: {duration:.2f}s ({(duration / 60):.2f}m). Per puzzle: {per_puzzle_time:.2f}s"
            )
    
    duration = time.time() - start_time
    print("Finished", puzzle_id, f"Total time: {duration:.2f}s ({(duration / 60):.2f}m)")

    with open(f"{dataset_dir}/{puzzle_id}_{edition}.json", "w") as f:
        json.dump(puzzles, f)

driver.quit()



In [None]:
dataset_dir = "/Users/pfh/work/arc-data/html"

file_groups = {}

for filename in os.listdir(dataset_dir):
    if filename.endswith('.json'):
        prefix = filename.split('.')[0].split('_')[0]
        if file_groups.get(prefix, None) is None:
            file_groups[prefix] = []
        file_groups[prefix].append(filename)

# Process each group of files
for puzzle_id, files in file_groups.items():
    if len(files) == 1 and files[0] == f"{puzzle_id}.json":
        print(f"Skipping {puzzle_id} because there are only {len(files)} files")
        continue
    combined_data = []
    files_to_delete = []
    for file in files:
        with open(os.path.join(dataset_dir, file), 'r') as f:
            try:
                data = json.load(f)
                if isinstance(data, list):
                    combined_data.extend(data)
                    files_to_delete.append(file)
                else:
                    print("Malformed file", file)
            except json.JSONDecodeError:
                print(f"Error decoding JSON in file: {file}")
    
    for file in files_to_delete:
        file_path = os.path.join(dataset_dir, file)
        os.remove(file_path)
        print(f"Deleted: {file}")

    # Write combined data to a new file
    output_filename = f"{puzzle_id}.json"
    with open(os.path.join(dataset_dir, output_filename), 'w') as f:
        json.dump(combined_data, f)
    
    print(f"Combined {len(files_to_delete)} files with {len(combined_data)} puzzles into {output_filename}")


In [None]:
import os
from arc_prize.synth_data.html import get_html, make_prompt

puzzle_id = "12422b43"
os.makedirs(f"html/{puzzle_id}", exist_ok=True)

print(make_prompt(puzzle_id))

# for model in ["claude"]:
#   for i in range(1):
#     response = get_html(puzzle_id, model, output_file_path=f"html/{puzzle_id}/{model}_{i}.html")
