In [1]:
from openai import OpenAI
import os

client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=os.getenv("OPENROUTER_API_KEY"),
)

def ask(messages):
    output =  client.chat.completions.create(
        model="anthropic/claude-3.5-sonnet",
        messages=messages,
        temperature=0.5,
        max_tokens=2048,
        top_p=0.95,
        stop=["</COLLECT>", "</EXPLORE>", "</COLLECT_ADD>", "</EXPLORE_ADD>", "</VALIDATE_SOLUTION>", "</SOLUTION>"],
    )

    return output.choices[0].message.content

In [2]:
from datetime import datetime
import traceback
import builtins

timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
log_filename = f'attempts/attempt-{timestamp}.txt'

# Redirect print statements to writing to a file
def print(*args, **kwargs):
    builtins.print(*args, **kwargs)
    with open(log_filename, 'a') as f:
        f.write(' '.join(map(str, args)) + '\n')

In [3]:
import json
with open('puzzles/arc-agi_test_challenges.json', 'r') as f:
    data = json.load(f)

# loading example puzzle
data = data["1190e5a7"]

In [4]:
emoji_map = {
    0: '⚫', 1: '🔵', 2: '🔴', 3: '🟢', 4: '🟡',
    5: '⚪', 6: '🟣', 7: '🟠', 8: '🔹', 9: '🟥'
}

def grid_to_emoji(grid):
    return '\n'.join([''.join([emoji_map[cell] for cell in row]) for row in grid])
    # return "\n".join(["".join([str(ele) for ele in row]) for row in grid])

add_to_prompt = ""

for i in range(len(data["train"])):
    add_to_prompt += f"Train {i}\n"
    add_to_prompt += "Input:\n"
    add_to_prompt += grid_to_emoji(data["train"][i]["input"]) + "\n"
    add_to_prompt += "Output:\n"
    add_to_prompt += grid_to_emoji(data["train"][i]["output"]) + "\n\n"

for i in range(len(data["test"])):
    add_to_prompt += f"Test {i}\n"
    add_to_prompt += "Input:\n"
    add_to_prompt += grid_to_emoji(data["test"][i]["input"]) + "\n"
    add_to_prompt += "Output:\n"
    add_to_prompt += "<find transformation to predict the test output>\n\n"
    
print(add_to_prompt)

Train 0
Input:
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟠🟠🟠🟠🟠🟠🟠🟠🟠🟠🟠🟠🟠🟠🟠
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
🟢🟠🟢🟢🟢🟢🟢🟢🟢🟢🟠🟢🟢🟠🟢
Output:
🟢🟢🟢🟢
🟢🟢🟢🟢

Train 1
Input:
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹🔹
🔵🔵🔵🔵🔹🔵🔵🔵🔵🔵🔵
Output:
🔵🔵
🔵🔵
🔵🔵

Train 2
Input:
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵🔵
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🟢🔵🟢🔵🟢🔵🟢
🟢🟢🟢🟢🟢🟢🔵🟢🟢🟢🟢🟢🟢🟢

In [5]:
with open('prompts/bank_prompt.txt', 'r') as f:
    initial_prompt =  f.read()

initial_prompt = initial_prompt.split("<PUZZLE_DATA>")[0]+"<PUZZLE_DATA>\n"+add_to_prompt+"\n</PUZZLE_DATA>"+initial_prompt.split("</PUZZLE_DATA>")[1]

In [6]:
def compute_initial_prompt(initial_prompt, explore_history, collect_history):
    collect = ""
    if(len(collect_history) > 0):
        for idx, e in enumerate(collect_history):
            collect += f"<collect_history_item>{e}</collect_history_item>\n"
    else:
        collect = "-- no collect history --\n"
    
    collect = "<collect_history>\n"+collect+"</collect_history>"
    initial_prompt = initial_prompt.split("<collect_history>")[0]+collect+initial_prompt.split("</collect_history>")[1]

    explore = ""
    if(len(explore_history) > 0):
        for idx, e in enumerate(explore_history):
            explore += f"<explore_history_item>{e}</explore_history_item>\n"
    else:
        explore = "-- no explore history --\n"

    explore = "<explore_history>\n"+explore+"</explore_history>"
    initial_prompt = initial_prompt.split("<explore_history>")[0]+explore+initial_prompt.split("</explore_history>")[1]

    return initial_prompt

In [7]:
explore_history = []
collect_history = []

initial_prompt = compute_initial_prompt(initial_prompt, explore_history, collect_history)
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": initial_prompt
            },
        ],
    }
]

solution_found = False  

for i in range(100):
    response = ask(messages)
    print(response)

    def run_dsl():
        return "< run_dsl function not implemented >"
    
    if("<EXPLORE>" in response):
        response += "</EXPLORE>"
        string_output_from_claude = response.split("<EXPLORE>")[1].split("</EXPLORE>")[0] # full output where you generated DSL code
        code = string_output_from_claude.split("<DSL>")[1].split("</DSL>")[0].strip().strip("\n") # extract just the code
        try:
            exec(code) # run eval on your code to use the run_dsl function that you wrote
            output = run_dsl() # this function must have been written by you, and created when i run eval
        except Exception as e:
            output = f"Error: {str(e)}\n\nFull traceback:\n{traceback.format_exc()}"
        print(f"\n\nOutput: {output}\n\n")
        messages.extend([
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": response
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "<DSL_OUTPUT>\n"+str(output)+"\n</DSL_OUTPUT>\n\n Usually, you would now perform a explore add step. But, if you have an idea of the solution, you should perform a validate solution step."
                    },
                ],
            }
        ])

    if("<VALIDATE_SOLUTION>" in response):
        response += "</VALIDATE_SOLUTION>"
        string_output_from_claude = response.split("<VALIDATE_SOLUTION>")[1].split("</VALIDATE_SOLUTION>")[0] # full output where you generated DSL code
        code = string_output_from_claude.split("<DSL>")[1].split("</DSL>")[0].strip().strip("\n") # extract just the code
        try:
            exec(code) # run eval on your code to use the run_dsl function that you wrote
            output = run_dsl() # this function must have been written by you, and created when i run eval
        except Exception as e:
            output = f"Error: {str(e)}\n\nFull traceback:\n{traceback.format_exc()}"
        print(f"\n\nOutput: {output}\n\n")
        messages.extend([
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": response
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "<DSL_OUTPUT>\n"+str(output)+"\n</DSL_OUTPUT>\nUsually, you would attempt to validate a solution based on the information and patterns that you recognised. If you are not sure, you would perform an explore add step."
                    },
                ],
            }
        ])

    if("<SOLUTION>" in response):
        response += "</SOLUTION>"
        string_output_from_claude = response.split("<SOLUTION>")[1].split("</SOLUTION>")[0] # full output where you generated DSL code
        code = string_output_from_claude.split("<DSL>")[1].split("</DSL>")[0].strip().strip("\n") # extract just the code
        try:
            exec(code) # run eval on your code to use the run_dsl function that you wrote
            output = run_dsl() # this function must have been written by you, and created when i run eval
        except Exception as e:
            output = f"Error: {str(e)}\n\nFull traceback:\n{traceback.format_exc()}"
            print(f"\n\nOutput: {output}\n\n")
            messages.extend([
                {
                    "role": "assistant",
                    "content": [
                        {
                            "type": "text",
                            "text": response
                        }
                    ]
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "<DSL_OUTPUT>\n"+str(output)+"\n</DSL_OUTPUT>"
                        },
                    ],
                }
            ])
            continue

        print("SOLUTION FOUND:")
        print(output)
        solution_found = True
        break

    elif("<COLLECT>" in response):
        response += "</COLLECT>"
        print("</COLLECT>")
        string_output_from_claude = response.split("<COLLECT>")[1].split("</COLLECT>")[0]
        code = string_output_from_claude.split("<DSL>")[1].split("</DSL>")[0].strip().strip("\n")
        try:
            exec(code) # run eval on your code to use the run_dsl function that you wrote
            output = run_dsl() # this function must have been written by you, and created when i run eval
        except Exception as e:
            output = f"Error: {str(e)}\n\nFull traceback:\n{traceback.format_exc()}"
        print(f"\n\nOutput: {output}\n\n")
        messages.extend([
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "text": response
                    }
                ]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "<DSL_OUTPUT>\n"+str(output)+"\n</DSL_OUTPUT>\n\n Usually, you would attempt to validate a solution based on the information and patterns that you recognised. If you are not sure, you would perform an collect add step."
                    },
                ],
            }
        ])

    elif("<EXPLORE_ADD>" in response):
        response += "</EXPLORE_ADD>"
        print("</EXPLORE_ADD>")
        string_output_from_claude = response.split("<EXPLORE_ADD>")[1].split("</EXPLORE_ADD>")[0]
        explore_history.append(string_output_from_claude)
        initial_prompt = compute_initial_prompt(initial_prompt, explore_history, collect_history)
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": initial_prompt
                    },
                ],
            }
        ]

    elif("<COLLECT_ADD>" in response):
        response += "</COLLECT_ADD>"
        string_output_from_claude = response.split("<COLLECT_ADD>")[1].split("</COLLECT_ADD>")[0]
        collect_history.append(string_output_from_claude)
        initial_prompt = compute_initial_prompt(initial_prompt, explore_history, collect_history)
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": initial_prompt
                    },
                ],
            }
        ]
    
    print("\n\n")
    print("-"*100)
    print("\n\n")

Let me start by examining the grid patterns systematically. I notice there are horizontal lines of a different color (orange, light blue, and blue) running across each input grid. Let me verify this pattern first.

<COLLECT>
Let's verify the presence and pattern of horizontal lines in each input grid
<DSL>
def run_dsl():
    results = []
    for idx in range(len(data["train"])):
        input_grid = data["train"][idx]["input"]
        
        # Find rows that are all the same color
        line_rows = []
        for i, row in enumerate(input_grid):
            if all(x == row[0] for x in row):
                line_rows.append((i, row[0]))
        
        results.append({
            "example": idx,
            "line_rows": line_rows,
            "grid_height": len(input_grid),
            "grid_width": len(input_grid[0])
        })
    return results
</DSL>

</COLLECT>


Output: [{'example': 0, 'line_rows': [(2, 7)], 'grid_height': 15, 'grid_width': 15}, {'example': 1, 'line_rows': [

KeyboardInterrupt: 

In [8]:
if(solution_found):
    print("Solution found!")
    print(output)
    print(grid_to_emoji(output[0]))
else:
    print("Solution not found.")

Solution found!
[[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
🔵🔵🔵🔵
🔵🔵🔵🔵
🔵🔵🔵🔵
🔵🔵🔵🔵
🔵🔵🔵🔵


In [9]:
explore_history

['\nThe height of the output grid equals the number of sections created by horizontal dividing lines. However, the width calculation is more complex - my assumption about dividing by 3 was incorrect. For example, in Train 0: input sections are 15 wide → output is 4 wide, Train 1: input sections are 11 wide → output is 2 wide, Train 2: input sections are 27 wide → output is 5 wide.\n']

In [10]:
collect_history

['\n1. Each input grid contains exactly 2 colors, while output grids contain only 1 color\n2. The output color is always the "majority" color from the input (🟢, 🔵, 🟢 respectively)\n3. The output dimensions are significantly smaller than input dimensions\n']