In [2]:
import logging
import re
import typing as t
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from jinja2 import Template
from openai import OpenAI
from tqdm.auto import tqdm


# Openrouter constants
OPENROUTER_API_KEY = '<openrouter-api-key>'
OPENTROUTER_BASE_URL = 'https://openrouter.ai/api/v1'


# Experiment constants
EXPERIMENT_NAME = 'exp2.0'  # summary
MODEL_NAME = 'openai/gpt-4o-mini'
TEMPERATURE = 0.0

RANDOM_SEED = 20250402

N_ARMS = 5
DELTA = 0.2
N_TRIALS = 10


ARM_NAME_TO_IDX = {'blue': 0, 'green': 1, 'red': 2, 'yellow': 3, 'purple': 4}
ARM_IDX_TO_NAME = {v: k for k, v in ARM_NAME_TO_IDX.items()}

DATA_DIR = Path('../data')
EXPERIMENT_DIR = DATA_DIR / EXPERIMENT_NAME
EXPERIMENT_DIR.mkdir(exist_ok=True, parents=True)

def bootstrap() -> None:
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
    np.random.seed(RANDOM_SEED)


bootstrap()

  from .autonotebook import tqdm as notebook_tqdm


In [38]:
SYSTEM_PROMPT_TEMPLATE = """\
You are a bandit algorithm in a room with {{n_arms}} buttons labeled blue, green, red, yellow, purple. \
Each button is associated with a Bernoulli distribution with a fixed but unknown mean; the means \
for the buttons could be different. For each button, when you press it, you will get a reward \
that is sampled from the button's associated distribution. You have {{n_trials}} time steps and, on each \
time step, you can choose any button and receive the reward. Your goal is to maximize the total \
reward over the {{n_trials}} time steps.

At each time step, I will show you a summary of your past choices and rewards. Then you must \
make the next choice, which must be exactly one of blue, green, red, yellow, purple. Let's think \
step by step to make sure we make a good choice. You must provide your final answer within the \
tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple.
"""
SYSTEM_PROMPT_TEMPLATE = Template(SYSTEM_PROMPT_TEMPLATE)


USER_PROMPT_TEMLATE = """\
So far you have played {{n_trials}} times. Here is what you know about the environment so far:

{{history}}

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags \
<Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step \
by step to make sure we make a good choice.
"""
USER_PROMPT_TEMLATE = Template(USER_PROMPT_TEMLATE)

USER_PROMPT_TEMLATE_EXT = """\
You got the reward equal to {{reward}}.
Now refine your knowledge about the environment based on your previous knowledge, \
your last action and the reward you got after it. \
Feel free to add any useful information, e.g. instructions, \
summary of your interactions with the environemnt. \
Try to make it as useful for you as possible and put it between the brackets <HISTORY></HISTORY>.
"""
USER_PROMPT_TEMLATE_EXT = Template(USER_PROMPT_TEMLATE_EXT)

In [19]:
# Rendering example
HISTORY_PROMPT = "There's no information about the buttons currently. You need to explore more!"

actions = ["green", "green", "green", "green"]
rewards = [0, 0, 0, 1]
last_steps = "\n".join([f"action: {a}, reward: {r}" for a, r in zip(actions, rewards)])

example_prompt_inputs = {
    'n_trials': 10,
    'history': HISTORY_PROMPT,
}
user_prompt = USER_PROMPT_TEMLATE.render(**example_prompt_inputs)
print(user_prompt)

So far you have played 10 times. Here is what you know about the environment so far:

There's no information about the buttons currently. You need to explore more!

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step by step to make sure we make a good choice.


In [20]:
example_prompt_inputs = {
    'reward': 1,
}
user_prompt_ext = USER_PROMPT_TEMLATE_EXT.render(**example_prompt_inputs)
print(user_prompt_ext)

You got the reward equal to 1.
Now refine your knowledge about the environment based on your previous knowledge, your last action and the reward you got after it. Feel free to add any useful information, e.g. instructions, summary of your interactions with the environemnt. Try to make it as useful for you as possible and put it between the brackets <HISTORY></HISTORY>.


In [21]:
class MultiArmedBandit:
    def __init__(self, n_arms: int, delta: float) -> None:
        self.n_arms = n_arms
        self.delta = delta

        means = [0.5 + self.delta / 2] + [0.5 - self.delta / 2] * (self.n_arms - 1)
        np.random.shuffle(means)

        self.means = means
        self.best_arm = np.argmax(means)

    def pull(self, arm: int) -> int:
        assert 0 <= arm < self.n_arms, f"Arm {arm} doesn't exist"

        p = self.means[arm]
        reward = np.random.binomial(1, p=p)

        return reward

In [33]:
def create_client() -> OpenAI:
    client = OpenAI(
        base_url=OPENTROUTER_BASE_URL,
        api_key=OPENROUTER_API_KEY,
    )
    return client


def get_prediction(
        client: OpenAI,
        system_prompt: str,
        user_prompt: str,
        answer: str = None,
        user_prompt_ext: str = None,
        **kwargs
        ) -> t.Union[str, None]:
    messages = [
        {
            'role': 'system',
            'content': system_prompt
        },
        {
            'role': 'user',
            'content': user_prompt
        },
    ]

    if answer:
        messages.extend([
            {
                'role': 'assistant',
                'content': answer
            },
            {
                'role': 'user',
                'content': user_prompt_ext
            }
        ])

    prediction = None
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=TEMPERATURE,
            **kwargs
        )
        prediction = completion.choices[0].message.content
    except Exception as _:
        logging.warning('Request failed')

    return prediction


def extract_arm_color(prediction: str) -> t.Union[str, None]:
    names = '|'.join(ARM_NAME_TO_IDX.keys())
    pattern = f'<Answer>({names})</Answer>'
    match = re.search(pattern, prediction)

    color = None
    if match:
        color = match.group(1)

    return color


def extract_hystory(prediction: str) -> str:
    hystory = re.findall("<HISTORY>((?:.*?|\\n)*?)</HISTORY>", prediction)
    if hystory:
        return hystory[0]
    else:
        return ""

In [32]:
prediction = """
<HISTORY>
- Total plays: 1
- Button pressed: blue
- Reward received: 1
- Current knowledge: The blue button has a reward of 1 from the Bernoulli distribution. This is my first data point, but I still need to explore other buttons to understand their distributions and compare them to the blue button.
</HISTORY>
"""

re.findall("<HISTORY>((?:.*?|\\n)*?)</HISTORY>", prediction)

['\n- Total plays: 1\n- Button pressed: blue\n- Reward received: 1\n- Current knowledge: The blue button has a reward of 1 from the Bernoulli distribution. This is my first data point, but I still need to explore other buttons to understand their distributions and compare them to the blue button.\n']

In [23]:
client = create_client()
bandit = MultiArmedBandit(n_arms=N_ARMS, delta=DELTA)

In [39]:
stats = []

rewards = {k: 0 for k in ARM_NAME_TO_IDX.keys()}
occurrences = {k: 0 for k in ARM_NAME_TO_IDX.keys()}
HISTORY_PROMPT = "There's no information about the buttons currently. You need to explore more."

for trial in tqdm(range(N_TRIALS), total=N_TRIALS):
    system_prompt = SYSTEM_PROMPT_TEMPLATE.render({'n_arms': N_ARMS, 'n_trials': N_TRIALS})

    user_prompt_inputs = {
        'n_trials': trial,
        'history': HISTORY_PROMPT,
    }
    user_prompt = USER_PROMPT_TEMLATE.render(user_prompt_inputs)
    prediction = get_prediction(client, system_prompt, user_prompt)

    arm_name = extract_arm_color(prediction)
    arm_idx = ARM_NAME_TO_IDX[arm_name]

    reward = bandit.pull(arm_idx)

    user_prompt_inputs_ext = {
        'reward': reward,
    }
    user_prompt_ext = USER_PROMPT_TEMLATE_EXT.render(user_prompt_inputs_ext)
    prediction_history = get_prediction(client, system_prompt, user_prompt, prediction, user_prompt_ext)

    # debug
    print(f"Round {trial}\n")
    print(f"User: {user_prompt}\n")
    print(f"AI: {prediction}\n")
    print(f"User: {user_prompt_ext}\n")
    print(f"AI: {prediction_history}")
    print("-----\n")

    HISTORY_PROMPT = extract_hystory(prediction_history)

    occurrences[arm_name] += 1
    rewards[arm_name] += reward

    cumulative_reward = sum(rewards.values())
    cumulative_reward_per_arm = {f'cumulative_reward_{k}': v for k, v in rewards.items()}
    cumulative_occurrences_per_arm = {f'cumulative_occurrence_{k}': v for k, v in occurrences.items()}
    stats.append({
        'trial': trial,
        'arm_name': arm_name,
        'arm_idx': arm_idx,
        'reward': reward,
        'cumulative_reward': cumulative_reward,
        'system_prompt': system_prompt,
        'user_prompt': user_prompt,
        'raw_prediction': prediction,
        'history_summary': HISTORY_PROMPT,
        'best_arm': ARM_IDX_TO_NAME[bandit.best_arm],
        **cumulative_reward_per_arm,
        **cumulative_occurrences_per_arm
    })


df = pd.DataFrame(stats)

 10%|█         | 1/10 [00:06<00:54,  6.07s/it]

Round 0

User: So far you have played 0 times. Here is what you know about the environment so far:

There's no information about the buttons currently. You need to explore more.

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step by step to make sure we make a good choice.

AI: Since I have not played any buttons yet, I need to explore the options available to me. Each button is associated with a Bernoulli distribution, and I have no prior information about their means. 

To maximize my chances of learning about the rewards associated with each button, I will choose one button to press. Since there is no prior information, I can select any button at random. 

For simplicity, I will choose the first button in the list, which is blue. This will allow me to gather information about the reward distribution associated with the blue button.

Therefore, 

 20%|██        | 2/10 [00:12<00:50,  6.26s/it]

Round 1

User: So far you have played 1 times. Here is what you know about the environment so far:


- I have pressed the blue button once and received a reward of 0.
- This indicates that the mean reward for the blue button is likely low, but it is not definitive since I have only one data point.
- I need to explore other buttons to gather more information about their reward distributions.
- I will continue to explore the buttons to identify which ones may yield higher rewards.


Which button will you choose next? Remember, YOU MUST provide your final answer within the tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step by step to make sure we make a good choice.

AI: Given that I have only pressed the blue button once and received a reward of 0, it is prudent to explore other buttons to gather more information about their reward distributions. Since I have no prior data on the other buttons, I should choose one of them to maximize my c

 30%|███       | 3/10 [00:21<00:54,  7.73s/it]

Round 2

User: So far you have played 2 times. Here is what you know about the environment so far:


- I have pressed the blue button once and received a reward of 0.
- I have pressed the green button once and also received a reward of 0.
- So far, both the blue and green buttons have yielded a reward of 0, suggesting that their mean rewards are likely low. However, with only two data points, this is not definitive.
- I still have three buttons left to explore: red, yellow, and purple.
- It is important to continue exploring these remaining buttons to gather more information about their reward distributions.


Which button will you choose next? Remember, YOU MUST provide your final answer within the tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step by step to make sure we make a good choice.

AI: Given the information provided, I have already explored the blue and green buttons, both yielding a reward of 0. This suggests that their mea

 40%|████      | 4/10 [00:28<00:44,  7.44s/it]

Round 3

User: So far you have played 3 times. Here is what you know about the environment so far:


So far, I have pressed the following buttons and received the corresponding rewards:

1. Blue button: Pressed once, reward = 0
2. Green button: Pressed once, reward = 0
3. Red button: Pressed once, reward = 0

Summary of interactions:
- The blue and green buttons have both yielded a reward of 0, suggesting that their mean rewards are likely low.
- The red button has also yielded a reward of 0, indicating that it may also have a low mean reward.
- I have not yet explored the yellow and purple buttons, which remain untested.

Current knowledge:
- Blue, green, and red buttons likely have low mean rewards (possibly 0).
- I still need to explore the yellow and purple buttons to gather more information about their reward distributions.

Next steps:
- I should prioritize exploring the yellow and purple buttons in the next time steps to maximize my chances of finding a button with a higher mean

 50%|█████     | 5/10 [00:41<00:46,  9.26s/it]

Round 4

User: So far you have played 4 times. Here is what you know about the environment so far:


Summary of interactions:
1. Blue button: Pressed once, reward = 0
2. Green button: Pressed once, reward = 0
3. Red button: Pressed once, reward = 0
4. Yellow button: Pressed once, reward = 1

Current knowledge:
- The blue, green, and red buttons have all yielded a reward of 0, indicating that their mean rewards are likely low (possibly 0).
- The yellow button has yielded a reward of 1, suggesting that it has a higher mean reward compared to the previously tested buttons.
- The purple button remains untested, and I still need to explore it to gather more information about its reward distribution.

Next steps:
- I should continue to explore the purple button in the next time step to determine if it has a higher mean reward or if it also has a low mean reward like the blue, green, and red buttons.


Which button will you choose next? Remember, YOU MUST provide your final answer within the 

 50%|█████     | 5/10 [00:45<00:45,  9.07s/it]


KeyboardInterrupt: 

In [35]:
df

Unnamed: 0,trial,arm_name,arm_idx,reward,cumulative_reward,system_prompt,user_prompt,raw_prediction,history_summary,best_arm,cumulative_reward_blue,cumulative_reward_green,cumulative_reward_red,cumulative_reward_yellow,cumulative_reward_purple,cumulative_occurrence_blue,cumulative_occurrence_green,cumulative_occurrence_red,cumulative_occurrence_yellow,cumulative_occurrence_purple
0,0,blue,0,0,0,You are a bandit algorithm in a room with 5 bu...,So far you have played 0 times. Here is what y...,"Since I have not played any buttons yet, I nee...",\n- Total time steps: 1\n- Buttons pressed: 1 ...,purple,0,0,0,0,0,1,0,0,0,0
1,1,green,1,1,1,You are a bandit algorithm in a room with 5 bu...,So far you have played 1 times. Here is what y...,Since I have only pressed the blue button and ...,\n- Total time steps: 2\n- Buttons pressed: 2 ...,purple,0,1,0,0,0,1,1,0,0,0
2,2,red,2,1,2,You are a bandit algorithm in a room with 5 bu...,So far you have played 2 times. Here is what y...,Given the information from the first two time ...,\n- Total time steps: 3\n- Buttons pressed: 3 ...,purple,0,1,1,0,0,1,1,1,0,0
3,3,yellow,3,0,2,You are a bandit algorithm in a room with 5 bu...,So far you have played 3 times. Here is what y...,Given the information from the previous three ...,\n- Total time steps: 4\n- Buttons pressed: 4 ...,purple,0,1,1,0,0,1,1,1,1,0
4,4,purple,4,0,2,You are a bandit algorithm in a room with 5 bu...,So far you have played 4 times. Here is what y...,Given the information from the previous rounds...,\n- Total time steps: 5\n- Buttons pressed: 5 ...,purple,0,1,1,0,0,1,1,1,1,1
5,5,green,1,1,3,You are a bandit algorithm in a room with 5 bu...,So far you have played 5 times. Here is what y...,"Given the information gathered so far, the blu...",\n- Total time steps: 6\n- Buttons pressed: 6 ...,purple,0,2,1,0,0,1,2,1,1,1
6,6,green,1,0,3,You are a bandit algorithm in a room with 5 bu...,So far you have played 6 times. Here is what y...,"Based on the information provided, the green b...",\n- Total time steps: 7\n- Buttons pressed: 7 ...,purple,0,2,1,0,0,1,3,1,1,1
7,7,red,2,0,3,You are a bandit algorithm in a room with 5 bu...,So far you have played 7 times. Here is what y...,"Given the information provided, we have the fo...",\n- Total time steps: 8\n- Buttons pressed: 8 ...,purple,0,2,1,0,0,1,3,2,1,1
8,8,green,1,0,3,You are a bandit algorithm in a room with 5 bu...,So far you have played 8 times. Here is what y...,"Given the information provided, we have the fo...",\n- Total time steps: 9\n- Buttons pressed: 9 ...,purple,0,2,1,0,0,1,4,2,1,1
9,9,green,1,1,4,You are a bandit algorithm in a room with 5 bu...,So far you have played 9 times. Here is what y...,"Given the information provided, we have the fo...",\n- Total time steps: 10\n- Buttons pressed: 1...,purple,0,3,1,0,0,1,5,2,1,1


In [None]:
model_name = MODEL_NAME.replace('/', '_').replace(':', '_').replace('-', '_')
version = datetime.strftime('%Y%m%d%H%M%S')

filename = f'{EXPERIMENT_NAME}_{model_name}_arms-{N_ARMS}_delta-{DELTA}_trials-{N_TRIALS}_v{version}.csv'
filepath = EXPERIMENT_DIR / filename

df.to_csv(filepath, index=False)