In [2]:

import logging
import re
import typing as t
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from jinja2 import Template
from openai import OpenAI
from tqdm.auto import tqdm

from concurrent.futures import ThreadPoolExecutor, as_completed

# Openrouter constants
OPENROUTER_API_KEY = '<openrouter-api-key>'
OPENTROUTER_BASE_URL = 'https://openrouter.ai/api/v1'


# Experiment constants
EXPERIMENT_NAME = 'exp1.0'  # baseline
MODEL_NAME = 'openai/gpt-4o-mini'
# MODEL_NAME = 'openai/gpt-4'
TEMPERATURE = 0.0

RANDOM_SEED = 20250402

N_ARMS = 5
DELTA = 0.2
N_TRIALS = 100

N_EXPERIMENT_RUNS = 20
N_WORKERS = 20


COLORS = [
    'blue', 'red', 'green', 'yellow', 'purple',
    'orange', 'brown', 'pink', 'black', 'white',
    'gray', 'cyan', 'magenta', 'violet', 'indigo'
]


DATA_DIR = Path('../data')
EXPERIMENT_DIR = DATA_DIR / EXPERIMENT_NAME
EXPERIMENT_DIR.mkdir(exist_ok=True, parents=True)


# def bootstrap() -> None:
#     logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
#     np.random.seed(RANDOM_SEED)


# bootstrap()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SYSTEM_PROMPT_TEMPLATE = """\
You are a bandit algorithm in a room with {{n_arms}} buttons labeled {{colors|join(", ")}}. \
Each button is associated with a Bernoulli distribution with a fixed but unknown mean; the means \
for the buttons could be different. For each button, when you press it, you will get a reward \
that is sampled from the button's associated distribution. You have {{n_trials}} time steps and, on each \
time step, you can choose any button and receive the reward. Your goal is to maximize the total \
reward over the {{n_trials}} time steps.

At each time step, I will show you a summary of your past choices and rewards. Then you must \
make the next choice, which must be exactly one of {{colors|join(", ")}}. Let's think \
step by step to make sure we make a good choice. You must provide your final answer within the \
tags <Answer>COLOR</Answer> where COLOR is one of {{colors|join(", ")}}.
"""
SYSTEM_PROMPT_TEMPLATE = Template(SYSTEM_PROMPT_TEMPLATE)


USER_PROMPT_TEMLATE = """\
So far you have played {{n_trials}} times. Here is what you know about the environment so far:

{{history}}

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags \
<Answer>COLOR</Answer> where COLOR is one of {{colors|join(", ")}}. Let's think step \
by step to make sure we make a good choice.
"""
USER_PROMPT_TEMLATE = Template(USER_PROMPT_TEMLATE)

USER_PROMPT_TEMLATE_EXT = """\
You got the reward equal to {{reward}}.
Now refine your knowledge about the environment based on your previous knowledge, \
your last action and the reward you got after it. \
Feel free to add any useful information, e.g. instructions, \
summary of your interactions with the environemnt. \
Try to make it as useful for you as possible and put it between the brackets <HISTORY></HISTORY>.
"""
USER_PROMPT_TEMLATE_EXT = Template(USER_PROMPT_TEMLATE_EXT)

In [4]:
# Rendering example
HISTORY_PROMPT = "There's no information about the buttons currently. You need to explore more!"

actions = ["green", "green", "green", "green"]
rewards = [0, 0, 0, 1]
last_steps = "\n".join([f"action: {a}, reward: {r}" for a, r in zip(actions, rewards)])

example_prompt_inputs = {
    'n_trials': 10,
    'history': HISTORY_PROMPT,
}
user_prompt = USER_PROMPT_TEMLATE.render(**example_prompt_inputs)
print(user_prompt)

So far you have played 10 times. Here is what you know about the environment so far:

There's no information about the buttons currently. You need to explore more!

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags <Answer>COLOR</Answer> where COLOR is one of . Let's think step by step to make sure we make a good choice.


In [5]:
example_prompt_inputs = {
    'reward': 1,
}
user_prompt_ext = USER_PROMPT_TEMLATE_EXT.render(**example_prompt_inputs)
print(user_prompt_ext)

You got the reward equal to 1.
Now refine your knowledge about the environment based on your previous knowledge, your last action and the reward you got after it. Feel free to add any useful information, e.g. instructions, summary of your interactions with the environemnt. Try to make it as useful for you as possible and put it between the brackets <HISTORY></HISTORY>.


In [6]:
class MultiArmedBandit:
    def __init__(self, n_arms: int, delta: float) -> None:
        self.n_arms = n_arms
        self.delta = delta

        means = [0.5 + self.delta / 2] + [0.5 - self.delta / 2] * (self.n_arms - 1)
        np.random.shuffle(means)

        self.means = means
        self.best_arm = np.argmax(means)

    def pull(self, arm: int) -> int:
        assert 0 <= arm < self.n_arms, f"Arm {arm} doesn't exist"

        p = self.means[arm]
        reward = np.random.binomial(1, p=p)

        return reward

In [7]:
def create_client() -> OpenAI:
    client = OpenAI(
        base_url=OPENTROUTER_BASE_URL,
        api_key=OPENROUTER_API_KEY,
    )
    return client


def get_prediction(
        client: OpenAI,
        system_prompt: str,
        user_prompt: str,
        answer: str = None,
        user_prompt_ext: str = None,
        **kwargs
        ) -> t.Union[str, None]:
    messages = [
        {
            'role': 'system',
            'content': system_prompt
        },
        {
            'role': 'user',
            'content': user_prompt
        },
    ]

    if answer:
        messages.extend([
            {
                'role': 'assistant',
                'content': answer
            },
            {
                'role': 'user',
                'content': user_prompt_ext
            }
        ])

    prediction = None
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=TEMPERATURE,
            **kwargs
        )
        prediction = completion.choices[0].message.content
    except Exception as _:
        logging.warning('Request failed')

    return prediction


def extract_arm_color(prediction: str, arm_name_to_idx: t.Dict[str, int]) -> t.Union[str, None]:
    names = '|'.join(arm_name_to_idx.keys())
    pattern = f'<Answer>({names})</Answer>'
    match = re.search(pattern, prediction)

    color = None
    if match:
        color = match.group(1)

    return color


def extract_hystory(prediction: str) -> str:
    hystory = re.findall("<HISTORY>((?:.*?|\\n)*?)</HISTORY>", prediction)
    if hystory:
        return hystory[0]
    else:
        return ""

In [12]:
def run_experiment(n_trials: int,
                   n_arms: int,
                   delta: float,
                   system_prompt_template: Template,
                   user_prompt_template: Template,
                   user_prompt_template_ext: Template,
                   history_summary: str = "There's no information about the buttons currently. You need to explore more.",
                   ) -> pd.DataFrame:
    assert n_arms <= len(COLORS)
    
    arm_name_to_idx = {name: idx for name, idx in zip(COLORS, range(n_arms))}
    arm_idx_to_name = {idx: name for name, idx in arm_name_to_idx.items()}

    colors = arm_name_to_idx.keys()
    
    client = create_client()
    bandit = MultiArmedBandit(n_arms=n_arms, delta=delta)

    rewards = {k: 0 for k in arm_name_to_idx.keys()}
    occurrences = {k: 0 for k in arm_name_to_idx.keys()}

    stats = []
    for trial in tqdm(range(n_trials), total=n_trials):
        system_prompt = system_prompt_template.render({'n_arms': n_arms, 'n_trials': n_trials, 'colors': colors})

        user_prompt_inputs = user_prompt_inputs = {
            'n_trials': trial,
            'history': history_summary,
            'colors': colors,
        }   
        user_prompt = user_prompt_template.render(user_prompt_inputs)
        
        prediction = get_prediction(client, system_prompt, user_prompt)

        arm_name = None
        arm_idx = None
        reward = None
        if prediction is not None:
            arm_name = extract_arm_color(prediction, arm_name_to_idx)
            arm_idx = arm_name_to_idx[arm_name]

            reward = bandit.pull(arm_idx)

            occurrences[arm_name] += 1
            rewards[arm_name] += reward

            user_prompt_inputs_ext = {
                'reward': reward,
            }
            user_prompt_ext = user_prompt_template_ext.render(user_prompt_inputs_ext)
            prediction_history = get_prediction(client, system_prompt, user_prompt, prediction, user_prompt_ext)

            history_summary = extract_hystory(prediction_history)
        
        cumulative_reward = sum(rewards.values())
        cumulative_reward_per_arm = {f'cumulative_reward_{k}': v for k, v in rewards.items()}
        cumulative_occurrences_per_arm = {f'cumulative_occurrence_{k}': v for k, v in occurrences.items()}

        stats.append({
            'trial': trial,
            'arm_name': arm_name,
            'arm_idx': arm_idx,
            'reward': reward,
            'cumulative_reward': cumulative_reward,
            'system_prompt': system_prompt,
            'user_prompt': user_prompt,
            'history_summary': history_summary,
            'raw_history_summary': prediction_history,
            'raw_prediction': prediction,
            'best_arm': arm_idx_to_name[bandit.best_arm],
            **cumulative_reward_per_arm,
            **cumulative_occurrences_per_arm
        })

    df = pd.DataFrame(stats)

    return df

In [13]:
def save_dataframe(df: pd.DataFrame, experiment_name: str, model_name: str, n_trials: int, n_arms: int, delta: float):
    model_name = model_name.replace('/', '_').replace(':', '_').replace('-', '_')
    delta = str(delta).replace('.', '')
    version = datetime.now().strftime('%Y%m%d%H%M%S')

    filename = f'{experiment_name}_{model_name}_trials-{n_trials}-arms-{n_arms}_delta-{delta}_v{version}.csv'
    filepath = EXPERIMENT_DIR / filename

    df.to_csv(filepath, index=False)

In [14]:
configs = [(run_id, EXPERIMENT_NAME, N_TRIALS, N_ARMS, DELTA, MODEL_NAME, SYSTEM_PROMPT_TEMPLATE, USER_PROMPT_TEMLATE, USER_PROMPT_TEMLATE_EXT) for run_id in range(N_EXPERIMENT_RUNS)]

def run_experiment_and_save_results(run_id: str,
                                    experiment_name: str,
                                    n_trials: int,
                                    n_arms: int,
                                    delta: float,
                                    model_name: str,
                                    system_prompt_template: Template,
                                    user_prompt_template: Template,
                                    user_prompt_template_ext: Template) -> None:
    df = run_experiment(n_trials=n_trials, n_arms=n_arms, delta=delta, system_prompt_template=system_prompt_template, user_prompt_template=user_prompt_template, user_prompt_template_ext=user_prompt_template_ext)
    df['run_id'] = run_id

    save_dataframe(df=df, experiment_name=experiment_name, model_name=model_name, n_trials=n_trials, n_arms=n_arms, delta=delta)


In [15]:
# Do a test run:
run_experiment_and_save_results(*configs[1])

100%|██████████| 100/100 [20:18<00:00, 12.19s/it]


In [None]:
with ThreadPoolExecutor(max_workers=N_WORKERS-1) as executor:
    futures = [executor.submit(run_experiment_and_save_results, *config) for config in configs[:2]]

    for future in tqdm(as_completed(futures), total=len(futures), desc="Running experiments"):
            try:
                future.result()  # Retrieve result to catch exceptions
            except Exception as e:
                print(f"An error occurred during experiment execution: {e}")