In [1]:
import warnings
warnings.filterwarnings("ignore")

from dotenv import load_dotenv, find_dotenv

import logging
import os
import re
import typing as t

import numpy as np
import pandas as pd
from jinja2 import Template
from openai import OpenAI
from tqdm.auto import tqdm

load_dotenv(find_dotenv())

# Openrouter constants
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENTROUTER_BASE_URL = 'https://openrouter.ai/api/v1'


# Experiment constants
EXPERIMENT_NAME = 'exp1.0'  # baseline
MODEL_NAME = 'openai/gpt-4o-mini'
TEMPERATURE = 0.0

RANDOM_SEED = 20250402

N_ARMS = 5
DELTA = 0.2
N_TRIALS = 3


ARM_NAME_TO_IDX = {'blue': 0, 'green': 1, 'red': 2, 'yellow': 3, 'purple': 4}
ARM_IDX_TO_NAME = {v: k for k, v in ARM_NAME_TO_IDX.items()}


def bootstrap() -> None:
    # logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
    np.random.seed(RANDOM_SEED)


bootstrap()

In [2]:
SYSTEM_PROMPT_TEMPLATE = """\
You are a bandit algorithm in a room with {{n_arms}} buttons labeled blue, green, red, yellow, purple. \
Each button is associated with a Bernoulli distribution with a fixed but unknown mean; the means \
for the buttons could be different. For each button, when you press it, you will get a reward \
that is sampled from the button's associated distribution. You have {{n_trials}} time steps and, on each \
time step, you can choose any button and receive the reward. Your goal is to maximize the total \
reward over the {{n_trials}} time steps.

At each time step, I will show you a summary of your past choices and rewards. Then you must \
make the next choice, which must be exactly one of blue, green, red, yellow, purple. Let's think \
step by step to make sure we make a good choice. You must provide your final answer within the \
tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple.
"""
SYSTEM_PROMPT_TEMPLATE = Template(SYSTEM_PROMPT_TEMPLATE)


USER_PROMPT_TEMLATE = """\
So far you have played {{n_trials}} times with your past choices and rewards summarized
as follows:
- blue button: pressed {{blue_n_occur}} times. Reward: {{blue_good_occur}} times gave good outcomes and {{blue_bad_occur}} times gave bad outcomes 
- green button: pressed {{green_n_occur}} times. Reward: {{green_good_occur}} times gave good outcomes and {{green_bad_occur}} times gave bad outcomes 
- red button: pressed {{red_n_occur}} times. Reward:{{red_good_occur}} times gave good outcomes and {{red_bad_occur}} times gave bad outcomes
- yellow button: pressed {{yellow_n_occur}} times. Reward:{{yellow_good_occur}} times gave good outcomes and {{yellow_bad_occur}} times gave bad outcomes
- purple button: pressed {{purple_n_occur}} times. Reward:{{purple_good_occur}} times gave good outcomes and {{purple_bad_occur}} times gave bad outcomes

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags \
<Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step \
by step to make sure we make a good choice.
"""
USER_PROMPT_TEMLATE = Template(USER_PROMPT_TEMLATE)

In [3]:
# Rendering example
example_prompt_inputs = {
    'n_trials': 10,
    'blue_n_occur': 4,
    'blue_good_occur':1,
    "blue_bad_occur":2,
    'blue_avg_reward': 0.4,
    'green_n_occur': 0,
    'green_avg_reward': 0,
    'red_n_occur': 3,
    'red_avg_reward': 0.1,
    'yellow_n_occur': 0,
    'yellow_avg_reward': 0,
    'purple_n_occur': 3,
    'purple_avg_reward': 0.2
}
print(USER_PROMPT_TEMLATE.render(**example_prompt_inputs))

So far you have played 10 times with your past choices and rewards summarized
as follows:
- blue button: pressed 4 times. Reward: 1 times gave good outcomes and 2 times gave bad outcomes 
- green button: pressed 0 times. Reward:  times gave good outcomes and  times gave bad outcomes 
- red button: pressed 3 times. Reward: times gave good outcomes and  times gave bad outcomes
- yellow button: pressed 0 times. Reward: times gave good outcomes and  times gave bad outcomes
- purple button: pressed 3 times. Reward: times gave good outcomes and  times gave bad outcomes

Which button will you choose next? Remember, YOU MUST provide your final answer within the tags <Answer>COLOR</Answer> where COLOR is one of blue, green, red, yellow, purple. Let's think step by step to make sure we make a good choice.


In [4]:
class MultiArmedBandit:
    def __init__(self, n_arms: int, delta: float) -> None:
        self.n_arms = n_arms
        self.delta = delta

        means = [0.5 + self.delta / 2] + [0.5 - self.delta / 2] * (self.n_arms - 1)
        np.random.shuffle(means)

        self.means = means
        self.best_arm = np.argmax(means)

    def pull(self, arm: int) -> int:
        assert 0 <= arm < self.n_arms, f"Arm {arm} doesn't exist"

        p = self.means[arm]
        reward = np.random.binomial(1, p=p)
        
        return reward

In [None]:
def create_client() -> OpenAI:
    client = OpenAI(
        base_url=OPENTROUTER_BASE_URL,
        api_key=OPENROUTER_API_KEY,
    )
    return client


def get_prediction(client: OpenAI, system_prompt: str, user_prompt: str, **kwargs) -> str | None:
    messages = [
        {
            'role': 'system',
            'content': system_prompt
        },
        {
            'role': 'user',
            'content': user_prompt
        }
    ]
    
    prediction = None
    try:
        completion = client.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=TEMPERATURE,
            **kwargs
        )

        prediction = completion.choices[0].message.content
        
    except Exception as _:
        logging.warning('Request failed')
 
    return prediction


def build_prompt_inputs(trial: int, rewards: t.Dict[str, int], occurrences: t.Dict[str, int]) -> t.Dict[str, float]:
    
    prompt_inputs = {
        'n_trials': trial,

        'blue_n_occur': occurrences['blue']["count"],
        "blue_good_occur": occurrences["blue"]["Good"],
        "blue_bad_occur": occurrences["blue"]["Bad"],

        'red_n_occur': occurrences['red']["count"],
        "red_good_occur": occurrences["red"]["Good"],
        "red_bad_occur": occurrences["red"]["Bad"],

        'green_n_occur': occurrences['green']["count"],
        "green_good_occur": occurrences["green"]["Good"],
        "green_bad_occur": occurrences["green"]["Bad"],

        'yellow_n_occur': occurrences['yellow']["count"],
        "yellow_good_occur": occurrences["yellow"]["Good"],
        "yellow_bad_occur": occurrences["yellow"]["Bad"],

        'purple_n_occur': occurrences['purple']["count"],
        "purple_good_occur": occurrences["purple"]["Good"],
        "purple_bad_occur": occurrences["purple"]["Bad"],
    }

    return prompt_inputs


def extract_arm_color(prediction: str) -> str | None:    
    names = '|'.join(ARM_NAME_TO_IDX.keys())
    pattern = f'<Answer>({names})</Answer>'
    match = re.search(pattern, prediction)

    color = None
    if match:
        color = match.group(1)

    return color

In [6]:
client = create_client()
bandit = MultiArmedBandit(n_arms=N_ARMS, delta=DELTA)

In [7]:
def extract_verbalised_reward(reward:int):
    return "Good" if reward == 1 else "Bad"


In [8]:
stats = []
history = ''

rewards = {k: 0 for k in ARM_NAME_TO_IDX.keys()}
occurrences = {k: {"count": 0, "Good": 0, "Bad":0} for k in ARM_NAME_TO_IDX.keys()}
for trial in tqdm(range(N_TRIALS), total=N_TRIALS):
    system_prompt = SYSTEM_PROMPT_TEMPLATE.render({'n_arms': N_ARMS, 'n_trials': N_TRIALS})

    user_prompt_inputs = build_prompt_inputs(trial, rewards, occurrences)
    user_prompt = USER_PROMPT_TEMLATE.render(user_prompt_inputs)
    history += f'{user_prompt}\n'
    prediction = get_prediction(client, system_prompt, user_prompt)
    history += f"{prediction}\n\n"

    arm_name = extract_arm_color(prediction)
    arm_idx = ARM_NAME_TO_IDX[arm_name]

    reward = bandit.pull(arm_idx)
    reward_valence = extract_verbalised_reward(reward=reward)

    occurrences[arm_name]["count"] += 1
    occurrences[arm_name][reward_valence] += 1
    rewards[arm_name] += reward

    cumulative_reward = sum(rewards.values())
    cumulative_reward_per_arm = {f'cumulative_reward_{k}': v for k, v in rewards.items()}
    cumulative_occurrences_per_arm = {f'cumulative_occurrence_{k}': v["count"] for k, v in occurrences.items()}
    stats.append({
        'trial': trial,
        'arm_name': arm_name,
        'arm_idx': arm_idx,
        'reward': reward,
        'cumulative_reward': cumulative_reward,
        'system_prompt': system_prompt,
        'user_prompt': user_prompt,
        'raw_prediction': prediction,
        'best_arm': ARM_IDX_TO_NAME[bandit.best_arm],
        **cumulative_reward_per_arm,
        **cumulative_occurrences_per_arm
    })

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [10:13<00:00,  6.13s/it]


In [9]:
from pathlib import Path
from datetime import datetime

df = pd.DataFrame(stats)

DATA_DIR = Path('../data')
EXPERIMENT_DIR = DATA_DIR / EXPERIMENT_NAME
EXPERIMENT_DIR.mkdir(exist_ok=True, parents=True)

model_name = MODEL_NAME.replace('/', '_').replace(':', '_').replace('-', '_')
version = datetime.now().strftime('%Y%m%d%H%M%S')

filename = f'{EXPERIMENT_NAME}_{model_name}_arms-{N_ARMS}_delta-{DELTA}_trials-{N_TRIALS}_v{version}.csv'
filepath = EXPERIMENT_DIR / filename

df.to_csv(filepath, index=False)