In [116]:
import logging
import re
import typing as t
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
from jinja2 import Template
from openai import OpenAI
from tqdm.auto import tqdm


# Experiment constants
EXPERIMENT_NAME = 'exp1.0'  # baseline
MODEL_NAME = 'openai/gpt-4o-mini'
# MODEL_NAME = 'openai/gpt-4'
TEMPERATURE = 0.0

RANDOM_SEED = 20250402

N_ARMS = 5
DELTA = 0.2
N_TRIALS = 100


ARM_NAME_TO_IDX = {'blue': 0, 'green': 1, 'red': 2, 'yellow': 3, 'purple': 4}
ARM_IDX_TO_NAME = {v: k for k, v in ARM_NAME_TO_IDX.items()}



# Computing evalutation metrics

In [241]:
import pandas as pd
import glob
import os

def load_csv_files(path):
    # Construct the full pattern to match CSV files in the given directory.
    csv_pattern = os.path.join(path, '*.csv')
    
    # Retrieve all CSV file paths.
    csv_files = glob.glob(csv_pattern)
    dataframes = []

    # Read each CSV, add 'replicate' column, and store in a list.
    for i, file in enumerate(csv_files, start=0):
        temp_df = pd.read_csv(file)
        temp_df["replicate"] = i
        dataframes.append(temp_df)
    
    # Return the first DataFrame separately, plus the full list of DataFrames.
    return dataframes[0], dataframes

# Example usage:
path = '/home/josephj/data/exp2.5'  # Update this path to your CSV directory
path = '/home/josephj/data/exp1.0_detailed'
#path = '/home/josephj/data/exp1.0_dichotomous'
#path = '/home/josephj/data/exp6.5'
path = '/home/josephj/data/exp1.0'
df, dataframes = load_csv_files(path)

print(f"Loaded {len(dataframes)} CSV files.")


Loaded 17 CSV files.


In [242]:
pwd

'/home/josephj/mini_project_alps'

First we will need to combine the different replicate results together into one dataframe

In [243]:


# Merged dataframe
df_combined = pd.concat(dataframes, ignore_index=True)

# Print result
print(df_combined.head())

   trial arm_name  arm_idx  reward  cumulative_reward  \
0      0     blue        0       1                  1   
1      1      red        1       0                  1   
2      2    green        2       0                  1   
3      3   yellow        3       1                  2   
4      4   purple        4       0                  2   

                                       system_prompt  \
0  You are a bandit algorithm in a room with 5 bu...   
1  You are a bandit algorithm in a room with 5 bu...   
2  You are a bandit algorithm in a room with 5 bu...   
3  You are a bandit algorithm in a room with 5 bu...   
4  You are a bandit algorithm in a room with 5 bu...   

                                         user_prompt  \
0  So far you have played 0 times. Here is what y...   
1  So far you have played 1 times. Here is what y...   
2  So far you have played 2 times. Here is what y...   
3  So far you have played 3 times. Here is what y...   
4  So far you have played 4 times. Here 

# Metric 1: SuffFailFreq(T/2) quantifies exploration failures

For an experiment replicate R and round t, let SuffFail(t, R) be a binary variable that is 1 if the best arm is never chosen in rounds [t, T ]. Then let SuffFailFreq(t) := mean({SuffFail(t, R) : replicates R}).
SuffFailFreq(T /2)




In [244]:
df_combined

Unnamed: 0,trial,arm_name,arm_idx,reward,cumulative_reward,system_prompt,user_prompt,history_summary,raw_history_summary,raw_prediction,...,cumulative_reward_green,cumulative_reward_yellow,cumulative_reward_purple,cumulative_occurrence_blue,cumulative_occurrence_red,cumulative_occurrence_green,cumulative_occurrence_yellow,cumulative_occurrence_purple,run_id,replicate
0,0,blue,0,1,1,You are a bandit algorithm in a room with 5 bu...,So far you have played 0 times. Here is what y...,\n- Total time steps played: 1\n- Button chose...,<HISTORY>\n- Total time steps played: 1\n- But...,"Since I have not played any buttons yet, I nee...",...,0,0,0,1,0,0,0,0,12,0
1,1,red,1,0,1,You are a bandit algorithm in a room with 5 bu...,So far you have played 1 times. Here is what y...,\n- Total time steps played: 2\n- Button chose...,<HISTORY>\n- Total time steps played: 2\n- But...,Given that I have only played the blue button ...,...,0,0,0,1,1,0,0,0,12,0
2,2,green,2,0,1,You are a bandit algorithm in a room with 5 bu...,So far you have played 2 times. Here is what y...,\n- Total time steps played: 3\n- Button chose...,<HISTORY>\n- Total time steps played: 3\n- But...,Given the information from the first two steps...,...,0,0,0,1,1,1,0,0,12,0
3,3,yellow,3,1,2,You are a bandit algorithm in a room with 5 bu...,So far you have played 3 times. Here is what y...,\n- Total time steps played: 4\n- Button choic...,<HISTORY>\n- Total time steps played: 4\n- But...,"Given the information from the previous steps,...",...,0,1,0,1,1,1,1,0,12,0
4,4,purple,4,0,2,You are a bandit algorithm in a room with 5 bu...,So far you have played 4 times. Here is what y...,\n- Total time steps played: 5\n- Button choic...,<HISTORY>\n- Total time steps played: 5\n- But...,"Given the information from the previous steps,...",...,0,1,0,1,1,1,1,1,12,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,95,green,2,0,42,You are a bandit algorithm in a room with 5 bu...,So far you have played 95 times. Here is what ...,\n- Total plays: 96\n- Button pressed:\n - Bl...,<HISTORY>\n- Total plays: 96\n- Button pressed...,"Given the current situation, I have the follow...",...,35,0,0,8,6,75,5,2,1,16
1696,96,red,1,1,43,You are a bandit algorithm in a room with 5 bu...,So far you have played 96 times. Here is what ...,\n- Total plays: 97\n- Button pressed:\n - Bl...,<HISTORY>\n- Total plays: 97\n- Button pressed...,"Given the current situation, we have the follo...",...,35,0,0,8,7,75,5,2,1,16
1697,97,red,1,0,43,You are a bandit algorithm in a room with 5 bu...,So far you have played 97 times. Here is what ...,\n- Total plays: 98\n- Button pressed:\n - Bl...,<HISTORY>\n- Total plays: 98\n- Button pressed...,"Based on the information provided, we have the...",...,35,0,0,8,8,75,5,2,1,16
1698,98,green,2,1,44,You are a bandit algorithm in a room with 5 bu...,So far you have played 98 times. Here is what ...,\n- Total plays: 99\n- Button pressed:\n - Bl...,<HISTORY>\n- Total plays: 99\n- Button pressed...,"Given the current situation, let's analyze the...",...,36,0,0,8,8,76,5,2,1,16


In [233]:
df["SuffFail"] = 0

T = df["trial"].max()
seen_best_arm = False

for t in reversed(range(T + 1)):
    if df.loc[t, "arm_name"] == df.loc[t, "best_arm"]:
        #print(f"Chosen arm was best arm at t = {t} ")
        seen_best_arm = True  # The best arm was chosen at least once
    df.loc[t, "SuffFail"] = int(not seen_best_arm)  # 1 if best_arm was never chosen from t to T

print(df["SuffFail"])

0     0
1     0
2     0
3     0
4     0
     ..
95    0
96    0
97    0
98    0
99    0
Name: SuffFail, Length: 100, dtype: int64


In [234]:
print("For this replicate, SuffFail(T/2) = ", df.loc[df["trial"] == 50, "SuffFail"])

For this replicate, SuffFail(T/2) =  50    0
Name: SuffFail, dtype: int64


In [235]:
## CALCULATION FOR DF_COMBINED
# Sort
df_combined = df_combined.sort_values(["replicate", "trial"], ascending=[True, False])

# Identify T (max trial number)
T = df_combined["trial"].max()
T_half = T // 2  # T/2
print(T_half) #should be 49 = index of the 50th run

# Step 1: Check if the best arm was ever chosen from t to T
df_combined["seen_best_arm"] = (
    df_combined.groupby("replicate", group_keys=False)
    .apply(lambda g: g["arm_name"].eq(g["best_arm"]).iloc[::-1].cumsum().iloc[::-1] > 0)
)

# Step 2: Compute SuffFail
df_combined["SuffFail"] = (~df_combined["seen_best_arm"]).astype(int)

# Step 3: Compute SuffFailFreq(T/2)
suff_fail_freq_T_half = df_combined[df_combined["trial"] == T_half].groupby("replicate")["SuffFail"].mean().mean()

print("SuffFailFreq(T/2):", suff_fail_freq_T_half)

49
SuffFailFreq(T/2): 0.0


  .apply(lambda g: g["arm_name"].eq(g["best_arm"]).iloc[::-1].cumsum().iloc[::-1] > 0)


# Metric 2: K · MinFrac(T ) quantifies exploitation failures

uniform-like failures (expressed via K · MinFrac(T )) = exploitation fails = measures uniform-like failures

LLM selects arms in roughly equal proportions for the entirety of the T rounds and fails to exploit the acquired information to focus on the better arms.

For a particular experiment replicate R and round t, let fa(t, R) be the fraction of rounds in [1, t] in which a given arm a is chosen, MinFrac(t, R) := mina fa(t, R), and MinFrac(t) := mean({MinFrac(t, R) : replicates R}). Since MinFrac(t) ≤ 1/K, ∀t ∈ [T ], we always plot K · MinFrac(t), so as to rescale the range to [0, 1].
  


In [236]:
import pandas as pd

# Suppose df_combined is your DataFrame with columns "replicate", "trial", "arm_name", etc.
K = df_combined["arm_name"].unique()  # e.g., 5 arms
replica_numbs = df_combined["replicate"].unique()

# Ensure the data is sorted by replicate and trial if the order matters
df_combined = df_combined.sort_values(["replicate", "trial"])

replicate_cumulative_counts = {}

# Group df_combined by "replicate"
grouped = df_combined.groupby("replicate", as_index=False)

# Iterate through each replicate's data
for replicate_value, group in grouped:
    # 'group' is now a normal DataFrame filtered to a single replicate
    replicate_cumulative_counts[replicate_value] = {}
    for arm in K:
        # Boolean Series for this arm, then cumulative sum over rows in that replicate
        cumsum_series = (group["arm_name"] == arm).cumsum()
        replicate_cumulative_counts[replicate_value][arm] = cumsum_series

# ---------------------------------------------
# Example of how you might compute MinFrac and KMinFrac for the entire DataFrame
# (rather than separately per replicate):

# 1) For the entire dataset, get the overall cumsum for each arm
cumulative_counts = {a: (df_combined["arm_name"] == a).cumsum() for a in K}

# 2) Add "MinFrac" and "KMinFrac" columns to df_combined
#    assuming "trial" is 0-based (0, 1, 2, ...)
df_combined["MinFrac"] = df_combined["trial"].apply(
    lambda t: min(cumulative_counts[a][t] / (t + 1) for a in K)
)
df_combined["KMinFrac"] = df_combined["MinFrac"] * 5

print(df_combined[["trial", "KMinFrac"]].head())


   trial  KMinFrac
0      0       0.0
1      1       0.0
2      2       0.0
3      3       0.0
4      4       1.0


In [237]:
K = df_combined["arm_name"].unique() # number of arms = 5
print(K)

replica_numbs = df_combined["replicate"].unique()
print(replica_numbs) # R
replicate_cumulative_counts = {}

for r in replica_numbs:
    mask = df_combined["replicate"] == r
    for a in K:
        df_combined.loc[mask, f"cumulative_count_{a}"] = (df_combined.loc[mask, "arm_name"] == a).cumsum()

    # Compute MinFrac for this replicate
    df_combined.loc[mask, "MinFrac"] = df_combined.loc[mask, "trial"].apply(
        lambda t: min(df_combined.loc[mask & (df_combined["trial"] == t), [f"cumulative_count_{a}" for a in K]].min(axis=1) / (t + 1))
    )

    # Compute KMinFrac
    df_combined.loc[mask, "KMinFrac"] = df_combined.loc[mask, "MinFrac"] * len(K)

print(df_combined[["trial", "KMinFrac"]])

['blue' 'red' 'green' 'yellow' 'purple']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]
      trial  KMinFrac
0         0  0.000000
1         1  0.000000
2         2  0.000000
3         3  0.000000
4         4  1.000000
...     ...       ...
1695     95  0.104167
1696     96  0.103093
1697     97  0.102041
1698     98  0.101010
1699     99  0.100000

[1700 rows x 2 columns]


In [238]:
# Compute MinFrac(t) as the mean MinFrac(t, R) over all replicates R
MinFrac_t = df_combined.groupby("trial")["MinFrac"].mean().reset_index(name="MinFrac_t")

# Display the result
print(MinFrac_t)

    trial  MinFrac_t
0       0   0.000000
1       1   0.000000
2       2   0.000000
3       3   0.000000
4       4   0.200000
..    ...        ...
95     95   0.025123
96     96   0.024864
97     97   0.024610
98     98   0.024361
99     99   0.024118

[100 rows x 2 columns]


# Metric 3: MedianReward

(Note: if Reward is None, filter it out, as it reports system failures)

MedianReward  = the rescaled median (over replicates) of the time-averaged total reward.

More precisely, let Φ(R) be the time-averaged total reward for a given replicate R. Then E [Φ(R)] ranges in the interval [1/2 − ∆/2, 1/2 + ∆/2]. We rescale Φ(R), by translating and multiplying, so that E [Φ(R)] ranges in [0, 1].


In [239]:
T = 100
# Compute Time-Averaged Total Reward per Replicate
df_filtered = df_combined.dropna(subset=["reward"])  # Ignore system failures
time_avg_reward_per_r = df_filtered.groupby("replicate")["reward"].sum() / T

# Compute Median of Time-Averaged Rewards
median_reward = time_avg_reward_per_r.median()

# Rescale the Median Reward
Delta = 0.2  # In our setting of the "hard" instance with K=5, then ∆=0.2
min_val = (1/2) - (Delta/2)
rescaled_median_reward = (median_reward - min_val) / Delta

print(f"Rescaled MedianReward: {rescaled_median_reward}")

Rescaled MedianReward: 0.3


# Metric 4: GreedyFrac

The fraction of greedy rounds, averaged over the replicates. A greedy round is one in which an arm with a largest average reward is selected. This is one way to quantify the extent to which a configuration behaves like Greedy.


In [240]:
df_filtered = df_combined.dropna(subset=["reward"])

# Step 2: Compute cumulative reward & count per arm per replicate
df_filtered["cumulative_reward"] = df_filtered.groupby(["replicate", "arm_name"])["reward"].cumsum()
df_filtered["cumulative_count"] = df_filtered.groupby(["replicate", "arm_name"]).cumcount() + 1

# Step 3: Compute empirical mean reward for each arm (avoid division by zero)
df_filtered["mean_reward"] = df_filtered["cumulative_reward"] / df_filtered["cumulative_count"]

# Step 4: Find the arm with the highest mean reward at each time step
max_reward_per_trial = df_filtered.groupby(["replicate", "trial"])["mean_reward"].transform("max")

# Step 5: Check if the chosen arm was greedy (has the max reward)
df_filtered["greedy_round"] = (df_filtered["mean_reward"] == max_reward_per_trial).astype(int)

# Step 6: Compute GreedyFrac per replicate and then average
greedy_frac_per_replicate = df_filtered.groupby("replicate")["greedy_round"].mean()
greedy_frac = greedy_frac_per_replicate.mean()

print(f"GreedyFrac: {greedy_frac}")


GreedyFrac: 1.0
