In [None]:
%load_ext autoreload
%autoreload 2


from IPython.display import display, HTML
from agents import banditagents
from environments import bandits
import utils

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
sns.set_theme(style="whitegrid", palette="pastel")

##  Run Solvers

In [None]:
configs_dir = '/Users/juliomartinez/Documents/PhD/socialmotivation/configs'
num_episodes = 500

solvers = ['GreedyAgent', 'EpsilonGreedyAgent', 'UCBAgent']
#solvers = ['UCBAgent']
environment = 'TwoArmBandit'

bandit_config_filename = os.path.join(configs_dir, environment + '.yaml')
bandit_config = utils.get_config(bandit_config_filename)
env = bandits.Bandit(bandit_config)
num_iterations = 1000

results_column_names = ['episode','iteration','solver','reward_demonstrator','reward_learner','chosen_arm_id_demonstrator','chosen_arm_id_learner']
#results_column_name = ['episode', 'iteration', 'solver', 'agent', 'reward', 'best_arm_id']
#best_arm_dist_column_names = ['solver', 'num_episodes', 'num_iterations', 'prob_arm1', 'prob_arm2']


for i, solver in enumerate(solvers):
 
    # Get solver class 
    agentClass = getattr(banditagents, solver)
    
    # Get config file
    agents_config_filename = os.path.join(configs_dir, solver + '.yaml')
    agents_config = utils.get_config(agents_config_filename)

    demonstrator_best_arm_distribution = np.zeros(env.num_arms)
    learner_best_arm_distribution = np.zeros(env.num_arms)

    for episode_j in range(num_episodes):
        # run demonstrator
        demonstrator = agentClass(agents_config['demonstrator'])   
        demonstrator(env)
        
        # run learner
        learner = agentClass(agents_config['learner'])
        learner(env, demonstrator)

        # store results
        demonstrator_best_arm_distribution[demonstrator.best_arm_id]=+1
        learner_best_arm_distribution[learner.best_arm_id]=+1

        num_iterations = len(demonstrator.reward_history)
        iterations = list(range(num_iterations))
        episodes = [episode_j]*num_iterations
        solvers_ = [solver]*num_iterations
        trial_df = pd.DataFrame(list(zip(episodes, iterations, solvers_, demonstrator.reward_history,learner.reward_history, demonstrator.arm_id_history, learner.arm_id_history)), columns=results_column_names)
        if i < 1 and episode_j < 1:
            results_df = trial_df.copy()
        else:
            results_df = pd.concat([results_df,trial_df],join='inner', ignore_index=True)

    demonstrator_best_arm_distribution = demonstrator_best_arm_distribution / np.sum(demonstrator_best_arm_distribution)
    learner_best_arm_distribution = learner_best_arm_distribution / np.sum(learner_best_arm_distribution)
    solver_df = pd.DataFrame({
        'solver':solver, 
        'num_episodes': num_episodes, 
        'num_iterations': num_iterations, 
        'demonstrator_prob_arm0':demonstrator_best_arm_distribution[0],
        'demonstrator_prob_arm1':demonstrator_best_arm_distribution[1], 
        'learner_prob_arm0':learner_best_arm_distribution[0],
        'learner_prob_arm1':learner_best_arm_distribution[1]
        }, index=[0])
    if i < 1:
        arm_dist_df = solver_df.copy()
    else:
        arm_dist_df = pd.concat([arm_dist_df,solver_df],axis=0)

results_df.to_csv('/Users/juliomartinez/Documents/PhD/socialmotivation/results.csv')
display(HTML(results_df.head().to_html()))

## Format Data and Compute Helper Variables

In [None]:
# Add cumulative reward (over iterations) to results_df
results_df = pd.read_csv('/Users/juliomartinez/Documents/PhD/socialmotivation/results.csv', index_col=0).sort_values(by=['solver', 'episode', 'iteration'], ignore_index=True)
results_df['cumulative_reward_demonstrator'] = results_df.groupby(['solver','episode'])['reward_demonstrator'].cumsum()
results_df['cumulative_reward_learner'] = results_df.groupby(['solver','episode'])['reward_learner'].cumsum()

# Compute average cumulative reward (average across episodes)
avg_cumulative_reward_demonstrator = pd.Series(results_df.groupby(['solver','iteration'])['cumulative_reward_demonstrator'].mean(), name='avg_cumulative_reward_demonstrator')
avg_cumulative_reward_learner = pd.Series(results_df.groupby(['solver','iteration'])['cumulative_reward_learner'].mean(), name='avg_cumulative_reward_learner')
avg_cumulative_reward_df = pd.concat([avg_cumulative_reward_demonstrator,avg_cumulative_reward_learner],axis=1)
avg_cumulative_reward_df['delta_of_avg_cumulative_reward'] = avg_cumulative_reward_df['avg_cumulative_reward_learner'].sub(avg_cumulative_reward_df['avg_cumulative_reward_demonstrator'], axis = 0)
avg_cumulative_reward_df = avg_cumulative_reward_df.reset_index().sort_values(by=['solver', 'iteration'], ignore_index=True)

# Differences in cumulative reward for each iteration
delta_df = results_df[['episode', 'iteration', 'solver']]
delta_df['delta_of_cumulative_reward'] = results_df['cumulative_reward_learner'].sub(results_df['cumulative_reward_demonstrator'], axis = 0)
delta_df = delta_df.sort_values(by=['solver', 'episode', 'iteration'], ignore_index=True)

# change to wide format
results_long_df = pd.wide_to_long(
    results_df, 
    stubnames=['reward', 'chosen_arm_id', 'cumulative_reward'], 
    i=['episode', 'iteration', 'solver'], 
    j='agent',
    sep='_', 
    suffix=r'\w+').reset_index().sort_values(by=['solver', 'episode', 'iteration'], ignore_index=True)
print('Results Pivot Longer')
display(HTML(results_long_df.head().to_html()))


print('\n\nResults')
display(HTML(results_df.head().to_html()))

print('\n\nAverge Cumulative Reward')
display(HTML(avg_cumulative_reward_df.head().to_html()))

print('\n\nDelta of Each Iteration')
display(HTML(delta_df.head().to_html()))

print('\n\nResults in Wide Format')
display(HTML(results_long_df.head().to_html()))


## Display Plots

In [None]:
ncols = 2
nrows = 1
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6*ncols,6))
sns.lineplot(data=results_long_df, x="iteration", y="cumulative_reward", hue="solver", style="agent", ax=axes[0])
axes[0].set_ylabel('Average Cumulative Reward')
axes[0].set_xlabel('Iteration')
axes[0].axvline(x=100, ls=':', lw=2, label='number of initial iterations', color='grey')

#sns.lineplot(data=avg_cumulative_reward_df, x="iteration", y="delta_of_avg_cumulative_reward", hue="solver", ax=axes[1])
#axes[1].set_ylabel('$\Delta$(Average(Cumulative Reward))')
#axes[1].set_xlabel('Iteration')
#axes[1].axvline(x=100, ls=':', lw=2, label='number of initial iterations', color='grey')

sns.lineplot(data=delta_df, x="iteration", y="delta_of_cumulative_reward", hue="solver", ax=axes[1])
axes[1].set_ylabel('Avg( $\Delta$(Cumulative Reward) )')
axes[1].set_xlabel('Iteration')
axes[1].axvline(x=100, ls=':', lw=2, label='number of initial iterations', color='grey')
#plt.suptitle('Observe Current Iter')
#plt.suptitle('Observe Best Arm ($p_{exclude}=0$) for Greedy and Epsilon Greedy')
#plt.suptitle('Observe Best Arm ($p_{exclude}=1$) for Greedy and Epsilon Greedy')
#plt.suptitle('Observe Best Arm ($p_{exclude}=0.5$) for Greedy and Epsilon Greedy')
#plt.suptitle('Observe Simulatenously and Current Iter')
#plt.suptitle('Observe Best Arm ($p_{exclude}=0$) and Observe Simultaneously')
plt.suptitle('Observe Best Arm ($p_{exclude}=1$) and Observe Simultaneously')

plt.show()
