In [0]:
%load_ext autoreload
%autoreload 2

from agents import banditagents
from environments import bandits
import utils

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
sns.set_theme(style="whitegrid", palette="pastel")

In [1]:
configs_dir = '/Users/juliomartinez/Documents/PhD/socialmotivation/configs'
num_episodes = 50
#solvers = ['EpsilonGreedyAgent']
#solvers = ['GreedyAgent']
solvers = ['GreedyAgent', 'EpsilonGreedyAgent']
environment = 'TwoArmBandit'

bandit_config_filename = os.path.join(configs_dir, environment + '.yaml')
bandit_config = utils.get_config(bandit_config_filename)
env = bandits.Bandit(bandit_config)
num_iterations = 1000

results_column_names = ['episode','iteration','solver','demonstrator_rewards','learner_rewards','demonstrator_arm_id','learner_arm_id']
#best_arm_dist_column_names = ['solver', 'num_episodes', 'num_iterations', 'prob_arm1', 'prob_arm2']


for i, solver in enumerate(solvers):
 
    # Get solver class 
    agentClass = getattr(banditagents, solver)
    
    # Get config file
    agents_config_filename = os.path.join(configs_dir, solver + '.yaml')
    agents_config = utils.get_config(agents_config_filename)

    demonstrator_best_arm_distribution = np.zeros(env.num_arms)
    learner_best_arm_distribution = np.zeros(env.num_arms)

    for episode_j in range(num_episodes):
        # run demonstrator
        demonstrator = agentClass(agents_config['demonstrator'])   
        demonstrator(env)
        
        # run learner
        learner = agentClass(agents_config['learner'])
        learner(env, demonstrator)

        # store results
        demonstrator_best_arm_distribution[demonstrator.best_arm_id]=+1
        learner_best_arm_distribution[learner.best_arm_id]=+1

        num_iterations = agents_config['demonstrator']['num_iterations']
        iterations = list(range(num_iterations))
        episodes = [episode_j]*num_iterations
        solvers_ = [solver]*num_iterations
        trial_df = pd.DataFrame(list(zip(episodes, iterations, solvers_, demonstrator.reward_history,learner.reward_history, demonstrator.arm_id_history, learner.arm_id_history)), columns=results_column_names)
        if episode_j < 1:
            results_df = trial_df.copy()
        else:
            results_df = pd.concat([results_df,trial_df],axis=0)

    demonstrator_best_arm_distribution = demonstrator_best_arm_distribution / np.sum(demonstrator_best_arm_distribution)
    learner_best_arm_distribution = learner_best_arm_distribution / np.sum(learner_best_arm_distribution)
    solver_df = pd.DataFrame({'solver':solver, 'num_episodes': num_episodes, 'num_iterations': num_iterations, 'prob_arm0':demonstrator_best_arm_distribution[0],'prob_arm1':demonstrator_best_arm_distribution[1]}, index=[0])
    if i < 1:
        arm_dist_df = solver_df.copy()
    else:
        arm_dist_df = pd.concat([arm_dist_df,solver_df],axis=0)


In [None]:
results_df['solver','episode','iteration','demonstrator_rewards','learner_rewards'].groupby(['solver','episode']).cumsum()

In [None]:
results_df.solver.unique()

In [None]:
julio = banditagents.Person('Julio', "Martinez", 32)

In [None]:
heather = banditagents.Student('Heather', 'Martinez', 30, 'hat')