In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import torch 
os.chdir("../..")

from aexgym.env import PersSyntheticEnv
from aexgym.model import PersonalizedLinearModel
from aexgym.agent import LinearTS, LinearUniform, LinearUCB, LinearRho
from aexgym.objectives import contextual_best_arm, contextual_simple_regret
from scripts.setup_script import make_uniform_prior

In [2]:
n_days = 5
n_arms = 10
context_len = 5
n_steps = n_days 
batch_size = 100
s2 = 0.2 * torch.ones((n_days, 1))

if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
print(device)


cuda:0


In [3]:
#personalization 

#initialize parameterss
n_objs = 1
scaling = 1 / (batch_size*10)
pers_beta, pers_sigma = make_uniform_prior(context_len*n_arms, scaling, n_objs=n_objs)
context_mu, context_var = torch.ones(context_len), 1*torch.eye(context_len)

#initialize synthetic and agent model 
model = PersonalizedLinearModel(
    beta_0 = pers_beta, 
    sigma_0 = pers_sigma, 
    n_arms = n_arms, 
    s2 = s2,  
    n_objs=n_objs
)

#initialize synthetic environment
env = PersSyntheticEnv(
    model = model, 
    context_mu = context_mu, 
    context_var = context_var, 
    context_len = context_len, 
    batch_size = batch_size, 
    n_steps = n_steps
)





In [22]:
#initialize agent 
agent = LinearUniform(model, "Linear Uniform")
agent = LinearTS(model, "Linear TS", toptwo=False, n_samples = 100)
#agent = LinearTS(model, "Linear TS", toptwo=True, n_samples = 100)
#agent = LinearRho(model, "Linear Rho", lr=0.4, weights= (0,1))

In [23]:
print_probs = False
torch.manual_seed(0)
objective = contextual_simple_regret()
objective.weights = (0, 1)
torch.set_printoptions(sci_mode=False)
regret_list = []
percent_arms_correct_list = []



for i in range(10000):
    env.reset()
    #print(env.mean_matrix)
    cumul_regret = 0
    all_contexts, cur_step = env.reset()
    beta, sigma = agent.model.reset()
    #print(beta, sigma)
    beta, sigma = beta.to(device), sigma.to(device)
    
    while env.n_steps - cur_step > 0:

        #move to device 
        state_contexts, action_contexts, eval_contexts = tuple(contexts.to(device) for contexts in all_contexts)
        
        #train agent 
        agent.train_agent( 
            beta = beta, 
            sigma = sigma, 
            cur_step = cur_step, 
            n_steps = n_steps, 
            train_context_sampler = env.sample_train_contexts, 
            eval_contexts = eval_contexts,
            eval_action_contexts = action_contexts, 
            real_batch = batch_size, 
            print_losses=False, 
            objective=objective,
            repeats=10000
        )    
        #get probabilities
        probs = agent(
            beta = beta, 
            sigma = sigma, 
            contexts = state_contexts, 
            action_contexts = action_contexts, 
            objective = objective
        )
     
        #print probabilities 
        if print_probs == True:
            print(agent.name, env.n_steps - cur_step, probs)
        
        #get actions and move to new state
        actions = torch.distributions.Categorical(probs).sample()
        
        #move to next environment state 
        all_contexts, sampled_rewards, sampled_features, cur_step  = env.step(
            state_contexts = state_contexts, 
            action_contexts = action_contexts, 
            actions = actions
        )

        rewards = objective(
            agent_actions = actions,
            true_rewards = env.get_true_rewards(state_contexts, action_contexts)
        )

        cumul_regret += rewards['regret']
        
        #update model state 
        beta, sigma = agent.model.update_posterior(
            beta = beta, 
            sigma = sigma, 
            rewards = sampled_rewards, 
            features = agent.model.feature_map(actions, state_contexts, action_contexts), 
            idx = cur_step-1
        )

    #get evaluation contexts and true rewards 
    eval_contexts = env.sample_eval_contexts(access=True).to(device)
    true_eval_rewards = env.get_true_rewards(eval_contexts, action_contexts)
    
    fantasy_rewards = agent.fantasize(beta, eval_contexts, action_contexts).to(device)
    agent_actions = torch.argmax(fantasy_rewards.squeeze(), dim=1)

    #calculate results from objective 
    results_dict = objective(
        agent_actions = agent_actions, 
        true_rewards = true_eval_rewards.to(device)
    )

    cumul_regret = cumul_regret / n_days
    results_dict['regret'] = objective.weights[0] * cumul_regret + objective.weights[1] * results_dict['regret']
    
    #append results 
    percent_arms_correct_list.append(results_dict['percent_arms_correct'])
    regret_list.append(results_dict['regret'])

    #print results 
    if i % 10 == 0:
        
        print(i, "Regret: ", np.mean(regret_list))
        print("Percent Arms Correct: ", np.mean(percent_arms_correct_list))

0 Regret:  0.060665540397167206
Percent Arms Correct:  0.36
10 Regret:  0.05018237927420573
Percent Arms Correct:  0.3918181818181818
20 Regret:  0.045457759650335425
Percent Arms Correct:  0.40523809523809523
30 Regret:  0.04660983699103517
Percent Arms Correct:  0.41580645161290325
40 Regret:  0.048848049397148735
Percent Arms Correct:  0.40292682926829265
50 Regret:  0.04746605415700698
Percent Arms Correct:  0.4098039215686275
60 Regret:  0.046968172960838335
Percent Arms Correct:  0.41081967213114756
70 Regret:  0.048052471603306245
Percent Arms Correct:  0.4019718309859155
80 Regret:  0.046316861715397714
Percent Arms Correct:  0.4154320987654321
90 Regret:  0.04750283258956867
Percent Arms Correct:  0.40571428571428575
100 Regret:  0.04676293094854544
Percent Arms Correct:  0.40584158415841587
110 Regret:  0.047612878082840294
Percent Arms Correct:  0.40432432432432436
120 Regret:  0.046184684388524244
Percent Arms Correct:  0.41752066115702474
130 Regret:  0.04615981278599787
P

KeyboardInterrupt: 