In [6]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import torch 
os.chdir("../..")

from aexgym.env import ConstraintPersSyntheticEnv
from aexgym.model import PersonalizedLinearModel
from aexgym.agent import LinearTS, LinearUniform, LinearUCB, LinearRho
from aexgym.objectives import contextual_best_arm, contextual_simple_regret
from scripts.setup_script import make_uniform_prior

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
n_days = 5
n_arms = 10
context_len = 5
n_steps = n_days 
batch_size = 100
s2 = 0.2 * torch.ones((n_days, 1))

if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
print(device)


cpu


In [101]:
#personalization 

#initialize parameterss
n_objs = 1
scaling = 1 / (batch_size*10)
pers_beta, pers_sigma = make_uniform_prior(context_len*n_arms, scaling, n_objs=n_objs)
context_mu, context_var = torch.ones(context_len), 1*torch.eye(context_len)
constraint_mu, constraint_var = torch.zeros(n_arms), 1*torch.eye(n_arms)
print(pers_beta)
pers_beta = 1*torch.ones_like(pers_beta)
#initialize synthetic and agent model 
model = PersonalizedLinearModel(
    beta_0 = pers_beta, 
    sigma_0 = pers_sigma, 
    n_arms = n_arms, 
    s2 = s2,  
    n_objs=n_objs,
)

#initialize synthetic environment
env = ConstraintPersSyntheticEnv(
    model = model, 
    context_mu = context_mu, 
    context_var = context_var, 
    context_len = context_len, 
    batch_size = batch_size, 
    n_steps = n_steps,
    constraint_mu = constraint_mu,
    constraint_var = constraint_var
)

tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]])


In [102]:
#initialize agent 
agent = LinearUniform(model, "Linear Uniform")
#agent = LinearTS(model, "Linear TS", toptwo=False, n_samples = 100, constraint=True, cost_weight=0.01)
#agent = LinearTS(model, "Linear TS", toptwo=True, n_samples = 100)
#agent = LinearRho(model, "Linear Rho", lr=0.4, weights= (0,1), cost_weight = 0)

In [103]:
'''REMEMBER STANDARDIZE
'''

print_probs = False
torch.manual_seed(0)
objective = contextual_simple_regret()
objective.weights = (0, 1)
torch.set_printoptions(sci_mode=False)
regret_list = []
percent_arms_correct_list = []
cost_list = []


for i in range(10000):
    env.reset()
    cost = 0
    #print(env.mean_matrix)
    cumul_regret = 0
    all_contexts, cur_step = env.reset()
    beta, sigma = agent.model.reset()
    #print(beta, sigma)
    beta, sigma = beta.to(device), sigma.to(device)
    
    while env.n_steps - cur_step > 0:

        #move to device 
        state_contexts, action_contexts, eval_contexts, costs = tuple(contexts.to(device) for contexts in all_contexts)
        #train agent 
        agent.train_agent( 
            beta = beta, 
            sigma = sigma, 
            cur_step = cur_step, 
            n_steps = n_steps, 
            train_context_sampler = env.sample_train_contexts, 
            eval_contexts = eval_contexts,
            eval_action_contexts = action_contexts, 
            real_batch = batch_size, 
            print_losses=False, 
            objective=objective,
            costs=costs,
            repeats=10000
        )    
        #get probabilities
        probs = agent(
            beta = beta, 
            sigma = sigma, 
            contexts = state_contexts, 
            action_contexts = action_contexts, 
            objective = objective,
            costs = costs 
        )
     
        #print probabilities 
        if print_probs == True:
            print(agent.name, env.n_steps - cur_step, probs)
        
        #get actions and move to new state
        actions = torch.distributions.Categorical(probs).sample()
        cost += (torch.mean(costs[actions]) - torch.min(costs)).item()
        #move to next environment state 
        all_contexts, sampled_rewards, sampled_features, cur_step  = env.step(
            state_contexts = state_contexts, 
            action_contexts = action_contexts, 
            actions = actions
        )

        rewards = objective(
            agent_actions = actions,
            true_rewards = env.get_true_rewards(state_contexts, action_contexts)
        )

        cumul_regret += rewards['regret']
        
        #update model state 
        beta, sigma = agent.model.update_posterior(
            beta = beta, 
            sigma = sigma, 
            rewards = sampled_rewards, 
            features = agent.model.feature_map(actions, state_contexts, action_contexts), 
            idx = cur_step-1
        )

    #get evaluation contexts and true rewards 
    eval_contexts = env.sample_eval_contexts(access=True).to(device)
    true_eval_rewards = env.get_true_rewards(eval_contexts, action_contexts)
    
    fantasy_rewards = agent.fantasize(beta, eval_contexts, action_contexts).to(device)
    agent_actions = torch.argmax(fantasy_rewards.squeeze(), dim=1)

    #calculate results from objective 
    results_dict = objective(
        agent_actions = agent_actions, 
        true_rewards = true_eval_rewards.to(device)
    )

    cumul_regret = cumul_regret / n_days
    results_dict['regret'] = objective.weights[0] * cumul_regret + objective.weights[1] * results_dict['regret']
    
    #append results 
    percent_arms_correct_list.append(results_dict['percent_arms_correct'])
    regret_list.append(results_dict['regret'])
    cost_list.append(cost)

    #print results 
    if i % 10 == 0:
        
        print(i, "Regret: ", np.mean(regret_list))
        print("Percent Arms Correct: ", np.mean(percent_arms_correct_list))
        print('cost', np.mean(cost_list))

0 Regret:  0.04553588479757309
Percent Arms Correct:  0.38
cost 3.5489953756332397
10 Regret:  0.04459281494332985
Percent Arms Correct:  0.48181818181818187
cost 3.4576949693939905
20 Regret:  0.048434684540899026
Percent Arms Correct:  0.4723809523809523
cost 3.4240952900477817
30 Regret:  0.049373984411959684
Percent Arms Correct:  0.44032258064516133
cost 3.384240520577277
40 Regret:  0.050387714819119474
Percent Arms Correct:  0.43341463414634146
cost 3.418499570794222
50 Regret:  0.04974864623235429
Percent Arms Correct:  0.41745098039215683
cost 3.4343196083517635
60 Regret:  0.04969234093565677
Percent Arms Correct:  0.4186885245901639
cost 3.450667718394858
70 Regret:  0.04832845013982184
Percent Arms Correct:  0.4191549295774648
cost 3.4663860008750165
80 Regret:  0.04917505866974408
Percent Arms Correct:  0.4090123456790123
cost 3.4499665594395297
90 Regret:  0.049067592089688715
Percent Arms Correct:  0.41021978021978023
cost 3.4371942704195506
100 Regret:  0.04826767406527

KeyboardInterrupt: 