In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import os
import torch 
os.chdir("../..")

from aexgym.env import PersSyntheticEnv
from aexgym.model import PersonalizedLinearModel
from aexgym.agent import LinearTS, LinearUniform, LinearUCB, LinearRho
from aexgym.objectives import contextual_best_arm, contextual_simple_regret
from scripts.setup_script import make_uniform_prior

In [2]:
n_days = 3
n_arms = 10
context_len = 5
n_steps = n_days 
batch_size = 100
s2 = 0.2 * torch.ones((n_days, 1))

if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
print(device)


cpu


In [3]:
#personalization 

#initialize parameterss
n_objs = 1
scaling = 1 / (batch_size*10)
pers_beta, pers_sigma = make_uniform_prior(context_len*n_arms, scaling, n_objs=n_objs)
context_mu, context_var = torch.ones(context_len), 5*torch.eye(context_len)

#initialize synthetic and agent model 
model = PersonalizedLinearModel(
    beta_0 = pers_beta, 
    sigma_0 = pers_sigma, 
    n_arms = n_arms, 
    s2 = s2,  
    n_objs=n_objs
)

#initialize synthetic environment
env = PersSyntheticEnv(
    model = model, 
    context_mu = context_mu, 
    context_var = context_var, 
    context_len = context_len, 
    batch_size = batch_size, 
    n_steps = n_steps
)





In [8]:
#initialize agent 
agent = LinearUniform(model, "Linear Uniform")
agent = LinearTS(model, "Linear TS", toptwo=False, n_samples = 100)
#agent = LinearTS(model, "Linear TS", toptwo=True, n_samples = 100)
#agent = LinearRho(model, "Linear Rho", lr=0.4)

In [9]:
print_probs = True
torch.manual_seed(0)
objective = contextual_simple_regret()
torch.set_printoptions(sci_mode=False)
regret_list = []
percent_arms_correct_list = []



for i in range(10000):
    env.reset()
    #print(env.mean_matrix)
    all_contexts, cur_step = env.reset()
    beta, sigma = agent.model.reset()
    #print(beta, sigma)
    beta, sigma = beta.to(device), sigma.to(device)
    
    while env.n_steps - cur_step > 0:

        #move to device 
        state_contexts, action_contexts, eval_contexts = tuple(contexts.to(device) for contexts in all_contexts)
        
        #train agent 
        agent.train_agent( 
            beta = beta, 
            sigma = sigma, 
            cur_step = cur_step, 
            n_steps = n_steps, 
            train_context_sampler = env.sample_train_contexts, 
            eval_contexts = eval_contexts,
            eval_action_contexts = action_contexts, 
            real_batch = batch_size, 
            print_losses=False, 
            objective=objective,
            repeats=10000
        )    
        #get probabilities
        probs = agent(
            beta = beta, 
            sigma = sigma, 
            contexts = state_contexts, 
            action_contexts = action_contexts, 
            objective = objective
        )
     
        #print probabilities 
        if print_probs == True:
            print(agent.name, env.n_steps - cur_step, torch.mean(probs, dim=0))
        
        #get actions and move to new state
        actions = torch.distributions.Categorical(probs).sample()
        
        #move to next environment state 
        all_contexts, sampled_rewards, sampled_features, cur_step  = env.step(
            state_contexts = state_contexts, 
            action_contexts = action_contexts, 
            actions = actions
        )
        
        #update model state 
        beta, sigma = agent.model.update_posterior(
            beta = beta, 
            sigma = sigma, 
            rewards = sampled_rewards, 
            features = agent.model.feature_map(actions, state_contexts, action_contexts), 
            idx = cur_step-1
        )

    #get evaluation contexts and true rewards 
    eval_contexts = env.sample_eval_contexts(access=True).to(device)
    true_eval_rewards = env.get_true_rewards(eval_contexts, action_contexts)
    
    #calculate results from objective 
    results_dict = objective(
        fantasy_rewards = agent.fantasize(beta, eval_contexts, action_contexts).to(device), 
        true_rewards = true_eval_rewards.to(device)
    )
    
    #append results 
    percent_arms_correct_list.append(results_dict['percent_arms_correct'])
    regret_list.append(results_dict['regret'])

    #print results 
    if i % 1 == 0:
        
        print(i, "Regret: ", np.mean(regret_list))
        print("Percent Arms Correct: ", np.mean(percent_arms_correct_list))

Linear TS 3 tensor([0.1042, 0.0933, 0.0996, 0.0993, 0.1019, 0.1044, 0.0993, 0.0971, 0.0966,
        0.1043])
Linear TS 2 tensor([0.1217, 0.1197, 0.0700, 0.0832, 0.1057, 0.0988, 0.0985, 0.0911, 0.1304,
        0.0809])
Linear TS 1 tensor([0.1269, 0.1494, 0.0694, 0.0739, 0.0789, 0.1019, 0.1125, 0.0676, 0.1600,
        0.0595])
0 Regret:  0.13528600335121155
Percent Arms Correct:  0.36
Linear TS 3 tensor([0.0993, 0.1042, 0.0995, 0.1018, 0.1013, 0.1000, 0.0977, 0.1000, 0.0986,
        0.0976])
Linear TS 2 tensor([0.1186, 0.0959, 0.1204, 0.0600, 0.1298, 0.1123, 0.0820, 0.1035, 0.1175,
        0.0600])
Linear TS 1 tensor([0.1987, 0.1239, 0.1364, 0.0550, 0.0932, 0.0953, 0.0708, 0.0664, 0.1046,
        0.0557])
1 Regret:  0.11222052946686745
Percent Arms Correct:  0.375
Linear TS 3 tensor([0.0976, 0.1024, 0.0973, 0.0994, 0.0977, 0.1037, 0.0978, 0.0998, 0.1070,
        0.0973])
Linear TS 2 tensor([0.1188, 0.1381, 0.1146, 0.1063, 0.0862, 0.0993, 0.0811, 0.1059, 0.0831,
        0.0666])
Linear TS

KeyboardInterrupt: 