In [1]:
import numpy as np
import config.config as config
from Agent import *
from Model import *
from pathlib import Path

In [2]:
class Env():
    def __init__(self):
        self.gamma_shape = 10    # defines Gamma dist.
        self.gamma_mus = np.array([3, 6, 9])
        self.batch_size = arg.batch_size
        
    def reset(self):
        # have a nonexistent 4th box option, the action pushing it means not choose any box
        self.box_time_count = np.zeros((self.batch_size, self.gamma_mus.size + 1))  
        self.box_mus = np.ones((self.batch_size, self.gamma_mus.size))
        self.box_sampled_interval = np.ones_like(self.box_time_count) * 10000
        for trial_idx in range(self.batch_size):
            self.box_mus[trial_idx] = np.random.permutation(self.gamma_mus)    # randomize mus for three boxes
        
        # assign randomized mus for the first three box options, the 4th option has a mu of 10000 meaning nothing will happen
        # after choosing it, i.e., not choose any of the three boxes
        self.box_sampled_interval[:, :self.gamma_mus.size] = np.round(np.random.gamma(shape=self.gamma_shape, 
                                                                      scale=self.box_mus / self.gamma_shape)).clip(1, 10000)
        # Sample a std of observation noise from the range [0, 1)
        self.obs_std = np.random.rand(self.batch_size, 1) * 1
        
    def get_obs(self):
        # observation, a ratio of box time count to the total box reward interval, 1 means reward is available
        obs = (self.box_time_count / self.box_sampled_interval)[:, :self.gamma_mus.size].clip(0, 1)
        # add observation noise
        obs += obs * np.random.randn(*obs.shape) * self.obs_std
        # provide extra observation uncertainty as agent input
        obs = np.concatenate([obs, self.obs_std], axis=-1)
        return obs
        
    def push_button(self, box_idx):
        # give reward after pushing, reward = 1 if reward is available otherwise 0
        reward = self.box_time_count[np.arange(box_idx.size), box_idx] >= \
                 self.box_sampled_interval[np.arange(box_idx.size), box_idx]
        reward = reward.astype(float)
        
        # cost of pushing a button. Pushing the 4th option, i.e., not pushing any box, has no cost
        cost = -np.ones_like(reward) * 0.01
        cost[box_idx == 3] = 0
        
        #reward_shaped = reward.copy()
        #time_diff = self.box_time_count[np.arange(box_idx.size), box_idx] - \
        #            self.box_sampled_interval[np.arange(box_idx.size), box_idx]
        #reward_shaped[(time_diff > 0) & (time_diff < 10)] = 1 - (time_diff[(time_diff > 0) & (time_diff < 10)] / 20)
        #reward_shaped[time_diff >= 10] = 0.5
        #reward_shaped[(time_diff > -10) & (time_diff < 0)] = time_diff[(time_diff > -10) & (time_diff < 0)] / 20
        #reward_shaped[time_diff < -10] = -0.5
        #reward_shaped[box_idx != 3] = 0
        
        # time step update. Reset box timer after pushing. Resample box reward interval.
        self.box_time_count += 1 
        self.box_time_count[np.arange(box_idx.size), box_idx] = 0
        self.box_sampled_interval[box_idx != 3, box_idx[box_idx != 3]] = np.round(
                                np.random.gamma(shape=self.gamma_shape, 
                                                scale=self.box_mus[box_idx != 3, box_idx[box_idx != 3]]
                                                / self.gamma_shape)).clip(1, 10000)
        return reward+cost, reward+cost

In [3]:
# params
seed_number = 2
datapath = Path(rf'C:\Users\Panos\OneDrive - nyu.edu\Documents\PhD Classes\Machine Learning\ML_Project\Code\seed{seed_number}')
arg = config.ConfigCore(datapath)
arg.SEED_NUMBER = seed_number
arg.save()

# reproducibility
torch.manual_seed(seed_number)
torch.cuda.manual_seed_all(seed_number)
np.random.seed(seed_number)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
env = Env()
agent = Agent(arg, ActorCritic)

In [5]:
# total training trials, usually converge around 100-200 trials
num_trial = 201

reward_log = reward_num = done_num = 0
agent.buffer.clear()

def get_init_variables():   
    state = torch.zeros(1, arg.batch_size, 7, device=arg.device)
    hiddenin = (torch.zeros(1, arg.batch_size, arg.RNNSELF_SIZE, device=arg.device),
                torch.zeros(1, arg.batch_size, arg.RNNSELF_SIZE, device=arg.device))
    
    return state, hiddenin


for i_epi in range(num_trial):
    if i_epi % (arg.full_len / arg.truncated_len) == 0:
        state, hiddenin = get_init_variables()
        env.reset()

    agent.buffer.actorhidden0s.append(hiddenin)
    
    state_vectors = []; states = []; actions = []; actionlogprobs = []; rewards = []
    
    for t in range(arg.truncated_len):
        action, action_logprob, hiddenout, dist = agent.select_action(state, hiddenin, 
                                                                      enable_noise=True, return_dist=True)
        action_cpu = action.cpu().squeeze().numpy()
        reward_shaped, reward = env.push_button(action_cpu)
        reward_shaped = torch.tensor(reward_shaped, device=arg.device, 
                                     dtype=torch.float).reshape(1, -1, 1) * arg.REWARD_SCALE
        reward = torch.tensor(reward, device=arg.device, dtype=torch.float).reshape(1, -1, 1)
        # store data
        reward_log += reward.mean()
        reward_num += reward.sum()

        states.append(state)
        actions.append(action)
        actionlogprobs.append(action_logprob)
        rewards.append(reward_shaped)
        
        # next step
        next_obs = env.get_obs()
        next_obs = torch.tensor(next_obs, device=arg.device, dtype=torch.float).unsqueeze(0)
        next_state = torch.cat([next_obs, action / 3, reward, torch.ones_like(reward) * t / arg.truncated_len], dim=-1)
            
        # update variables
        hiddenin = hiddenout
        state = next_state
        
    states.append(state)
    states = torch.cat(states); agent.buffer.states.append(states)
    actions = torch.cat(actions); agent.buffer.actions.append(actions)
    actionlogprobs = torch.cat(actionlogprobs); agent.buffer.actionlogprobs.append(actionlogprobs)
    rewards = torch.cat(rewards); agent.buffer.rewards.append(rewards)
    
    
    # update model
    losses = agent.learn()  
    agent.save(i_epi)

    # print
    entropy = losses[2].cpu().numpy()
    reward_frac = (reward_num.item()) / (arg.batch_size * arg.full_len)

    print(f'epi {i_epi},', f'reward_avg {reward_log.item(): .3f},', f'reward_num {reward_num.item(): .0f},',
          f'reward_frac {reward_frac: .3f},', f'entropy {np.round(entropy.item(), 3)}')

    agent.buffer.clear()

    reward_log = 0
    reward_num = 0
    done_num = 0

epi 0, reward_avg  177.330, reward_num  88665, reward_frac  0.177, entropy 1.385
epi 1, reward_avg  181.420, reward_num  90710, reward_frac  0.181, entropy 1.377
epi 2, reward_avg  190.140, reward_num  95070, reward_frac  0.190, entropy 1.35
epi 3, reward_avg  202.938, reward_num  101469, reward_frac  0.203, entropy 1.307
epi 4, reward_avg  216.467, reward_num  108234, reward_frac  0.216, entropy 1.244
epi 5, reward_avg  229.560, reward_num  114780, reward_frac  0.230, entropy 1.16
epi 6, reward_avg  245.090, reward_num  122545, reward_frac  0.245, entropy 1.065
epi 7, reward_avg  258.137, reward_num  129068, reward_frac  0.258, entropy 0.999
epi 8, reward_avg  268.704, reward_num  134352, reward_frac  0.269, entropy 0.887
epi 9, reward_avg  282.699, reward_num  141349, reward_frac  0.283, entropy 0.813
epi 10, reward_avg  291.293, reward_num  145646, reward_frac  0.291, entropy 0.709
epi 11, reward_avg  303.393, reward_num  151697, reward_frac  0.303, entropy 0.682
epi 12, reward_avg 

KeyboardInterrupt: 