In [None]:
from DDPGv2Agent import Agent, Noise
from collections import deque
rewards = deque(maxlen=100)

# read configuration parameters
from Config import Config
arg = Config()
# fix random seed
import random
random.seed(arg.SEED_NUMBER)
import torch
torch.manual_seed(arg.SEED_NUMBER)
if torch.cuda.is_available():
    torch.cuda.manual_seed(arg.SEED_NUMBER)
import numpy as np
np.random.seed(arg.SEED_NUMBER)
import time

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

import datetime
import pandas as pd

filename = '20191016-205855' # agent information
df = pd.read_csv('../firefly-inverse-data/data/' + filename + '_log.csv',
                 usecols=['discount_factor','process gain forward', 'process gain angular', 'process noise std forward',
                          'process noise std angular', 'obs gain forward', 'obs gain angular', 'obs noise std forward',
                          'obs noise std angular', 'goal radius'])

DISCOUNT_FACTOR = df['discount_factor'][0]
gains_range = [np.floor(df['process gain forward'].min()), np.ceil(df['process gain forward'].max()),
               np.floor(df['process gain angular'].min()), np.ceil(df['process gain angular'].max())]

std_range = [df['process noise std forward'].min(), df['process noise std forward'].max(),
               df['process noise std angular'].min(), df['process noise std angular'].max()]
goal_radius_range = [df['goal radius'].min(), df['goal radius'].max()]


env = Model(arg) # build an environment
x, pro_gains, pro_noise_stds, goal_radius = env.reset(arg.gains_range, arg.std_range, arg.goal_radius_range)

state_dim = env.state_dim
action_dim = env.action_dim

MAX_EPISODE = 100
std = 0.00001 #0.05
noise = Noise(action_dim, mean=0., std=std)

agent = Agent(state_dim, action_dim, arg,  filename, hidden_dim=128, gamma=arg.DISCOUNT_FACTOR, tau=0.001)
agent.load(filename)

tot_t = 0.
episode = 0.

b, state, obs_gains, obs_noise_stds = agent.Bstep.reset(x, torch.zeros(1), pro_gains, pro_noise_stds, 
                                                        goal_radius, arg.gains_range, arg.std_range)  # reset monkey's internal model



In [None]:
t = torch.zeros(1) # to track the amount of time steps to catch a firefly

theta = (pro_gains, pro_noise_stds, obs_gains, obs_noise_stds, goal_radius)

In [None]:
action = agent.select_action(state, action_noise = noise, param = None)  # with action noise


In [None]:

next_x, reached_target = env(x, action.view(-1)) #track true next_x of monkey
next_ox = agent.Bstep.observations(next_x)  # observation
next_b, info = agent.Bstep(b, next_ox, action, env.box) # belief next state, info['stop']=terminal # reward only depends on belief
next_state = agent.Bstep.Breshape(next_b, t, theta) # state used in policy is different from belief