In [57]:
import torch
from agents.debn_ps import *
from environments.env_gridworld import *
from tqdm import tqdm 

In [19]:
envs = TaskEnvironment((10,10))

In [20]:
envs.num_percepts_list

(10, 10)

In [21]:

print(envs.position)
envs.step(1)
print(envs.position)

[0 0]
[0 1]


In [22]:
envs.step(2), envs.position

((array([0, 1]), 0.0, False, None), array([0, 1]))

#### Experiments

In [23]:
# INTERACTION PARAMETERS #
#------------------------#
EPISODES = 1200  # number of episodes for each agent 2000
MAX_STEPS_PER_TRIAL = 10  # number of allowed timesteps before reset 20000

#agent parameters
AGENT_NUMBER = 1
DEVICE = 'cpu'
AGENT_NAME = 'ebm'

GAMMA = 0.99 # discount factor
hidden_layers = 1 # number of hidden layers
hidden_units_layer = 64 # number of hidden units
NUM_HIDDEN = [int(round(hidden_units_layer))]*hidden_layers # list of hidden unit numbers list
DROPOUT = [0.]*hidden_layers # dropout rate list
LEARNING_RATE = 0.001 # learning rate
CAPACITY = 5000 # size of the memory
BATCH_SIZE = 100 # size of the training batch for experience replay
REPLAY_TIME = 100 # the time interval between each experience replay
TARGET_UPDATE = 100 # update interval for the target network
SAVE_MODEL = False #set to true to save state dict

BETA_i = 0.001 # initial beta parameter for schedule
BETA_f = 0.8 # final beta parameter for schedule
SCHEDULE = 'htan' # name of the schedule
if SCHEDULE == 'htan':
    beta = np.tanh(np.linspace(BETA_i, BETA_f, EPISODES)) # tanh schedule
elif SCHEDULE =='lin':
    beta = np.linspace(BETA_i, BETA_f, EPISODES) # linear scchedule

# ENVIRONMENT PARAMETERS
ENV_NAME = 'gridworld' # environment name
DIMENSIONS = [100,100] # 2D grid of size [100,100]
env = TaskEnvironment(DIMENSIONS) #generate environment
percept_size = DIMENSIONS[0]+DIMENSIONS[1] #size of the percept space
action_size = 4 # size of the action space

#action encoding
all_actions = torch.empty(0)
for i in range(action_size):
    a = torch.zeros((1, 1, action_size))
    a = a.new_full((1, 1, action_size), 0.)
    a[0, 0, i] = 1.
    all_actions = torch.cat((all_actions, a))

#percept encoding
def to_two_hot(percept, dim):
    """
    Two-hot encodes the 2D percept of positions.
    """
    one_hot = np.zeros(dim[0]+dim[1])
    one_hot[percept[0]] = 1
    one_hot[dim[0]+percept[1]] = 1
    return one_hot


In [24]:
# DEFINE AGENT           #
#------------------------#
agent = DEBNAgent(percept_size, action_size, all_actions, dim_hidden=NUM_HIDDEN, dropout_rate=DROPOUT,
                    device = DEVICE, learning_rate=LEARNING_RATE, capacity=CAPACITY, batch_size=BATCH_SIZE, replay_time=REPLAY_TIME,
                    target_update = TARGET_UPDATE, gamma = GAMMA, train_output_weights = False)


In [33]:
agent.all_actions, agent.target_update, agent.gamma, agent._optimizer, agent._target_net ;

In [37]:
counter = 0
#reset the environment
percept = env.reset()
percept = to_two_hot(percept,DIMENSIONS)
percept = np.reshape(percept, [1, percept_size])
percept = torch.Tensor(percept)
reward = 0.
done = False

In [42]:
agent.deliberate(percept, 1)

tensor([[0., 1., 0., 0.]])

In [54]:
action = agent.deliberate_and_learn(percept, None, reward, GAMMA, done, 1.00)
action = (action[0] == 1).nonzero().item()

In [55]:
env.step(action)

(array([0, 1]), 0.0, False, None)

In [56]:
# RUN TRAINING STEPS     #
#------------------------#
timesteps = []
for e in range(EPISODES):
    counter = 0
    #reset the environment
    percept = env.reset()
    percept = to_two_hot(percept,DIMENSIONS)
    percept = np.reshape(percept, [1, percept_size])
    percept = torch.Tensor(percept)
    reward = 0.
    done = False
    for t in tqdm(range(1, MAX_STEPS_PER_TRIAL + 1), desc= 'runnnig episode . .. '+str(e) ):
        action = agent.deliberate_and_learn(percept, None, reward, GAMMA, done, beta[e])
        action = (action[0] == 1).nonzero().item()
        print('action : ', action)
        percept, reward, done, _ = env.step(action)
        print('percept : ', percept)
        print('reward : ', reward)
        percept = to_two_hot(percept, DIMENSIONS)
        percept = np.reshape(percept, [1, percept_size])
        percept = torch.Tensor(percept)
        
        if t==MAX_STEPS_PER_TRIAL:
            reward = -1
            done = True
        if done:
            agent.deliberate_and_learn(percept, None, reward, GAMMA, done, beta[e])
            timesteps.append(t)
            break

    if e%100 == 0:
        print("Average last 100 scores (timesteps per episode) the agent achieved at " + str(e) + ": ", np.mean(timesteps[-100:]))
        # save data to file


action :  1
percept :  tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])
reward :  0.0
action :  0
percept :  tensor([[0., 1., 0., 0., 0., 0

KeyboardInterrupt: 