In [1]:
import numpy as np

from src.wrapper import RestrictionWrapper
from examples.agents.td3 import TD3
from examples.envs.navigation import NavigationEnvironment
from examples.restrictors.navigation_restrictor import NavigationRestrictor
from examples.utils import ReplayBuffer

In [2]:
env_config = {
    'STEPS_PER_EPISODE': 40,
    'ACTION_RANGE': 220,
    'DT': 1.0,
    'REWARD': {
        'TIMESTEP_PENALTY_COEFFICIENT': 0.05,
        'REWARD_COEFFICIENT': 5.0,
        'GOAL': 50,
        'COLLISION': -20.0
    },
    'MAP': {
        'HEIGHT': 15.0,
        'WIDTH': 15.0,
        'AGENT': {'x': 1.0, 'y': 1.0, 'angle': 90.0, 'step_size': 1.0, 'radius': 0.4},
        'GOAL': {'x': 12.0, 'y': 12.0, 'radius': 0.5}
    }
}
restrictor = NavigationRestrictor(4, [[5.0, 0.0], [0.0, 5.0]], 1.0, 0.2, 0.5, 50, 8, -110.0, 110.0)
environment = NavigationEnvironment(env_config)

restricted_environment = RestrictionWrapper(environment, restrictor)

In [3]:
td3_config = {
    'state_dim': 9,
    'action_dim': 1,
    'max_action': 220.0,
    'discount': 0.99,
    'tau': 0.005,
    'policy_noise': 0.2,
    'noise_clip:': 0.5,
    'policy_freq': 2,
    'exploration_noise': 0.05,
    'batch_size': 64,
    'train_after_timesteps': 2000
}

td3 = TD3(**td3_config)
replay_buffer = ReplayBuffer(state_dim=9, action_dim=1)

In [23]:
episode_reward = 0
episode_timesteps = 0
episode_num = 0
training_timesteps = 0

restricted_environment.reset()

for agent in restricted_environment.agent_iter():
    next_observation, reward, termination, truncation, info = restricted_environment.last()

    # None action if episode is done
    if termination or truncation:
        action = None
    # Turn of the agent
    elif agent == 'agent_0':
        episode_reward += reward
        episode_timesteps += 1
        if episode_timesteps > 1:
            replay_buffer.add(observation, action, next_observation, reward, termination)
        observation = next_observation

        training_timesteps += 1
        if training_timesteps < td3_config['train_after_timesteps']:
                action = restricted_environment.action_space(agent).sample()
        else:
            action = (
                td3.select_action(np.array(observation))
                + np.random.normal(0, td3_config['max_action'] * td3_config['exploration_noise'], size=td3_config['action_dim'])
            ).clip(-td3_config['max_action'], td3_config['max_action'])

        if training_timesteps >= td3_config['train_after_timesteps']:
            td3.train(replay_buffer, td3_config['batch_size'])
    # Or restrictor
    else:
        action = restrictor.act(next_observation)

    restricted_environment.step(action)

restricted_environment.close()

TypeError: float() argument must be a string or a real number, not 'dict'