In [1]:
import numpy as np
import torch

from src.utils import flatten
from src.wrapper import RestrictionWrapper
from examples.agents.td3 import TD3
from examples.envs.navigation import NavigationEnvironment
from examples.restrictors.navigation_restrictor import NavigationRestrictor
from examples.utils import ReplayBuffer

In [45]:
env_config = {
    'HEIGHT': 15.0,
    'WIDTH': 15.0,
    'STEPS_PER_EPISODE': 40,
    'ACTION_RANGE': 220,
    'DT': 1.0,
    'TIMESTEP_PENALTY_COEFFICIENT': 0.05,
    'REWARD_COLLISION': -1.0,
    'REWARD_GOAL': 5.0,
    'REWARD_COEFFICIENT': 5.0,
    'AGENT_RADIUS': 0.5,
    'AGENT_PERSPECTIVE': 90,
    'AGENT_STEP_SIZE': 1.0,
    'AGENT_X': 1.5,
    'AGENT_Y': 1.5,
    'GOAL_RADIUS': 1.0,
    'GOAL_X': 12.0,
    'GOAL_y': 12.0
}
restrictor = NavigationRestrictor(0, [[5.0, 0.0], [0.0, 5.0]], 1.0, 0.2, 0.5, 50, 8, -110.0, 110.0)
environment = NavigationEnvironment(env_config)

restricted_environment = RestrictionWrapper(environment, restrictor)

In [54]:
td3_config = {
    'state_dim': 22,
    'action_dim': 1,
    'max_action': 110.0,
    'discount': 0.99,
    'tau': 0.005,
    'policy_noise': 0.2,
    'noise_clip:': 0.5,
    'policy_freq': 2,
    'exploration_noise': 0.05,
    'batch_size': 256,
    'train_after_timesteps': 2000,
    'learning_rate_actor': 1e-5,
    'learning_rate_critic': 1e-5
}

total_timesteps = 50000
td3 = TD3(**td3_config)
replay_buffer = ReplayBuffer(state_dim=td3_config['state_dim'], action_dim=td3_config['action_dim'])

In [7]:
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

In [56]:
episode_num = 0
training_timesteps = 0

while training_timesteps < total_timesteps:
    restricted_environment.reset()
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
    observation = None
    action = None
    last_td3_action = None

    for agent in restricted_environment.agent_iter():
        next_observation, reward, termination, truncation, info = restricted_environment.last()

        # Turn of the agent
        if agent == 'agent_0':
            episode_reward += reward
            episode_timesteps += 1

            flattened_next_observation = flatten(restricted_environment.observation_space('agent_0'),
                                                 next_observation, max_len=8, raise_error=False)

            if episode_timesteps > 1:
                replay_buffer.add(observation,
                                  last_td3_action,
                                  flattened_next_observation,
                                  reward,
                                  termination or truncation)
            observation = flattened_next_observation

            training_timesteps += 1
            if training_timesteps < td3_config['train_after_timesteps']:
                    action = restricted_environment.action_space(agent).sample()
            else:
                det_action = td3.select_action(observation)
                noise = np.random.normal(0, td3_config['max_action'] * td3_config['exploration_noise'], size=td3_config['action_dim'])
                action = (det_action + noise).clip(-td3_config['max_action'], td3_config['max_action'])
            if training_timesteps >= td3_config['train_after_timesteps']:
                td3.train(replay_buffer, td3_config['batch_size'])
            last_td3_action = action
        # Or restrictor
        else:
            action = restrictor.act(next_observation)

        # None action if episode is done
        if termination or truncation:
            action = None

        restricted_environment.step(action)

    print(f'Finished episode {episode_num} with reward {episode_reward}')

restricted_environment.close()

Finished episode 1 with reward -5.540116347695885
Finished episode 2 with reward 0.136429468781885
Finished episode 3 with reward -4.6367047009087905
Finished episode 4 with reward -3.35873181805932
Finished episode 5 with reward -5.02028808058135
Finished episode 6 with reward 4.807869526307469
Finished episode 7 with reward -1.41570669758535
Finished episode 8 with reward 7.910575588219789
Finished episode 9 with reward 14.34745782524136
Finished episode 10 with reward 0.8409076993986
Finished episode 11 with reward 4.8931458339053
Finished episode 12 with reward 13.732347654674715
Finished episode 13 with reward -2.93021658550393
Finished episode 14 with reward 5.974999307724801
Finished episode 15 with reward -5.01428928047424
Finished episode 16 with reward 5.1952677421321365
Finished episode 17 with reward -6.6107308745175395
Finished episode 18 with reward 0.8966686065987299
Finished episode 19 with reward -2.7616962254593798
Finished episode 20 with reward 0.8777850908455651
Fi

KeyboardInterrupt: 