In [1]:
import numpy as np
import torch
import random

from src.utils import flatten
from src.wrapper import RestrictionWrapper
from examples.agents.td3 import TD3
from examples.envs.navigation import NavigationEnvironment
from examples.restrictors.navigation_restrictor import NavigationRestrictor
from examples.utils import ReplayBuffer

In [2]:
# Tested with seeds 44, 45, 46, 47, 48
seed = 49
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [3]:
env_config = {
    'HEIGHT': 15.0,
    'WIDTH': 15.0,
    'STEPS_PER_EPISODE': 40,
    'ACTION_RANGE': 220,
    'DT': 1.0,
    'TIMESTEP_PENALTY_COEFFICIENT': 0.05,
    'REWARD_COLLISION': -1.0,
    'REWARD_GOAL': 5.0,
    'REWARD_COEFFICIENT': 10.0,
    'AGENT_RADIUS': 0.5,
    'AGENT_PERSPECTIVE': 90,
    'AGENT_STEP_SIZE': 1.0,
    'AGENT_X': 1.5,
    'AGENT_Y': 1.5,
    'GOAL_RADIUS': 1.0,
    'GOAL_X': 12.0,
    'GOAL_y': 12.0
}
restrictor = NavigationRestrictor(0, [[5.0, 0.0], [0.0, 5.0]], 1.0, 0.2, 0.5, 50, 8, -110.0, 110.0)
environment = NavigationEnvironment(env_config)

restricted_environment = RestrictionWrapper(environment, restrictor)

In [4]:
td3_config = {
    'state_dim': 22,
    'action_dim': 1,
    'max_action': 110.0,
    'discount': 0.99,
    'tau': 0.005,
    'policy_noise': 0.2,
    'noise_clip:': 0.5,
    'policy_freq': 2,
    'exploration_noise': 0.2,
    'batch_size': 256,
    'train_after_timesteps': 2000,
    'learning_rate_actor': 1e-5,
    'learning_rate_critic': 1e-5
}

total_timesteps = 50000
td3 = TD3(**td3_config)
replay_buffer = ReplayBuffer(state_dim=td3_config['state_dim'], action_dim=td3_config['action_dim'])

In [5]:
from src.restrictors import Restrictor


def evaluate(eval_policy: TD3, eval_restrictor: Restrictor):
    eval_env = RestrictionWrapper(NavigationEnvironment(env_config),
                                  NavigationRestrictor(0, [[5.0, 0.0], [0.0, 5.0]], 1.0, 0.2, 0.5, 50, 8, -110.0, 110.0))
    eval_reward = 0.0

    eval_env.reset()
    for eval_agent in eval_env.agent_iter():
        obs, rew, term, trunc, inf = eval_env.last()
        if eval_agent == 'agent_0':
            eval_reward += rew
            eval_action = eval_policy.select_action(flatten(eval_env.observation_space('agent_0'),
                                                            obs, max_len=8, raise_error=False))
        else:
            eval_action = eval_restrictor.act(obs)

        if term or trunc:
            eval_action = None

        eval_env.step(eval_action)

    return eval_reward

In [6]:
episode_num = 0
training_timesteps = 0

while training_timesteps < total_timesteps:
    restricted_environment.reset()
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
    observation = None
    action = None
    last_td3_action = None

    for agent in restricted_environment.agent_iter():
        next_observation, reward, termination, truncation, info = restricted_environment.last()

        # Turn of the agent
        if agent == 'agent_0':
            episode_reward += reward
            episode_timesteps += 1

            flattened_next_observation = flatten(restricted_environment.observation_space('agent_0'),
                                                 next_observation, max_len=8, raise_error=False)

            if episode_timesteps > 1:
                replay_buffer.add(observation,
                                  last_td3_action,
                                  flattened_next_observation,
                                  reward,
                                  termination or truncation)
            observation = flattened_next_observation

            training_timesteps += 1
            if training_timesteps < td3_config['train_after_timesteps']:
                    action = np.random.uniform(-110.0, 110.0, (1,))
            else:
                det_action = td3.select_action(observation)
                noise = np.random.normal(0, td3_config['max_action'] * td3_config['exploration_noise'], size=td3_config['action_dim'])
                action = (det_action + noise).clip(-td3_config['max_action'], td3_config['max_action'])
            if training_timesteps >= td3_config['train_after_timesteps']:
                td3.train(replay_buffer, td3_config['batch_size'])
            last_td3_action = action
        # Or restrictor
        else:
            action = restrictor.act(next_observation)

        # None action if episode is done
        if termination or truncation:
            action = None

        restricted_environment.step(action)

        if training_timesteps % 50 == 0:
            print(evaluate(td3, restrictor))

    print(f'Finished episode {episode_num} with reward {episode_reward}')

restricted_environment.close()

2.73786972304547
Finished episode 1 with reward -3.975812803695323
Finished episode 2 with reward -3.79130942107132
Finished episode 3 with reward 11.11701922710497
Finished episode 4 with reward -9.45519746738855
Finished episode 5 with reward -3.897859587698931
Finished episode 6 with reward -8.16969296115076
Finished episode 7 with reward 2.121757654473627
2.73786972304547
2.73786972304547
Finished episode 8 with reward 3.5556299252238386
Finished episode 9 with reward 1.81254924725854
Finished episode 10 with reward 5.13586021148406
Finished episode 11 with reward 2.58650682504495
Finished episode 12 with reward 0.1509789682882301
Finished episode 13 with reward -7.3088956545113
2.73786972304547
2.73786972304547
Finished episode 14 with reward 53.644894987402616
Finished episode 15 with reward 15.544251510981411
2.73786972304547
Finished episode 16 with reward 2.48394841219308
2.73786972304547
Finished episode 17 with reward 3.639142850750213
Finished episode 18 with reward 8.44833

KeyboardInterrupt: 