In [1]:
import numpy as np
import torch
import random

from src.restrictions import IntervalUnionRestriction
from src.wrapper import RestrictionWrapper
from examples.agents.td3 import TD3
from examples.envs.navigation import NavigationEnvironment
from examples.restrictors.navigation_restrictor import NavigationRestrictor
from examples.utils import ReplayBuffer

In [2]:
# Tested with seeds 45, 46, 47, 48, 49
seed = 49
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

In [3]:
env_config = {
    'HEIGHT': 15.0,
    'WIDTH': 15.0,
    'STEPS_PER_EPISODE': 40,
    'ACTION_RANGE': 220,
    'DT': 1.0,
    'TIMESTEP_PENALTY_COEFFICIENT': 0.05,
    'REWARD_COLLISION': -1.0,
    'REWARD_GOAL': 5.0,
    'REWARD_COEFFICIENT': 10.0,
    'AGENT_RADIUS': 0.5,
    'AGENT_PERSPECTIVE': 90,
    'AGENT_STEP_SIZE': 1.0,
    'AGENT_X': 1.5,
    'AGENT_Y': 1.5,
    'GOAL_RADIUS': 1.0,
    'GOAL_X': 12.0,
    'GOAL_y': 12.0
}
environment = NavigationEnvironment(env_config)

def projection_fn(env, action, restriction: IntervalUnionRestriction):
    return np.array([restriction.nearest_element(action[0])], dtype=np.float32)
restrictor = NavigationRestrictor(obstacle_count=4,
                                  obstacle_position_covariance=[[5.0, 0.0], [0.0, 5.0]],
                                  obstacle_mean_size=1.0,
                                  obstacle_variance_size=0.2,
                                  obstacle_size_range=0.5,
                                  start_seed=50,
                                  safety_angle=8,
                                  min_angle=-110.0,
                                  max_angle=110.0)
restricted_environment = RestrictionWrapper(environment, restrictor,
                                            restriction_violation_fns=projection_fn)

In [4]:
td3_config = {
    'state_dim': 6,
    'action_dim': 1,
    'max_action': 110.0,
    'discount': 0.99,
    'tau': 0.005,
    'policy_noise': 0.2,
    'noise_clip:': 0.5,
    'policy_freq': 2,
    'exploration_noise': 0.2,
    'batch_size': 256,
    'train_after_timesteps': 2000,
    'learning_rate_actor': 1e-5,
    'learning_rate_critic': 1e-5
}

total_timesteps = 50000
td3 = TD3(**td3_config)
replay_buffer = ReplayBuffer(state_dim=td3_config['state_dim'], action_dim=td3_config['action_dim'])

In [5]:
from src.restrictors import Restrictor


def evaluate(eval_policy: TD3, eval_restrictor: Restrictor):
    eval_env = RestrictionWrapper(NavigationEnvironment(env_config),
                                  NavigationRestrictor(obstacle_count=4,
                                                       obstacle_position_covariance=[[5.0, 0.0], [0.0, 5.0]],
                                                       obstacle_mean_size=1.0,
                                                       obstacle_variance_size=0.2,
                                                       obstacle_size_range=0.5,
                                                       start_seed=1,
                                                       safety_angle=8,
                                                       min_angle=-110.0,
                                                       max_angle=110.0),
                                  restriction_violation_fns=projection_fn)
    eval_reward = 0.0

    eval_env.reset()
    for eval_agent in eval_env.agent_iter():
        obs, rew, term, trunc, inf = eval_env.last()
        if eval_agent == 'agent_0':
            eval_reward += rew
            eval_action = np.array([
                obs['restriction'].nearest_element(eval_policy.select_action(obs['observation'])[0])
            ], dtype=np.float32)
        else:
            eval_action = eval_restrictor.act(obs)

        if term or trunc:
            eval_action = None

        eval_env.step(eval_action)

    return eval_reward

In [6]:
episode_num = 0
training_timesteps = 0
sample_from_restricted_space = True

while training_timesteps < total_timesteps:
    restricted_environment.reset()
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1
    observation = None
    action = None
    last_td3_action = None

    for agent in restricted_environment.agent_iter():
        next_observation, reward, termination, truncation, info = restricted_environment.last()

        # Turn of the agent
        if agent == 'agent_0':
            episode_reward += reward
            episode_timesteps += 1

            flattened_next_observation = next_observation['observation']

            if episode_timesteps > 1:
                replay_buffer.add(observation,
                                  last_td3_action,
                                  flattened_next_observation,
                                  reward,
                                  termination or truncation)
            observation = flattened_next_observation

            training_timesteps += 1
            if training_timesteps < td3_config['train_after_timesteps']:
                if sample_from_restricted_space:
                    action = next_observation['restriction'].sample()
                else:
                    action = np.random.uniform(-110.0, 110.0, (1,))
            else:
                det_action = td3.select_action(observation)
                noise = np.random.normal(0, td3_config['max_action'] * td3_config['exploration_noise'], size=td3_config['action_dim'])
                action = (det_action + noise).clip(-td3_config['max_action'], td3_config['max_action'])

            if training_timesteps >= td3_config['train_after_timesteps']:
                td3.train(replay_buffer, td3_config['batch_size'])
            last_td3_action = action
        # Or restrictor
        else:
            action = restrictor.act(next_observation)

        # None action if episode is done
        if termination or truncation:
            # print(f'{action} for {next_observation["restriction"]}, {restricted_environment.env.agent.x, restricted_environment.env.agent.y}')
            action = None

        restricted_environment.step(action)

        if training_timesteps % 50 == 0:
            print(f'Evaluation result: {evaluate(td3, restrictor)}')

    print(f'Finished episode {episode_num} with reward {episode_reward} in {episode_timesteps} steps - goal reached: {termination}')

restricted_environment.close()

Evaluation result: -10.12133494725474
Finished episode 1 with reward 17.102499987244716 in 24 steps - goal reached: True
Evaluation result: -10.12133494725474
Evaluation result: -10.12133494725474
Finished episode 2 with reward -7.000763217582208 in 41 steps - goal reached: False
Evaluation result: -10.12133494725474
Evaluation result: -10.12133494725474
Finished episode 3 with reward 9.183637545550695 in 41 steps - goal reached: False
Finished episode 4 with reward -19.8016873915515 in 41 steps - goal reached: False
Evaluation result: -10.12133494725474
Evaluation result: -10.12133494725474
Finished episode 5 with reward 35.22606793419044 in 41 steps - goal reached: False
Evaluation result: -10.12133494725474
Evaluation result: -10.12133494725474
Finished episode 6 with reward 30.676594723818763 in 41 steps - goal reached: False
Evaluation result: -10.12133494725474
Evaluation result: -10.12133494725474
Finished episode 7 with reward 53.44931874593792 in 41 steps - goal reached: False

KeyboardInterrupt: 