In [None]:
%load_ext autoreload
%autoreload

In [None]:
import torch
import numpy as np

from collections import deque
from unityagents import UnityEnvironment
from deeprl_cc.agent import PPOAgent

In [None]:
MAX_EPISODES = 20
ROLLOUT_LENGTH = 250
SCORE_WINDOW_SIZE = 100
MEAN_SCORE_WINDOW_TARGET = 30

In [None]:
def interact(action):
    action = action.reshape(num_agents, action_size)
    env_info = env.step(action)[brain_name]
    next_state, reward, done = env_info.vector_observations, env_info.rewards, env_info.local_done
    return next_state.reshape(num_agents, -1), np.array(reward).reshape(num_agents, -1), np.array(done).reshape(num_agents, -1)

def reset():
    state = env.reset()[brain_name].vector_observations.reshape(num_agents, -1)
    return state

In [None]:
env = UnityEnvironment(file_name="Reacher.app")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

In [None]:
agent = PPOAgent(state_dim=state_size, action_dim=action_size, num_agents=num_agents)
states, episode_return, episode_length = reset(), np.zeros(num_agents), 0

In [None]:
# My implementation
# Main loop: collect experience in env and update/log each epoch

scores = []                                            # list containing scores from each episode
scores_window = deque(maxlen=SCORE_WINDOW_SIZE)        # last 100 scores
    
for i_episode in range(MAX_EPISODES):
    for t in range(ROLLOUT_LENGTH):
        actions, values, logp = agent.propose_action(torch.as_tensor(states, dtype=torch.float32))

        next_states, rewards, dones = interact(actions)
        episode_return += rewards.squeeze()
        episode_length += 1

        # save and log
        agent.train(states, actions, rewards.squeeze(), values.squeeze(), logp.squeeze(), dones.squeeze())

        # Update obs (critical!)
        states = next_states

        terminal = dones.any()
        episode_ended = t==ROLLOUT_LENGTH-1

        if terminal or episode_ended:
            print(f'Episode: {i_episode}, Reward this episode:{np.mean(episode_return)}, Scores: {np.mean(scores_window)}')
            scores_window.append(np.mean(episode_return))       # save most recent score
            scores.append(np.mean(episode_return))              # save most recent score
            states, episode_return, episode_length = reset(), np.zeros(num_agents), 0
        
        if np.mean(scores_window)>=MEAN_SCORE_WINDOW_TARGET:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_window)))
            episodes_solved_in = i_episode-100