In [1]:
%load_ext autoreload
%autoreload 2

# Udacity Deep Reinforcement Learning Nanodegree - Project 3: Collaboration and Competition

In [2]:
from collections import deque

from unityagents import UnityEnvironment
import numpy as np

from agent import Agent, ReplayBuffer

## Set everything up

* Parameters
* Unity environment
* Agent

## Parameters

In [3]:
params = {
    'buffer_size': 1e5,
    'batch_size': 128,
    'n_episodes': 10000,
    'max_steps': 500,
    'update_step': 5,
    'solution_threshold': .5,
    'eval_window_length': 100,
    'num_agents': 2,
    'agent_seed': 33,
    'env_seed': 33,
    'buffer_seed': 33,
    'gamma': 1.,
    'tau': 1e-3,
    'lr_actor': 1e-4,
    'lr_critic': 1e-4,
    'critic_weight_decay': 0,
    'noise_sigma': 0.2
}

In [4]:
env = UnityEnvironment(file_name="Tennis.app",
                       seed=params['env_seed'],
                       no_graphics=False)
# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [5]:
# reset the environment
env_info = env.reset(train_mode=False)[brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.93938923 -1.5
 -0.          0.         -7.46356201  6.         -0.          0.        ]


In [6]:
action_size = brain.vector_action_space_size
state_size = env_info.vector_observations.shape[1]

## Instantiate multiple DDPG Agent and Shared Replay Buffer

In [7]:
agents = {}
for num_agent in range(params['num_agents']):
    agents[num_agent] = Agent(state_size=state_size, action_size=action_size,
                              agent_no=num_agent, params=params)

replay_buffer = ReplayBuffer(params=params)

## Train the Agent with DDPG (Deep Deterministic Policy Gradients)

In [8]:
eval_window_lengths = params['eval_window_length']
env_is_solved = False
scores = []
scores_window = deque(maxlen=eval_window_lengths)
best_score = 0

scale = 3.0
min_scale = 0.2
scale_decay = 0.9995

for i_episode in range(1, params['n_episodes']+1):
    episode_scores = np.zeros(params['num_agents'])
    best_score = 0
    for agent in agents.values():
        agent.reset()
    env_info = env.reset(train_mode=True)[brain_name]
    states = env_info.vector_observations
    
    # fill part of the buffer with purely random samples
    if i_episode < 700:
        for step in range(params['max_steps']):
            actions = np.random.uniform(-1, 1, (params['num_agents'], action_size))
            env_info = env.step(actions)[brain_name] 
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            
            replay_buffer.add(states, actions, rewards, next_states, dones)
            
            episode_scores += rewards
            states = next_states

            if np.any(dones):
                break
            
    else:
        for step in range(params['max_steps']):
            actions = [agent.act(env_info.vector_observations[no_agent], scale=scale)
                       for no_agent, agent in enumerate(agents.values())]
            actions = np.concatenate(actions, axis=0).reshape((params['num_agents'], action_size))

            env_info = env.step(actions)[brain_name]

            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            replay_buffer.add(states, actions, rewards, next_states, dones)

            for agent in agents.values():
                agent.step(replay_buffer, agents)

            episode_scores += rewards
            states = next_states
            scale = max(min_scale, scale*scale_decay)

            if np.any(dones):
                break
    
    score = max(episode_scores)
    scores.append(score)
    # np.save('scores', scores)
    scores_window.append(score)
    
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, score), end="")
    
    if i_episode % params['eval_window_length'] == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
    if (np.mean(scores_window) >= params['solution_threshold']) & (not env_is_solved):
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode,
                                                                                     np.mean(scores_window)))
        env_is_solved = True
        
    if env_is_solved & (score > best_score):
            best_score = score
    
print('Score (max over agents) from episode {} ({} steps): {}'.format(i_episode, step, np.max(scores)))

Episode 100	Average Score: 0.02
Episode 200	Average Score: 0.02
Episode 300	Average Score: 0.02
Episode 400	Average Score: 0.01
Episode 500	Average Score: 0.01
Episode 600	Average Score: 0.02
Episode 700	Average Score: 0.02
Episode 800	Average Score: 0.01
Episode 900	Average Score: 0.00
Episode 1000	Average Score: 0.00
Episode 1100	Average Score: 0.00
Episode 1200	Average Score: 0.00
Episode 1300	Average Score: 0.00
Episode 1400	Average Score: 0.00
Episode 1500	Average Score: 0.00
Episode 1600	Average Score: 0.00
Episode 1700	Average Score: 0.00
Episode 1800	Average Score: 0.00
Episode 1900	Average Score: 0.00
Episode 2000	Average Score: 0.00
Episode 2100	Average Score: 0.00
Episode 2200	Average Score: 0.00
Episode 2300	Average Score: 0.00
Episode 2400	Average Score: 0.00
Episode 2500	Average Score: 0.00
Episode 2600	Average Score: 0.00
Episode 2700	Average Score: 0.00
Episode 2800	Average Score: 0.00
Episode 2900	Average Score: 0.00
Episode 3000	Average Score: 0.00
Episode 3100	Averag

In [None]:
env.close()

## Performance Plot

In [None]:
average_scores = np.array([np.mean(avg_scores[i:(i+params['eval_window_length'])])
                           for i in range(len(avg_scores)-params['eval_window_length'])])
steps = len(avg_scores)

fig = plt.figure(figsize=(15, 10))
plt.plot(np.arange(steps), avg_scores, linewidth=1.5)
plt.plot(np.arange(params['eval_window_length'], steps), average_scores, 'g-')
plt.plot(np.arange(steps), [params['solution_threshold']]*steps, 'r-')
plt.ylabel('Score', fontsize=16)
plt.xlabel('Episode #', fontsize=16)
plt.legend(['Score', 'Average Score (w = 100)', 'Solution Threshold'])
plt.grid(True)
plt.show()

In [None]:
print("Environment was solved in Episode {}!".format(
      np.argmax((average_scores >= params['solution_threshold']))+params['eval_window_length']))

In [None]:
print("First Episode with Agent-Average Score >= 30.0: {}!".format(
    np.argmax(np.array(avg_scores) >= params['solution_threshold'])))