In [1]:
import time
from env import UnityEnvWrapper
import numpy as np
from collections import deque
from maddpg import MADDPG

In [2]:
from bokeh.plotting import figure 
from bokeh.models import Legend
from bokeh.layouts import column
from bokeh.io import output_notebook, show
output_notebook()

In [3]:
env  = UnityEnvWrapper('Tennis.app')

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [4]:
# reset the environment
env_info = env._env.reset(train_mode=True)[env.brain_name]

# number of agents 
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = env.brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [5]:
# for i in range(1, 6):                                      # play game for 5 episodes
#     states = env.reset()     # reset the environment    
#     scores = np.zeros(num_agents)                          # initialize the score (for each agent)
#     while True:
#         actions = np.random.randn(num_agents, action_size) # select an action (for each agent)
#         actions = np.clip(actions, -1, 1)                  # all actions between -1 and 1
# #         env_info = env.step(actions)[brain_name]           # send all actions to tne environment
# #         next_states = env_info.vector_observations         # get next state (for each agent)
# #         rewards = env_info.rewards                         # get reward (for each agent)
# #         dones = env_info.local_done                        # see if episode finished
#         next_states, rewards, dones = env.step(actions)
#         scores += rewards                         # update the score (for each agent)
#         states = next_states                               # roll over states to next time step
#         if np.any(dones):                                  # exit loop if episode finished
#             break
#     print('Score (max over agents) from episode {}: {}'.format(i, np.max(scores)))


In [6]:
agent = MADDPG(num_agents, state_size, action_size, 10)

In [7]:
def train(n_episodes=1000):
    current_score = []
    running_mean = []
    scores_deque = deque(maxlen=100)
    solved = False
    best_avg_score = 0.5
    start_time = time.time()
    for i_episode in range(1, n_episodes+1):
        states = env.reset()
        scores = np.zeros(num_agents)
        while True:
            actions = agent.act(states)
            next_states, rewards, dones = env.step(actions)
            agent.step(states, actions, rewards, next_states, dones)
            scores += rewards
            states = next_states
            if np.any(dones):
                break
        scores_deque.append(np.max(scores))
        current_score.append(np.max(scores))
        running_mean.append(np.mean(scores_deque))
    
        print('\rEpisode {}\tAverage Score: {:.3f}\tCurrent Score: {:.3f}\tLast Best Score: {:.3f}'.format(i_episode, running_mean[-1],current_score[-1], best_avg_score), end="")
        
        if running_mean[-1] >= best_avg_score*1.05:
            agent.saveCheckPoints()
            best_avg_score = running_mean[-1]
            
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.3f}\tCurrent Score: {:.3f}\tLast Best Score: {:.3f}'.format(i_episode, running_mean[-1], current_score[-1], best_avg_score))
    
        if running_mean[-1]>= 0.5 and not solved:
            solved = True
            print("\x1b[31m\n************ ENVIRONMENT_SOLVED ************\x1b[0m")
            print('\nsolved in {:d} episodes!\It took {:.3f} Minutes to solve the task'.format(i_episode, (time.time()-start_time)/60))
        
        if running_mean[-1] >= 0.75:
            break
            
    return current_score, running_mean

In [8]:
scores, running_mean = train(n_episodes = 1000)

Episode 100	Average Score: 0.000	Current Score: 0.000	Last Best Score: 0.500
Episode 200	Average Score: 0.038	Current Score: 0.100	Last Best Score: 0.500
Episode 300	Average Score: 0.130	Current Score: 0.100	Last Best Score: 0.500
Episode 400	Average Score: 0.259	Current Score: 0.300	Last Best Score: 0.500
Episode 429	Average Score: 0.517	Current Score: 2.600	Last Best Score: 0.500[31m
************ ENVIRONMENT_SOLVED ************[0m

solved in 429 episodes!\It took 22.062 Minutes to solve the task
Episode 441	Average Score: 0.751	Current Score: 1.900	Last Best Score: 0.735

In [9]:
PLOT_WIDTH = 900
PLOT_HEIGHT = 300
LINE_WIDTH = 2


def get_figure(args, x_axis_label, y_axis_label):
    
    fig = figure(
        plot_width=PLOT_WIDTH,
        plot_height=PLOT_HEIGHT,
        y_axis_label=y_axis_label,
        x_axis_label=x_axis_label
    )
    for data, x_axis_label, y_axis_label, color in args:
        fig.line(range(len(data)), data, legend=y_axis_label ,line_width=LINE_WIDTH, color=color)
    return fig

plots = []



plots.append(get_figure(
    [
        (scores, 'Episodes', 'Episodic reward', 'skyblue'),
        (running_mean, 'Episodes', 'Rolling mean of Episodic deward', 'slateblue'),
        ([0.5 for i in range(len(scores))], 'Episodes', 'Project completion threshold', 'tomato')
    ],
    'Episodes', ''
))


main_row = column(*plots)
show(main_row)

In [10]:
trained_agent = MADDPG(num_agents, state_size, action_size, 10)
trained_agent.loadCheckPoints()

In [11]:
def test(n_episodes=10):
    current_score = []
    running_mean = []
    scores_deque = deque(maxlen=100)
    for i_episode in range(1, n_episodes+1):
        env_info = env._env.reset(train_mode=False)[env.brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)
        while True:
            actions = trained_agent.act(states)
            next_states, rewards, dones = env.step(actions)
            scores += rewards
            states = next_states
            if np.any(dones):
                break
        scores_deque.append(np.max(scores))
        current_score.append(np.max(scores))
        running_mean.append(np.mean(scores_deque))
    
        print('\rEpisode {}\tAverage Score: {:.2f}\tCurrent Score: {:.2f}'.format(i_episode, np.mean(scores_deque),np.max(scores)))
    return current_score, running_mean


In [12]:
scores, running_mean = test()

Episode 1	Average Score: 2.60	Current Score: 2.60
Episode 2	Average Score: 2.60	Current Score: 2.60
Episode 3	Average Score: 1.73	Current Score: 0.00
Episode 4	Average Score: 1.40	Current Score: 0.40
Episode 5	Average Score: 1.64	Current Score: 2.60
Episode 6	Average Score: 1.80	Current Score: 2.60
Episode 7	Average Score: 1.56	Current Score: 0.10
Episode 8	Average Score: 1.69	Current Score: 2.60
Episode 9	Average Score: 1.78	Current Score: 2.50
Episode 10	Average Score: 1.75	Current Score: 1.50


In [13]:
plots = []
plots.append(get_figure(
    [
        (scores, 'Episodes', 'Episodic reward', 'skyblue'),
        (running_mean, 'Episodes', 'Rolling mean of Episodic deward', 'slateblue'),
        ([0.5 for i in range(len(scores))], 'Episodes', 'Project completion threshold', 'tomato')
    ],
    'Episodes', ''
))


main_row = column(*plots)
show(main_row)

In [14]:
env._env.close()