In [1]:
from unityagents import UnityEnvironment
import numpy as np

from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from ddpg_agent_2 import Agent

In [2]:
env = UnityEnvironment(file_name="Tennis_Linux/Tennis.x86_64")

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


In [3]:
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [4]:
# reset the environment
env_info = env.reset(train_mode=True)[brain_name]

# number of agents
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space 
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

Number of agents: 2
Size of each action: 2
There are 2 agents. Each observes a state with length: 24
The state for the first agent looks like: [ 0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -6.65278625 -1.5
 -0.          0.          6.83172083  6.         -0.          0.        ]


In [5]:
seed = 42
agent = Agent(state_size=state_size, action_size=action_size, n_agents=num_agents, seed=seed)

In [6]:
def ddpg(n_episodes=1000, max_t = 1000, window_size=100, score_threshold=0.5, 
         print_interval=100, epochs=10000):

    scores_deque = deque(maxlen=window_size) 
    scores = []        
    best_average_score = -np.inf
    print("Training on {} started...".format(agent.device))
    
    for i_episode in range(1, epochs+1):
        
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        
        agent.reset()
        episode_scores = np.zeros(num_agents) 

        for t in range(max_t):
            actions = agent.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            agent.step(states=states, actions=actions, rewards=rewards, next_states=next_states, dones=dones)
            episode_scores += np.array(rewards)
            states = next_states
            if np.any(dones):
                break

        episode_score = np.max(episode_scores) # Summary of scores for this episode
        scores_deque.append(episode_score)
        scores.append(episode_score)
        average_score = np.mean(scores_deque)

        print('\rEpisode: {}\tAverage Score: {:.2f}\tCurrent Score: {:.2f}'.format(i_episode, average_score, episode_score), end="")
        if i_episode % print_interval == 0:
            print('\rEpisode: {}\tAverage Score: {:.2f}\tCurrent Score: {:.2f}'.format(i_episode, average_score, episode_score))

        if average_score >= score_threshold:
            print('\nEnvironment solved in {} episodes!\tAverage Score: {:.2f}'.format(i_episode-window_size, average_score))
            torch.save(agent.actor_local.state_dict(), 'output/ddpg_checkpoint_actor.pth')
            torch.save(agent.critic_local.state_dict(), 'output/ddpg_checkpoint_critic.pth')
            break

    np.save('output/ddpg_scores.npy', scores)
    return scores

In [7]:
scores = ddpg()

Training on cuda:0 started...
Episode: 100	Average Score: -0.00	Current Score: -0.00
Episode: 200	Average Score: -0.00	Current Score: -0.00
Episode: 300	Average Score: -0.00	Current Score: -0.00
Episode: 400	Average Score: -0.00	Current Score: -0.00
Episode: 500	Average Score: 0.01	Current Score: 0.0500
Episode: 600	Average Score: 0.03	Current Score: -0.00
Episode: 700	Average Score: 0.01	Current Score: -0.00
Episode: 800	Average Score: -0.00	Current Score: -0.00
Episode: 900	Average Score: -0.00	Current Score: -0.00
Episode: 1000	Average Score: -0.00	Current Score: -0.00
Episode: 1100	Average Score: -0.00	Current Score: 0.050
Episode: 1200	Average Score: 0.01	Current Score: 0.0500
Episode: 1300	Average Score: 0.00	Current Score: -0.00
Episode: 1400	Average Score: 0.02	Current Score: 0.050
Episode: 1500	Average Score: 0.04	Current Score: 0.050
Episode: 1600	Average Score: 0.03	Current Score: 0.050
Episode: 1700	Average Score: 0.03	Current Score: 0.050
Episode: 1800	Average Score: 0.04	

KeyboardInterrupt: 

In [None]:
f = plt.figure()
ax = f.add_subplot(111)
plt.plot(range(1, len(scores) + 1), scores)
plt.xlabel('# Episodes')
plt.ylabel('Scores')
plt.savefig('output/ddpg_scores_plot.png')
plt.show()