In [1]:
from unityagents import UnityEnvironment
import numpy as np

from collections import deque
import torch
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
from IPython import display

from ddpg_agent import Agents
env = UnityEnvironment(file_name="Soccer_Linux/Soccer.x86_64")


g_brain_name = env.brain_names[0]
g_brain = env.brains[g_brain_name]

s_brain_name = env.brain_names[1]
s_brain = env.brains[s_brain_name]

env_info = env.reset(train_mode=True)

g_num_agents = len(env_info[g_brain_name].agents)
s_num_agents = len(env_info[s_brain_name].agents)

g_action_size = g_brain.vector_action_space_size
s_action_size = s_brain.vector_action_space_size
g_states = env_info[g_brain_name].vector_observations
g_state_size = g_states.shape[1]
s_states = env_info[s_brain_name].vector_observations
s_state_size = s_states.shape[1]

print(env.brain_names)
print('Number of goalie agents:', g_num_agents)
print('Number of striker agents:', s_num_agents)
print('Number of goalie actions:', g_action_size)
print('Number of striker actions:', s_action_size)
print('There are {} goalie agents. Each receives a state with length: {}'.format(g_states.shape[0], g_state_size))
print('There are {} striker agents. Each receives a state with length: {}'.format(s_states.shape[0], s_state_size))

g_params = {
    'state_size':g_state_size, 
    'action_size':g_action_size, 
    'num_agents':g_num_agents, 
    'random_seed':42,
    'fc1_units':400,
    'fc2_units':300,
    'BUFFER_SIZE':1e5,
    'BATCH_SIZE':64,
    'GAMMA':0.99,
    'TAU':1e-3,
    'LR_ACTOR':1e-4,
    'LR_CRITIC':1e-4,
    'CRITIC_WEIGHT_DECAY':0
}

s_params = {
    'state_size':s_state_size, 
    'action_size':s_action_size, 
    'num_agents':s_num_agents, 
    'random_seed':42,
    'fc1_units':400,
    'fc2_units':300,
    'BUFFER_SIZE':1e5,
    'BATCH_SIZE':64,
    'GAMMA':0.99,
    'TAU':1e-3,
    'LR_ACTOR':1e-4,
    'LR_CRITIC':1e-4,
    'CRITIC_WEIGHT_DECAY':0
}

g_agent = Agents(**g_params)
s_agent = Agents(**s_params)

print (g_agent.critic_local)
print (g_agent.actor_local)
print (s_agent.critic_local)
print (s_agent.actor_local)

def ddpg(n_episodes=2, max_t=100):
    g_scores_deque = deque(maxlen=100)
    g_scores = []
    
    s_scores_deque = deque(maxlen=100)
    s_scores = []
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=False)
        
        g_states = env_info[g_brain_name].vector_observations
        s_states = env_info[s_brain_name].vector_observations
        
        g_agent.reset()
        s_agent.reset()
        
        g_score = np.zeros(g_num_agents)
        s_score = np.zeros(s_num_agents)
        
        for t in range(max_t):
            g_actions = g_agent.act(g_states)
            s_actions = s_agent.act(s_states)

            
            g_actions = np.argmax(g_actions, axis=1)
            s_actions = np.argmax(s_actions, axis=1)

            
            g_actions = np.expand_dims(g_actions, axis=1)
            s_actions = np.expand_dims(s_actions, axis=1)
            
            
            actions = dict(zip([g_brain_name, s_brain_name],
                               [g_actions, s_actions]))
                        
            env_info = env.step(actions)
            
            g_next_state = env_info[g_brain_name].vector_observations
            s_next_state = env_info[s_brain_name].vector_observations
            
            g_rewards = env_info[g_brain_name].rewards  
            s_rewards = env_info[s_brain_name].rewards
            
            g_dones = env_info[g_brain_name].local_done
            s_dones = env_info[s_brain_name].local_done
            
            s_agent.step(s_states, s_actions, s_rewards, s_next_state, s_dones)
            g_agent.step(g_states, g_actions, g_rewards, g_next_state, g_dones)
            
            g_states = g_next_state
            s_states = s_next_state
            
            g_score += g_rewards
            s_score += s_rewards
            
            if np.any(g_dones) or np.any(s_dones):
                break
        
        g_scores_deque.append(np.mean(g_score))
        s_scores_deque.append(np.mean(s_score))
        
        g_scores.append(np.mean(g_score))
        s_scores.append(np.mean(s_score))
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score - g:{:.2f} s:{:.2f}'.format(i_episode, np.mean(g_scores_deque), np.mean(s_scores_deque)))
            
            torch.save(g_agent.actor_local.state_dict(), 'checkpoint_actor_soccer_g.pth')
            torch.save(g_agent.critic_local.state_dict(), 'checkpoint_critic_soccer_g.pth')   
    
            torch.save(s_agent.actor_local.state_dict(), 'checkpoint_actor_soccer_s.pth')
            torch.save(s_agent.critic_local.state_dict(), 'checkpoint_critic_soccer_s.pth')   
    
    return g_scores, s_scores
            

g_scores, s_scores = ddpg()


    
for i in range(5):                                         # play game for 2 episodes
    env_info = env.reset(train_mode=False)                 # reset the environment    
    g_states = env_info[g_brain_name].vector_observations  # get initial state (goalies)
    s_states = env_info[s_brain_name].vector_observations  # get initial state (strikers)
    g_scores = np.zeros(g_num_agents)                      # initialize the score (goalies)
    s_scores = np.zeros(s_num_agents)                      # initialize the score (strikers)
    while True:
        # select actions and send to environment
        g_actions = np.random.randint(g_action_size, size=g_num_agents)
        s_actions = np.random.randint(s_action_size, size=s_num_agents)
        
        actions = dict(zip([g_brain_name, s_brain_name], 
                           [g_actions, s_actions]))
        env_info = env.step(actions)                       
        
        # get next states
        g_next_states = env_info[g_brain_name].vector_observations         
        s_next_states = env_info[s_brain_name].vector_observations
        
        # get reward and update scores
        g_rewards = env_info[g_brain_name].rewards  
        s_rewards = env_info[s_brain_name].rewards
        g_scores += g_rewards
        s_scores += s_rewards
        
        # check if episode finished
        done = np.any(env_info[g_brain_name].local_done)  
        
        # roll over states to next time step
        g_states = g_next_states
        s_states = s_next_states
        
        # exit loop if episode finished
        if done:                                           
            break
    print('Scores from episode {}: {} (goalies), {} (strikers)'.format(i+1, g_scores, s_scores))

env.close()

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 2
        Number of External Brains : 2
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: GoalieBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
Unity brain name: StrikerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 6
        Vector Action descriptions: , , , , , 


['GoalieBrain', 'StrikerBrain']
Number of goalie agents: 2
Number of striker agents: 2
Number of goalie actions: 4
Number of striker actions: 6
There are 2 goalie agents. Each receives a state with length: 336
There are 2 striker agents. Each receives a state with length: 336
Critic(
  (fc1): Linear(in_features=336, out_features=400, bias=True)
  (fc2): Linear(in_features=404, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=1, bias=True)
)
Actor(
  (fc1): Linear(in_features=336, out_features=400, bias=True)
  (fc2): Linear(in_features=400, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=4, bias=True)
)
Critic(
  (fc1): Linear(in_features=336, out_features=400, bias=True)
  (fc2): Linear(in_features=406, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=1, bias=True)
)
Actor(
  (fc1): Linear(in_features=336, out_features=400, bias=True)
  (fc2): Linear(in_features=400, out_features=300, bias=True)
  (fc3): Li

RuntimeError: size mismatch, m1: [64 x 401], m2: [406 x 300] at /pytorch/aten/src/THC/generic/THCTensorMathBlas.cu:249