# Initialize the environment

In [1]:
from unityagents import UnityEnvironment
import numpy as np
import torch
from collections import deque

In [2]:
env = UnityEnvironment(file_name="Tennis.app")

# get the default brain
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


# Create the Agent (Actor + Critic)

In [3]:
from model import Actor, Critic
from randomProcess import OrnsteinUhlenbeckNoise
from replayBuffer import ReplayBuffer

In [8]:
import torch.optim as optim
import torch.nn.functional as F

class DDPG_Agent():
    def __init__(self,state_size,action_size,index):
        self.action_size = action_size
        self.state_size = state_size
        self.index = index
        
        self.actor_local = Actor(state_size,action_size).to(device)
        self.actor_target = Actor(state_size,action_size).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),lr = LR_ACTOR)
        self.critic_local = Critic(state_size,action_size).to(device)
        self.critic_target = Critic(state_size,action_size).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),lr = LR_CRITIC)
        
        # hard copy of local and target
        self.hard_update(self.actor_target, self.actor_local)
        self.hard_update(self.critic_target,self.critic_local)
        
        self.noise = OrnsteinUhlenbeckNoise(action_size)
        self.timesteps = 0
        
    def hard_update(self,target,source):
        for target_params,source_params in zip(target.parameters(),source.parameters()):
            target_params.data.copy_(source_params.data)
        
       
    def act(self,state,add_noise = True):
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action,-1,1)
    
    def reset(self):
        self.noise.reset() 
    
    def learn(self,experiences):
        
        states,actions,rewards,next_states,dones = experiences
        whole_states = torch.cat(states, dim=1).to(device)
        whole_next_states = torch.cat(next_states, dim=1).to(device)
        whole_actions = torch.cat(actions, dim=1).to(device)
        
        
        next_actions = [actions[index].clone() for index in range(num_agents)]
        next_actions[self.index] = self.actor_target(next_states[self.index])
        whole_next_actions = torch.cat(next_actions, dim=1).to(device)
        
        Q_target_next = self.critic_target(whole_next_states,whole_next_actions)
        Q_target = rewards[self.index] + GAMMA * Q_target_next *(1-dones[self.index])
        Q_exp = self.critic_local(whole_states,whole_actions)
        critic_loss = F.mse_loss(Q_exp,Q_target)
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1)
        self.critic_optimizer.step()
        
        actions_pred = [actions[index].clone() for index in range(num_agents)]
        actions_pred[self.index] = self.actor_local(states[self.index])
        whole_actions_pred = torch.cat(actions_pred, dim=1).to(device)
        
        self.actor_optimizer.zero_grad()
        actor_loss = -self.critic_local(whole_states, whole_actions_pred).mean()
        actor_loss.backward()
        self.actor_optimizer.step()


        self.soft_update(self.critic_local, self.critic_target, TAU)
        self.soft_update(self.actor_local, self.actor_target, TAU)
        
        
    def soft_update(self,local,target,tau):
        for target_params,local_params in zip(target.parameters(),local.parameters()):
            target_params.data.copy_(tau * local_params.data + (1.0 - tau) * target_params.data)

# Multiagent 

In [9]:
class MADDPG():
    
    def __init__(self, num_agents, state_size, action_size):
        self.num_agents = num_agents
        self.state_size = state_size
        self.action_size = action_size
        # Shared memory buffer
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE)
        # a DDPG Agent for each agent
        self.agents = [DDPG_Agent(state_size,action_size,x)  for x in range(num_agents)]
        
        
    def reset(self):
        for agent in self.agents:
            agent.reset()
    
    def act(self,state,add_noise = True):
        action = np.zeros([self.num_agents, self.action_size])
        for index,agent in enumerate(self.agents):
            action[index,:] = agent.act(state[index],add_noise = add_noise)
        return action
    
    def step(self,states,actions,rewards,next_states,dones):
        self.memory.add(states,actions,rewards,next_states,dones)
        
        if len(self.memory) > BATCH_SIZE:
            experiences = self.memory.sample()
            for index,agent in enumerate(self.agents):
                agent.learn(experiences)

In [10]:
num_agents = 2
state_size = 24
action_size = 2

LR_ACTOR = 10e-3               
LR_CRITIC = 10e-3                
TAU = 0.001                     
GAMMA = 0.99
BUFFER_SIZE = int(1e5)          
BATCH_SIZE = 128                
RANDOM_SEED = 42               
UPDATE_EVERY = 10   

n_episodes = 2000

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

multi_agent = MADDPG(num_agents, state_size, action_size)

In [11]:
scores_deque = deque(maxlen=10)
scores_list = []

for i_episode in range(1,n_episodes+1):                    
    env_info = env.reset(train_mode=True)[brain_name]      
    states = env_info.vector_observations                  
    multi_agent.reset()
    scores = np.zeros(num_agents)                          
    while True:
        actions = multi_agent.act(states)
        env_info = env.step(actions)[brain_name]           
        next_states = env_info.vector_observations         
        rewards = env_info.rewards                         
        dones = env_info.local_done                        
        multi_agent.step(states,actions,rewards,next_states,dones)
        scores += rewards 
        states = next_states                               
        if np.any(dones):                               
            break
            
    max_scores = np.max(scores)       
    scores_deque.append(max_scores)
    scores_list.append(max_scores)
    average_score = np.mean(scores_deque)
    print('\rEpisode {}\tAverage Score: {:.4f}'.format(i_episode, average_score), end="")
        
    if(average_score >=  0.5):
        maddpg.save()
        break
    if i_episode % 10 == 0:
        print('\rEpisode {}\tAverage Score: {:.4f}'.format(i_episode, average_score))

Episode 10	Average Score: 0.0000
Episode 20	Average Score: 0.0000
Episode 30	Average Score: 0.0100
Episode 40	Average Score: 0.0000
Episode 50	Average Score: 0.0000
Episode 60	Average Score: 0.0100
Episode 70	Average Score: 0.0100
Episode 80	Average Score: 0.0000
Episode 90	Average Score: 0.0000
Episode 100	Average Score: 0.0000
Episode 110	Average Score: 0.0000
Episode 120	Average Score: 0.0000
Episode 130	Average Score: 0.0000
Episode 140	Average Score: 0.0000
Episode 150	Average Score: 0.0000
Episode 160	Average Score: 0.0000
Episode 170	Average Score: 0.0000
Episode 180	Average Score: 0.0000
Episode 190	Average Score: 0.0000
Episode 200	Average Score: 0.0000
Episode 210	Average Score: 0.0000
Episode 220	Average Score: 0.0000
Episode 230	Average Score: 0.0000
Episode 240	Average Score: 0.0000
Episode 250	Average Score: 0.0000
Episode 260	Average Score: 0.0000
Episode 270	Average Score: 0.0000
Episode 280	Average Score: 0.0000
Episode 290	Average Score: 0.0000
Episode 300	Average Sco

KeyboardInterrupt: 

In [12]:
s

NameError: name 's' is not defined

In [14]:
states

array([[ -9.23119068,  -1.55886006, -30.        ,  -0.98100001,
          7.5908556 ,   5.91759634, -30.        ,  -0.98100001,
        -10.90504646,  -1.71581995,  -0.        ,  -1.96200001,
          7.5908556 ,   5.74101639,  -0.        ,  -1.96200001,
        -10.89979172,  -1.85235918,  -0.        ,   0.        ,
          7.5908556 ,   5.46633625,  -0.        ,   0.        ],
       [-10.67537212,  -0.98316395, -30.        ,   6.21520042,
         -7.5908556 ,   5.91759634, -30.        ,   6.21520042,
        -10.90028286,  -0.42050385,   0.        ,   5.23420095,
         -7.5908556 ,   5.74101639,   0.        ,   5.23420095,
        -10.89978886,   0.0440563 ,   0.        ,   4.25320148,
         -7.5908556 ,   5.46633625,   0.        ,   4.25320148]])

In [15]:
actions

array([[-1.        , -0.73484945],
       [-0.92873263,  1.        ]])