In [1]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from time import sleep
from IPython.display import clear_output
from collections import namedtuple, deque

from unityagents import UnityEnvironment

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

In [3]:
from buffer import ReplayBuffer
from model import ActorNetwork, CriticNetwork

In [4]:
import warnings
warnings.simplefilter('ignore', UserWarning)

In [5]:
plt.style.use('ggplot')

## Set Configs

In [6]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [7]:
BATCH_SIZE = 32
GAMMA = 0.995
ENTROPY_WEIGHT = 0.001
EPS_CLIP = 1e-1
GRAD_CLIP = 5e-1

GOALIE_LR = 8e-5
STRIKER_LR = 1e-4

In [8]:
ACTOR_KEY = 0
CRITIC_KEY = 1

## Set Environment

In [9]:
env = UnityEnvironment(file_name='./Soccer.app', no_graphics=True)

goalie_brain_name = env.brain_names[0]
striker_brain_name = env.brain_names[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 2
        Number of External Brains : 2
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: GoalieBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 
Unity brain name: StrikerBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 112
        Number of stacked Vector Observation: 3
        Vector Action space type: discrete
        Vector Action space size (per agent): 6
        Vector Action descriptions: , , , , , 


## Define [PPO](https://arxiv.org/pdf/1707.06347.pdf) Agent

In [10]:
class PPOAgent():
    """The Agent that will interacts with and learns from the environment."""
    
    def __init__(self, env, mode='Goalie', seed=90):
        """Initialize an Agent object."""
        
        self.env_info = env.reset(train_mode=True)
                
        if mode == 'Goalie': brain_name = env.brain_names[0] # brain name for Goalie
        elif mode == 'Striker': brain_name = env.brain_names[1] # brain name for Striker
            
        self.state_size = self.env_info[brain_name].vector_observations.shape[1]
        self.action_size = env.brains[brain_name].vector_action_space_size
        
        self.batch_size = BATCH_SIZE
        self.gamma = GAMMA
        self.entropy_weight = ENTROPY_WEIGHT
        self.eps_clip = EPS_CLIP       
        self.grad_clip = GRAD_CLIP
        
        if mode == 'Goalie': self.lr = GOALIE_LR
        elif mode == 'Striker':  self.lr = STRIKER_LR
            
        self.actor = ActorNetwork(self.state_size, self.action_size, seed).to(device)
        self.critic = CriticNetwork(self.state_size, seed).to(device)
        self.optimizer = torch.optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=self.lr)
        
        self.buffer = ReplayBuffer(self.batch_size, seed)
        
    def act(self, states):
        
        states = torch.FloatTensor(states).unsqueeze(0).to(device)
        
        self.actor.eval()
        
        with torch.no_grad():
            actions, action_log_probs, _ = self.actor(states)
            
        actions = actions.cpu().detach().numpy().item()
        action_log_probs = action_log_probs.cpu().detach().numpy().item()
        
        self.policy.train()
        
        return actions, action_log_probs
    
    def memorize(self, actor_state, critic_state, action, log_prob, reward):
        
        self.buffer.add(actor_state, critic_state, action, log_prob, reward)
    
    def learn(self, gamma):
        
        actor_states, critic_states, actions, old_log_probs, rewards, num_experiences = buffer.get_experiences()
        
        discount = self.gamma**np.arange(num_experiences)
        rewards = rewards.squeeze(1) * discount
        next_rewards = rewards[::-1].cumsum(axis=0)[::-1]
        
        actor_states = torch.FloatTensor(actor_states).to(device)
        critic_states = torch.FloatTensor(critic_states).to(device)
        actions = torch.LongTensor(actions).squeeze(1).to(device)
        log_probs = torch.FloatTensor(log_probs).squeeze(1).to(device)
        rewards = torch.FloatTensor(next_rewards.copy()).to(device)

        self.critic.eval()
        with torch.no_grad():
            state_values = self.critic(critic_states)
        
        self.critic.train()
        
        advantages = (rewards - values.detach().squeeze())
        advantages = advantages.detach()
        advantages_normalized = (advantages - advantages.mean()) / (advantages.std() + 1e-10)
        advantages_normalized = torch.FloatTensor(advantages_normalized).to(device)
        
        batches = BatchSampler(SubsetRandomSampler(range(0, num_experiences)), self.batch_size, drop_last=False)
        
        losses = []
        
        for batch_indices in batches:
            
            batch_indices = torch.LongTensor(batch_indices).to(device)
            
            actor_states = actor_states[batch_indices]
            critic_state = critic_state[batch_indices]
            actions = actions[batch_indices]
            old_log_probs = old_log_probs[batch_indices]
            rewards = rewards[batch_indices]
            advantages_normalized = advantages_normalized[batch_indices]
        
            _, log_probs, dist_entropies = self.actor(actor_states, actions)
            state_values = self.critic(critic_states).squeeze()
            
            ratios = torch.exp(log_probs - old_log_probs.detach())
            
            surrogate1 = ratios * advantages
            surrogate2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
                
            policy_loss = -torch.min(surrogate1, surrogate2).mean() - self.entropy_weight * dist_entropies.mean()
            value_loss = 0.5 * (rewards, state_values).pow(2).mean()
            
            total_loss = policy_loss + value_loss

            self.optimizer.zero_grad()
            total_loss.backward()
            nn.utils.clip_grad_norm_(self.actor.parameters(), self.grad_clip)
            nn.utils.clip_grad_norm_(self.critic.parameters(), self.grad_clip)
            self.optimizer.step()
            
        self.epsilon *= 1.
        self.entropy_weight *= 0.995
    
    def save(self, actor_path, critic_path):
        
        if not os.path.exists('./agents/'): os.makedirs('./agents/')
        torch.save(self.actor.state_dict(), actor_path); torch.save(self.critic.state_dict(), critic_path);
        
    def load(self, actor_path, critic_path):
        
        self.actor.load_state_dict(torch.load(actor_path)); self.critic.load_state_dict(torch.load(critic_path));
    
    def watch(self, num_episodes=10, max_time=2000):
        pass

In [11]:
goalie = PPOAgent(env, mode='Goalie', seed=90); striker = PPOAgent(env, mode='Striker', seed=90);

## Train The Agent

In [12]:
def train_agent(num_episodes=2000, max_time=2000):
    pass

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_ppo_evaluation.png')
plt.show()

## 🎬 Watch The Smart Agent

In [None]:
agent.load(f'./agents/Actor_{brain_name}.pth', f'./agents/Critic_{brain_name}.pth');

In [None]:
agent.watch(num_episodes=10)

---