# Proximal Policy Optimization

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

from collections import namedtuple, deque

In [2]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.distributions import MultivariateNormal

In [3]:
from buffer import BasicBuffer
from model import PolicyNetwork

In [4]:
import warnings
warnings.simplefilter('ignore', UserWarning)

In [5]:
plt.style.use('ggplot')

## Set Configs

In [6]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [7]:
GAMMA = 0.99
PRINT_EVERY = 20

LR = 3e-4
BETAS = (0.9, 0.999)
EPS_CLIP = 0.2
ACTION_STD = 0.5
K_EPOCHS = 80
UPDATE_TIME_STEP = 4000

## Set Environment

In [8]:
ENV_NAME = 'BipedalWalker-v2'
env = gym.make(ENV_NAME).unwrapped; env.seed(90);

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [9]:
print('Environment Display:')
env.reset() # reset environment to a new, random state
env.render()

print('State space {}'.format(env.observation_space))
print('Action space {}'.format(env.action_space))

Environment Display:
State space Box(24,)
Action space Box(4,)


## Define [PPO](https://arxiv.org/pdf/1707.06347.pdf) Agent

In [10]:
class PPOAgent():
    """The Agent that will interacts with and learns from the environment."""
    
    def __init__(self, env, seed):
        """Initialize an Agent object."""
        
        self.env = env
        
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        
        self.gamma = GAMMA
        self.lr = LR
        self.betas = BETAS
        self.eps_clip = EPS_CLIP
        self.action_std = ACTION_STD
        self.K_epochs = K_EPOCHS

        self.policy = PolicyNetwork(self.state_size, self.action_size, self.action_std, seed).to(device)
        self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.lr, betas=self.betas)
        
        self.policy_old = PolicyNetwork(self.state_size, self.action_size, self.action_std, seed).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        # set buffer
        self.buffer = BasicBuffer()
        
        # set loss function
        self.mse_loss = nn.MSELoss()
    
    def calculate_action(self, state, action_mean, action_var):
    
        cov_mat = torch.diag(action_var).to(device)
        dist = MultivariateNormal(action_mean, cov_mat)
        
        action = dist.sample()
        action_log_prob = dist.log_prob(action)
    
        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.log_probs.append(action_log_prob)
        
        return action.detach()
    
    def act(self, state):
        
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        
        action_mean, action_var = self.policy_old.act(state)
        action = self.calculate_action(state, action_mean, action_var)
        action = action.cpu().data.numpy().flatten()
        
        return action
    
    def evaluate_policy(self, state, action):
        
        action_mean, action_var = self.policy.act(state)
        action_var = action_var.expand_as(action_mean)
        
        cov_mat = torch.diag_embed(action_var).to(device)
        dist = MultivariateNormal(action_mean, cov_mat)
        
        action_log_prob = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.policy.evaluate(state)
        
        return action_log_prob, torch.squeeze(state_value), dist_entropy
        
    def learn(self):
        
        rewards = []
        discounted_reward = 0
        
        for reward, done in zip(reversed(self.buffer.rewards), reversed(self.buffer.dones)):
            if done:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # normalize the rewards
        rewards = torch.tensor(rewards).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        
        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states).to(device), 1).detach()
        old_actions = torch.squeeze(torch.stack(self.buffer.actions).to(device), 1).detach()
        old_log_probs = torch.squeeze(torch.stack(self.buffer.log_probs).to(device), 1).detach()
        
        # optimize policy for K epochs without generating new trajectories
        for _ in range(self.K_epochs):
            
            # evaluate old actions and values to learn surrogate function
            log_probs, state_values, dist_entropy = self.evaluate_policy(old_states, old_actions)
            
            # find the ratio of pi_theta and pi_theta_old
            ratios = torch.exp(log_probs - old_log_probs.detach())
            
            # compute surrogate function (clip it to ensure that the new policy remains close to the old one)
            advantages = rewards - state_values.detach()
            surrogate1 = ratios * advantages
            surrogate2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surrogate1, surrogate2) + 0.5 * self.mse_loss(state_values, rewards) - 0.01 * dist_entropy
            loss = loss.mean()
            
            # backpropagate gradient loss
            self.policy_optimizer.zero_grad()
            loss.backward()
            self.policy_optimizer.step()
        
        # copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())
        
    def save(self, policy_path):
        
        if not os.path.exists('./agents/'): os.makedirs('./agents/')
        torch.save(self.policy.state_dict(), policy_path)
        
    def watch(self, num_episodes):
        
        for i_episode in range(1, num_episodes+1):
    
            state = env.reset()
            rewards = []
            for time_step in range(1000):

                env.render() # render the screen

                action = self.act(state) # select an action
                next_state, reward, done, _ = env.step(action)

                state = next_state
                rewards.append(reward)
                if done:
                    break

            print(f'\rEpisode: {i_episode}, Average Score: {sum(rewards):.2f}')

        env.close()

In [11]:
agent = PPOAgent(env, seed=90)

## Train The Agent

In [12]:
def train_agent(num_episodes=10000, max_time=1500):
    
    scores = []
    scores_window = deque(maxlen=PRINT_EVERY)
    
    time_to_update = 0
    average_length = 0
    
    # training loop
    for i_episode in range(1, num_episodes+1):

        state = env.reset()
        score = 0
        
        for time_step in range(max_time):
            
            time_to_update +=1
            
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            
            # saving reward and done
            agent.buffer.rewards.append(reward)
            agent.buffer.dones.append(done)
            
            # update agent
            if time_to_update % UPDATE_TIME_STEP == 0:
                agent.learn()
                agent.buffer.clear_memory()
                time_to_update = 0
                
            state = next_state
            score += reward
            if done:
                break
                
        average_length += time_step
                
        scores_window.append(score)
        scores.append(score)
        
        print(f'\rEpisode: {i_episode}, Average Length: {average_length/PRINT_EVERY}, Average Score: {np.mean(scores_window):.2f}', end='')
        
        if i_episode % PRINT_EVERY == 0:
            print(f'\rEpisode: {i_episode}, Average Length: {average_length/PRINT_EVERY}, Average Score: {np.mean(scores_window):.2f}')
            average_length = 0
            
        if np.mean(scores_window) >= 300.0:
            print(f'\nEnvironment solved in {i_episode:d} episodes!, Average Length: {average_length}, Average Score: {np.mean(scores_window):.2f}')
            agent.save(f'./agents/PPO_{ENV_NAME}.pth')
            break
            
    print('Training completed.')
    
    return scores

In [None]:
scores = train_agent(num_episodes=10000, max_time=1500)

Episode: 20, Average Length: 517.1, Average Score: -92.231
Episode: 40, Average Length: 662.05, Average Score: -86.91
Episode: 60, Average Length: 958.15, Average Score: -70.63
Episode: 80, Average Length: 1213.1, Average Score: -57.173
Episode: 100, Average Length: 1118.55, Average Score: -59.94
Episode: 120, Average Length: 1257.5, Average Score: -52.783
Episode: 140, Average Length: 1304.4, Average Score: -30.335
Episode: 160, Average Length: 1307.1, Average Score: -17.603
Episode: 180, Average Length: 1287.0, Average Score: 3.8276
Episode: 200, Average Length: 1459.15, Average Score: 31.44
Episode: 220, Average Length: 1499.0, Average Score: 54.105
Episode: 240, Average Length: 1499.0, Average Score: 69.402
Episode: 260, Average Length: 1432.05, Average Score: 73.85
Episode: 280, Average Length: 1499.0, Average Score: 98.005
Episode: 300, Average Length: 1499.0, Average Score: 109.505
Episode: 320, Average Length: 1367.45, Average Score: 95.63
Episode: 340, Average Length: 1306.25,

## Evaluate The Agent

In [None]:
plt.figure(figsize=(10,5))
plt.plot(np.arange(len(scores)), scores, color='green')
plt.xlabel('Num of episodes')
plt.ylabel('Score')
if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/plot_of_ppo_evaluation.png')
plt.show()

## 🎬 Watch The Smart Agen

In [None]:
agent.watch(num_episodes=10)

---