# PPO

## We are gonna develop PPO algorithm which will be the base for other algorithms like:
## 1) Playing Montezuma Revange from a single demostration
## 2) Go-Explore

In [1]:
# Importing Section

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import gym
import numpy as np
import random as rand

from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display
from Parallel_env import parallelEnv

## Device Selection

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

## Hyper-Parameters

In [3]:
environment = 'PongDeterministic-v4'

num_episodes = 2000
beta_decay = 0.995
beta = 0.01
discount_rate = 0.99
tmax = 400
epsilon = 0.1 # For clipping
epsilon_decay = 0.999
recycling_traj = 4

mean_rewards = []

RIGHT = 4
LEFT = 5

## Pre-Processing Images

In [4]:
def preprocess_single(image, bkg_color = np.array([144, 72, 17])):
    # NOTICE: The image 'image' we are receving is 260*160. We want to remove part of the pixels
    # from the first axis and rescale the image in a form 80*80. We put everything in grayscale.
    img = np.mean(image[34:-16:2,::2] - bkg_color, axis = 1)/255.
    return img

def preprocess_batch(images, bkg_color = np.array([144, 72, 17])):
    array_images = np.asarray(images)
    
    # Next if is used just if we do not parallelize
    if len(array_images.shape) < 5:
        array_images = np.expand_dims(array_images, axis = 1)
    
    # First dim of array_images represents the 2 subsequent frames. The second the number of envs.
    array_images_prepro = np.mean(array_images[:,:, 34:-16:2, ::2] - bkg_color, axis = -1)/255.0
    batch_input = torch.from_numpy(np.swapaxes(array_images_prepro, 0, 1)).float().to(device)
    
    return batch_input  

## Collect Trajectory function

In [5]:
def collectT(envs, policy, tmax = 320, nrandom = 5):
    
    n = len(envs.ps)
    
    state_list = []
    reward_list = []
    prob_list = []
    action_list = []
    
    envs.reset()
    
    exp1, exp2, exp3, exp4 = envs.step([1]*n)
    
    # Performing some random steps
    
    for _ in range(nrandom):
        frame1, reward1, is_done1, info1 = envs.step(np.random.choice([RIGHT, LEFT],n))
        frame2, reward2, is_done2, info2 = envs.step([0]*n)
   
    for t in range(tmax):
        # Stacking two frames on top of each other, rescaling and giving the tensor the shape
        # n*2*80*80
        batch_input = preprocess_batch([frame1, frame2]) 
        
        ## HERE WHY SQUEEZE? TRY TO UNDERSTAND WHY
        probs = policy(batch_input).squeeze().cpu().detach().numpy()
        
        actions = np.where(np.random.rand(n) < probs, RIGHT, LEFT)
        probs = np.where(actions == RIGHT, probs, 1.0 - probs)
        
        frame1, reward1, is_done, info1 = envs.step(actions)
        frame2, reward2, is_done, info2 = envs.step([0]*n)
        
        reward = reward1 + reward2
        
        state_list.append(batch_input)
        reward_list.append(reward)
        prob_list.append(probs)
        action_list.append(actions)
        
        if is_done.any():
            break
    
    # HERE IN THE MAIN I JUST USE PROBS AND REWARDS I THINK THEY ARE THE ONLY NEEDED IN MY LOSS FUNCTION
    return prob_list, state_list, action_list, reward_list
        

## Single Play function

In [11]:
def play(env, policy, preprocess=None, nrand=5):
    #env.reset()

    # star game
    env.step(1)
    
    # perform nrand random steps in the beginning
    for _ in range(nrand):
        frame1, reward1, is_done1, info1 = env.step(np.random.choice([RIGHT,LEFT]))
        frame2, reward2, is_done2, info2 = env.step(0)
    
    while True:
       
        frame_input = preprocess_batch([frame1, frame2])
        prob = policy(frame_input)
        
        # RIGHT = 4, LEFT = 5
        action = RIGHT if rand.random() < prob else LEFT
        frame1, reward1 , is_done1, info1 = env.step(action)
        
        if is_done1 == True:
            #env.reset()
            break
        frame2, reward2 , is_done2, info2 = env.step(0)
    
        if is_done2 == True:
            #env.reset()
            break
    
    #env.close()
    
    #animate_frames(anim_frames)
    return 

## Surrogate function

In [12]:
def surrogate(policy, old_probs, states, actions, rewards, discount, epsilon = 0.1, beta = 0.1):
    discount = discount**np.arange(len(rewards))
    rew_disc = np.asarray(rewards) * discount[:, np.newaxis]
    rew_future = rew_disc[::-1].cumsum(axis = 0)[::-1]
    
    mean = np.mean(rew_future, axis = 1)
    std = np.std(rew_future, axis = 1) + 1.0e-10
    
    rewards_normalized = (rew_future - mean[:, np.newaxis])/(std[:, np.newaxis])
    
    actions = torch.tensor(actions, dtype = torch.int8).to(device)
    old_probs = torch.tensor(old_probs, dtype = torch.float).to(device)
    rewards = torch.tensor(rewards_normalized, dtype = torch.float).to(device)
    
    states = torch.stack(states).to(device)
    policy_input = states.view(-1, *states.shape[-3:]) # Reshaping. First axis is batch
    probs_new_policy = policy(policy_input).view(states.shape[:-3])
    probs_new_policy = torch.where(actions == RIGHT, probs_new_policy, 1.0 - probs_new_policy)
    
    ratio = probs_new_policy/old_probs
    
    clip = torch.clamp(ratio, 1.0 - epsilon, 1.0 + epsilon) 
    clipped_surrogate = torch.min(ratio * rewards, clip * rewards)
    
    # MIXED ENTROPY
    
    entropy = -(probs_new_policy*torch.log(old_probs+1.e-10)+ \
        (1.0-probs_new_policy)*torch.log(1.0-old_probs+1.e-10))
    
    return torch.mean(clipped_surrogate + beta*entropy)

## Policy

In [13]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        ## THIS HAS TO BE MODIFIED USING SEQUENTIAL
        self.conv1 = nn.Conv2d(2, 4, kernel_size = 6, stride = 2, bias = False) 
        self.conv2 = nn.Conv2d(4, 16, kernel_size = 6, stride = 4)
        self.size = 9*9*16
        
        self.linear1 = nn.Linear(self.size, 256)
        self.linear2 = nn.Linear(256, 1)
        
        self.sig = nn.Sigmoid()
        
    def forward(self, state):
        state = F.relu(self.conv1(state))
        state = F.relu(self.conv2(state))
        state = state.view(-1, self.size)
        state = F.relu(self.linear1(state))
        return self.sig(self.linear2(state))

## Instatiating objects

In [14]:
agent = Policy().to(device)
optimizer = optim.Adam(agent.parameters(), lr = 1e-4)
envs = parallelEnv(environment, n = 8, seed = 12345)
# Next is just for video
env = gym.make('PongDeterministic-v4')
env = gym.wrappers.Monitor(env, "./vid" , video_callable=lambda episode_id: (episode_id+1)%10==0, force = True)

Process Process-16:
Process Process-10:
Process Process-9:
Process Process-12:
Process Process-13:
Process Process-15:
Process Process-11:
Traceback (most recent call last):
Traceback (most recent call last):
Process Process-14:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiproce

## Training 

In [15]:
beta = 0.1
for ep in range(num_episodes):
    old_probs, states, actions, rewards = collectT(envs, agent, tmax = tmax)
    
    total_rewards = np.sum(rewards, axis = 0)
    
    for _ in range(recycling_traj):
        L = - surrogate(agent, old_probs, states, actions, rewards, discount_rate, epsilon, beta)
        
        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        
    epsilon *= epsilon_decay
    beta *= beta_decay
    
    mean_rewards.append(np.mean(total_rewards))
    
    if (ep + 1)% 10 == 0:
        print('Episode: %.2d , mean reward %.2f'%(ep + 1, np.mean(total_rewards) ))
    
    env.reset()
    play(env, agent)

Episode: 10 , mean reward -18.75
Episode: 20 , mean reward -16.88
Episode: 30 , mean reward -18.38
Episode: 40 , mean reward -17.62
Episode: 50 , mean reward -16.88
Episode: 60 , mean reward -17.12
Episode: 70 , mean reward -17.00
Episode: 80 , mean reward -16.88
Episode: 90 , mean reward -16.88
Episode: 100 , mean reward -15.62
Episode: 110 , mean reward -15.38
Episode: 120 , mean reward -13.88
Episode: 130 , mean reward -13.38
Episode: 140 , mean reward -14.12
Episode: 150 , mean reward -14.75
Episode: 160 , mean reward -11.88
Episode: 170 , mean reward -11.00
Episode: 180 , mean reward -12.25
Episode: 190 , mean reward -10.25
Episode: 200 , mean reward -12.00
Episode: 210 , mean reward -10.25
Episode: 220 , mean reward -11.00
Episode: 230 , mean reward -10.12
Episode: 240 , mean reward -10.75
Episode: 250 , mean reward -7.62
Episode: 260 , mean reward -6.00
Episode: 270 , mean reward -6.88
Episode: 280 , mean reward -7.25
Episode: 290 , mean reward -6.88
Episode: 300 , mean reward -

KeyboardInterrupt: 