# Welcome!
Below, we will learn to implement and train a policy to play atari-pong, using only the pixels as input. We will use convolutional neural nets, multiprocessing, and pytorch to implement and train our policy. Let's get started!

In [None]:
# install package for displaying animation
!pip install JSAnimation

# custom utilies for displaying animation, collecting rollouts and more
import pong_utils

%matplotlib inline

# check which device is being used. 
# I recommend disabling gpu until you've made sure that the code runs
device = pong_utils.device
print("using device: ",device)

In [None]:
# render ai gym environment
import gym
import time

# PongDeterministic does not contain random frameskip
# so is faster to train than the vanilla Pong-v4 environment
env = gym.make('PongDeterministic-v4')

print("List of available actions: ", env.unwrapped.get_action_meanings())

# we will only use the actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5
# the 'FIRE' part ensures that the game starts again after losing a life
# the actions are hard-coded in pong_utils.py

# Preprocessing
To speed up training, we can simplify the input by cropping the images and use every other pixel



In [None]:
import matplotlib
import matplotlib.pyplot as plt

# show what a preprocessed image looks like
env.reset()
_, _, _, _ = env.step(0)
# get a frame after 20 steps
for _ in range(20):
    frame, _, _, _ = env.step(1)

plt.subplot(1,2,1)
plt.imshow(frame)
plt.title('original image')

plt.subplot(1,2,2)
plt.title('preprocessed image')

# 80 x 80 black and white image
plt.imshow(pong_utils.preprocess_single(frame), cmap='Greys')
plt.show()



# Policy

## Exercise 1: Implement your policy
 
Here, we define our policy. The input is the stack of two different frames (which captures the movement), and the output is a number $P_{\rm right}$, the probability of moving left. Note that $P_{\rm left}= 1-P_{\rm right}$

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F



# set up a convolutional neural net
# the output is the probability of moving right
# P(left) = 1-P(right)
class Policy(nn.Module):

    def __init__(self):
        super(Policy, self).__init__()
        
        
        # 80x80 to outputsize x outputsize
        # outputsize = (inputsize - kernel_size + stride)/stride 
        # (round up if not an integer)

        # output = 19x19
        self.conv1 = nn.Conv2d(2, 4, kernel_size=8, stride=4)
        # output = 8x8
        self.conv2 = nn.Conv2d(4, 12, kernel_size=5, stride=2)
        self.size=12*8*8
        
        self.fc1 = nn.Linear(self.size, 320)
        self.fc2 = nn.Linear(320, 120)
        self.fc3 = nn.Linear(120, 1)
        self.sig = nn.Sigmoid()
        
    def forward(self, x):
            
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        # flatten the tensor
        x = x.view(-1,self.size)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.sig(self.fc3(x))


# run your own policy!
policy=Policy().to(device)
# policy=pong_utils.Policy().to(device)

# we use the adam optimizer with learning rate 2e-4
# optim.SGD is also possible
import torch.optim as optim
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

# Game visualization
pong_utils contain a play function given the environment and a policy. An optional preprocess function can be supplied. Here we define a function that plays a game and shows learning progress

In [None]:
pong_utils.play(env, policy, time=200) 
# try to add the option "preprocess=pong_utils.preprocess_single"
# to see what the agent sees

# Function Definitions
Here you will define key functions for training. 

## Exercise 2: write your own function for training
(what I call scalar function is the same as policy_loss up to a negative sign)

### PPO
Later on, you'll implement the PPO algorithm as well, and the scalar function is given by
$\frac{1}{T}\sum^T_t \min\left\{R_{t}^{\rm future}\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)},R_{t}^{\rm future}{\rm clip}_{\epsilon}\!\left(\frac{\pi_{\theta'}(a_t|s_t)}{\pi_{\theta}(a_t|s_t)}\right)\right\}$

the ${\rm clip}_\epsilon$ function is implemented in pytorch as ```torch.clamp(ratio, 1-epsilon, 1+epsilon)```

In [28]:
def clipped_surrogate(policy, old_probs, states, actions, rewards,
                      discount = 0.995, epsilon=0.1, beta=0.01):

    ########
    ## 
    ## WRITE YOUR OWN CODE HERE
    ##
    ########
    
    discounts = discount**np.arange(len(rewards))
    discounted_reward = rewards * discounts[:,np.newaxis]
    future_discounted_reward = np.asarray(discounted_reward)[::-1].cumsum(axis=1)[::-1]
    
    mean_reward = np.mean(future_discounted_reward, axis=1)
    std_reward = np.std(future_discounted_reward, axis=1) + 1.0e-10
    
    normalized_reward = (future_discounted_reward - mean_reward[:,np.newaxis]) / std_reward[:,np.newaxis]
    
    actions = torch.tensor(actions, dtype=torch.int8, device=device)
    old_probs = torch.tensor(old_probs, dtype=torch.float, device=device)
    rewards = torch.tensor(normalized_reward, dtype=torch.float, device=device)

    # convert states to policy (or probability)
    new_probs = pong_utils.states_to_prob(policy, states)
    new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs)

    ratio = new_probs / old_probs
    
    clip = torch.clamp(ratio, 1-epsilon, 1+epsilon)
    
    clipped_results = torch.min(rewards*ratio,rewards*clip)
    
    # include a regularization term
    # this steers new_policy towards 0.5
    # prevents policy to become exactly 0 or 1 helps exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \
        (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))

    return torch.mean(beta*entropy + clipped_results)


# Training
We are now ready to train our policy!
WARNING: make sure to turn on GPU, which also enables multicore processing. It may take up to 45 minutes even with GPU enabled, otherwise it will take much longer!

In [29]:
from parallelEnv import parallelEnv
import numpy as np
# keep track of how long training takes
# WARNING: running through all 800 episodes will take 30-45 minutes

# training loop max iterations
episode = 500

# widget bar to display progress
!pip install progressbar
import progressbar as pb
widget = ['training loop: ', pb.Percentage(), ' ', 
          pb.Bar(), ' ', pb.ETA() ]
timer = pb.ProgressBar(widgets=widget, maxval=episode).start()


envs = parallelEnv('PongDeterministic-v4', n=8, seed=1234)

discount_rate = .99
epsilon = 0.1
beta = .01
tmax = 320
SGD_epoch = 4

# keep track of progress
mean_rewards = []

for e in range(episode):

    # collect trajectories
    old_probs, states, actions, rewards = \
        pong_utils.collect_trajectories(envs, policy, tmax=tmax)
        
    total_rewards = np.sum(rewards, axis=0)


    # gradient ascent step
    for _ in range(SGD_epoch):
        
        # uncomment to utilize your own clipped function!
        # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)

        L = -pong_utils.clipped_surrogate(policy, old_probs, states, actions, rewards,
                                          epsilon=epsilon, beta=beta)
        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        del L
    
    # the clipping parameter reduces as time goes on
    epsilon*=.999
    
    # the regulation term also reduces
    # this reduces exploration in later runs
    beta*=.995
    
    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))
    
    # display some progress every 20 iterations
    if (e+1)%20 ==0 :
        print("Episode: {0:d}, score: {1:f}".format(e+1,np.mean(total_rewards)))
        print(total_rewards)
        
    # update progress widget bar
    timer.update(e+1)
    
timer.finish()



training loop:   4% |#                                          | ETA:  0:34:15

Episode: 20, score: -11.250000
[-13. -13.  -7.  -8. -12. -14. -11. -12.]


training loop:   8% |###                                        | ETA:  0:32:14

Episode: 40, score: -10.875000
[ -9. -12.  -6. -14. -13. -11. -10. -12.]


training loop:  12% |#####                                      | ETA:  0:30:39

Episode: 60, score: -10.625000
[-10.  -8. -12.  -7. -10. -14. -13. -11.]


training loop:  16% |######                                     | ETA:  0:29:08

Episode: 80, score: -11.375000
[ -9. -13. -12. -10. -14.  -8. -14. -11.]


training loop:  20% |########                                   | ETA:  0:27:40

Episode: 100, score: -10.625000
[-11. -12. -14. -10. -10.  -8. -10. -10.]


training loop:  24% |##########                                 | ETA:  0:26:14

Episode: 120, score: -7.250000
[ -8.  -6.  -9.  -6. -11.  -6.  -6.  -6.]


training loop:  28% |############                               | ETA:  0:24:49

Episode: 140, score: -6.500000
[ -5.  -4.  -7.  -9.  -5.  -6.  -6. -10.]


training loop:  32% |#############                              | ETA:  0:23:25

Episode: 160, score: -6.750000
[-8. -9. -7. -7. -7. -6. -8. -2.]


training loop:  36% |###############                            | ETA:  0:22:01

Episode: 180, score: -5.625000
[-7. -4.  0. -8. -6. -7. -6. -7.]


training loop:  40% |#################                          | ETA:  0:20:38

Episode: 200, score: -3.375000
[-2. -9. -4. -2. -2. -3. -4. -1.]


training loop:  44% |##################                         | ETA:  0:19:15

Episode: 220, score: -3.750000
[-4.  0. -7. -6. -3. -1. -3. -6.]


training loop:  48% |####################                       | ETA:  0:17:52

Episode: 240, score: -4.125000
[-1. -3. -5. -4. -5. -2. -4. -9.]


training loop:  52% |######################                     | ETA:  0:16:30

Episode: 260, score: -1.500000
[-2.  0.  0.  0. -4.  0. -3. -3.]


training loop:  56% |########################                   | ETA:  0:15:07

Episode: 280, score: -2.875000
[-5. -3. -3. -1. -2. -1. -1. -7.]


training loop:  60% |#########################                  | ETA:  0:13:45

Episode: 300, score: -2.750000
[-5. -3. -3. -2. -3. -4. -1. -1.]


training loop:  64% |###########################                | ETA:  0:12:22

Episode: 320, score: -2.875000
[-3. -4. -3. -2. -2. -4. -4. -1.]


training loop:  68% |#############################              | ETA:  0:11:00

Episode: 340, score: -2.125000
[ 0. -2. -3. -3.  1. -4. -1. -5.]


training loop:  72% |##############################             | ETA:  0:09:37

Episode: 360, score: -2.250000
[-5. -1.  1. -3. -1. -4. -2. -3.]


training loop:  76% |################################           | ETA:  0:08:15

Episode: 380, score: -0.875000
[-1. -1. -1. -1. -1.  0. -1. -1.]


training loop:  80% |##################################         | ETA:  0:06:52

Episode: 400, score: -1.000000
[-1.  0. -1. -2. -1. -1. -1. -1.]


training loop:  84% |####################################       | ETA:  0:05:30

Episode: 420, score: -0.500000
[ 0.  0. -1.  0. -1. -1.  0. -1.]


training loop:  88% |#####################################      | ETA:  0:04:07

Episode: 440, score: -1.125000
[-2. -1. -1. -1. -1. -1. -1. -1.]


training loop:  92% |#######################################    | ETA:  0:02:45

Episode: 460, score: -1.375000
[ 0. -4. -1. -2. -2.  0. -1. -1.]


training loop:  96% |#########################################  | ETA:  0:01:22

Episode: 480, score: -1.000000
[-1. -1. -1. -1. -1. -1. -1. -1.]


training loop: 100% |###########################################| Time: 0:34:23

Episode: 500, score: -0.875000
[ 0. -1. -2. -1. -1. -1.  0. -1.]





In [30]:
pong_utils.play(env, policy, time=200) 

In [31]:
# save your policy!
torch.save(policy, 'PPO.policy')

# load policy if needed
# policy = torch.load('PPO.policy')

# try and test out the solution 
# make sure GPU is enabled, otherwise loading will fail
# (the PPO verion can win more often than not)!
#
# policy_solution = torch.load('PPO_solution.policy')
# pong_utils.play(env, policy_solution, time=2000) 

  "type " + obj.__name__ + ". It won't be checked "
