<a href="https://colab.research.google.com/github/ounospanas/AIDL_B02/blob/main/AIDL_B02_REINFORCE_Pong_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install "gymnasium[atari,accept-rom-license]" ale-py shimmy[atari]

Collecting shimmy[atari]
  Downloading Shimmy-2.0.0-py3-none-any.whl.metadata (3.5 kB)
[0mDownloading Shimmy-2.0.0-py3-none-any.whl (30 kB)
Installing collected packages: shimmy
Successfully installed shimmy-2.0.0


In [21]:
""" Trains an agent with (stochastic) Policy Gradients on Pong. Uses OpenAI Gym. """
""" This is a pytorch implemetation of this https://gist.github.com/karpathy/a4166c7fe253700972fcbc77e4ea32c5"""

import numpy as np
import time
import gymnasium as gym
import ale_py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from matplotlib import pylab as plt
%matplotlib inline

In [22]:
#create policy network
class PolicyGradientNet(nn.Module):
    def __init__(self):
        super(PolicyGradientNet, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm2d(32)
        self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
        self.bn3 = nn.BatchNorm2d(32)
        self.head = nn.Linear(1568, 6)

    def forward(self, x):
        x = F.relu(self.bn1((self.conv1(x))))
        x = F.relu(self.bn2((self.conv2(x))))
        x = F.relu(self.bn3((self.conv3(x))))
        return F.softmax(self.head(x.view(x.size(0), -1)), dim=1)

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [24]:
model = PolicyGradientNet().to(device)

In [25]:
# def loss_fn(preds, r):
#     # pred is output from neural network, a is action index
#     # r is return (sum of rewards to end of episode), d is discount factor
#     r = torch.Tensor(r)
#     return torch.sum(-r * torch.log(preds)) # element-wise multipliy, then sum

In [26]:
# hyperparameters
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
learning_rate = 0.0001
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
render = False

In [27]:
def prepro(I):
    """ prepro 210x160x3 uint8 frame into 6400 (80x80) 1D float vector """
    I = I[35:195] # crop
    I = I[::2,::2,0] # downsample by factor of 2
    I[I == 144] = 0 # erase background (background type 1)
    I[I == 109] = 0 # erase background (background type 2)
    I[I != 0] = 1 # everything else (paddles, ball) just set to 1
    return I.astype(np.float64)

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(0, r.size)):
        if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

In [None]:
gym.register_envs(ale_py)
env = gym.make('PongNoFrameskip-v4', render_mode='rgb_array')
observation, info = env.reset()

reward_sum = 0
super_reward_sum = 0

time_steps = []
training_epochs = 2000
MAX_DUR = 10000
load = False
count = 0

if load:
    model = torch.load('pong_pytorch.pt')

for _ in range(training_epochs):

    if render: env.render() # in case you run the code in a local machine
    prev_x = None # used in computing the difference frame
    curr_state, info = env.reset()
    transitions = [] # list of state, action, rewards
    done = False

    for t in range(MAX_DUR): #while in episode

        # preprocess the observation, set input to network to be difference image
        cur_x = prepro(observation)
        x = cur_x - prev_x if prev_x is not None else np.zeros_like(cur_x)
        prev_x = cur_x
        x = torch.from_numpy(x)
        x = x.view(1,1,80,80)

        # forward the policy network and sample an action from the returned probability
        aprob = model(x.to(device, dtype=torch.float))
        m = Categorical(aprob) # create a map for probs and actions to be sampled later
        action = m.sample()
        action = action.item() # get only value

        # step the environment and get new measurements
        observation, reward, terminated, truncated, info = env.step(action)
        reward_sum += reward

        #append state, action, reward tuple in the transitions
        transitions.append((x, aprob[0][action], reward))

        # you can comment this out to make the task based on trajectories not episodes
        if terminated or truncated:
            break

    # Optimize policy network with full episode/trajectory
    ep_len = len(transitions) # episode/trajectory length
    time_steps.append(ep_len)
    preds = torch.zeros(ep_len)
    discounted_rewards = torch.zeros(ep_len)

    count+=1
    rewards = []
    for i in range(ep_len): #for each step in episode/trajectory

        state, action, reward = transitions[i]
        preds[i] = action
        rewards.append(reward)

    epr = np.vstack(rewards)

    # normalize rewards
    discounted_rewards = discount_rewards(epr)
    discounted_rewards -= np.mean(discounted_rewards)
    eps = np.finfo(np.float32).eps.item()
    discounted_rewards /= np.std(discounted_rewards) +eps

    #compute loss
    policy_loss = -torch.sum(torch.Tensor(discounted_rewards) * torch.log(preds).view(-1,1))
    loss = policy_loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print("For episode {} the reward is: {}".format(count,reward_sum))
    super_reward_sum += reward_sum
    reward_sum = 0

    if count%100==0:
        print('The mean of last 100 rewards is {}'.format(super_reward_sum/100))
        super_reward_sum = 0
        torch.save(model, 'pong_pytorch.pt')

env.close()

For episode 1 the reward is: -21.0
For episode 2 the reward is: -21.0
For episode 3 the reward is: -21.0
For episode 4 the reward is: -20.0
For episode 5 the reward is: -20.0
For episode 6 the reward is: -20.0
For episode 7 the reward is: -20.0
For episode 8 the reward is: -20.0
For episode 9 the reward is: -21.0
For episode 10 the reward is: -21.0
For episode 11 the reward is: -20.0
For episode 12 the reward is: -21.0
For episode 13 the reward is: -19.0
For episode 14 the reward is: -19.0
For episode 15 the reward is: -18.0
For episode 16 the reward is: -20.0
For episode 17 the reward is: -20.0
For episode 18 the reward is: -19.0
For episode 19 the reward is: -20.0
For episode 20 the reward is: -21.0
For episode 21 the reward is: -18.0
For episode 22 the reward is: -19.0
For episode 23 the reward is: -18.0
For episode 24 the reward is: -20.0
For episode 25 the reward is: -19.0
For episode 26 the reward is: -20.0
For episode 27 the reward is: -18.0
For episode 28 the reward is: -21.0
F

In [None]:
!apt-get install -y xvfb x11-utils
!pip install pyvirtualdisplay==0.2.*



In [None]:
from gym.wrappers.monitoring.video_recorder import VideoRecorder
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display

In [None]:
# load the weights from file
model = torch.load('pong_pytorch.pt')

In [None]:
before_training = "pong_trained.mp4"

from gym import wrappers
env = gym.make("PongDeterministic-v4")
video = VideoRecorder(env, before_training)


for _ in range(1):
  state, info = env.reset()
  reward_sum = 0
  prev_x = None

  for t in range(10000):

    video.capture_frame()

    cur_x = prepro(state)
    x = cur_x - prev_x if prev_x is not None else np.zeros_like(cur_x)
    prev_x = cur_x
    x = torch.from_numpy(x)
    x = x.view(1,1,80,80)

    with torch.no_grad():
        aprob = model(x.to(device, dtype=torch.float))
    #print(aprob)
    m = Categorical(aprob)
    action = m.sample()
    action = action.item()

    state, reward, done, _ = env.step(action)
    reward_sum += reward

    if done:
        break
print(reward_sum)
video.close()
env.close()

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  return I.astype(np.float)
  logger.deprecation(
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


20.0


In [None]:
from base64 import b64encode
def render_mp4(videopath: str) -> str:
  """
  Gets a string containing a b4-encoded version of the MP4 video
  at the specified path.
  """
  mp4 = open(videopath, 'rb').read()
  base64_encoded_mp4 = b64encode(mp4).decode()
  return f'<video width=400 controls><source src="data:video/mp4;' \
         f'base64,{base64_encoded_mp4}" type="video/mp4"></video>'

In [None]:
from IPython.display import HTML
html = render_mp4(before_training)
HTML(html)