In [None]:
# This is an improvement of DQN which accounts for previous information from previous states
# For example, if you trained the DQN algorithm to play Pong, it would not be able to predict which direction the ball moves in, since it is only trained on single instances of screens
# So now, instead of training on single screens, we train on sequences of screens. Then we use an LSTM to carry information from previous states to the next state
# To do this, we select random "windows" of playing a game, where we look at some random sequence of n states in the middle of the game for some n, and initialize the hidden state to be 0

In [None]:
# @title
!wget http://www.atarimania.com/roms/Roms.rar 
!unrar x -o+ /content/Roms.rar >/dev/nul
!python -m atari_py.import_roms /content/ROMS >/dev/nul
!sudo pip install pyvirtualdisplay
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [None]:
#@title
from pyvirtualdisplay import Display
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
import random as r
import copy
from google.colab import drive
import skimage
from skimage import io as io2
from skimage.transform import resize
drive.mount('gdrive')

Drive already mounted at gdrive; to attempt to forcibly remount, call drive.mount("gdrive", force_remount=True).


In [None]:
# This code is not original, and taken from: https://colab.research.google.com/github/jeffheaton/t81_558_deep_learning/blob/master/t81_558_class_12_01_ai_gym.ipynb
# This allows me to record videos of model playing Atari games in Google colab

def query_environment(name):
    env = gym.make(name)
    spec = gym.spec(name)
    print(f"Action Space: {env.action_space}")
    print(f"Observation Space: {env.observation_space}")
    print(f"Max Episode Steps: {spec.max_episode_steps}")
    print(f"Nondeterministic: {spec.nondeterministic}")
    print(f"Reward Range: {env.reward_range}")
    print(f"Reward Threshold: {spec.reward_threshold}")

display = Display(visible=0, size=(1400, 900))
display.start()

def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")


def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env

In [None]:
# Name is name of Atari game in Gym. Usually this is [name]-v0

name = "Breakout-v0"
query_environment(name)
num_action = 4 # Number of possible actions in game

Action Space: Discrete(2)
Observation Space: Box(-3.4028234663852886e+38, 3.4028234663852886e+38, (4,), float32)
Max Episode Steps: 200
Nondeterministic: False
Reward Range: (-inf, inf)
Reward Threshold: 195.0


In [None]:
# Converts array to tensor of floats
def toTensor(arr):
  return torch.tensor(arr, dtype=torch.float32)

In [None]:
# Value model is the model of our target nets and value nets. Here we use LSTMs to keep track of previous state information. 
#Thus, we also plug in hidden states and cell states to the model as well as the current game screen

hidden_size = 256

class valueModel(nn.Module):
  def __init__(self, n_out, hidden_size):
    super(valueModel, self).__init__()
    self.value_net = nn.Sequential(
    nn.Conv2d(1, 32, kernel_size = 8, stride = 4), 
    nn.BatchNorm2d(32),
    nn.ReLU(),
    nn.Conv2d(32, 64, kernel_size = 4,  stride = 2),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.Conv2d(64, 64, kernel_size = 3,  stride = 1),
    nn.BatchNorm2d(64),
    nn.ReLU(),
    nn.Flatten(1),
    nn.Linear(3136, 512))
    self.lstm = nn.LSTM(512,  hidden_size)
    self.linear = nn.Sequential(nn.Flatten(), nn.Linear(hidden_size, n_out))
  def forward(self, state, h, c):
    output, (new_h, new_c) = self.lstm(self.value_net(state), (h, c))
    return (self.linear(output), (new_h, new_c))

value_net = valueModel(num_action, hidden_size)
target_net = valueModel(num_action,  hidden_size)

try:
  value_net.load_state_dict(torch.load("gdrive/MyDrive/"+str(name)+"_bot_pompd"))
  target_net.load_state_dict(torch.load("gdrive/MyDrive/"+str(name)+"_bot_pompd"))
except:
  print("Previous models not found")

optimizer = torch.optim.RMSprop(params = 
            value_net.parameters(), lr=0.0001)

In [None]:
 # Returns the action that gives the best possible predicted score based on the target network 

def chooseActionTarget(state, h, c):
  vals, (_, _) = target_net(state, h, c)
  vals = vals.detach().numpy()
  return max([(vals[i], i) for i in range(len(vals))])

# Returns the action that gives the best possible predicted score based on the value network 

def chooseAction(state, h, c):
  vals, (h, c) = value_net(state, h, c)
  vals = vals.detach().numpy()
  return max([(vals[i], i) for i in range(len(vals))])

# Sets the target network's parameters to be equal to the value network 's parameters

def updateTarget():
  target_net.load_state_dict(value_net.state_dict())

In [None]:
maxsize = 10000 # Maximum size of replay memory
replay = []
gamma = 1 # Size of gamma used in DQN
window_size = 100 # Number of states in sequences we use to train our model
loss_func = nn.HuberLoss() # Loss function used

# Add a list of states, actions, rewards to our replay. Note here we're adding sequences of states, not individual states

def addReplay(states, actions, rewards, dones):
  if len(replay) == maxsize:
    replay.pop(0)
  for i in range(50):
    left = r.randint(0, len(states)-1)
    right = max(left+window_size, len(states))
    replay.append((states[left:right], actions[left:right], rewards[left:right], dones[left:right]))

# Gets one state at random from our replay memory

def getOneReplay():
  ret = replay[r.randint(0, len(replay)-1)]
  return ret

# Gets a sample of size n from replay memory

def sampleReplay(n):
  return [getOneReplay() for i in range(n)]

def zero_if(x, b):
  if b:
    return 0
  return x

# Computes the loss among all sequences of states in a sample from replay memory

def compute_loss(replay_sample):
  loss = toTensor(0)
  for game in replay_sample:
    loss += compute_loss_game(game)
  return loss

# Computes the loss of an entire sequence of states, actions, and rewards, where hidden and cell states are kept track of over the game

def compute_loss_game(game):
  h = torch.zeros(hidden_size).unsqueeze(0)
  c = torch.zeros(hidden_size).unsqueeze(0)
  states = game[0]
  actions = game[1]
  rewards = game[2]
  dones = game[3]
  loss = toTensor(0)
  for i in range(len(states)):
    if i < len(states)-1:
      loss += compute_loss_state(states[i], actions[i], states[i+1], rewards[i], dones[i], h, c)
    elif dones[i]:
      loss += compute_loss_state(states[i], actions[i], states[i], rewards[i], dones[i], h, c)
  return loss

# Computes the loss of a single state, given hidden states and cell states

def compute_loss_state(state, action, observation, reward, done, h, c):
  pred_val, (h, c) = value_net(toTensor(state).unsqueeze(0), h, c)
  targ_max_val = chooseActionTarget(toTensor(observation).unsqueeze(0), h, c)[0]
  actual_val = toTensor(reward + zero_if(gamma*targ_max_val, done))
  loss = loss_func(pred_val,actual_val)
  return loss

# Trains our model on a batch of size bs

def learnFromReplay(bs = 1024):
  if len(replay) < 1000:
    return 0
  sample = sampleReplay(bs)
  optimizer.zero_grad()
  loss = compute_loss(sample)
  loss.backward()
  optimizer.step()
  return float(loss)

In [None]:
# Converts game screens to 84 x 84 black and white images

def preproc(image):
  return np.expand_dims(np.transpose(skimage.color.rgb2gray(resize(image, (84, 84)))), 0)

# Plays the game once. Epsilon is the probability for a given state we choose an action randomly "explore" over "exploit", show described whether we want to return a video of us playing the game

def playGame(show=False, epsilon = 0.1):
  score = 0
  losses = []

  states = []
  actions = []
  rewards = []
  dones = []

  if show:
    env = wrap_env(gym.make(name))
  else:
    env = gym.make(name)
  env._max_episode_steps = 1000
  observation = env.reset()
  observation = preproc(observation)
  h = torch.zeros(hidden_size).unsqueeze(0)
  c = torch.zeros(hidden_size).unsqueeze(0)
  
  while True:
    env.render()
    curr_state = observation


    if not r.random() < epsilon:
      _, action = chooseAction(toTensor(curr_state).unsqueeze(0), h, c)
    else:
      action = env.action_space.sample()
    action = int(action)

    observation, reward, done, info = env.step(action)
    score += reward
    observation = preproc(observation)
    states.append(curr_state)
    actions.append(action)
    rewards.append(reward)
    dones.append(done)
    if done:
        break
  addReplay(states, actions, rewards, dones) # Adds the entire list of states and actions and rewards to replay, as we need multiple states in order to train our model to get better at preserving info of previous states
  env.close()
  if show:
    show_video()
  losses.append(learnFromReplay(32))
  return (score, np.array(losses).mean())

In [None]:
c = 10 # Number of games played before we update target
count = 0
scores = [] # Keeps track of scores and losses of games
losses = []

In [None]:
import matplotlib.pyplot as plt

num_games = 1000 # Plays this many games and trains our model
for i in range(num_games):
  score, loss = playGame(epsilon = 0.1)
  scores.append(score)
  losses.append(loss)
  count += 1
  if count % c == 0:
    updateTarget()
  plt.plot(scores)
  plt.show()
  plt.plot(losses)
  plt.show()
  torch.save(value_net.state_dict(), "gdrive/MyDrive/"+str(name)+"_bot_pompd")