<a href="https://colab.research.google.com/github/nkasmanoff/DeepRL/blob/master/Load_CarRacing_In_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is used to train a DQN on the CarRacing Open AI gym environment, using a GPU, but most importantly, is able to load this environment!

# CoLab Preambles

Most of the requirements of python packages are already fulfilled on CoLab. To run Gym, you have to install prerequisites like xvbf,opengl & other python-dev packages using the following codes.

[](To be done next time: )
[](https://becominghuman.ai/lets-build-an-atari-ai-part-1-dqn-df57e8ff3b26)

In [None]:
#!pip install gym
#!apt-get install python-opengl -y
#!apt install xvfb -y

In [None]:
#!pip install gym[atari]

For rendering environment, you can use pyvirtualdisplay. So fulfill that 

In [None]:
#!pip install pyvirtualdisplay
#!pip install piglet

To activate virtual display we need to run a script once for training an agent, as follows:

In [None]:
#from pyvirtualdisplay import Display
#display = Display(visible=0, size=(1400, 900))
#display.start()

In [None]:
# This code creates a virtual display to draw game images on. 
# If you are running locally, just ignore it
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

In [None]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) # error only
#import tensorflow as tf
import numpy as np
import random
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML

from IPython import display as ipythondisplay

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
from gym import envs
#print(envs.registry.all())

## Add your own environments


In [None]:
#!pip install gym[atari]

## Box2d

Box2d is a 2D physics engine. You can install it via  and then get started as follow:

In [None]:
#!pip install gym[box2d]

In [None]:
def discretize(action):
    """
    Discrete actions used, so far I've made it so that there are 5 unique actions, and the DQN 
    selects from these
    """
    
    if action  == 0:
        # do nothing 
        return np.array([0,0,0])
    if action  == 1:
        # glide left 
        return np.array([-1,0,0])
    if action  == 2:
        # glide right 
        return np.array([1,0,0])
    if action  == 3:
        # accelerate
        return np.array([0,1,0])
    if action  == 4:
        # brake
        return np.array([0,0,1])
    if action  == 5:
        # turn left acc little bit
        return np.array([-1,.5,0])
    if action == 6:
        # turn right acc little bit
        return np.array([-1,.5,0])


# Environment successfully loaded in. 

# Next up, instantiate DQN Modules

In [None]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


%matplotlib inline

In [None]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

In [None]:
class ReplayMemory(object):
    """
    Replay buffer, can do saving, sampling, and clearing 
    """

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
class DeepQNetwork(nn.Module):
    """
    Deep Q Network module. 
    
    It instantiates the class optimizer, device, and all the layers used. 
    
    Same architecture as the one used in HW1, since that seemed to learn things!
    
    
    """
    def __init__(self,lr,in_ch=3,n_actions=7,ch=2):
        super(DeepQNetwork,self).__init__()
        self.conv1 = nn.Conv2d(in_channels=in_ch,out_channels=ch*8,kernel_size=7)
        self.conv2 = nn.Conv2d(in_channels=ch*8,out_channels=ch*16,kernel_size=3,stride=2)
        self.conv3 = nn.Conv2d(in_channels=ch*16,out_channels=ch*16,kernel_size=7)
        self.conv4 = nn.Conv2d(in_channels=ch*16,out_channels=ch*32,kernel_size=3,stride=2)
        self.conv5 = nn.Conv2d(in_channels=ch*32,out_channels=ch*32,kernel_size=7)
        self.conv6 = nn.Conv2d(in_channels=ch*32,out_channels=ch*64,kernel_size=3,stride=2)
        self.fc1 = nn.Linear(64 * ch * 4 * 4,256)
        self.fc2 = nn.Linear(256,n_actions)
        self.history_length = in_ch
        self.ch = ch

        
    def forward(self, x):
        
        #x = torch.Tensor(x).to(self.device) #send to GPU, and make a pytorch obj in case it isn't
        x = x.view(-1,self.history_length,x.shape[-2],x.shape[-1]) #proper shape too, in case it wasn't 
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = F.relu(self.conv5(x))
        x = F.relu(self.conv6(x))

        x = x.view(-1, 64 * self.ch * 4 * 4)
        
        x = F.relu(self.fc1(x))
        a = self.fc2(x)

        return a
  

# Input extraction

In [None]:
# That's what all these helper functions do. 

def rgb2gray(rgb):
    """ 
    converts rgb images to grayscale.
    """
    gray = np.dot(rgb[...,:3], [0.2125, 0.7154, 0.0721])
    return gray.astype('float32') 

def test_preprocess(state):
    """
    Crop, grayscale, and standardize 
    """
    state = state[:-12:,6:-6]
    state = rgb2gray(state)
    state = state / 255
    state = state  * 2 - 1
    

    return state


def grab_test_history(state,state_history,history_length): 
    """
    Designate the number of frames you want to use, and 
    stack together. 
    """
    if len(state_history) >= history_length:
        state_input = np.array(state_history[-history_length:])

    
    else:
        remainder = history_length - len(state_history)
        padding = np.zeros(shape = (remainder,84,84))
        state_input = np.array(state_history[-history_length:])
        state_input = np.concatenate((padding,state_input))
        
    state_input = torch.from_numpy(state_input).resize(1,history_length,84,84).float()
    return state_input    
            


def discretize(action):
    """
    Discrete actions used, so far I've made it so that there are 5 unique actions, and the DQN 
    selects from these
    """
    
    if action  == 0:
        # do nothing 
        return np.array([0,0,0])
    if action  == 1:
        # glide left 
        return np.array([-1,0,0])
    if action  == 2:
        # glide right 
        return np.array([1,0,0])
    if action  == 3:
        # accelerate
        return np.array([0,1,0])
    if action  == 4:
        # brake
        return np.array([0,0,1])
    if action  == 5:
        # turn left acc little bit
        return np.array([-1,.5,0])
    if action == 6:
        # turn right acc little bit
        return np.array([-1,.5,0])


# Training Loop

In [None]:
policy_net = DeepQNetwork(lr=ALPHA,in_ch=HISTORY_LENGTH,n_actions=n_actions,ch=CH).to(device)
target_net = DeepQNetwork(lr=ALPHA,in_ch=HISTORY_LENGTH,n_actions=n_actions,ch=CH).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()


In [None]:
optimizer = optim.Adam(policy_net.parameters()) # what to train with
memory = ReplayMemory(10000) # initizalize replay 

In [None]:
steps_done = 0

def select_action(state):
    """
    Given an input state, chose an action either via epsilon decay, 
    or what the policy net likes
    """
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
       # print("policy choice")
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state.to(device)).max(1)[1].view(1, 1)
    else:
       # print("Eps dec ")
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


In [None]:
episode_durations = []

def optimize_model():
    """
    Completes a single step of the optimization. Samples a batch, computes 
    Q and r + maxQ, and takes the gradient step to bring those closer together. 
    """
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [None]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
ALPHA = .001
HISTORY_LENGTH = 2
CH = 1 # model capacity
RENDER = True
n_actions = 7 # I designate this, not the action space

In [None]:
# Now the actual training. .
env = gym.make('CarRacing-v0')

num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
   # env.reset()
    
    state = env.reset()
    state_history = []
    state = test_preprocess(state)
    state_history.append(state)
    state_input = grab_test_history(state,state_history,history_length=HISTORY_LENGTH)
    #break
   # last_screen = get_screen()
   # current_screen = get_screen()
   # state = current_screen - last_screen
    for t in count():
        if RENDER:
            env.render()
        # Select and perform an action
        action = select_action(state_input)
        next_state, reward, done, _ = env.step(discretize(action.item()))
        reward = torch.tensor([reward], device=device)

        

        # Observe new state
        #last_screen = current_screen
        #current_screen = get_screen()
        
        
      #  if not done:
       #     next_state = #current_screen - last_screen
  #      else:
 #           next_state = None


        next_state = test_preprocess(next_state)

        state_history.append(state)
        old_state_input = state_input

        state_input = grab_test_history(state,state_history,history_length=HISTORY_LENGTH)

        # Store the transition in memory
        memory.push(old_state_input, action, state_input, reward)

        # Move to the next state
       # state = next_state

        # Perform one step of the optimization (on the target network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

print('Complete')
