Install dependencies

In [None]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
# install required system dependencies
!apt-get install -y xvfb x11-utils

# install required python dependencies (might need to install additional gym extras depending)
!pip install gym[box2d]==0.17.* pyvirtualdisplay==0.2.* PyOpenGL==3.1.* PyOpenGL-accelerate==3.1.*

In [2]:
#@title Imports
import numpy as np
import matplotlib.pyplot as plt
import time
from matplotlib.pyplot import figure

import gym
import torch
from torch.utils import data
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import random
from gym import wrappers
import copy
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

display = Display(visible=0, size=(1400, 900))
display.start()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())
print(device)

from PartI_lib import archs as archs
from PartI_lib import performance_evaluation as eval
from PartI_lib import train_loop as tl
from PartI_lib import my_tools as mt
# !mkdir Checkpoints


In [None]:
#@title Deep RL Replay Buffer

class ReplayBuffer:
    def __init__(self, size, input_shape,dev):
        self.size = size
        self.counter = 0
        self.state_buffer = torch.zeros((self.size, input_shape))
        self.action_buffer = torch.zeros(self.size, dtype=torch.int64)
        self.reward_buffer = torch.zeros(self.size)
        self.new_state_buffer = torch.zeros((self.size, input_shape))
        self.terminal_buffer = torch.zeros(self.size)
        self.state_buffer.to(dev)
        self.action_buffer.to(dev)
        self.reward_buffer.to(dev)
        self.new_state_buffer.to(dev)
        self.terminal_buffer.to(dev)
        self.dev=dev;



    def store_tuples(self, state, action, reward, new_state, done):
        idx = self.counter % self.size
        self.state_buffer[idx] = state
        self.action_buffer[idx] = action
        self.reward_buffer[idx] = reward
        self.new_state_buffer[idx] = new_state
        self.terminal_buffer[idx] = done
        self.counter += 1

    def sample_buffer(self, batch_size):
        max_buffer = min(self.counter, self.size)
        batch = np.random.choice(max_buffer, batch_size, replace=False)
        state_batch = self.state_buffer[batch].to(self.dev)
        action_batch = self.action_buffer[batch].to(self.dev)
        reward_batch = self.reward_buffer[batch].to(self.dev)
        new_state_batch = self.new_state_buffer[batch].to(self.dev)
        done_batch = self.terminal_buffer[batch].to(self.dev)

        return state_batch, action_batch, reward_batch, new_state_batch, done_batch
    def last_buffer(self, batch_size):
        state_batch = self.state_buffer[-1:-batch_size].to(self.dev)
        action_batch = self.action_buffer[-1:-batch_size].to(self.dev)
        reward_batch = self.reward_buffer[-1:-batch_size].to(self.dev)
        new_state_batch = self.new_state_buffer[-1:-batch_size].to(self.dev)
        done_batch = self.terminal_buffer[-1:-batch_size].to(self.dev)

        return state_batch, action_batch, reward_batch, new_state_batch, done_batch


Default DQN arquitecture

In [None]:
# class DQN(nn.Module):

#     def __init__(self, inputs, outputs,dfactor):
#         super(DQN, self).__init__()
        
#         self.input_size=inputs;
#         self.output_size=outputs;
#         self.discount_factor=dfactor;
        
#         self.layers = nn.Sequential(
#             nn.Linear(in_features=self.input_size, out_features=128),
#             # nn.Linear(in_features=128, out_features=256),
#             # nn.Linear(in_features=256, out_features=512),
#             # nn.Linear(in_features=512, out_features=256),
#             nn.Linear(in_features=128, out_features=self.output_size)
#         )


#     # Called with either one element to determine next action, or a batch
#     # during optimization. Returns tensor([[left0exp,right0exp]...]).
#     def forward(self, x):
#         return self.layers(x)

#     def policy(self,state):
#        with torch.no_grad():
#             return self.__call__(state).argmax()
     
#     def getPolicy(self,state,eps_threshold):
#         sample = random.random()
#         if sample > eps_threshold:
#             with torch.no_grad():
 
#                 return self.__call__(state).argmax()
#         else:
#             return  torch.tensor([[random.randrange(self.output_size)]], device=device, dtype=torch.long)



 

OpenAI Environment CartPole-v0

###    Description:
        A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity.
###    Source:
        This environment corresponds to the version of the cart-pole problem described by Barto, Sutton, and Anderson
###    Observation:
        Type: Box(4)
        Num     Observation               Min                     Max
        0       Cart Position             -4.8                    4.8
        1       Cart Velocity             -Inf                    Inf
        2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
        3       Pole Angular Velocity     -Inf                    Inf
###    Actions:
        Type: Discrete(2)
        Num   Action
        0     Push cart to the left
        1     Push cart to the right
        Note: The amount the velocity that is reduced or increased is not fixed; it depends on the angle the pole is pointing. This is because the center of gravity of the pole increases the amount of energy needed to move the cart underneath it
###    Reward:
        Reward is 1 for every step taken, including the termination step
###    Starting State:
        All observations are assigned a uniform random value in [-0.05..0.05]
###    Episode Termination:
        Pole Angle is more than 12 degrees.
        Cart Position is more than 2.4 (center of the cart reaches the edge of the display).
        Episode length is greater than 200.
###     Solved Requirements:
        Considered solved when the average return is greater than or equal to 195.0 over 20 consecutive trials.

In [None]:
#@title DQN movie generation (for visual evaluation in Google Colab)

def createMovie(Network, path, Filename):
    env = gym.make("CartPole-v0")
    envX = wrappers.Monitor(env, path+'/'+Filename, force=True)

    observation = envX.reset()

    i = 0
    Network.eval()

    while True:
        envX.render()

        state = torch.Tensor(observation).to(device)

        action = Network.policy(state)

        observation, reward, done, info = envX.step(action.item())
        i = i+1
        if done:
          break

    envX.close()
    env.close()
    Network.train()
    mp4list = glob.glob(path+Filename+'/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        if(Running_in_colab):
            ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
        else:
            # print("Loc Video")
            # print(mp4)
            # print("VIDEO PR")
            Video(mp4)

            # HTML("""
            #         <video alt="test" controls>
            #             <source src="test.mp4" type="video/mp4">
            #         </video>
            #     """)
    else:
        print("Could not find video")


Run the new netwwork with random policy

In [None]:
# randomnet=RandomNet(2)
# createMovie(randomnet,"random")

Initialization and Parameters:


In [None]:


# Sim configuration
env = gym.make("CartPole-v0")
spec = gym.spec("CartPole-v0")


inputs = 4
n_actions = 2

#hyper-parameters
TotalEpisodes = 2000
MaxSteps = 400
FreezeCounter = 25
BatchSize = 128
exploration_threshold = 1
exploration_threshold_min = 0.01
# exploration_decay=0.002
discount_factor = 0.99
SaveAtCounter = 200
LearningRateDecay = 0.99

arch = "DoubleDuelingDQN"
n_layers = 4
LearningRate = 0.0001
exploration_decay = 0.02


# arch = "DQN"

#network DQN

buffer = ReplayBuffer(1000000, inputs, device)

# print(policy_net)

file_path__ = mt.create_dir("results", arch)

policy_net, target_net = archs.archs(
    arch, inputs, n_actions, discount_factor, device, n_layers)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
file_path_ = mt.create_dir(file_path__, "N"+str(n_layers)+"_Layers")

optimizer = torch.optim.Adam(
    policy_net.parameters(), lr=LearningRate)
scheduler = optim.lr_scheduler.ExponentialLR(
    optimizer, gamma=LearningRateDecay)
loss = torch.nn.MSELoss()

def trainModel():
    if buffer.counter < BatchSize:
        return 0.0

    state_batch, action_batch, reward_batch, new_state_batch, done_batch = buffer.sample_buffer(
        BatchSize)

    q_actual = torch.gather(policy_net(
        state_batch), 1, action_batch.reshape(-1, 1))
    with torch.no_grad():
        if(arch == "DoubleDQN" or arch == "DoubleDuelingDQN"):
            target = torch.argmax(
                policy_net(state_batch), -1).detach()
            q_max_next = target_net(new_state_batch).gather(
                1, target.unsqueeze(-1)).squeeze(-1)
        else:
            q_max_next = target_net(new_state_batch).max(1)[
                0].detach()
    q_target = (q_max_next * discount_factor) * \
        (1-done_batch) + reward_batch

    ll = loss(q_actual, q_target.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    ll.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

    return ll.item()


file_name = mt.set_name(arch, n_layers, BatchSize, exploration_threshold, exploration_threshold_min,
                        exploration_decay, discount_factor, LearningRate, LearningRateDecay)
file_path = mt.create_dir(file_path_, file_name)
mt.create_dir(file_path, "Checkpoints")
print(file_name)


In [None]:
bestNet, episodes, scores, events, avg_scores, avg_scores20, exploration, avg_scores100 = tl.train_loop(
    policy_net, target_net, env, device, TotalEpisodes, FreezeCounter, SaveAtCounter, createMovie, MaxSteps, exploration_threshold, exploration_decay, exploration_threshold_min, buffer, trainModel, file_path)


In [None]:
eval.performance_evaluation(
    file_path, episodes, scores, events, avg_scores, avg_scores20, exploration, avg_scores100)
eval.report(file_path, arch, BatchSize, exploration_threshold, exploration_threshold_min, exploration_decay, discount_factor,
            LearningRate, LearningRateDecay, episodes, scores, events, avg_scores, avg_scores20, exploration, n_layers, avg_scores100)


In [None]:
createMovie(bestNet, file_path, 'bestNet')
