<a href="https://colab.research.google.com/github/meliksahb/Deep-RL/blob/main/DRL_Atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import numpy as np
import tensorflow as tf

In [None]:
class DeepQNetwork(object):
  def __init__(self, lr, n_actions, name, fc1_dims=256,
               input_dims=(210,160,4), chkpt_dir='tmp/dqn'):
    "name of the network is important because we are gonna have           n\
    2 networks one to select an action and one to tell us the             n\
    value of an action                                                    n\
    n\
    fc1_dims= number of dimensions in the first fully connected layer     n\
    n\
    input_dims= input dimension of our env. for atari library in gym      n\
    all of the images have 210 by 160 resolution and we are going to      n\
    pass in a set of frames to give the agent a sense of motion we are    n\
    going to pass in 4 frames in particular. we will do some cropping     n\
    n\
    chkpt_dir = directory to save checkpoints of model"

    self.lr = lr
    self.name = name
    self.n_actions = n_actions
    self.fc1_dims = fc1_dims
    self. input_dims = input_dims

    # tensorflow session this is what instantiates everything into the graph
    # each network want to have its own
    self.sess = tf.Session()

    # add everything to the graph
    self.build_network()

    # once you have added everything to the graph, you have to initialize it
    self.sess.run(tf.global_variables_initializer())

    # save model
    self.saver = tf.train.Saver()  # its going to train for quite some time
    # so we are going to want to be able to save it as we go along because
    # we have other stuff to do

    # save checkpoint files
    self.checkpoint_file = os.path.join(chkpt_dir, 'deepqnet.ckpt')

    # keep track of the parameters for each particular network
    self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                    scope=self.name)
    # we use this later when we copy one network to another

  # build network
  def build_net(self):

    # encase everyting in a scope that is based on the network name
    with tf.variable_scope(self.name):

      # we are going to have olaceholder variables that tell us the inputs to
      # our model. we are going to want to input the stack of images from the
      # atari game, we want to input the actions that the agent took as well as
      # the target value for the q network
      self.input = tf.placeholder(tf.float32, shape=[None, *self.input_dims],
                                                     name='inputs')
      # this convention of naming placeholders and layers repeated because it
      # makes debugging easier. if you get an error it will tell you the
      # variable or layer that caused the error


      # one hot encoding of the actions
      self. actions = tf.placeholder(tf.float32,shape=[None, self.n_actions],
                                     name='action_taken')

      # same thing for the q_target
      self.q_target = tf.placeholder(tf.float32, shape=[None, self.n_actions])

      # convention of using None as the first parameter in the shape allows you
      # to train a batch of stuff and thats the important because in virtually
      # every deep learning application you want to pass in a batch of info
      # in this case we are going to be passing in batches of stacked frames


      # start build conv layers
      conv1 = tf.layers.conv2d(inputs=self.input, filters=32,
                               kernel_size=(8,8), strides=4, name='conv1',
                               kernel_initializer=tf.variance_scaling_initializer(scale=2))

      # activate that with a relu func
      conv1_activated = tf.nn.relu(conv1)

      conv2 = tf.layers.conv2d(inputs=conv1_activated, filters=64,
                               kernel_size=(4,4), strides=2, name='conv2',
                               kernel_initializer=tf.variance_scaling_initializer(scale=2))

      conv2_activated = tf.nn.relu(conv2)

      conv3 = tf.layers.conv2d(inputs=conv2_activated, filters=128,
                               kernel_size=(3,3), strides=1, name='conv3',
                               kernel_initializer=tf.variance_scaling_initializer(scale=2))

      conv3_activated = tf.nn.relu(conv3)

      # flatten all of them and pass them through a dense network to get q values
      # or the values of each state action pair
      flat = tf.layers.flatten(conv3_activated)

      dense1 = tf.layers.dense(flat, units=self.fc1_dims,
                               activation=tf.nn.relu,
                               kernel_initializer=tf.variance_scaling_initializer(scale=2))

      # determine q values
      # q values and q learning just refers t the value of a state action pair
      # this will be output of our neural network (1 output for each action)
      self.Q_values = tf.layers.dense(dense1, units=self.n_actions,
                                      kernel_initializer=tf.variance_scaling_initializer(scale=2))

      # we are not activating that yet. we want to just get linear activation of
      # the output of out network

      # actual value of q for each action (Actions is a placeholder)
      self.q = tf.reduce_sum(tf.multiply(self.Q_values, self.actions))

      # squared difference between the q value of the network outputs and q target
      self.loss = tf.reduce_mean(tf.square(self.q - self.q_target))

      # the way q learning works is that at each time step it's a form of temporal
      # difference learning. so every time step, it learns and it says 'hey, I
      # took some action, what was the maximal action I could have taken' and then
      # it takes the delta between whatever action it took and the maximal action
      # and uses that to update the neural network as its loss function

      # training
      self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)

  # save model
  def load_checkpoint(self):
    'reason is that these models take a notoriously long time to train and so n\
    we may want to start and stop as we go along'

    print('... loading checkpoint ...')

    self.saver.restore(self.sess, self.checkpoint_file)
    # it will look in the checkpoint file and load up the graph from that file
    # and save it and load it into the graph of the current session

  def save_checkpoint(self):

    print('... saving checkpoint ...')

    self.saver.save(self.sess, self.checkpoint_file)
    # takes the current session and opposite to a file

# summary
# this deep q network takes a batch of images from the env in this case breakout
# passes it through CNN to do the feature selection that passes it through a fully
# connected layer to deternşbe the value of each given action and then uses the
# maximum value of the next action to determine its loss function and perform
# training on that network via back propagation

  # agent that includes everything else all of the learnings, memories etc.
  class Agent(object):
    def __init__(self, alpha, gamma, mem_size, n_actions, epsilon, batch_size,
                 replace_target=5000, input_dims=(210, 64, 4),
                 q_next='tmp/q_next', q_eval='tmp/q_eval'):

    # alpha: learning rate
    # gamma: discount factor
    # epsilon: determines how often it takes a random action
    # replace_target: how often we want to replace our target network

    # one network says action to take other one says value of that action

        # when take random actions we will need to know the action space
        # and we need to know the number of actions
        self.n_actions = n_actions
        self.action_space = [i for i in range(self.n_actions)]

        # discount factor: tells the agent how much it wants to discount future rewards
        self.gamma = gamma

        # memory size: tells us how many transitions to store in memory
        self.mem_size = mem_size


        self.epsilon = epsilon
        self.batch_size = batch_size

        self.mem_cntr = 0

        self.replace_target = replace_target

        # we need our network to tell the agent the value of the next action
        self.q_next = DeepQNetwork(alpha, n_actions, input_dims=input_dims,
                                   name='q_next', chkpt_dir=q_next_dir)

        self.q_eval = DeepQNetwork(alpha, n_actions, input_dims=input_dims,
                                   name='q_eval', chkpt_dir=q_eval_dir)

        # we have our 2 networks the next thing we need is a memory so q learning
        # works by saving the state, action, reward and new state transitions in
        # its memory.

        self.state_memory = np.zeros((self.mem_size, *input_dims))
        # this will save a set of four transitions four frames stacked
        # four frames by number of memories

        self.new_state_memory = np.zeros((self.mem_size, *input_dims))

        # we lso need an action memory. this will store the one hot encoding of
        # our actions. to save ram, save that as int8
        self.action_memory = np.zeros((self.mem_size, n_actions, dtype=np.int8))

        self.reward_memory = np.zeros(self.mem_size)

        # terminal memory just saves the memory of the done flex and to save ram
        # we will save that one as int8
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.int8)

    def store_transition(self, state, action, reward, state_, terminal):
      # agent has some fixed memeory signs we want to fill up to that memory and
      # then when we exceed it we just want to go back to the beginning and start
      # overriding it.

      # counter that keeps track of the number of memories that it has stored
      index = self.mem_cntr % self.mem_size

      self.state_memory[index] = state

      # do one hot encoding
      actions = np.zeros(self.n_actions)

      # we pass in the action it will just be an integer
      actions[action] = 1.0

      # making an array of zeros and setting the index of the action you took to
      # one is one hot encoding

      self.action_memory[index] = actions
      self.reward_memory[index] = reward
      self.new_state_memory[index] = state_
      self.terminal_memory[index] = 1 - terminal

      self.mem_cntr += 1

      # deep q learning relies on what is called epsilon greedy. so epsilon is a
      # parameter that tells it how often to choose a random action we are going
      # to dk_epsilon over time the agent will start out acting purely randomly
      # for many many hundreds of games and eventually the random factor will
      # start decreasing over time and the agent will take more and more greedy
      # actions. greedy action is choosing the action that has the highest value
      # of the next state.

    def choose_action(self,state):
      rand = np.random.random()

      # select an action at random from the agents action space
      if rand < self.epsilon:

        action = np.random.choice(self.action_space)

      else:

        # if we are going to take a greedy action, then we need to actually find
        # out what out next highest valued action is so we need to use our evaluation
        # network to run
        actions = self.q_eval.sess.run(self.q_eval.Q_values,
                                       feed_dict={self.q_eval.input: state})
        # current state as the q evaluation network input

        # take max action
        action = np.argmax(actions)

        return action

# learning part
# learning has many parts to it. basic idea is first thing we are going to do is
# check to see if we want to update the value of our target network and if its
# time to do that we are going to go ahead and do that.
# next thing we are going to do is select a batch of random memories the most
# important thing here is that these memories are non-sequential. if you choose
# sequential memories then the agent will get trapped in little parameter space
# and what you'll get is oscillations and performance over time.
# to actually have robust learning you want to select different transitions over
# the entirety of the memory.
# then you have to calculate the value of the current action as well as the next
# maximum action and then you plug that into the bellman eq for the q learning
# algo and run your update func on your loss.


