# Assault (Atari 2600): which is the best agent?

### Install packages

In [None]:
!pip install imageio
!pip install gym
!pip install gym[all]
!pip install gym[atari]
!pip install imageio-ffmpeg
!pip3 install matplotlib
!pip install tensorflow_probability

In [None]:
! wget http://www.atarimania.com/roms/Roms.rar && unrar x Roms.rar && unzip Roms/ROMS.zip
! python3 -m atari_py.import_roms .

### Import 

In [3]:
import os
import numpy as np
import random
import imageio
import IPython
import base64
import gym

import cv2
from PIL import Image
from IPython.display import clear_output
from collections import deque
from datetime import datetime
import tensorflow as tf
if tf.__version__ > "2.4.0":
    import tensorflow_probability as tfp
import matplotlib.pyplot as plt

from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint

### Load Game

In [4]:
environment = gym.make("Assault-v4")

print('Number of states: {}'.format(environment.observation_space))
print('Number of actions: {}'.format(environment.action_space))
print(environment.unwrapped.get_action_meanings())

Number of states: Box(0, 255, (210, 160, 3), uint8)
Number of actions: Discrete(7)
['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']


### Utils

In [23]:
def image_preprocess_observations(frame, shape=(200, 160)):
    frame = frame.astype(np.uint8)  # cv2 requires np.uint8, other dtypes will not work

    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    frame = frame[10:, :160]  # crop image
    #frame = cv2.resize(frame, shape, interpolation=cv2.INTER_NEAREST)
    frame = frame.reshape((*shape))

    return frame


def create_video(env, model, video_filename = 'imageio'):
  def embed_mp4(filename):
    """Embeds an mp4 file in the notebook."""
    video = open(filename,'rb').read()
    b64 = base64.b64encode(video)
    tag = '''
    <video width="640" height="480" controls>
      <source src="data:video/mp4;base64,{0}" type="video/mp4">
    Your browser does not support the video tag.
    </video>'''.format(b64.decode())

    return IPython.display.HTML(tag)


  num_episodes = 2
  video_filename = video_filename + ".mp4"
  with imageio.get_writer(video_filename, fps=60) as video:
    for _ in range(num_episodes):
        observation = environment.reset()

        state = image_preprocess_observations(observation)
        #take the firsts 4 step as init
        states = [state, state, state, state]
            
        terminated = False
        video.append_data(observation)
        while not terminated:
            agent_states = np.expand_dims(np.array(tf.cast(states, tf.float32)).reshape(model.state_size), axis=0)
            action = model.act(agent_states)
            #print(agent_states.shape)
            env_action = action #agent.atari_action_from_output(action)  # UP=2 DOWN=3

            # Take action    
            observation, _, terminated, _ = environment.step(env_action) 
            next_state = image_preprocess_observations(observation)

            next_states = states[1:]
            next_states.append(next_state)
            states = next_states
            #agent.store(agent_states, action, reward, np.array(next_states).reshape((80,80,4)), terminated)
            video.append_data(observation)

  embed_mp4(video_filename)


def play_games(environment, agent):
    print(agent)
    total_reward = 0
    for i in range(4):
        observation = environment.reset()
        observation, _, _, _ = environment.step(1)

        state = image_preprocess_observations(observation)
        #take the firsts 4 step as init
        states = [state, state, state, state]
            
        terminated = False
        print("Start game {}".format(i))
        while not terminated:
            agent_states = np.expand_dims(np.array(tf.cast(states, tf.float32)).reshape(agent.state_size), axis=0)
            env_action = agent.act(agent_states)

            # Take action  
            #print(env_action)
            observation, reward, terminated, _ = environment.step(env_action) 
            next_state = image_preprocess_observations(observation)

            next_states = states[1:]
            next_states.append(next_state)
            states = next_states

            total_reward += reward
    return total_reward


def self_play(best_agent_reward, current_agent, environment):
    #best_agent_reward = play_games(environment, best_agent)
    current_agent_reward = play_games(environment, current_agent)
    print("Best reward {}".format(best_agent_reward))
    if current_agent_reward > best_agent_reward:
        print("Better model found! Saving best_model")
        current_agent.save_model(0, "Break_best_model")
        return current_agent_reward
    return current_agent_reward


def plot_bar(data, n, filename="Bar"):
    plt.figure()
    plt.bar(np.arange(n), data, align='center', alpha=0.5)
    plt.xlabel('Actions')
    plt.ylabel('Counts')
    plt.xticks(np.arange(7), ['NOOP', 'FIRE', "UP", 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'])
    plt.title('Histogram of Actions choosen')
    plt.grid(True)
    plt.savefig(filename + ".png")
    plt.close()
    
    
def plot_graph(data, filename="Graph"):
    plt.figure()
    plt.plot(data)
    plt.xlabel('Episodes')
    plt.ylabel('Rewards')
    plt.xticks(np.arange(len(data)))
    plt.title('Episode Rewards')
    plt.grid(True)
    plt.savefig(filename + ".png")
    plt.close()
    
    
def plot_mult_bar(data1, data2, n, filename="MultiBar"):
    plt.figure()
    w = 0.3
    plt.bar(np.arange(n), data1, width=w, align='center', alpha=0.5)
    plt.bar(np.arange(n), data2, width=w, align='center', alpha=0.5)
    plt.xlabel('Actions')
    plt.ylabel('Counts')
    plt.xticks(np.arange(7), ['NOOP', 'FIRE', "UP", 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE'])
    plt.title('Histogram of Actions choosen')
    plt.grid(True)
    plt.savefig(filename + ".png")
    plt.close()
    
    
def plot_multi_graph(data1, data2, filename="MultiGraph"):
    plt.figure()
    plt.plot(data1)
    plt.plot(data2)
    plt.xlabel('Episodes')
    plt.ylabel('Rewards')
    plt.xticks(np.arange(len(data1)))
    plt.title('Episode Rewards')
    plt.grid(True)
    plt.savefig(filename + ".png")
    plt.close()

### DQN Agent

In [6]:
class DQNAgent:
    def __init__(self, n_episode, greedy=False):
        # Initialize atributes
        # Stack four images preprecessed 
        self.state_size = (200, 160, 4)  # environment.observation_space.shape*4
        self.action_size = 7  # environment.action_space.n
        self.greedy = greedy

        #self.optimizer = Adam(learning_rate=0.000001)
        self.optimizer = RMSprop(learning_rate=0.00025,
                                       decay=0.95,
                                       momentum=0.0,
                                       epsilon=0.00001,
                                       centered=True)
        self.n_episode = n_episode
        self.batch_size = 32
        
        self.expirience_replay = deque(maxlen=5000)
        
        # Initialize discount and exploration rate
        self.gamma = 0.95
        self.epsilon_init = 0.1 if self.greedy else 0.1
        self.epsilon_final = 0.01
        self.epsilon = np.logspace(self.epsilon_init, self.epsilon_final, self.n_episode, endpoint=True)
        # esploriation term for UCB and counter for the actions
        # self.ucb_c = 2 
        # self.counter_actions = np.ones(self.action_size)  # use ones and not zeros because the fraction in UCB
        
        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_target_model()

        self.best_agent_reward = 0

    def store(self, state, action, reward, next_state, terminated):
        self.expirience_replay.append((state, action, reward, next_state, terminated))
    
    def _build_compile_model(self):
        model = Sequential()
        model.add(Conv2D(16, (8, 8), strides=(4, 4), padding='same', activation='relu', input_shape=self.state_size))
        model.add(Conv2D(32, (4, 4), strides=(2, 2), activation='relu'))
        # model.add(Conv2D(64, (3, 3), activation='relu'))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        # initializer = tf.keras.initializers.Constant(10.)
        # model.add(Dense(self.action_size, activation='linear', kernel_initializer=initializer))
        model.add(Dense(self.action_size, activation='linear'))

        # compile the model using traditional Machine Learning losses and optimizers
        model.compile(loss=tf.keras.losses.Huber(), optimizer=self.optimizer, metrics=['accuracy'])

        return model

    def align_target_model(self):
        print("Aligning models..")
        self.target_network.set_weights(self.q_network.get_weights())
        
    def save_model(self, episode, file_name=None):
        print("Saving weights...")
        if file_name is None:
            file_name = 'DQN_net_weights-' + str(episode) + '-' + str(self.best_agent_reward) + '-'+ datetime.now().strftime("%Y%m%d-%H%M%S")
        self.q_network.save_weights(file_name + '.h5')
        return file_name
    
    def load_model(self, filename):
        # load pre-trained model if exist
        if (os.path.isfile(filename + '.h5')):
            print("Loading previous weights: {}".format(filename))
            self.q_network.load_weights(filename + '.h5')
            self.align_target_model()
    
    def act(self, state, episode=None):
        if episode:
            # training mode with UCB or epsilon greedy decaying
            # Upper-Confidence-Bound policy
            #ucb_weights = self.ucb_c*np.sqrt(np.log(episode)/self.counter_actions)
            #return np.argmax(self.q_network.predict(state)[0] + ucb_weights)
            
            # epsilon greedy with epsilon decaying
            if np.random.rand() <= self.epsilon[episode]:
                # print("Epsilon")
                return random.randint(0, self.action_size-1)

        else:
            # if epsilon is None we are in play not training, so fixed epsilon-greedy
            if np.random.rand() <= self.epsilon_final:
                # print("Epsilon")
                return random.randint(0, self.action_size-1)
        # print("POLICY")
        return np.argmax(self.q_network.predict(state)[0])

    def train_on_single(self):
        if len(self.expirience_replay) < self.batch_size:
            return None
        
        print("Train on single..")
        minibatch = random.sample(self.expirience_replay, self.batch_size)
        
        # extract SARS tuples
        for state, action, reward, next_state, terminated in minibatch:
            # predict values
            target = self.q_network.predict(np.expand_dims(state, axis=0))

            if terminated:
                # if last state there's no future actions
                target[0][action] = reward
            else:
                # take value for next state 
                t = self.target_network.predict(np.expand_dims(next_state, axis=0))
                # update the taget with the max q-value of next state
                # using a greedy policy
                target[0][action] = reward + self.gamma * np.max(t, axis=1)

            self.q_network.fit(np.expand_dims(state, axis=0), target, epochs=1, verbose=0)
    
    def get_arrays_from_batch(self, batch):
        try:
            states = np.array([x[0] for x in batch])
            actions = np.array([x[1] for x in batch])
            rewards = np.array([x[2] for x in batch])
            next_states = np.array([x[3] for x in batch])
            terminateds = np.array([x[4] for x in batch])
        except:
            states = x[0]
            actions = x[1]
            rewards = x[2]
            next_states = x[3]
            terminateds = x[4]
        
        return states, actions, rewards, next_states, terminateds
    
    def train_on_batch(self):
        if len(self.expirience_replay) < self.batch_size:
            return None
            
        #print("Train on batch..")
        minibatch = random.sample(self.expirience_replay, self.batch_size)
        # get the SARS tuple
        states, actions, rewards, next_states, terminateds = self.get_arrays_from_batch(minibatch)
       
        # predict values for next states using target net
        next_Q_values = self.target_network.predict(next_states)
        # takes the greedy actions
        max_next_Q_values = np.max(next_Q_values, axis=1)

        # calculate target values for training, not for the last state
        target_Q_values = (rewards + (1 - terminateds) * self.gamma * max_next_Q_values)
        target_Q_values = target_Q_values.reshape(-1, 1)
        
        mask = tf.one_hot(actions, self.action_size)
        
        with tf.GradientTape() as tape:
            # takes values predicted for current states
            all_Q_values = self.q_network(states)
            # takes values only for given actions
            Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
            # loss = tf.reduce_mean(tf.keras.losses.Huber()(target_Q_values, Q_values))
            loss = tf.reduce_mean(tf.keras.losses.MeanSquaredError()(target_Q_values, Q_values))

        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))

        return loss
    
    def play(self, environment, num_of_episodes, timesteps_per_episode, train=True):
        cont_e = 0
        total_reward_list = []
        total_actions_list = []
        for e in range(0, num_of_episodes):
            print("Episode {} start...".format(e))
            reward_list = []
            actions_list = []
            # Reset the environment
            state = environment.reset()

            state = image_preprocess_observations(state)
            # take the firsts 4 step as init
            states = np.stack([state] * 4, axis = 2)

            # Initialize variables
            episode_reward = 0
            terminated = False
            experience_replay_temp = []

            for timestep in range(timesteps_per_episode):
                # Run Action
                if self.greedy:
                  env_action = self.act(np.expand_dims(states, axis=0))
                else:
                  env_action = self.act(np.expand_dims(states, axis=0), e)
                # +1 on action counter for UCB
                # self.counter_actions[env_action] += 1
                
                # Take action    
                next_state, reward, terminated, info = environment.step(env_action) 
                # print("Reward obtained by q_net " + str(reward))
                next_state = image_preprocess_observations(next_state)
                next_states = np.append(states[:, :, 1: ], np.expand_dims(next_state, 2), axis = 2)
                
                self.store(states, env_action, reward, next_states, terminated)

                # train after some timestep, less become too much computational effort
                if cont_e > 100 and train:
                    # print("start timestamp for training: {}".format(str(timestep)))
                    # loss = self.train_on_single()
                    loss = self.train_on_batch()
                    # print("loss: " + str(loss))
                    cont_e = 0

                cont_e += 1
                states = next_states

                # save stats
                reward_list.append(reward)
                actions_list.append(env_action)
                episode_reward += reward
                total_actions_list.append(env_action)

                if terminated:
                    print("The rewards for the episode {} after {} timestep is {}".format(e, timestep, episode_reward))
                    total_reward_list.append(episode_reward)

                    # DQN training at the end of the episode
                    if train:
                        loss = self.train_on_batch()
                        print("Terminated loss: " + str(loss))
                        cont_e = 0

                    print("____________END__________________")
                    break


            if (e + 1) % 20 == 0:
                print("***************ALIGN-MODEL*******************")
                if train:
                    self.align_target_model()
                print("**********************************")
            if (e + 1) % 100 == 0:
                print("***************SAVE-WEIGHTS*******************")
                new_best_agent_reward = self_play(self.best_agent_reward, self, environment)
                if new_best_agent_reward > self.best_agent_reward:
                    self.best_agent_reward = new_best_agent_reward
                    self.save_model(e)
                print("the score is " + str(new_best_agent_reward))
                print("**********************************")

        return total_reward_list, total_actions_list
      

### Actor-Critic Agent

In [20]:
class Model(tf.keras.Model):
    def __init__(self, state_size, action_size):
        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
    
        # model blocks
        self.conv_1 = Conv2D(16, (8, 8), strides=(4,4), padding='same', activation='relu', input_shape=state_size)
        self.conv_2 = Conv2D(32, (4, 4), strides=(2,2), activation='relu')
        self.dense = Dense(256, activation='relu')
        self.dense_v = Dense(128, activation='relu')
        self.dense_a = Dense(128, activation='relu')
        self.out_v = Dense(1, activation=None)
        self.out_a = Dense(self.action_size, activation='softmax')

    def call(self, input_data):
        x = self.conv_1(input_data)
        x = self.conv_2(x)
        x = Flatten()(x)
        x = self.dense(x)

        x_a = self.dense_a(x)
        a = self.out_a(x_a)

        x_v = self.dense_v(x)
        v = self.out_v(x_v)
        return v, a
    

class ACAgent():
    def __init__(self, gamma = 0.99):
        self.state_size = (200, 160, 4)
        self.action_size = 7
        self.gamma = gamma
        self.optimizer = Adam(learning_rate=1e-6)
        m = Model(self.state_size, self.action_size)
        m.compile(loss=tf.keras.losses.Huber(), optimizer=self.optimizer, metrics=['accuracy'])
        m.build((None, 200, 160, 4))
        self.model = m
      
    def save_model(self, episode, file_name=None):
        print("Saving weights...")
        if file_name is None:
            file_name = 'AC_net_weights-' + str(episode) + '-'+ str(self.best_agent_reward) + '-'+ datetime.now().strftime("%Y%m%d-%H%M%S")
        self.model.save_weights(file_name + '.h5')
        return file_name

    def load_model(self, filename):
        # load model if exist
        if (os.path.isfile(filename + '.h5')):
            print("Loading previous weights: {}".format(filename))
            self.model.load_weights(filename + '.h5')
    
    def act(self, state, e=None):
        _, prob = self.model(state)
        prob = prob.numpy()
        # create a distribution to sample
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        return int(action.numpy()[0])

    def actor_loss_single(self, prob, action, td):
        # create a distribution
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        log_prob = dist.log_prob(action)
        # error is td * ln_prob
        loss = -log_prob*td
        return loss    
         
    def train_on_single(self, state, action, reward, next_state, done):
        # create batches of 1
        state = np.array([state])
        next_state = np.array([next_state])

        with tf.GradientTape() as tape:
            # value and action for the state St
            v, p =  self.model(state, training=True)
            # value and action for the state St+1
            vn, _ = self.model(next_state, training=True)
            # td error
            td = reward + self.gamma * vn * (1 - int(done)) - v

            a_loss = self.actor_loss_single(p, action, td)
            c_loss = td**2  # to reproduce a MSE with only one data
            total_loss = a_loss + 0.4 * c_loss
          
        grads = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return total_loss

    def get_discounted_rewards(self, rewards):
      discounted_rewards = []
      sum_reward = 0
      rewards.reverse()
      for r in rewards:
        sum_reward = r + self.gamma * sum_reward
        discounted_rewards.append(sum_reward)
      discounted_rewards.reverse()

      return discounted_rewards

    def actor_loss_on_batch(self, probs, actions, td):
        
        probability = []
        log_probability= []
        for pb,a in zip(probs,actions):
          dist = tfp.distributions.Categorical(probs=pb, dtype=tf.float32)
          log_prob = dist.log_prob(a)
          prob = dist.prob(a)
          probability.append(prob)
          log_probability.append(log_prob)

        # print(probability)
        # print(log_probability)

        p_loss= []
        e_loss = []
        td = td.numpy()

        for pb, t, lpb in zip(probability, td, log_probability):
          t =  tf.constant(t)
          policy_loss = tf.math.multiply(lpb,t)
          entropy_loss = tf.math.negative(tf.math.multiply(pb,lpb))
          p_loss.append(policy_loss)
          e_loss.append(entropy_loss)
        p_loss = tf.stack(p_loss)
        e_loss = tf.stack(e_loss)
        p_loss = tf.reduce_mean(p_loss)
        e_loss = tf.reduce_mean(e_loss)

        loss = -p_loss - 0.0001 * e_loss
        return loss

    def train_on_batch(self, states, actions, discounted_rewards):
        states = np.array(states, dtype=np.float32)
        actions = np.array(actions, dtype=np.int32)
        discounted_rewards = np.array(discounted_rewards, dtype=np.float32)
        discounted_rewards = tf.reshape(discounted_rewards, (len(discounted_rewards),))
        print(states.shape)
        
        with tf.GradientTape() as tape:
            v, p = self.model(states, training=True)
            v = tf.reshape(v, (len(v),))
            td = tf.math.subtract(discounted_rewards, v)
            a_loss = self.actor_loss_on_batch(p, actions, td)
            c_loss = 0.5*tf.keras.losses.mean_squared_error(discounted_rewards, v)
            total_loss = a_loss + 0.4 * c_loss
          
        grads = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
        return a_loss, c_loss

    def play(self, environment, num_of_episodes, timesteps_per_episode, train=True):
      cont_e = 0
      total_reward_list = []
      total_actions_list = []
      for e in range(0, num_of_episodes):
          print("Episode {} start...".format(e))
          state_list = []
          reward_list = []
          actions_list = []
          # Reset the environment
          state = environment.reset()

          state = image_preprocess_observations(state)
          #take the firsts 4 step as init
          states = tf.cast(np.stack([state] * 4, axis = 2), tf.float32)

          # Initialize variables
          episode_reward = 0
          terminated = False

          experience_replay_temp = []
          timestep = -1
          #for timestep in range(timesteps_per_episode):
          while True:
              timestep += 1
              # Run Action
              env_action = self.act(np.expand_dims(states, axis=0))

              # Take action    
              next_state, reward, terminated, info = environment.step(env_action) 
              #print("Reward obtained by q_net " + str(reward))
              next_state = image_preprocess_observations(next_state)
              next_states = tf.cast(np.append(states[:, :, 1: ], np.expand_dims(next_state, 2), axis = 2), tf.float32)

              if train:
                  loss = self.train_on_single(states, env_action, reward, next_states, terminated)
                  # print("loss: " + str(loss))
            
              
              state_list.append(states)
              states = next_states
              reward_list.append(reward)
              actions_list.append(env_action)
              episode_reward += reward
              
              total_actions_list.append(env_action)

              if terminated:
                  print("The rewards for the episode {} after {} timestep is {}".format(e, timestep, episode_reward))
                  total_reward_list.append(episode_reward)

                  #if train:
                  #  discounted_reward_list = self.get_discounted_rewards(reward_list)
                  #  loss = self.train_on_batch(state_list, actions_list, discounted_reward_list)
                  print("____________END__________________")
                  break

          if (e + 1) % 50 == 0:
              print("***************SAVE-WEIGHTS*******************")
              self.save_model(e)
              print("**********************************")

      return total_reward_list, total_actions_list
    

## Training

In [None]:
# play parameters
num_of_episodes = 50000
timesteps_per_episode = 5500
cont_e = 0

dqn_model_file = ""
dqn_agent = DQNAgent(num_of_episodes, greedy=False)
dqn_agent.load_model(dqn_model_file)

ac_agent = ACAgent()
ac_model_file = ""
ac_agent.load_model(ac_model_file)

In [None]:
total_reward_dqn, total_action_dqn = dqn_agent.play(environment, num_of_episodes, timesteps_per_episode, train=True)

total_reward_a2c, total_action_a2c = ac_agent.play(environment, num_of_episodes, timesteps_per_episode, train=True)

## Evaluation

In [None]:
environment.reset()

In [None]:
num_of_evaluation_step = 10
timesteps_per_episode = 1500
ac_loaded_model = ACAgent()
ac_filename = "7"
ac_loaded_model.load_model(ac_filename)

total_reward_a2c, total_action_a2c = ac_loaded_model.play(environment, num_of_evaluation_step, timesteps_per_episode, train=False)
actions_count_a2c = [total_action_a2c.count(0), total_action_a2c.count(1), total_action_a2c.count(2), total_action_a2c.count(3), total_action_a2c.count(4), total_action_a2c.count(5), total_action_a2c.count(6)]
plot_bar(actions_count_a2c, len(actions_count_a2c), "ActionsBarA2C"+str(num_of_evaluation_step))
plot_graph(total_reward_a2c, filename="RewardGraphA2C"+str(num_of_evaluation_step))

In [None]:
num_of_evaluation_step = 10
timesteps_per_episode = 5500

dqn_agent = DQNAgent(num_of_evaluation_step, greedy=True)
dqn_model_file = ""
dqn_agent.load_model(dqn_model_file)

total_reward_dqn, total_action_dqn = dqn_agent.play(environment, num_of_evaluation_step, timesteps_per_episode, train=False)
actions_count_dqn = [total_action_dqn.count(0), total_action_dqn.count(1), total_action_dqn.count(2), total_action_dqn.count(3), total_action_dqn.count(4), total_action_dqn.count(5), total_action_dqn.count(6)]
plot_bar(actions_count_dqn, len(actions_count_dqn), "ActionsBarDQN"+str(num_of_evaluation_step))
plot_graph(total_reward_dqn, filename="RewardGraphDQN"+str(num_of_evaluation_step))


In [None]:
plot_multi_graph(total_reward_dqn, total_reward_a2c, filename="RewardGraphCompare")
plot_mult_bar(actions_count_dqn, actions_count_a2c, len(actions_count_dqn), filename="ActionBarCompare")

## Video Creation

In [None]:
ac_best_model = ACAgent()
ac_filename = "AC_best_weights"
ac_best_model.load_model(ac_filename)
create_video(environment, ac_best_model, "AC_Best")

In [None]:
dqn_best_agent = DQNAgent(1, True)
dqn_filename = "DQN_best_weights"
dqn_best_agent.load_model(dqn_filename)
create_video(environment, dqn_best_agent, "DQN_Best")