In [None]:
# import necessary packages:

import gym 
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import numpy as np
from cpprb import PrioritizedReplayBuffer

In [None]:
# have a look at the environment:
e = gym.make('ALE/SpaceInvaders-ram-v5')
e.reset()
e.step(0)
e.step(1)

In [None]:
# create wrapper to make it easier to interact with the environment:

class wrapper():
    def __init__(self, envName="ALE/SpaceInvaders-ram-v5"):
        self.env = gym.make(envName)
        self.state = None
        self.last_lives = 0

    def reset(self):
        self.state = self.env.reset()
        self.last_lives = 0
        terminal_life_lost = True 
        
        return terminal_life_lost

    def step(self,action):
        new_ram, reward, terminal, info = self.env.step(action)
            
        if info['lives'] < self.last_lives:
            terminal_life_lost = True
        else:
            terminal_life_lost = terminal
        self.last_lives = info['lives']

        self.state = new_ram
        
        return new_ram, reward, terminal, terminal_life_lost

In [None]:
# create a class for our replay buffer using Prioritized Experience Replay from the cpprb package:

class ExperienceReplay():
    """Making use of the cpprb package to implement PER
        with 100,000 as the memory size"""
    def __init__(self, size=100000, ram=128, batch_size=32):
        self.size = size
        self.ram = ram
        self.batch_size = batch_size

        self.prb = PrioritizedReplayBuffer(size,
                              {"obs": {"shape": (ram)},
                               "act": {},
                               "rew": {},
                               "next_obs": {"shape": (ram)},
                               "done": {}},
                              alpha=0.5)
        
    def add_experience(self, action, ram, reward, new_ram, terminal):
        self.prb.add(obs=ram,
            act=action,
            rew=reward,
            next_obs=new_ram,
            done=terminal)
            
    def get_minibatch(self):
        s = self.prb.sample(self.batch_size)
        act = np.array([i[0] for i in s['act']])
        rew = np.array([i[0] for i in s['rew']])
        done = np.array([i[0] for i in s['done']])
        return s['obs'], act, rew, s['next_obs'], done

In [None]:
# make a class for decreasing the exploration probability over time:

class Exploration():
    def __init__(self, DQN, n_actions, eps_init=1.0, eps_f=0.01, eps_anneal=2000000):

        self.n_actions = n_actions
        self.eps_anneal = eps_anneal
        self.eps_f = eps_f
        self.linspace = np.linspace(eps_init, eps_f, eps_anneal)
        self.DQN = DQN
        self.playing = False

    def get_action(self, session, frame_number, state):
        
        if frame_number <= 10000:
            eps = self.linspace[0]
        elif (frame_number > 10000) and (frame_number < self.eps_anneal):
            eps = self.linspace[frame_number-10000]
        elif self.playing == True:
            eps = 0.0
        else:
            eps = self.eps_f
        
        if np.random.rand(1) < eps:
            return np.random.randint(0, self.n_actions)
        return session.run(self.DQN.best_action, feed_dict={self.DQN.input:[state]})[0]

In [None]:
# create class for our Dueling DQN agent:

class DQN_agent():   
    def __init__(self, n_actions, learning_rate=0.00025, 
                 ram_size=128):

        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.ram_size = ram_size
        
        self.input = tf.placeholder(shape=[None, self.ram_size], 
                                    dtype=tf.float32)
        self.inputscaled = self.input/255
        
        self.dense1 = tf.layers.dense(
            inputs=self.inputscaled, units=512, kernel_initializer=tf.variance_scaling_initializer(scale=2),
             activation=tf.nn.relu, name='dense1')
        self.dense2 = tf.layers.dense(
            inputs=self.dense1, units=256, kernel_initializer=tf.variance_scaling_initializer(scale=2),
             activation=tf.nn.relu, name='dense2')
        self.dense3 = tf.layers.dense(
            inputs=self.dense2, units=self.ram_size, kernel_initializer=tf.variance_scaling_initializer(scale=2),
             activation=tf.nn.relu, name='dense3')
        self.valuestream, self.advantagestream = tf.split(self.dense3,2,axis=1)
        self.valuestream = tf.layers.flatten(self.valuestream)
        self.advantagestream = tf.layers.flatten(self.advantagestream)
        self.advantage = tf.layers.dense(
            inputs=self.advantagestream, units=self.n_actions,
            kernel_initializer=tf.variance_scaling_initializer(scale=2), name="advantage")
        self.value = tf.layers.dense(
            inputs=self.valuestream, units=1, 
            kernel_initializer=tf.variance_scaling_initializer(scale=2), name='value')

        self.q_values = self.value + tf.subtract(self.advantage, tf.reduce_mean(self.advantage, axis=1, keepdims=True))
        self.best_action = tf.argmax(self.q_values,1)
        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.Q = tf.reduce_sum(tf.multiply(self.q_values, tf.one_hot(self.action, self.n_actions, dtype=tf.float32)), axis=1)

        self.loss = tf.reduce_mean(tf.losses.huber_loss(labels=self.target_q, predictions=self.Q))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.update = self.optimizer.minimize(self.loss)

In [None]:
# function to activate learning in the training process:

def learn(session, replay_memory, main_dqn, target_dqn, batch_size=32, gamma=0.99):

    # Draw a minibatch from the replay memory
    minibatch = replay_memory.get_minibatch()

    arg_q_max = session.run(main_dqn.best_action, feed_dict={main_dqn.input:minibatch[3]})
    q_vals = session.run(target_dqn.q_values, feed_dict={target_dqn.input:minibatch[3]})

    double_q = q_vals[range(batch_size), arg_q_max]
    target_q = minibatch[2] + (gamma*double_q*(1-minibatch[4]))

    loss, update = session.run([main_dqn.loss,main_dqn.update],feed_dict={main_dqn.input:minibatch[0],main_dqn.target_q:target_q,main_dqn.action:minibatch[1]})
    
    return loss

In [None]:
# class to update the target network every number of steps:

class Update_Target():
    def __init__(self, main_variables, target_variables):
        self.main_variables = main_variables
        self.target_variables = target_variables

    def update_target_variables(self):
        update_ops = []
        for i, var in enumerate(self.main_variables):
            copy_op = self.target_variables[i].assign(var.value())
            update_ops.append(copy_op)
        return update_ops
            
    def __call__(self, sess):
        update_ops = self.update_target_variables()
        for copy_op in update_ops:
            sess.run(copy_op)

In [None]:
# function to clip rewards, to avoid large updates due to large magnitudes of rewards:

def clip_reward(reward):
    return int(np.sign(reward))

In [None]:
# final definitions:

tf.compat.v1.reset_default_graph()

# Defining the environment:
atari = wrapper()

# Defining the networks:
with tf.variable_scope('main'):
    main = DQN_agent(atari.env.action_space.n)
with tf.variable_scope('target'):
    target = DQN_agent(atari.env.action_space.n)

# initialise variables and call saver to save our model throughout training:
init = tf.global_variables_initializer()
saver = tf.train.Saver()    
main_vs = tf.trainable_variables(scope='main')
target_vs = tf.trainable_variables(scope='target')

In [None]:
# now to train the model:

memory = ExperienceReplay()
update_networks = Update_Target(main_vs, target_vs)
explore = Exploration(main, atari.env.action_space.n)

with tf.Session() as sess:
    try:
        saver = tf.train.import_meta_graph("tmp/spaceinvaders_dueldqn-600.meta")
        saver.restore(sess,tf.train.latest_checkpoint("tmp/"))
    except:
        print('no previous training!')
    sess.run(init)
    
    frame_number = 0
    rewards = []
    loss_list = []
    
    while frame_number < 10000000:

        terminal_life_lost = atari.reset()
        episode_reward_sum = 0
        for _ in range(20000):
            action = explore.get_action(sess, frame_number, atari.state)   
            s = atari.state
            new_ram, reward, terminal, terminal_life_lost = atari.step(action)  
            frame_number += 1
            episode_reward_sum += reward
            clipped_reward = clip_reward(reward)
            memory.add_experience(action=action,ram=s,new_ram=new_ram,reward=clipped_reward,terminal=terminal_life_lost)   
            
            # 10000 random actions before learning:
            if frame_number > 10000:
                loss = learn(sess, memory, main, target)
                loss_list.append(loss)

            # update target network every 5000 frames:
            if frame_number % 5000 == 0 and frame_number > 10000:
                update_networks(sess)
                
            if terminal:
                terminal = False
                break

        rewards.append(episode_reward_sum)

        # save network and print rewards every 100 episodes
        if len(rewards) % 100 == 0:
            saver.save(sess, 'tmp/spaceinvaders_dueldqn', global_step=len(rewards))
            print(rewards[-1])

In [None]:
len(rewards)

In [None]:
import matplotlib.pyplot as plt

plt.plot(rewards)

In [None]:
frame_number

In [None]:
means2 = []
m=0
for i in range(len(rewards)):
    m += rewards[i]
    if i%100 == 0:
        means2.append(m/100)
        m=0

In [None]:
plt.figure()
plt.title('Rolling Means of Returns from 16,000 Episodes')
plt.xlabel('Epoch (each epoch contains 100 episodes)')
plt.ylabel('Mean Episodic Reward')
plt.plot(means2[:-2])
plt.show()

In [None]:
np.array(means2).max()

In [None]:
means2[1]

In [None]:
means2[-1]

In [None]:
np.array(rewards).max()

In [None]:
import pandas as pd

df_means = pd.DataFrame(means2, columns=['Rolling Means'])
df_rewards = pd.DataFrame(rewards, columns=['Rewards'])
df_means.to_csv('means2.csv', index=False)
df_rewards.to_csv('rewards2.csv', index=False)

In [None]:
# evaluating the performance of the agent. i.e. seeing him play the game:

explore = Exploration(main, atari.env.action_space.n)
explore.playing = True
with tf.Session() as sess:
    saver = tf.train.import_meta_graph("tmp/spaceinvaders_dueldqn-15700.meta")
    saver.restore(sess,tf.train.latest_checkpoint("tmp/"))
    environment = gym.make('ALE/SpaceInvaders-ram-v5', render_mode='human')
    state = environment.reset()
    done = False
    reward_sum = 0
    while not done:
        action = explore.get_action(sess, 3000000, state)
        new_state, reward, terminal, _ = environment.step(action)
        reward_sum += reward
        state = new_state
        done = terminal
    environment.close()

In [None]:
reward_sum