In [1]:
import numpy as np
import tensorflow as tf
import random 
import gym

  from ._conv import register_converters as _register_converters


In [2]:
from skimage import transform
from skimage.color import rgb2gray

import matplotlib.pyplot as plt

from collections import deque

import warnings

warnings.filterwarnings('ignore')

In [3]:
env = gym.make('SpaceInvaders-v0')
print("Frame Size: %s" % env.observation_space)
print("Actions: %d" % env.action_space.n)

possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())
print(possible_actions)

Frame Size: Box(210, 160, 3)
Actions: 6
[[1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 1 0 0]
 [0 0 0 0 1 0]
 [0 0 0 0 0 1]]


In [4]:
def preprocess_frame(frame):
    gray = rgb2gray(frame)

    cropped_frame = gray[8:-12, 4:-12]

    normalized_frame = cropped_frame / 255.0

    preprocessed_frame = transform.resize(normalized_frame, [110, 84])

    return preprocessed_frame  # 110x84x1 frame


In [5]:
stack_size = 4
stacked_frames = deque([np.zeros((110,84), dtype=np.int) for _ in range(stack_size)], maxlen= 4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    if is_new_episode:
        stacked_frames = deque([np.zeros((110, 84), dtype=np.int) for _ in range(stack_size)], maxlen=4)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
    
    stacked_frames.append(frame)
    stacked_state = np.stack(stacked_frames, axis=2)
    
    return stacked_state, stacked_frames

In [6]:
### Hyperparameters

state_size = [110, 84, 4]
action_size = env.action_space.n
learning_rate = 0.00025

#Training
total_episodes = 50
max_steps = 50000
batch_size = 64

explore_start = 1.0
explore_end = 0.01
decay_rate = 0.00001

gamma = 0.9

pretrain_length = batch_size
memory_size = 1000000

stack_size = 4

training = True

episode_render = False


In [7]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = stack_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            #  https://medium.com/understand-the-python/understanding-the-asterisk-of-python-8b9daaa4a558
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name="actions")
            
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                          filters=32,
                                          kernel_size=[8, 8],
                                          strides=[4, 4],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv1"
                                          )
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                          filters=64,
                                          kernel_size=[4, 4],
                                          strides=[2, 2],
                                          padding="VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv2"
                                          )
            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")
            
            self.conv3 = tf.layers.conv2d(inputs=self.conv2_out,
                                          filters=64,
                                          kernel_size=[3, 3],
                                          strides=[2, 2],
                                          padding="VALID", 
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                          name="conv3"
                                          )
            
            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            self.flatten = tf.contrib.layers.flatten(self.conv3_out)
            
            self.fc = tf.layers.dense(inputs=self.flatten,
                                      units = 512,
                                      activation=tf.nn.elu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                      name="fc1")
            
            self.output = tf.layers.dense(inputs=self.fc,
                                          kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = self.action_size,
                                          activation=None)
            
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [8]:
tf.reset_default_graph()
network = DQNetwork(state_size, action_size, learning_rate)


In [9]:
class Memory:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                 size=batch_size,
                                 replace=False
                                 )
        return [self.buffer[i] for i in index]


In [24]:
memory = Memory(max_size=memory_size)
for i in range(pretrain_length):
    if i == 0:
        state = env.reset()
    
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    choice = random.randint(1, len(possible_actions)) - 1
    action = possible_actions[choice]
    next_state, reward, done, _ = env.step(choice)
    
    if episode_render:
        env.render()
    
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    
    if done:
        next_state = np.zeros(state.shape)
        
        memory.add((state, action, reward, next_state, done))
        
        state = env.reset()
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    else:
        memory.add((state, action, reward, next_state, done))
        
        state = next_state

In [16]:
#Setup Tensorboard
writer = tf.summary.FileWriter("tb/dqn/1")
tf.summary.scalar("Loss", network.loss)
write_op = tf.summary.merge_all()
saver = tf.train.Saver()

In [17]:
"""
This function will do the part
With ϵϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""


def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, sess):
    # https://docs.scipy.org/doc/numpy-1.15.1/reference/generated/numpy.random.rand.html
    exp_exp_tradeoff = np.random.rand()
    
    explore_prob = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if explore_prob > exp_exp_tradeoff:
        choice = random.randint(1, len(possible_actions)) - 1
    else:
        Qs = sess.run(network.output, feed_dict = {network.inputs_: state.reshape((1, *state.shape))})
        
        choice = np.argmax(Qs)
    
    # print("Choice",choice)
    return choice, explore_prob

In [27]:
saver = tf.train.Saver()
loss = 0
if training:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        decay_step = 0
        
        for episode in range(total_episodes):
            step = 0
            
            episode_rewards = []
            
            state = env.reset()
            
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                
                decay_step += 1
                
                choice, explore_probability = predict_action(explore_start, explore_end, decay_rate, decay_step, state, 
                                                             sess)
                action = possible_actions[choice]
                next_state, reward, done, _ = env.step(choice)
                
                if episode_render:
                    env.render()
                    
                episode_rewards.append(reward)
                
                if done:
                    next_state = np.zeros((110, 84), dtype=np.int)
                    
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    step = max_steps
                    
                    total_reward = np.sum(episode_rewards)
                    
                    print('Episode: {}'.format(episode),
                                  'Total reward: {}'.format(total_reward),
                                  'Explore P: {:.4f}'.format(explore_probability),
                                'Training Loss {:.4f}'.format(loss))
                    
                    # reward_list.append()
                    memory.add((state, action, reward, next_state, done))
                    
                else:
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    memory.add((state, action, reward, next_state, done))
                    
                    state = next_state
                    
                batch = memory.sample(batch_size)
                # ndmin : int, optional
                # 
                # Specifies the minimum number of dimensions that the resulting array should have. 
                # Ones will be pre-pended to the shape as needed to meet this requirement.
                # 
            
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                # Feed dict: https://www.learningtensorflow.com/lesson4/
                Qs_next_state = sess.run(network.output, feed_dict= {network.inputs_: next_states_mb})
                
                # TODO:
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                
                targets_mb = np.array([each for each in target_Qs_batch])
                
                loss, _ = sess.run([network.loss, network.optimizer],
                                   feed_dict={
                                       network.inputs_: states_mb,
                                       network.target_Q: targets_mb,
                                       network.actions_: actions_mb
                                   })
                summary = sess.run(write_op, feed_dict={
                    network.inputs_: states_mb,
                    network.target_Q: targets_mb,
                    network.actions_: actions_mb
                })
                
                writer.add_summary(summary, episode)
                writer.flush()
                
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: 180.0 Explore P: 0.9934 Training Loss 0.0365


Model Saved


Episode: 1 Total reward: 105.0 Explore P: 0.9884 Training Loss 0.2379


Episode: 2 Total reward: 55.0 Explore P: 0.9830 Training Loss 1.5771


Episode: 3 Total reward: 80.0 Explore P: 0.9788 Training Loss 0.0614


Episode: 4 Total reward: 50.0 Explore P: 0.9752 Training Loss 0.0221


Episode: 5 Total reward: 380.0 Explore P: 0.9682 Training Loss 13.4014


Model Saved


Episode: 6 Total reward: 110.0 Explore P: 0.9616 Training Loss 0.8318


Episode: 7 Total reward: 115.0 Explore P: 0.9534 Training Loss 2.0538


Episode: 8 Total reward: 210.0 Explore P: 0.9465 Training Loss 4.6265


Episode: 9 Total reward: 185.0 Explore P: 0.9406 Training Loss 0.3303


Episode: 10 Total reward: 325.0 Explore P: 0.9334 Training Loss 3.6408


Model Saved


Episode: 11 Total reward: 475.0 Explore P: 0.9239 Training Loss 1.8674


Episode: 12 Total reward: 185.0 Explore P: 0.9161 Training Loss 10.1320


Episode: 13 Total reward: 210.0 Explore P: 0.9091 Training Loss 5.9516


Episode: 14 Total reward: 110.0 Explore P: 0.9017 Training Loss 10.7422


Episode: 15 Total reward: 80.0 Explore P: 0.8965 Training Loss 0.9828


Model Saved


Episode: 16 Total reward: 125.0 Explore P: 0.8921 Training Loss 0.6173


Episode: 17 Total reward: 110.0 Explore P: 0.8866 Training Loss 7.2901


Episode: 18 Total reward: 190.0 Explore P: 0.8797 Training Loss 6.4219


Episode: 19 Total reward: 210.0 Explore P: 0.8729 Training Loss 4.5042


Episode: 20 Total reward: 365.0 Explore P: 0.8635 Training Loss 3.7347


Model Saved


Episode: 21 Total reward: 115.0 Explore P: 0.8589 Training Loss 7.2846


Episode: 22 Total reward: 270.0 Explore P: 0.8494 Training Loss 4.0792


Episode: 23 Total reward: 115.0 Explore P: 0.8427 Training Loss 1.1775


Episode: 24 Total reward: 120.0 Explore P: 0.8363 Training Loss 1.7999


Episode: 25 Total reward: 120.0 Explore P: 0.8314 Training Loss 13.6700


Model Saved


Episode: 26 Total reward: 255.0 Explore P: 0.8254 Training Loss 1.7583


Episode: 27 Total reward: 180.0 Explore P: 0.8200 Training Loss 14.2526


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/negativezero/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-27-fcd8af8a592c>", line 64, in <module>
    states_mb = np.array([each[0] for each in batch], ndmin=3)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/negativezero/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 1863, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/negativezero/anaconda3/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1095, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_of

KeyboardInterrupt: 

In [21]:

with tf.Session() as sess:
    total_test_rewards = []
    
    # Load the model
    saver.restore(sess, "./models/model.ckpt")
    
    for episode in range(10):
        total_rewards = 0
        
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
        print("****************************************************")
        print("EPISODE ", episode)
        
        while True:
            # Reshape the state
            state = state.reshape((1, *state_size))
            # Get action from Q-network 
            # Estimate the Qs values state
            Qs = sess.run(network.output, feed_dict = {network.inputs_: state})
            
            # Take the biggest Q value (= the best action)
            choice = np.argmax(Qs)
            action = possible_actions[choice]
            
            #Perform the action and get the next_state, reward, and done information
            next_state, reward, done, _ = env.step(choice)
            env.render()
            
            total_rewards += reward

            if done:
                print ("Score", total_rewards)
                total_test_rewards.append(total_rewards)
                break
                
                
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
            
    env.close()

INFO:tensorflow:Restoring parameters from ./models/model.ckpt


****************************************************
EPISODE  0


Score 75.0
****************************************************
EPISODE  1


Score 230.0
****************************************************
EPISODE  2


Score 145.0
****************************************************
EPISODE  3


Score 80.0
****************************************************
EPISODE  4


Score 190.0
****************************************************
EPISODE  5


Score 110.0
****************************************************
EPISODE  6


Score 130.0
****************************************************
EPISODE  7


Score 105.0
****************************************************
EPISODE  8


Score 200.0
****************************************************
EPISODE  9


Score 275.0
