In [1]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random

In [2]:
! pip install gym[atari]
env = gym.make('MsPacman-v0')



[2019-02-10 13:23:37,159] Making new env: MsPacman-v0
  result = entry_point.load(False)


In [3]:
color = np.array([210, 160, 73]).mean()
color

147.66666666666666

In [4]:
color = np.array([210, 160, 73]).mean()

def preprocess_observation(obs):
    # crop and resize image
    img = obs[1:176:2,::2]
    
    # convert image to greyscale
    img = img.mean(axis=2)
    
    # improve the contrast
    img[img == color] = 0
    
    # normalize the image
    img = (img - 128) / 128
    
    return img.reshape(88,80,1)

Building the Q-network

In [5]:
tf.reset_default_graph()

def q_network(env, X, name_scope):
    # initializing the layers
    initializer = tf.contrib.layers.variance_scaling_initializer()
    
    # define the network
    with tf.variable_scope(name_scope) as scope:
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME',
                        weights_initializer=initializer)
        
        layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME',
                        weights_initializer=initializer)
        
        layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=4, padding='SAME',
                        weights_initializer=initializer)
        
        
        flat = flatten(layer_3)
        
        # feed the layer to a fully connected network
        fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        
        # output of the fully connected layer
        output = fully_connected(fc, num_outputs=env.action_space.n, activation_fn=None,
                                weights_initializer=initializer)
        
        # store the parameters of the network
        vars = {v.name[len(scope.name):]: v for v in tf.get_collection(
            key = tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
        
    return vars, output

In [6]:
eps_min = 0.05
eps_max = 1
eps_decay_steps = 100_000

def epsilon_greedy(env, action, step):
    epsilon = max(eps_min, eps_max - (eps_max - eps_min) * step / eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(env.action_space.n)
    else:
        return action


In [7]:
# create the replay buffer
buffer_len = 20_000
exp_buffer = deque(maxlen=buffer_len) 
# deque is a list that can easily add items at the end and remove from the start

In [8]:
def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    # state, action, next_state, reward, done
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

In [9]:
num_episodes = 1000
batch_size = 32
inupt_shape = (None, 88, 80, 1)
learning_rate = 0.001
gamma = 0.9

global_step = 0
copy_steps = 100
train_steps = 4 # amount of steps that are added (and removed) from the buffer
start_steps =  2000

In [10]:
logdir = 'logs'
tf.reset_default_graph()

# define placeholder for input: the game state
X = tf.placeholder(tf.float32, shape=inupt_shape)

# define a boolean to kick off training
in_training_mode = tf.placeholder(tf.bool)

In [11]:
# we build the 2 networks
mainQ, mainQ_outputs = q_network(env, X, 'mainQ')
targetQ, targetQ_outputs = q_network(env, X, 'targetQ')

In [12]:
# define a placeholder for our actions values
X_action = tf.placeholder(tf.int32, shape=(None, ))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, 
                                                      env.action_space.n), 
                         axis=-1, keep_dims=True)


[2019-02-10 13:23:38,068] From <ipython-input-12-f39ea0cd470a>:5: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [13]:
# defining the copy op to copy the weights of mainQ to the target Q network
copy_op = [tf.assign(main_name, targetQ[var_name]) 
           for var_name, main_name in mainQ.items()]
copy_main_to_target = tf.group(*copy_op)

In [14]:
# define a placeholder for the target
y = tf.placeholder(tf.float32, shape=(None, 1))

# compute the loss
loss = tf.reduce_mean(tf.square(y - Q_action))

# define the optimizer and the training_op
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

init = tf.global_variables_initializer()

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

Running the tensorflow session to run the model

In [16]:
with tf.Session() as sess:
    init.run()
    
    # for each episode
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        episodic_loss = []
        
        # run episode until reaching terminal state
        while not done:
            # preprocess the observation
            obs = preprocess_observation(obs)
            
            # feed game screen and get Q_values for each action
            actions = mainQ_outputs.eval(feed_dict={X: [obs], in_training_mode: False})
            
            # get action that derives from Q_values
            action = np.argmax(actions, axis=-1)
            
            # select the action using epsilon-greedy policy
            action = epsilon_greedy(env, action, global_step)
            
            # perform action and move to next state
            # observe next_obs and receive reward
            next_obs, reward, done, _ = env.step(action)
            
            # store transition in replay buffer
            exp_buffer.append([obs, action, preprocess_observation(next_obs), 
                               reward, done])
            
            # start training Q-nework after certain number of steps 
            # with samples from the experience replay buffer
            if global_step > start_steps and global_step % train_steps == 0:
                # sample batch of transitions
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
                
                # list states
                o_obs = [x for x in o_obs]
                
                # list next states
                o_next_obs = [x for x in o_next_obs]
                
                # next_actions
                next_act = targetQ_outputs.eval(feed_dict={X: o_next_obs, 
                                                         in_training_mode: False})
                
                # compute the target of Q_network
                y_batch = o_rew + gamma * np.max(next_act, axis=-1) * (1 - o_done)
                
                # merge all summaries
                mrg_summary = merge_summary.eval(
                    feed_dict={X: o_obs, y: np.expand_dims(y_batch, axis=-1),
                               X_action: o_act, in_training_mode: False})
                file_writer.add_summary(mrg_summary, global_step)
                
                # train mainQ network
                train_loss, _ = sess.run([loss, training_op], 
                    feed_dict={X: o_obs, y: np.expand_dims(y_batch, axis=-1),
                               X_action: o_act, in_training_mode: True})
                episodic_loss.append(train_loss)
            
            if (global_step + 1) % copy_steps == 0 and global_step > start_steps:
                copy_main_to_target.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
        
        print(f'Epoch: {epoch} - Reward: {episodic_reward}')

            

Epoch: 658 - Reward: 230.0
Epoch: 663 - Reward: 280.0
Epoch: 534 - Reward: 170.0
Epoch: 759 - Reward: 240.0
Epoch: 610 - Reward: 140.0
Epoch: 777 - Reward: 260.0
Epoch: 674 - Reward: 260.0
Epoch: 733 - Reward: 350.0
Epoch: 578 - Reward: 240.0
Epoch: 706 - Reward: 270.0
Epoch: 843 - Reward: 290.0
Epoch: 670 - Reward: 190.0
Epoch: 705 - Reward: 290.0
Epoch: 1106 - Reward: 560.0
Epoch: 828 - Reward: 450.0
Epoch: 750 - Reward: 320.0
Epoch: 544 - Reward: 230.0
Epoch: 627 - Reward: 180.0
Epoch: 594 - Reward: 150.0
Epoch: 565 - Reward: 200.0
Epoch: 525 - Reward: 280.0
Epoch: 539 - Reward: 240.0
Epoch: 593 - Reward: 160.0
Epoch: 670 - Reward: 360.0
Epoch: 705 - Reward: 250.0
Epoch: 652 - Reward: 230.0
Epoch: 826 - Reward: 360.0
Epoch: 641 - Reward: 260.0
Epoch: 796 - Reward: 320.0
Epoch: 665 - Reward: 330.0
Epoch: 494 - Reward: 230.0
Epoch: 657 - Reward: 360.0
Epoch: 566 - Reward: 200.0
Epoch: 904 - Reward: 560.0
Epoch: 587 - Reward: 200.0
Epoch: 664 - Reward: 260.0
Epoch: 620 - Reward: 300.0


KeyboardInterrupt: 