# Reinforcement learning with Tensorflow

This code is supporting material for the book `Building Machine Learning Systems with Python` by [Willi Richert](https://www.linkedin.com/in/willirichert/), [Luis Pedro Coelho](https://www.linkedin.com/in/luispedrocoelho/) and [Matthieu Brucher](https://www.linkedin.com/in/matthieubrucher/) published by PACKT Publishing.

It is made available under the MIT License.

All code examples use Python in version...

In [None]:
import sys
sys.version

## Utility functions

In [None]:
import os

CHART_DIR = "charts"
if not os.path.exists(CHART_DIR):
    os.mkdir(CHART_DIR)

def save_png(name):
    fn = 'B09124_13_%s.png'%name # please ignore, it just helps our publisher :-)
    plt.savefig(os.path.join(CHART_DIR, fn), bbox_inches="tight")

## Simple text games

In [None]:
import gym
import numpy as np

env = gym.make('FrozenLake-v0')

### Estimating the Q function the old fashion way

Let's make a table with some Q values for this environment

In [None]:
# Start with an empty table
Q = np.zeros((env.observation_space.n, env.action_space.n))
# Set learning hyperparameters
lr = .8
y = .95
num_episodes = 2000

# Let's run!
for i in range(num_episodes):
    # Reset environment and get first new observation (top left)
    s = env.reset()
    # Do 100 iterations to update the table
    for i in range(100):
        # Choose an action by picking the max of the table + additional random noise ponderated by the episode
        a = np.argmax(Q[s,:] + np.random.randn(1,env.action_space.n)//(i+1))
        # Get new state and reward from environment after chosen step 
        s1, r, d,_ = env.step(a)
        # Update Q-Table with new knowledge
        Q[s,a] = Q[s,a] + lr*(r + y*np.max(Q[s1,:]) - Q[s,a])
        s = s1
        if d == True:
            break

In [None]:
print("Final Q-Table Values")
print(Q)

### Test games with TF

In [None]:
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

Let's create a new network.

In [None]:
y = 0.99
e = 0.1 # 1 in 10 samples, we chose a new action for the network
num_episodes = 2000
learning_rate = 0.1

In [None]:
tf.reset_default_graph()

# A simple one layer network
inputs = tf.placeholder(shape=[None, 16], dtype=tf.float32, name="input")
Qout = tf.layers.dense(
    inputs=inputs,
    units=4,
    use_bias=False,
    name="dense",
    kernel_initializer=tf.random_uniform_initializer(minval=0, maxval=.0125)
)
predict = tf.argmax(Qout, 1)

# Our optimizer will try to optimize 
nextQ = tf.placeholder(shape=[None, 4], dtype=tf.float32, name="target")
loss = tf.reduce_sum(tf.square(nextQ - Qout))

trainer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
updateModel = trainer.minimize(loss)

We can now train the network, and check that it will get more and more sucesses as the training progresses.

In [None]:
# To keep track of our games and our results
jList = []
rList = []
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for i in range(num_episodes):
        s = env.reset()
        rAll = 0
        
        for j in range(100):
            a, targetQ = sess.run([predict, Qout], feed_dict={inputs:np.identity(16)[s:s+1]})
            # We randomly choose a new state that we may have not encountered before
            if np.random.rand(1) < e:
                a[0] = env.action_space.sample()

            s1, r, d, _ = env.step(a[0])
            
            # Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout, feed_dict={inputs:np.identity(16)[s1:s1+1]})
            # Obtain maxQ' and set our target value for chosen action.
            targetQ[0, a[0]] = r + y*np.max(Q1)
            
            # Train our network using target and predicted Q values
            sess.run(updateModel, feed_dict={inputs:np.identity(16)[s:s+1], nextQ:targetQ})
            rAll += r
            s = s1
            if d == True:
                # Reduce chance of random action as we train the model.
                e = 1 / ((i // 50) + 10)
                break
        jList.append(j)
        rList.append(rAll)
print("Percent of succesful episodes: %f%%" % (sum(rList) / num_episodes))

We now display the evolution of the reward with each episode

In [None]:
from scipy.signal import lfilter

plt.plot(lfilter(np.ones(20)/20, [1], rList))
save_png("reward")

We can also see that the survival increases, even if we take suoptimal paths:

In [None]:
plt.plot(jList)
save_png("length")

## Atari games

We can now design a a network that can tackle more or less any of the Atari games available on the gym plaform.

In [None]:
import gym

import os
import six
import numpy as np
import tensorflow as tf
import random
from collections import deque

CHART_DIR = "charts"
if not os.path.exists(CHART_DIR):
    os.mkdir(CHART_DIR)

We need a few helper function, one to preprocess our images and shrink them and two others that will transpose the data. The reason is that we use the past images as additional channels, so the axis order is wrong.

In [None]:
def to_grayscale(img):
    return np.mean(img, axis=2).astype(np.uint8)

def downsample(img):
    return img[::2, ::2]

def preprocess(img):
    return to_grayscale(downsample(img))[None,:,:]

def adapt_state(state):
    return [np.float32(np.transpose(state, (2, 1, 0)) / 255.0)]

def adapt_batch_state(state):
    return np.transpose(np.array(state), (0, 3, 2, 1)) / 255.0


We add a bunch of hyperparameters and constants

In [None]:

env_name = "Breakout-v4"

width = 80  # Resized frame width
height = 105  # Resized frame height

n_episodes = 12000  # Number of runs for the agent
state_length = 4  # Number of most frames we input to the network

gamma = 0.99  # Discount factor

exploration_steps = 1000000  # During all these steps, we progressively lower epsilon
initial_epsilon = 1.0  # Initial value of epsilon in epsilon-greedy
final_epsilon = 0.1  # Final value of epsilon in epsilon-greedy

initial_random_search = 20000  # Number of steps to populate the replay memory before training starts
replay_memory_size = 400000  # Number of states we keep for training
batch_size = 32  # Batch size
network_update_interval = 10000  # The frequency with which the target network is updated
train_skips = 4  # The agent selects 4 actions between successive updates

learning_rate = 0.00025  # Learning rate used by RMSProp
momentum = 0.95  # momentum used by RMSProp
min_gradient = 0.01  # Constant added to the squared gradient in the denominator of the RMSProp update

network_path = 'saved_networks/' + env_name
tensorboard_path = 'summary/' + env_name
save_interval = 300000  # The frequency with which the network is saved
initial_quiet_steps = 10  # Initial steps while the agent is not doing anything.

We use a class to train, save and restore our network.
get_trained_action() will be the method used to get a new action from the network.

In [None]:
class Agent():
    def __init__(self, num_actions, restore_network=False):
        self.num_actions = num_actions
        self.epsilon = initial_epsilon
        self.epsilon_step = (initial_epsilon - final_epsilon) / exploration_steps
        self.t = 0

        # Parameters used for summary
        self.total_reward = 0
        self.total_q_max = 0
        self.total_loss = 0
        self.duration = 0
        self.episode = 0

        # Create replay memory
        self.replay_memory = deque()

        # Create q network
        self.s, self.q_values, q_network = self.build_network("Q")
        q_network_weights = q_network.trainable_weights

        # Create target network
        self.st, self.target_q_values, target_network = self.build_network("Target")
        target_network_weights = target_network.trainable_weights

        # Define target network update operation
        self.update_target_network = [target_network_weights[i].assign(q_network_weights[i]) for i in range(len(target_network_weights))]

        # Define loss and gradient update operation
        self.a, self.y, self.loss, self.grads_update = self.build_training_op(q_network_weights)

        # Interactive session instead of the usual one just vecause it is simple to create
        # Would need some refactoring otherwise of this constructor
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver(q_network_weights)
        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
        self.summary_writer = tf.summary.FileWriter(tensorboard_path, self.sess.graph)

        if not os.path.exists(network_path):
            os.makedirs(network_path)

        # Initialize target network
        self.sess.run(self.update_target_network)

        if restore_network:
            self.load_network()

    def build_network(self, name):
        model = tf.keras.Sequential(name=name)
        model.add(tf.keras.layers.Convolution2D(filters=32, kernel_size=8, strides=(4, 4), activation='relu', input_shape=(width, height, state_length), name="Layer1" + name))
        model.add(tf.keras.layers.Convolution2D(filters=64, kernel_size=4, strides=(2, 2), activation='relu', name="Layer2" + name))
        model.add(tf.keras.layers.Convolution2D(filters=64, kernel_size=3, strides=(1, 1), activation='relu', name="Layer3" + name))
        model.add(tf.keras.layers.Flatten(name="Flatten" + name))
        model.add(tf.keras.layers.Dense(512, activation='relu', name="Layer4" + name))
        model.add(tf.keras.layers.Dense(self.num_actions, name="Output" + name))

        s = tf.placeholder(tf.float32, [None, width, height, state_length], name="state" + name)
        q_values = model(s)

        return s, q_values, model

    def build_training_op(self, q_network_weights):
        a = tf.placeholder(tf.int64, [None], name="actions")
        y = tf.placeholder(tf.float32, [None], name="qInput")

        # Convert action to one hot vector
        a_one_hot = tf.one_hot(a, self.num_actions, 1.0, 0.0)
        q_value = tf.reduce_sum(tf.multiply(self.q_values, a_one_hot), reduction_indices=1)

        # Clip the error, the loss is quadratic when the error is in (-1, 1), and linear outside of that region
        error = tf.abs(y - q_value)
        quadratic_part = tf.clip_by_value(error, 0.0, 1.0)
        linear_part = error - quadratic_part
        loss = tf.reduce_mean(0.5 * tf.square(quadratic_part) + linear_part)

        optimizer = tf.train.RMSPropOptimizer(learning_rate, momentum=momentum, epsilon=min_gradient)
        grads_update = optimizer.minimize(loss, var_list=q_network_weights)

        return a, y, loss, grads_update

    def get_initial_state(self, frame):
        processed_frame = preprocess(frame)
        state = [processed_frame for _ in range(state_length)]
        return np.concatenate(state)

    def get_action(self, state):
        if self.epsilon >= random.random() or self.t < initial_random_search:
            action = random.randrange(self.num_actions)
        else:
            action = np.argmax(self.q_values.eval(feed_dict={self.s: adapt_state(state)}))

        # Decay epsilon over time
        if self.epsilon > final_epsilon and self.t >= initial_random_search:
            self.epsilon -= self.epsilon_step

        return action

    def run(self, state, action, reward, terminal, frame):
        next_state = np.append(state[1:, :, :], frame, axis=0)

        # Clip all positive rewards at 1 and all negative rewards at -1, leaving 0 rewards unchanged
        reward = np.clip(reward, -1, 1)

        # Store transition in replay memory
        self.replay_memory.append((state, action, reward, next_state, terminal))
        if len(self.replay_memory) > replay_memory_size:
            self.replay_memory.popleft()

        if self.t >= initial_random_search:
            # Train network
            if self.t % train_skips == 0:
                self.train_network()

            # Update target network
            if self.t % network_update_interval == 0:
                self.sess.run(self.update_target_network)

            # Save network
            if self.t % save_interval == 0:
                save_path = self.saver.save(self.sess, network_path + '/' + env_name, global_step=self.t)

        self.total_reward += reward
        self.total_q_max += np.max(self.q_values.eval(feed_dict={self.s: adapt_state(state)}))
        self.duration += 1

        if terminal:
            # Write summary
            stats = [self.total_reward, self.total_q_max / self.duration,
                    self.duration, self.total_loss / (self.duration / train_skips)]
            for i in range(len(stats)):
                self.sess.run(self.update_ops[i], feed_dict={
                    self.summary_placeholders[i]: float(stats[i])
                })
            summary_str = self.sess.run(self.summary_op)
            self.summary_writer.add_summary(summary_str, self.episode + 1)

            self.total_reward = 0
            self.total_q_max = 0
            self.total_loss = 0
            self.duration = 0
            self.episode += 1

        self.t += 1

        return next_state

    def train_network(self):
        state_batch = []
        action_batch = []
        reward_batch = []
        next_state_batch = []
        terminal_batch = []
        y_batch = []

        # Sample random minibatch of transition from replay memory
        minibatch = random.sample(self.replay_memory, batch_size)
        for data in minibatch:
            state_batch.append(data[0])
            action_batch.append(data[1])
            reward_batch.append(data[2])
            next_state_batch.append(data[3])
            terminal_batch.append(data[4])

        # Convert True to 1, False to 0
        terminal_batch = np.array(terminal_batch) + 0

        target_q_values_batch = self.target_q_values.eval(feed_dict={self.st: adapt_batch_state(next_state_batch)})
        y_batch = reward_batch + (1 - terminal_batch) * gamma * np.max(target_q_values_batch, axis=1)

        loss, _ = self.sess.run([self.loss, self.grads_update], feed_dict={
            self.s: adapt_batch_state(next_state_batch),
            self.a: action_batch,
            self.y: y_batch
        })

        self.total_loss += loss

    def setup_summary(self):
        episode_total_reward = tf.Variable(0., name="EpisodeTotalReward")
        tf.summary.scalar(env_name + '/Total_Reward/Episode', episode_total_reward)
        episode_avg_max_q = tf.Variable(0., name="EpisodeAvgMaxQ")
        tf.summary.scalar(env_name + '/Average_Max Q/Episode', episode_avg_max_q)
        episode_duration = tf.Variable(0., name="EpisodeDuration")
        tf.summary.scalar(env_name + '/Duration/Episode', episode_duration)
        episode_avg_loss = tf.Variable(0., name="EpisodeAverageLoss")
        tf.summary.scalar(env_name + '/Average_Loss/Episode', episode_avg_loss)
        summary_vars = [episode_total_reward, episode_avg_max_q, episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

    def load_network(self):
        checkpoint = tf.train.get_checkpoint_state(network_path)
        if checkpoint and checkpoint.model_checkpoint_path:
            self.saver.restore(self.sess, checkpoint.model_checkpoint_path)
            print('Successfully loaded: ' + checkpoint.model_checkpoint_path)
        else:
            print('Training new network...')

    def get_trained_action(self, state):
        action = np.argmax(self.q_values.eval(feed_dict={self.s: adapt_state(state)}))
        return action

We can now train our network (and save some final images from the trained network)

In [None]:
from tqdm import tqdm

env = gym.make(env_name)
agent = Agent(num_actions=env.action_space.n)

for i in tqdm(range(n_episodes)):
    terminal = False
    frame = env.reset()
    for _ in range(random.randint(1, initial_quiet_steps)):
        frame, _, _, _ = env.step(0)  # Do nothing
    state = agent.get_initial_state(frame)
    while not terminal:
        action = agent.get_action(state)
        frame, reward, terminal, _ = env.step(action)

        processed_frame = preprocess(frame)
        state = agent.run(state, action, reward, terminal, processed_frame)
    env.env.ale.saveScreenPNG(six.b('%s/test_image_%05i.png' % (CHART_DIR, i)))
