In [1]:
# Common imports
import numpy as np
import os

In [2]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

In [3]:
##########  Using gym
import gym

# Scikit-Learn ≥0.20 is required
import sklearn

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
# Let's build the Double DQN. 
# Given a state, it will estimate, for each possible action, the sum of discounted future rewards it can expect after 
# it plays that action (but before it sees its outcome):

keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

env = gym.make("CartPole-v1")
input_shape = [4] # == env.observation_space.shape
n_outputs = 2 # == env.action_space.n

model = keras.models.Sequential([
    keras.layers.Dense(32, activation="elu", input_shape=[4]),
    keras.layers.Dense(32, activation="elu"),
    keras.layers.Dense(n_outputs)
])

target = keras.models.clone_model(model)
target.set_weights(model.get_weights())

In [5]:
batch_size = 32
discount_rate = 0.95
optimizer = keras.optimizers.Adam(learning_rate=6e-3)
loss_fn = keras.losses.Huber()

def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    best_next_actions = np.argmax(next_Q_values, axis=1)
    next_mask = tf.one_hot(best_next_actions, n_outputs).numpy()
    next_best_Q_values = (target.predict(next_states) * next_mask).sum(axis=1)
    target_Q_values = (rewards + 
                       (1 - dones) * discount_rate * next_best_Q_values)
    target_Q_values = target_Q_values.reshape(-1, 1)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [6]:
# To select an action using this DQN, we just pick the action with the largest predicted Q-value. However, to ensure that 
# the agent explores the environment, we choose a random action with probability epsilon.

def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        print("newaxis")
        print(state[np.newaxis])
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])

In [7]:
# We will also need a replay memory. It will contain the agent's experiences, 
# in the form of tuples: (obs, action, reward, next_obs, done). 
# We can use the deque class for that (but make sure to check out DeepMind's excellent Reverb library for 
# a much more robust implementation of experience replay):

from collections import deque

replay_memory = deque(maxlen=2000)

In [8]:
# And let's create a function to sample experiences from the replay memory. It will return 5 NumPy arrays: 
# [obs, actions, rewards, next_obs, dones].

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_memory), size=batch_size)
    batch = [replay_memory[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

In [9]:
# Now we can create a function that will use the DQN to play one step, and record its experience in the replay memory:
def play_one_step(env, state, epsilon):
    action = epsilon_greedy_policy(state, epsilon)
    next_state, reward, done, info = env.step(action)
    replay_memory.append((state, action, reward, next_state, done))
    return next_state, reward, done, info

In [10]:
# Lastly, let's create a function that will sample some experiences from the replay memory and perform a training step:

env.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

rewards = []
best_score = 0

for episode in range(600):
    obs = env.reset()    
    for step in range(200):
        epsilon = max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    rewards.append(step)
    if step >= best_score:
        best_weights = model.get_weights()
        best_score = step
    print("\rEpisode: {}, Steps: {}, eps: {:.3f}".format(episode, step + 1, epsilon), end="")
    if episode >= 50:
        training_step(batch_size)
        if episode % 50 == 0:
            target.set_weights(model.get_weights())
    # Alternatively, you can do soft updates at each step:
    #if episode >= 50:
        #target_weights = target.get_weights()
        #online_weights = model.get_weights()
        #for index in range(len(target_weights)):
        #    target_weights[index] = 0.99 * target_weights[index] + 0.01 * online_weights[index]
        #target.set_weights(target_weights)

model.set_weights(best_weights)

Episode: 8, Steps: 20, eps: 0.984newaxis
[[-0.11363989 -0.43542606  0.18042924  0.9705781 ]]
Episode: 9, Steps: 14, eps: 0.982newaxis
[[ 0.09258524 -0.02099489 -0.20233699 -0.48429964]]
Episode: 11, Steps: 16, eps: 0.978newaxis
[[ 0.02453447  0.72068768 -0.00443159 -0.96757257]]
Episode: 12, Steps: 24, eps: 0.976newaxis
[[ 0.05440586 -0.41088652 -0.00905702  0.45449593]]
Episode: 14, Steps: 19, eps: 0.972newaxis
[[ 0.00812975  0.56529223 -0.09194969 -0.97051382]]
Episode: 15, Steps: 19, eps: 0.970newaxis
[[-0.05106164 -0.35741562  0.07124481  0.7137997 ]]
newaxis
[[-0.06866832 -0.55585654  0.10966281  1.08356824]]
Episode: 16, Steps: 32, eps: 0.968newaxis
[[ 0.00176535 -0.34613522 -0.03111198  0.54523567]]
newaxis
[[-0.09156366  0.42454926  0.12271704 -0.41308112]]
Episode: 19, Steps: 10, eps: 0.962newaxis
[[ 0.12432678 -0.14430762 -0.18274025 -0.17023096]]
newaxis
[[ 0.1053042  -0.13396973 -0.19178688 -0.40259572]]
Episode: 21, Steps: 27, eps: 0.958newaxis
[[ 0.02003645  0.23526383  0

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(8, 4))
plt.plot(rewards)
plt.xlabel("Episode", fontsize=14)
plt.ylabel("Sum of rewards", fontsize=14)
plt.show()

In [None]:
# Now show the animation:

def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

In [None]:
env.seed(42)
state = env.reset()

frames = []

for step in range(200):
    action = epsilon_greedy_policy(state)
    state, reward, done, info = env.step(action)
    if done:
        break
    img = env.render(mode="rgb_array")
    frames.append(img)
    
plot_animation(frames)