In [None]:
# !pip install gym "gym[atari, accept-rom-license]" ale-py tf-agents==0.6.0 atari_py  gin-config==0.3.0

In [1]:
%load_ext tensorboard

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    #!apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
    #%pip install -U tf-agents pyvirtualdisplay
    #%pip install -U gym>=0.21.0
    #%pip install -U gym[box2d,atari,accept-rom-license]
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. CNNs can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

# Common imports
import numpy as np
import os
import datetime

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

2021-12-02 22:58:31.761658: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-12-02 22:58:34.608285: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-12-02 22:58:39.756298: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:04:00.0 name: Tesla K40m computeCapability: 3.5
coreClock: 0.745GHz coreCount: 15 deviceMemorySize: 11.17GiB deviceMemoryBandwidth: 268.58GiB/s
2021-12-02 22:58:39.756531: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-12-02 22:58:39.772449: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-12-02 22:58:39.781284: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcufft.so.10
202

## Creación del ambiente de ATARI

In [2]:
from tf_agents.environments import suite_atari
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4
from tf_agents.environments.tf_py_environment import TFPyEnvironment

max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames
environment_name = "BreakoutNoFrameskip-v4"

class AtariPreprocessingWithAutoFire(AtariPreprocessing):
    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        super().step(1) # FIRE to start
        return obs
    def step(self, action):
        lives_before_action = self.ale.lives()
        obs, rewards, done, info = super().step(action)
        if self.ale.lives() < lives_before_action and not done:
            super().step(1) # FIRE to start after life lost
        return obs, rewards, done, info

env = suite_atari.load(
    environment_name,
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[AtariPreprocessingWithAutoFire, FrameStack4])

tf_env = TFPyEnvironment(env)

A.L.E: Arcade Learning Environment (version +978d2ce)
[Powered by Stella]


## Creación de la Q Network


In [3]:
from tf_agents.networks.q_network import QNetwork

preprocessing_layer = keras.layers.Lambda(
                          lambda obs: tf.cast(obs, np.float32) / 255.)
conv_layer_params=[(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]
fc_layer_params=[512]

q_net = QNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params)

## Creación del Agente

In [4]:
from tf_agents.agents.dqn.dqn_agent import DdqnAgent

train_step = tf.Variable(0, dtype=tf.int64)
update_period = 4 # run a training step every 4 collect steps
optimizer = keras.optimizers.RMSprop(learning_rate=2.5e-4, rho=0.95, momentum=0.0,
                                     epsilon=0.00001, centered=True)
epsilon_fn = keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=1.0, # initial ε
    decay_steps=250000 // update_period, # <=> 1,000,000 ALE frames
    end_learning_rate=0.01) # final ε
agent = DdqnAgent(tf_env.time_step_spec(),
                 tf_env.action_spec(),
                 q_network=q_net,
                 optimizer=optimizer,
                 target_update_period=2000, # <=> 32,000 ALE frames
                 td_errors_loss_fn=keras.losses.Huber(reduction="none"),
                 gamma=0.99, # discount factor
                 train_step_counter=train_step,
                 epsilon_greedy=lambda: epsilon_fn(train_step))
agent.initialize()

2021-12-02 22:58:53.306039: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-12-02 22:58:53.334382: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2399755000 Hz
2021-12-02 22:58:53.351479: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558cece9ac40 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2021-12-02 22:58:53.351573: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2021-12-02 22:58:53.499277: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x558cecf07220 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2021-12-02 22:58:53.499426: 

## Creación del Replay Buffer

In [5]:
from tf_agents.replay_buffers import tf_uniform_replay_buffer

replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=tf_env.batch_size,
    max_length=100000) # reduce if OOM error

replay_buffer_observer = replay_buffer.add_batch

2021-12-02 22:58:58.685593: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 2822400000 exceeds 10% of free system memory.


## Observadores

In [6]:
class ShowProgress:
    def __init__(self, total):
        self.counter = 0
        self.total = total
    def __call__(self, trajectory):
        if not trajectory.is_boundary():
            self.counter += 1
        if self.counter % 100 == 0:
            print("\r{}/{}".format(self.counter, self.total), end="")

In [7]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
]

## Persistencia de métricas al tensorboard

In [10]:
train_dir = "tensorboard/ddqn/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary_writer = tf.summary.create_file_writer(train_dir, max_queue=1000, flush_millis=10000, name="DDQN")

## Definimos el driver

In [8]:
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

collect_driver = DynamicStepDriver(
    tf_env,
    agent.collect_policy,
    observers=[replay_buffer_observer] + train_metrics,
    num_steps=update_period) # collect 4 steps for each training iteration

## Precargamos el Replay Buffer con una politica aleatoria

In [12]:
from tf_agents.policies.random_tf_policy import RandomTFPolicy

initial_collect_policy = RandomTFPolicy(tf_env.time_step_spec(),
                                        tf_env.action_spec())
init_driver = DynamicStepDriver(
    tf_env,
    initial_collect_policy,
    observers=[replay_buffer.add_batch, ShowProgress(20000)],
    num_steps=20000) # <=> 80,000 ALE frames
final_time_step, final_policy_state = init_driver.run()

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.while_loop(c, b, vars, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.while_loop(c, b, vars))
20000/20000

In [9]:
dataset = replay_buffer.as_dataset(
    sample_batch_size=32,
    num_steps=2,
    num_parallel_calls=3).prefetch(3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


## Creamos un checkpoint

In [14]:
from tf_agents.utils import common

checkpoint_dir = "checkpoints/ddqn_"+ datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=10,
    agent=agent,
    policy=agent.policy,
    replay_buffer=replay_buffer,
    global_step=train_step
)

train_checkpointer.save(train_step)

2021-12-02 12:12:36.333862: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 2822400000 exceeds 10% of free system memory.


In [15]:
from tf_agents.policies.policy_saver import PolicySaver
tf_policy_saver = PolicySaver(agent.policy)

2021-12-02 12:13:01.008022: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


## Entrenamos nuestro agente

In [16]:
from tf_agents.utils.common import function

collect_driver.run = function(collect_driver.run)
agent.train = function(agent.train)

In [17]:
def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f}".format(
            iteration, train_loss.loss.numpy()), end="")
        # Guardamos cada metrica
        with train_summary_writer.as_default():
            for train_metric in train_metrics:
                train_metric.tf_summaries(train_step=train_step)

In [None]:
## train_agent(n_iterations=50000)

In [None]:
for _ in range(8):
    # Ejecutamos 8 veces 50000 iteraciones
    train_agent(n_iterations=50000)
    tf_policy_saver.save(f"modelos/ddqn_{train_step.numpy()}")
    train_checkpointer.save(train_step)

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: modelos/ddqn_50000/assets


INFO:tensorflow:Assets written to: modelos/ddqn_50000/assets
2021-12-02 13:30:51.438810: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 2822400000 exceeds 10% of free system memory.


49999 loss:0.00239INFO:tensorflow:Assets written to: modelos/ddqn_100000/assets


INFO:tensorflow:Assets written to: modelos/ddqn_100000/assets
2021-12-02 14:47:26.764639: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 2822400000 exceeds 10% of free system memory.


49999 loss:0.00266INFO:tensorflow:Assets written to: modelos/ddqn_150000/assets


INFO:tensorflow:Assets written to: modelos/ddqn_150000/assets
2021-12-02 16:04:31.488486: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 2822400000 exceeds 10% of free system memory.


41189 loss:0.00138

## Visualizamos el agente

In [14]:
# No límitamos los pasos por episodio
env_vis = suite_atari.load(
    environment_name,
    max_episode_steps=None,
    gym_env_wrappers=[AtariPreprocessingWithAutoFire, FrameStack4])

tf_env_vis = TFPyEnvironment(env_vis)

In [15]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env_vis.pyenv.envs[0].render(mode="rgb_array"))
    
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver

watch_driver = DynamicEpisodeDriver(
    tf_env_vis,
    agent.policy,
    observers=[save_frames],
    num_episodes=2)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)

In [22]:
%tensorboard --logdir tensorboard --port 9990

## Guardamos los modelos

## Limpiamos 

In [23]:
train_summary_writer.close()

In [24]:
train_step

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=400000>

In [10]:
saved_policy = tf.saved_model.load("modelos/ddqn_400000")

In [30]:
saved_policy

<tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject at 0x7ff040332100>

In [13]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env_vis.pyenv.envs[0].render(mode="rgb_array"))
    
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver

watch_driver = DynamicEpisodeDriver(
    tf_env_vis,
    saved_policy,
    observers=[save_frames],
    num_episodes=2)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)