In [5]:
!pip install --user gym "gym[atari, accept-rom-license]" ale-py tf-agents==0.6.0 atari_py \
    gin-config==0.3.0 tensorflow-probability==0.11.0 opencv-python-headless imageio-ffmpeg \
    imageio

Collecting gym
  Using cached gym-0.21.0-py3-none-any.whl
Collecting ale-py
  Using cached ale_py-0.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
Collecting tf-agents==0.6.0
  Using cached tf_agents-0.6.0-py3-none-any.whl (1.1 MB)
Collecting atari_py
  Using cached atari_py-0.2.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
Collecting gin-config==0.3.0
  Using cached gin_config-0.3.0-py3-none-any.whl (44 kB)
Collecting opencv-python-headless
  Using cached opencv_python_headless-4.5.4.60-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (47.6 MB)
Collecting imageio-ffmpeg
  Using cached imageio_ffmpeg-0.4.5-py3-none-manylinux2010_x86_64.whl (26.9 MB)
Collecting cloudpickle==1.3
  Using cached cloudpickle-1.3.0-py2.py3-none-any.whl (26 kB)
Collecting autorom[accept-rom-license]~=0.4.2
  Using cached AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license
  Using cached AutoROM.accept_rom_license-0.4.2-py3-none-a

In [None]:
nombre_experimento = "pg_atari"

In [1]:
%load_ext tensorboard

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if IS_COLAB or IS_KAGGLE:
    #!apt update && apt install -y libpq-dev libsdl2-dev swig xorg-dev xvfb
    #%pip install -U tf-agents pyvirtualdisplay
    #%pip install -U gym>=0.21.0
    #%pip install -U gym[box2d,atari,accept-rom-license]
    pass

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  # Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try:
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=7*1024)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Virtual devices must be set before GPUs have been initialized
    print(e)

# Common imports
import numpy as np
import os
import datetime

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To get smooth animations
import matplotlib.animation as animation
mpl.rc('animation', html='jshtml')

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "rl"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

2021-12-04 00:12:55.623651: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0


1 Physical GPUs, 1 Logical GPUs


2021-12-04 00:12:56.760830: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-12-04 00:12:57.400412: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-12-04 00:12:57.401027: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:00:04.0 name: Tesla T4 computeCapability: 7.5
coreClock: 1.59GHz coreCount: 40 deviceMemorySize: 14.75GiB deviceMemoryBandwidth: 298.08GiB/s
2021-12-04 00:12:57.401067: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.11.0
2021-12-04 00:12:57.406976: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.11
2021-12-04 00:12:57.411785: I tensorflow/stream_executor/platform/default/d

## Creación del ambiente de ATARI

In [2]:
from tf_agents.environments import suite_atari
from tf_agents.environments.atari_preprocessing import AtariPreprocessing
from tf_agents.environments.atari_wrappers import FrameStack4
from tf_agents.environments.tf_py_environment import TFPyEnvironment

max_episode_steps = 27000 # <=> 108k ALE frames since 1 step = 4 frames
environment_name = "BreakoutNoFrameskip-v4"

class AtariPreprocessingWithAutoFire(AtariPreprocessing):
    def reset(self, **kwargs):
        obs = super().reset(**kwargs)
        super().step(1) # FIRE to start
        return obs
    def step(self, action):
        lives_before_action = self.ale.lives()
        obs, rewards, done, info = super().step(action)
        if self.ale.lives() < lives_before_action and not done:
            super().step(1) # FIRE to start after life lost
        return obs, rewards, done, info

env = suite_atari.load(
    environment_name,
    max_episode_steps=max_episode_steps,
    gym_env_wrappers=[AtariPreprocessingWithAutoFire, FrameStack4])

tf_env = TFPyEnvironment(env)

  for external in metadata.entry_points().get(self.group, []):
A.L.E: Arcade Learning Environment (version +978d2ce)
[Powered by Stella]


## Creación de la Red Neuronal


In [3]:
from tf_agents.networks.actor_distribution_network import ActorDistributionNetwork
preprocessing_layer = keras.layers.Lambda(
                          lambda obs: tf.cast(obs, np.float32) / 255.)

conv_layer_params=[(32, (8, 8), 4), (64, (4, 4), 2), (64, (3, 3), 1)]

fc_layer_params = [512]

network = ActorDistributionNetwork(
    tf_env.observation_spec(),
    tf_env.action_spec(),
    preprocessing_layers=preprocessing_layer,
    conv_layer_params=conv_layer_params,
    fc_layer_params=fc_layer_params
)

## Creación del Agente


In [4]:
# Creamos la politica

from tf_agents.policies.actor_policy import ActorPolicy


actor_policy = ActorPolicy(time_step_spec=tf_env.time_step_spec(),
                           action_spec=tf_env.action_spec(),
                           actor_network=network)


2021-12-04 00:13:04.208120: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudnn.so.8
2021-12-04 00:13:05.505302: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.11


## Creación del Replay Buffer

In [5]:
class Memory:
    def __init__(self):
        self.observations = tf.TensorArray(dtype=tf_env.observation_spec().dtype, size=0, dynamic_size=True)
        self.actions = tf.TensorArray(dtype=tf_env.action_spec().dtype, size=0, dynamic_size=True)
        self.rewards = tf.TensorArray(dtype=tf_env.reward_spec().dtype, size=0, dynamic_size=True)
        self.count = 0

    def clear(self):
        _ = self.observations.close()
        _ = self.actions.close()
        _ = self.rewards.close()
        self.observations = tf.TensorArray(dtype=tf_env.observation_spec().dtype, size=0, dynamic_size=True)
        self.actions = tf.TensorArray(dtype=tf_env.action_spec().dtype, size=0, dynamic_size=True)
        self.rewards = tf.TensorArray(dtype=tf_env.reward_spec().dtype, size=0, dynamic_size=True)
        self.count = 0

    def add_to_memory(self, trajectory):
        self.observations = self.observations.write(self.count, trajectory.observation[0])
        self.actions = self.actions.write(self.count, trajectory.action[0])
        self.rewards = self.rewards.write(self.count, trajectory.reward[0])
        self.count += 1

    def get_observations(self):
        return self.observations.stack()
    
    def get_actions(self):
        return self.actions.stack()
    
    def get_rewards(self):
        rewards = self.rewards.stack()
        return tf.where(tf.equal(rewards, 0), -1 * tf.ones_like(rewards), rewards)
        
    
memoria = Memory()

## Observadores


In [6]:
from tf_agents.metrics import tf_metrics

train_metrics = [
    tf_metrics.NumberOfEpisodes(),
    tf_metrics.EnvironmentSteps(),
    tf_metrics.AverageReturnMetric(),
    tf_metrics.AverageEpisodeLengthMetric(),
    tf_metrics.MaxReturnMetric()
]

## Persistencia de métricas al tensorboard

In [7]:
train_dir = f"tensorboard/{nombre_experimento}/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_summary_writer = tf.summary.create_file_writer(train_dir, max_queue=1000, flush_millis=10000, name="PG")

## Definimos el driver

In [8]:
# Driver

from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver

driver = DynamicEpisodeDriver(
    tf_env,
    actor_policy,
    observers=train_metrics + [memoria.add_to_memory],
)

train_step_var = tf.Variable(0, dtype=tf.int64, trainable=False)

## Creamos un checkpoint

In [9]:
from tf_agents.utils import common

checkpoint_dir = "checkpoints/pg/" + nombre_experimento
train_checkpointer = common.Checkpointer(
    ckpt_dir=checkpoint_dir,
    max_to_keep=10,
    policy=actor_policy,
    global_step=train_step_var
)

In [10]:
from tf_agents.policies.policy_saver import PolicySaver
tf_policy_saver = PolicySaver(actor_policy)

2021-12-04 00:13:34.802531: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


## Entrenamos nuestro agente

In [11]:
from tf_agents.utils.common import function

# driver.run = function(driver.run) No se puede usar hasta crear una memoria con function

In [12]:
def normalizar(x):
    x -= np.mean(x)
    x /= np.std(x)
    return x.astype(np.float32)

def descuento(rewards, gamma=0.99):
    discounted_rewards = np.zeros_like(rewards)
    R = 0
    for t in reversed(range(len(rewards))):
        R = R * gamma + rewards[t]
        discounted_rewards[t] = R

    return normalizar(discounted_rewards)

In [13]:
def compute_loss(logits, actions, rewards):
    neg_logprob = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits,
        labels=actions
    )

    loss = tf.reduce_mean( neg_logprob * rewards )

    return loss

In [14]:
optimizer = tf.keras.optimizers.Adam(0.00025)

def train_step(network, observations, actions, discounted_rewards):
    with tf.GradientTape() as tape:
        # Pasamos por nuestra red
        output, _ = network(observations, step_type=(), network_state=())

        # Computamos la función de perdida
        loss = compute_loss(output.logits, actions, discounted_rewards)

    grads = tape.gradient(loss, network.trainable_variables)
    optimizer.apply_gradients(zip(grads, network.trainable_variables))

In [15]:
def correr_un_episodio_y_entrenar(i_episode):
    print(f"\rIteración: {i_episode}", end="")
    # Limpiamos la memoria
    memoria.clear()
    # Jugamos un episodio
    driver.run()
    # Entrenamos
    train_step(network,
               observations=memoria.get_observations(),
               actions=memoria.get_actions(),
               discounted_rewards = descuento(memoria.get_rewards()))
    train_step_var.assign_add(1)
    # Logeamos
    with train_summary_writer.as_default():
            for train_metric in train_metrics:
                # Cada train_step es un episodio completo
                train_metric.tf_summaries(train_step=train_step_var)

In [16]:
for i_episode in range(10000):
    # Entrenar 10000 episodios
    correr_un_episodio_y_entrenar(i_episode)
    # Guardar un modelo cada 500 episodios
    if train_step_var % 500 == 0:
        tf_policy_saver.save(f"modelos/{nombre_experimento}/pg_{train_step_var.numpy()}")
    # Hacer checkpoint cada tanto  
    if train_step_var % 100 == 0:
        train_checkpointer.save(train_step_var)





Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: modelos/pg/pg_500/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_500/assets


Iteración: 999INFO:tensorflow:Assets written to: modelos/pg/pg_1000/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_1000/assets


Iteración: 1499INFO:tensorflow:Assets written to: modelos/pg/pg_1500/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_1500/assets


Iteración: 1999INFO:tensorflow:Assets written to: modelos/pg/pg_2000/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_2000/assets


Iteración: 2499INFO:tensorflow:Assets written to: modelos/pg/pg_2500/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_2500/assets


Iteración: 2999INFO:tensorflow:Assets written to: modelos/pg/pg_3000/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_3000/assets


Iteración: 3499INFO:tensorflow:Assets written to: modelos/pg/pg_3500/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_3500/assets


Iteración: 3999INFO:tensorflow:Assets written to: modelos/pg/pg_4000/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_4000/assets


Iteración: 4499INFO:tensorflow:Assets written to: modelos/pg/pg_4500/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_4500/assets


Iteración: 4999INFO:tensorflow:Assets written to: modelos/pg/pg_5000/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_5000/assets


Iteración: 5499INFO:tensorflow:Assets written to: modelos/pg/pg_5500/assets


INFO:tensorflow:Assets written to: modelos/pg/pg_5500/assets


Iteración: 5612

KeyboardInterrupt: 

In [17]:
train_step_var

<tf.Variable 'Variable:0' shape=() dtype=int64, numpy=5612>

## Visualizamos el agente

In [17]:
# No límitamos los pasos por episodio
env_vis = suite_atari.load(
    environment_name,
    max_episode_steps=None,
    gym_env_wrappers=[AtariPreprocessingWithAutoFire, FrameStack4])

tf_env_vis = TFPyEnvironment(env_vis)

In [18]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env_vis.pyenv.envs[0].render(mode="rgb_array"))
    
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver

watch_driver = DynamicEpisodeDriver(
    tf_env_vis,
    actor_policy,
    observers=[save_frames],
    num_episodes=2)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)

In [None]:
%tensorboard --logdir tensorboard --port 9990

## Guardamos los modelos

## Limpiamos 

In [None]:
train_summary_writer.close()

In [None]:
train_step

In [19]:
saved_policy = tf.saved_model.load("modelos/pg/pg_1")

In [None]:
saved_policy

In [20]:
def update_scene(num, frames, patch):
    patch.set_data(frames[num])
    return patch,

def plot_animation(frames, repeat=False, interval=40):
    fig = plt.figure()
    patch = plt.imshow(frames[0])
    plt.axis('off')
    anim = animation.FuncAnimation(
        fig, update_scene, fargs=(frames, patch),
        frames=len(frames), repeat=repeat, interval=interval)
    plt.close()
    return anim

frames = []
def save_frames(trajectory):
    global frames
    frames.append(tf_env_vis.pyenv.envs[0].render(mode="rgb_array"))
    
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver

watch_driver = DynamicEpisodeDriver(
    tf_env_vis,
    saved_policy,
    observers=[save_frames],
    num_episodes=2)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)