In [1]:
import tensorflow as tf
import shutil
import numpy as np
import tqdm
import pandas as pd
import os
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import random

GAMMA = 0.99
TAU = 0.01
BATCHSIZE = 128
N_BATCHES = 64
N_MULTI_ENVS = 64
EXPLORATION_RATE = 0.75
EPSILON_DECAY = 0.9993

N_NEW_SAMPLES = 500
N_SAMPLE_SETS = 100

CHECKPOINT_FREQ = 100
LOG_FREQ = 100
N_TEST_ENVS = 8

In [2]:
x = np.arange(3000)
y = EXPLORATION_RATE * (EPSILON_DECAY ** x)

plt.plot(x,y)
plt.title("Epsilon over time")
plt.xlabel("Outer Loop Iteration")
plt.show()

In [3]:
TMP_SAVE_TO_PATH = "ckpts/ckpt"
TMP_LOG_PATH = "logs/{}.json"
TB_LOGS = "tb_lobs/run"


os.makedirs(TMP_SAVE_TO_PATH.replace("/ckpt",""), exist_ok= True)
os.makedirs(TMP_LOG_PATH.replace("/{}.json",""), exist_ok= True)
os.makedirs(TB_LOGS, exist_ok= True)

# # get old checkpoint
# !cp /content/gdrive/MyDrive/DeepRL/HW4/checkpoint.zip checkpoint.zip
# !unzip -d {TMP_SAVE_TO_PATH} checkpoint.zip

LOSS = tf.keras.losses.Huber()
CNN_SHAPE = (84, 84)


In [4]:


def triple_conv_block_no_batchnorm(x, filters):

    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x) + x
    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x) + x

    return x


def get_standard_dqn():
    tf.keras.backend.clear_session()

    inputs = tf.keras.Input((8,))
    x = tf.keras.layers.Dense(512, "relu")(inputs)
    x = tf.keras.layers.Dropout(0.2,)(x)
    x = tf.keras.layers.Dense(512, "relu")(x)
    x = tf.keras.layers.Dropout(0.2,)(x)
    x = tf.keras.layers.Dense(512, "relu")(x)
    outputs = tf.keras.layers.Dense(4, "linear")(x)

    model = tf.keras.Model(inputs, outputs, name="standard_dqn")

    return model


@tf.function
def sample_trajectory(dqn, state, epsilon=0.2):

    n_par = tf.shape(state)[0]

    mask = tf.random.uniform((n_par,), 0, 1, tf.float32) > epsilon

    predictions = dqn(state, training=False)
    max_actions = tf.math.argmax(predictions, axis=-1)

    random_choices = tf.random.uniform(
        shape=[n_par], minval=0, maxval=4, dtype=tf.int64)

    return tf.where(mask, max_actions, random_choices), tf.reduce_max(predictions, -1)


@tf.function
def preprocess_all(observation, next_observation, action, reward, terminated):

    observation = tf.cast(observation, tf.float32)

    next_observation = tf.cast(next_observation, tf.float32)

    action = tf.cast(action, tf.int64)
    reward = tf.cast(reward, tf.float32)
    terminated = tf.cast(terminated, tf.bool)

    return observation, next_observation, action, reward, terminated


@tf.function
def preprocess_obersvation(observation):

    observation = tf.cast(observation, tf.float32)

    return observation


@tf.function
def polyak_averaging(Q_target, Q_dqn, tau):
    """

    Args:
        Q_target (_type_): _description_
        Q_dqn (_type_): _description_
        tau (_type_): _description_
    """

    for old, new in zip(Q_target.trainable_variables, Q_dqn.trainable_variables):
        update = old * (1 - tau) + new * tau
        old.assign(update)


@tf.function
def update_q_network(data, dqn, q_target, optimizer, gamma):

    state, next_state, action, reward, terminated = data

    s_prime_values = q_target(next_state, training=False)
    s_prime_values = tf.reduce_max(s_prime_values, -1)
    
    labels = reward + gamma * s_prime_values * (1 - tf.cast(terminated, tf.float32))

    with tf.GradientTape() as tape:

        predictions = dqn(state, training=True)
        action_values = tf.gather(predictions, action, batch_dims=1)

        loss = LOSS(action_values, labels)

    gradients = tape.gradient(loss, dqn.trainable_variables)
    optimizer.apply_gradients(zip(gradients, dqn.trainable_variables))
    return loss


class ENV_SAMPLER:
    """
    Class for sampling environment data using a DQN model.
    """

    def __init__(self, dqn, n_multi_envs) -> None:
        """
        Initialize the ENV_SAMPLER instance.

        Args:
            env: The environment to sample from.
            dqn: The DQN model for action selection.
            n_multi_envs: The number of parallel environments.
            preprocess_observation: Function to preprocess observations.
        """
        self.env = gym.vector.make("LunarLander-v2", num_envs=n_multi_envs)
        self.current_state = self.env.reset()[0]
        self.dqn = dqn
        self.n_multi_envs = n_multi_envs

    def reset_env(self):
        """
        Reset the environment to the initial state.
        """
        self.current_state = self.env.reset()[0]

    def sample(self, n_samples, epsilon=0.2):
        """
        Sample environment data.

        Args:
            n_samples: The number of samples to generate.
            epsilon: The exploration factor for action selection (default: 0.2).

        Returns:
            samples: List of sampled data tuples (current_state, next_state, action, reward, terminated).
        """
        samples = []

        n_steps = np.ceil(n_samples / self.n_multi_envs).astype(int)

        for _ in range(n_steps):
            oberservation_as_tensor = preprocess_obersvation(
                self.current_state)

            action, q_vals = map(lambda x: x.numpy(), sample_trajectory(
                self.dqn, oberservation_as_tensor, epsilon))

            observation, reward, terminated, truncated, info = self.env.step(
                action)

            for i in range(self.n_multi_envs):
                samples.append((self.current_state[i],
                                observation[i],
                                action[i],
                                reward[i],
                                terminated[i]))

            self.current_state = observation

        return samples[:n_samples]

    def measure_model_perforamnce(self, gamma: float, target_q, n_test_envs = 4):

        test_env = gym.vector.make("LunarLander-v2", num_envs=n_test_envs)

        current_state = test_env.reset()[0]


        rewards = np.zeros(n_test_envs)
        terminated_at = []
        q_values = []
        target_q_values = []

        allready_terminated = np.zeros(n_test_envs, bool)

        steps = 0

        while True:
            

            oberservation_as_tensor = preprocess_obersvation(
                current_state)

            action, q_vals = map(lambda x: x.numpy(), sample_trajectory(
                self.dqn, oberservation_as_tensor, 0.05))

            target_vals = tf.reduce_max(target_q(oberservation_as_tensor), -1)

            observation, reward, terminated, truncated, info = test_env.step(
                action)

            current_state = observation

            rewards += (gamma ** steps) * reward * (1 - allready_terminated)


            allready_terminated = np.logical_or(
                allready_terminated, terminated)

            for index,t in enumerate(terminated):

                if t:
                    terminated_at.append(steps)

            q_values.extend(q_vals.tolist())
            target_q_values.extend(target_vals.numpy().tolist())

            steps += 1

            if allready_terminated.all():

                break

        average_q_val = np.mean(q_values)
        average_target_q_val = np.mean(target_q_values)

        l2_diff = np.array(q_values) - np.array(target_q_values)
        l2_diff = np.sqrt(np.square(l2_diff).mean())

        average_rewards = np.mean(rewards)
        average_termination = np.mean(terminated_at)
        
        test_env.close()

        return average_rewards, average_termination, average_q_val, average_target_q_val, l2_diff


class ReplayBuffer:
    """
    Class for managing a replay buffer for reinforcement learning.
    """

    def __init__(self, ) -> None:
        """
        Initialize the ReplayBuffer instance.

        Args:
            preprocess_func: Function to preprocess examples.
        """
        self.saved_trajectories = []

    def add_new_trajectory(self, trajectory):
        """
        Add a new trajectory to the replay buffer.

        Args:
            trajectory: List of examples representing a trajectory.
        """
        self.saved_trajectories.append(trajectory)

    def drop_first_trajectory(self):
        """
        Remove the oldest trajectory from the replay buffer.
        """
        to_delete = self.saved_trajectories.pop(0)
        del to_delete

    def sample_singe_example(
        self,
    ):
        """
        Sample a single example from the replay buffer.

        Args:
            melt_stop_criteria: Boolean flag indicating whether to consider stop criteria (default: False).

        Returns:
            example: A single example from a randomly chosen trajectory.
        """
        trajectory = random.choice(self.saved_trajectories)
        example = random.choice(trajectory)

        states, next_states, actions, rewards, terminations, = example

        return states, next_states, actions, rewards, terminations

    def sample_n_examples(self, n_examples: int):
        """
        Sample multiple examples from the replay buffer.

        Args:
            n_examples: The number of examples to sample.

        Returns:
            states, next_states, actions, rewards, stop_criteria: Arrays of sampled examples.
        """
        trajectories = [self.sample_singe_example() for _ in range(n_examples)]

        states, next_states, actions, rewards, stop_criteria = map(
            np.array, zip(*trajectories)
        )

        return states, next_states, actions, rewards, stop_criteria

    def generate_tf_dataset(self, n_batches, batchsize):
        """
        Generate a TensorFlow dataset from the replay buffer.

        Args:
            n_batches: The number of batches to generate.
            batchsize: The size of each batch.

        Returns:
            ds: TensorFlow dataset containing the preprocessed examples.
        """
        n_steps = n_batches * batchsize

        ds = self.sample_n_examples(n_steps)
        ds = tf.data.Dataset.from_tensor_slices(ds)
        ds = ds.map(preprocess_all, tf.data.AUTOTUNE)
        ds = ds.batch(batchsize)

        return ds

In [5]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

q_net = get_standard_dqn()

target_net = tf.keras.models.clone_model(q_net)

env_sampler = ENV_SAMPLER(q_net, N_MULTI_ENVS)
replay_buffer = ReplayBuffer()

checkpoint = tf.train.Checkpoint(q_net = q_net, target_net = target_net, optimizer = optimizer)

writer = tf.summary.create_file_writer(TB_LOGS)

# restore
# checkpoint.restore(TMP_SAVE_TO_PATH



2023-06-19 17:14:38.424389: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-19 17:14:38.429327: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-19 17:14:38.429494: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-19 17:14:38.430042: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [6]:
for _ in tqdm.tqdm(range(N_SAMPLE_SETS), desc = "Fill Buffer initally"):

    new_samples = env_sampler.sample(N_NEW_SAMPLES, epsilon = 1)
    replay_buffer.add_new_trajectory(new_samples)

Fill Buffer initally: 100%|██████████| 100/100 [00:05<00:00, 19.95it/s]


In [7]:
outer_steps = 0

env_sampler.reset_env()



while True:
    

    outer_steps +=1
    EXPLORATION_RATE *= EPSILON_DECAY
    
    ds = replay_buffer.generate_tf_dataset(N_BATCHES, BATCHSIZE)
    
    bar = ds # tqdm.tqdm(ds)
    
    losses = []

    for data in bar:

        loss = update_q_network(data, q_net, target_net, optimizer, GAMMA)
        
        
        losses.append(loss)
        mean_loss = float(np.mean(losses))
        # bar.set_description(f"Loss {mean_loss:.6f}")

    mean_loss = float(np.mean(losses))
    
    with writer.as_default():

        tf.summary.scalar("loss", mean_loss, step=outer_steps)
        tf.summary.scalar("epsilon", EXPLORATION_RATE, step=outer_steps)


    new_samples = env_sampler.sample(N_NEW_SAMPLES, epsilon= EXPLORATION_RATE)
    replay_buffer.add_new_trajectory(new_samples)

    replay_buffer.drop_first_trajectory()

    polyak_averaging(target_net, q_net, TAU)
    
    
    if outer_steps % LOG_FREQ == 0:
        
        print("Logging Model Metrics")
        
        results = env_sampler.measure_model_perforamnce( GAMMA, target_net, N_TEST_ENVS)
        names = ["average_rewards", "average_termination", "average_q_val", "average_target_q_val", "l2_diff",]

        for val, name in zip(results, names):

            with writer.as_default():

                tf.summary.scalar(name, val, step=outer_steps)

        results = pd.Series(results, names)
        
        results["average_loss"] = np.mean(losses)
        
        results.to_json(TMP_LOG_PATH.format(outer_steps))
        
        print(results)
        
        
    if outer_steps % CHECKPOINT_FREQ == 0:
        
        print("Saving Checkpoint")
        
        checkpoint.save(TMP_SAVE_TO_PATH)

Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          -5.509539
average_termination     729.380952
average_q_val             0.279820
average_target_q_val      0.105209
l2_diff                   2.135489
average_loss              1.209387
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          -8.006633
average_termination     362.090909
average_q_val             0.510703
average_target_q_val      0.481198
l2_diff                   2.727932
average_loss              1.210004
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          -4.201870
average_termination     446.933333
average_q_val             0.861691
average_target_q_val      0.768567
l2_diff                   2.388226
average_loss              1.230002
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards         -10.041727
average_termination     535.466667
average_q_val             1.390231
average_target_q_val      1.282922
l2_diff                   1.897361
average_loss              1.210114
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           1.733727
average_termination     795.560000
average_q_val             2.073296
average_target_q_val      1.823520
l2_diff                   1.972179
average_loss              1.300634
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards            9.900088
average_termination     1906.565217
average_q_val              2.644400
average_target_q_val       2.541667
l2_diff                    1.189710
average_loss               1.147327
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           11.248574
average_termination     1376.235294
average_q_val              3.212788
average_target_q_val       3.139042
l2_diff                    0.939870
average_loss               0.970838
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          -5.185374
average_termination     686.047619
average_q_val             4.547650
average_target_q_val      4.676023
l2_diff                   0.562815
average_loss              0.924151
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          -3.503479
average_termination     866.562500
average_q_val             5.554807
average_target_q_val      5.273357
l2_diff                   0.707323
average_loss              0.779214
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           -3.779180
average_termination     1212.642857
average_q_val              7.369318
average_target_q_val       7.467192
l2_diff                    0.474111
average_loss               0.596702
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards            2.363268
average_termination     1589.000000
average_q_val              7.671534
average_target_q_val       7.655367
l2_diff                    0.461929
average_loss               0.547388
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards            9.604541
average_termination     1195.250000
average_q_val              6.965874
average_target_q_val       7.095036
l2_diff                    0.388867
average_loss               0.537916
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           2.054318
average_termination     643.117647
average_q_val             8.436354
average_target_q_val      8.467231
l2_diff                   0.472359
average_loss              0.504369
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           4.175084
average_termination     917.941176
average_q_val             8.070183
average_target_q_val      7.908861
l2_diff                   0.470864
average_loss              0.392556
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           32.252818
average_termination     1631.400000
average_q_val             11.047451
average_target_q_val      11.072432
l2_diff                    0.417911
average_loss               0.487034
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          37.252734
average_termination     913.666667
average_q_val            14.208448
average_target_q_val     14.174950
l2_diff                   0.434362
average_loss              0.353886
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           19.293715
average_termination     1179.111111
average_q_val             10.306548
average_target_q_val      10.179141
l2_diff                    0.438067
average_loss               0.331186
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          23.121853
average_termination     857.875000
average_q_val            13.122060
average_target_q_val     13.010412
l2_diff                   0.398680
average_loss              0.266479
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           37.604646
average_termination     1465.105263
average_q_val             12.428048
average_target_q_val      12.280682
l2_diff                    0.485072
average_loss               0.348653
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           39.144342
average_termination     1086.833333
average_q_val             13.548063
average_target_q_val      13.337428
l2_diff                    0.471011
average_loss               0.383810
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          39.906821
average_termination     967.875000
average_q_val            14.503660
average_target_q_val     14.404293
l2_diff                   0.405013
average_loss              0.389847
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          34.106997
average_termination     975.866667
average_q_val            13.940448
average_target_q_val     13.874635
l2_diff                   0.423767
average_loss              0.334109
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           31.616170
average_termination     1200.240000
average_q_val             17.699020
average_target_q_val      17.697432
l2_diff                    0.376134
average_loss               0.425781
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          33.267881
average_termination     493.444444
average_q_val            18.210022
average_target_q_val     18.085460
l2_diff                   0.546760
average_loss              0.362977
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           30.192509
average_termination     1145.600000
average_q_val             15.585725
average_target_q_val      15.457098
l2_diff                    0.455987
average_loss               0.431128
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           29.715247
average_termination     1825.423077
average_q_val             12.933905
average_target_q_val      12.866194
l2_diff                    0.412182
average_loss               0.424015
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards          43.005776
average_termination     854.894737
average_q_val            18.602579
average_target_q_val     18.461083
l2_diff                   0.617927
average_loss              0.452832
dtype: float64
Saving Checkpoint


KeyboardInterrupt: 