In [1]:
import tensorflow as tf
import shutil
import numpy as np
import tqdm
import pandas as pd
import os
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import random

GAMMA = 0.99
TAU = 0.01
BATCHSIZE = 128
N_BATCHES = 64
N_MULTI_ENVS = 64
EXPLORATION_RATE = 0.75
EPSILON_DECAY = 0.998

N_NEW_SAMPLES = 500
N_SAMPLE_SETS = 100

CHECKPOINT_FREQ = 40
LOG_FREQ = 30
N_TEST_ENVS = 8

In [2]:
TMP_SAVE_TO_PATH = "ckpts/ckpt"
TMP_LOG_PATH = "logs/{}.json"
TB_LOGS = "tb_lobs/run"


os.makedirs(TMP_SAVE_TO_PATH.replace("/ckpt",""), exist_ok= True)
os.makedirs(TMP_LOG_PATH.replace("/{}.json",""), exist_ok= True)
os.makedirs(TB_LOGS, exist_ok= True)

# # get old checkpoint
# !cp /content/gdrive/MyDrive/DeepRL/HW4/checkpoint.zip checkpoint.zip
# !unzip -d {TMP_SAVE_TO_PATH} checkpoint.zip

LOSS = tf.keras.losses.Huber()
CNN_SHAPE = (84, 84)


In [3]:


def triple_conv_block_no_batchnorm(x, filters):

    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x) + x
    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x) + x

    return x


def get_standard_dqn():
    tf.keras.backend.clear_session()

    inputs = tf.keras.layers.Input(CNN_SHAPE + (3,))

    x = triple_conv_block_no_batchnorm(inputs, 16)
    x = tf.keras.layers.MaxPool2D(2)(x)
    x = triple_conv_block_no_batchnorm(x, 32)
    x = tf.keras.layers.MaxPool2D(2)(x)
    x = triple_conv_block_no_batchnorm(x, 64)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(512, "relu")(x)
    outputs = tf.keras.layers.Dense(4, "linear")(x)

    model = tf.keras.Model(inputs, outputs, name="standard_dqn")

    return model


def get_small_dqn():
    tf.keras.backend.clear_session()

    inputs = tf.keras.layers.Input(CNN_SHAPE + (3,))
    x = triple_conv_block_no_batchnorm(inputs, 10)
    x = tf.keras.layers.MaxPool2D(2)(x)
    x = triple_conv_block_no_batchnorm(x, 20)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(512, "relu")(x)
    outputs = tf.keras.layers.Dense(4, "linear")(x)

    model = tf.keras.Model(inputs, outputs, name="standard_dqn")

    return model


@tf.function
def sample_trajectory(dqn, state, epsilon=0.2):

    n_par = tf.shape(state)[0]

    mask = tf.random.uniform((n_par,), 0, 1, tf.float32) > epsilon

    predictions = dqn(state, training=False)
    max_actions = tf.math.argmax(predictions, axis=-1)

    random_choices = tf.random.uniform(
        shape=[n_par], minval=0, maxval=4, dtype=tf.int64)

    return tf.where(mask, max_actions, random_choices), tf.reduce_max(predictions, -1)


@tf.function
def preprocess_all(observation, next_observation, action, reward, terminated):

    observation = tf.cast(observation, tf.float32)/255
    observation = tf.image.resize(observation, CNN_SHAPE)

    next_observation = tf.cast(next_observation, tf.float32)/255
    next_observation = tf.image.resize(next_observation, CNN_SHAPE)

    action = tf.cast(action, tf.int64)
    reward = tf.cast(reward, tf.float32)
    terminated = tf.cast(terminated, tf.bool)

    return observation, next_observation, action, reward, terminated


@tf.function
def preprocess_obersvation(observation):

    observation = tf.cast(observation, tf.float32)/255

    return tf.image.resize(observation, CNN_SHAPE)


@tf.function
def polyak_averaging(Q_target, Q_dqn, tau):
    """

    Args:
        Q_target (_type_): _description_
        Q_dqn (_type_): _description_
        tau (_type_): _description_
    """

    for old, new in zip(Q_target.trainable_variables, Q_dqn.trainable_variables):
        update = old * (1 - tau) + new * tau
        old.assign(update)


@tf.function
def update_q_network(data, dqn, q_target, optimizer, gamma):

    state, next_state, action, reward, terminated = data

    s_prime_values = q_target(next_state, training=False)
    s_prime_values = tf.reduce_max(s_prime_values, -1)
    
    punishment = tf.ones_like(reward) * -1
    labels = reward + gamma * s_prime_values
    
    labels = tf.where(terminated, punishment, labels)

    with tf.GradientTape() as tape:

        predictions = dqn(state, training=True)
        action_values = tf.gather(predictions, action, batch_dims=1)

        loss = LOSS(action_values, labels)

    gradients = tape.gradient(loss, dqn.trainable_variables)
    optimizer.apply_gradients(zip(gradients, dqn.trainable_variables))
    return loss


class ENV_SAMPLER:
    """
    Class for sampling environment data using a DQN model.
    """

    def __init__(self, dqn, n_multi_envs) -> None:
        """
        Initialize the ENV_SAMPLER instance.

        Args:
            env: The environment to sample from.
            dqn: The DQN model for action selection.
            n_multi_envs: The number of parallel environments.
            preprocess_observation: Function to preprocess observations.
        """
        self.env = gym.vector.make('ALE/Breakout-v5', num_envs=n_multi_envs)
        self.current_state = self.env.reset()[0]
        self.dqn = dqn
        self.n_multi_envs = n_multi_envs

    def reset_env(self):
        """
        Reset the environment to the initial state.
        """
        self.current_state = self.env.reset()[0]

    def sample(self, n_samples, epsilon=0.2):
        """
        Sample environment data.

        Args:
            n_samples: The number of samples to generate.
            epsilon: The exploration factor for action selection (default: 0.2).

        Returns:
            samples: List of sampled data tuples (current_state, next_state, action, reward, terminated).
        """
        samples = []

        n_steps = np.ceil(n_samples / self.n_multi_envs).astype(int)

        for _ in range(n_steps):
            oberservation_as_tensor = preprocess_obersvation(
                self.current_state)

            action, q_vals = map(lambda x: x.numpy(), sample_trajectory(
                self.dqn, oberservation_as_tensor, epsilon))

            observation, reward, terminated, truncated, info = self.env.step(
                action)

            for i in range(self.n_multi_envs):
                samples.append((self.current_state[i],
                                observation[i],
                                action[i],
                                reward[i],
                                terminated[i]))

            self.current_state = observation

        return samples[:n_samples]

    def measure_model_perforamnce(self, gamma: float, target_q, n_test_envs = 4):

        test_env = gym.vector.make('ALE/Breakout-v5', num_envs=n_test_envs)

        current_state = test_env.reset()[0]


        rewards = np.zeros(n_test_envs)
        terminated_at = []
        q_values = []
        target_q_values = []

        allready_terminated = np.zeros(n_test_envs, np.bool)

        steps = 0

        while True:
            

            oberservation_as_tensor = preprocess_obersvation(
                current_state)

            action, q_vals = map(lambda x: x.numpy(), sample_trajectory(
                self.dqn, oberservation_as_tensor, 0.05))

            target_vals = tf.reduce_max(target_q(oberservation_as_tensor), -1)

            observation, reward, terminated, truncated, info = test_env.step(
                action)

            current_state = observation

            rewards += (gamma ** steps) * reward * (1 - allready_terminated)


            allready_terminated = np.logical_or(
                allready_terminated, terminated)

            for index,t in enumerate(terminated):

                if t:
                    terminated_at.append(steps)

            q_values.extend(q_vals.tolist())
            target_q_values.extend(target_vals.numpy().tolist())

            steps += 1

            if allready_terminated.all():

                break

        average_q_val = np.mean(q_values)
        average_target_q_val = np.mean(target_q_values)

        l2_diff = np.array(q_values) - np.array(target_q_values)
        l2_diff = np.sqrt(np.square(l2_diff).mean())

        average_rewards = np.mean(rewards)
        average_termination = np.mean(terminated_at)
        
        test_env.close()

        return average_rewards, average_termination, average_q_val, average_target_q_val, l2_diff


class ReplayBuffer:
    """
    Class for managing a replay buffer for reinforcement learning.
    """

    def __init__(self, ) -> None:
        """
        Initialize the ReplayBuffer instance.

        Args:
            preprocess_func: Function to preprocess examples.
        """
        self.saved_trajectories = []

    def add_new_trajectory(self, trajectory):
        """
        Add a new trajectory to the replay buffer.

        Args:
            trajectory: List of examples representing a trajectory.
        """
        self.saved_trajectories.append(trajectory)

    def drop_first_trajectory(self):
        """
        Remove the oldest trajectory from the replay buffer.
        """
        to_delete = self.saved_trajectories.pop(0)
        del to_delete

    def sample_singe_example(
        self,
    ):
        """
        Sample a single example from the replay buffer.

        Args:
            melt_stop_criteria: Boolean flag indicating whether to consider stop criteria (default: False).

        Returns:
            example: A single example from a randomly chosen trajectory.
        """
        trajectory = random.choice(self.saved_trajectories)
        example = random.choice(trajectory)

        states, next_states, actions, rewards, terminations, = example

        return states, next_states, actions, rewards, terminations

    def sample_n_examples(self, n_examples: int):
        """
        Sample multiple examples from the replay buffer.

        Args:
            n_examples: The number of examples to sample.

        Returns:
            states, next_states, actions, rewards, stop_criteria: Arrays of sampled examples.
        """
        trajectories = [self.sample_singe_example() for _ in range(n_examples)]

        states, next_states, actions, rewards, stop_criteria = map(
            np.array, zip(*trajectories)
        )

        return states, next_states, actions, rewards, stop_criteria

    def generate_tf_dataset(self, n_batches, batchsize):
        """
        Generate a TensorFlow dataset from the replay buffer.

        Args:
            n_batches: The number of batches to generate.
            batchsize: The size of each batch.

        Returns:
            ds: TensorFlow dataset containing the preprocessed examples.
        """
        n_steps = n_batches * batchsize

        ds = self.sample_n_examples(n_steps)
        ds = tf.data.Dataset.from_tensor_slices(ds)
        ds = ds.map(preprocess_all, tf.data.AUTOTUNE)
        ds = ds.batch(batchsize)

        return ds

In [4]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

q_net = get_standard_dqn()

target_net = tf.keras.models.clone_model(q_net)

env_sampler = ENV_SAMPLER(q_net, N_MULTI_ENVS)
replay_buffer = ReplayBuffer()

checkpoint = tf.train.Checkpoint(q_net = q_net, target_net = target_net, optimizer = optimizer)

writer = tf.summary.create_file_writer(TB_LOGS)

# restore
# checkpoint.restore(TMP_SAVE_TO_PATH



2023-06-14 18:43:51.886423: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 18:43:51.893571: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 18:43:51.893792: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 18:43:51.894853: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [5]:
for _ in tqdm.tqdm(range(N_SAMPLE_SETS), desc = "Fill Buffer initally"):

    new_samples = env_sampler.sample(N_NEW_SAMPLES, epsilon = 1)
    replay_buffer.add_new_trajectory(new_samples)

Fill Buffer initally:   0%|          | 0/100 [00:00<?, ?it/s]2023-06-14 18:43:55.672811: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8100
2023-06-14 18:43:55.873037: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-06-14 18:43:55.873545: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-06-14 18:43:55.873566: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2023-06-14 18:43:55.874164: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-06-14 18:43:55.874212: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
Fi

In [6]:
outer_steps = 0

env_sampler.reset_env()



while True:
    

    outer_steps +=1
    EXPLORATION_RATE *= EPSILON_DECAY
    
    ds = replay_buffer.generate_tf_dataset(N_BATCHES, BATCHSIZE)
    
    bar = ds # tqdm.tqdm(ds)
    
    losses = []

    for data in bar:

        loss = update_q_network(data, q_net, target_net, optimizer, GAMMA)
        
        
        losses.append(loss)
        mean_loss = float(np.mean(losses))
        # bar.set_description(f"Loss {mean_loss:.6f}")

    mean_loss = float(np.mean(losses))
    
    with writer.as_default():

        tf.summary.scalar("loss", mean_loss, step=outer_steps)
        tf.summary.scalar("epsilon", EXPLORATION_RATE, step=outer_steps)


    new_samples = env_sampler.sample(N_NEW_SAMPLES, epsilon= EXPLORATION_RATE)
    replay_buffer.add_new_trajectory(new_samples)

    replay_buffer.drop_first_trajectory()

    polyak_averaging(target_net, q_net, TAU)
    
    
    if outer_steps % LOG_FREQ == 0:
        
        print("Logging Model Metrics")
        
        results = env_sampler.measure_model_perforamnce( GAMMA, target_net, N_TEST_ENVS)
        names = ["average_rewards", "average_termination", "average_q_val", "average_target_q_val", "l2_diff",]

        for val, name in zip(results, names):

            with writer.as_default():

                tf.summary.scalar(name, val, step=outer_steps)

        results = pd.Series(results, names)
        
        results["average_loss"] = np.mean(losses)
        
        results.to_json(TMP_LOG_PATH.format(outer_steps))
        
        print(results)
        
        
    if outer_steps % CHECKPOINT_FREQ == 0:
        
        print("Saving Checkpoint")
        
        checkpoint.save(TMP_SAVE_TO_PATH)

Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.253721
average_termination     141.500000
average_q_val             0.000774
average_target_q_val     -0.004742
l2_diff                   0.005517
average_loss              0.005713
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.195423
average_termination     583.666667
average_q_val             0.004131
average_target_q_val     -0.000913
l2_diff                   0.005043
average_loss              0.006462
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.351764
average_termination     160.250000
average_q_val             0.008559
average_target_q_val      0.001305
l2_diff                   0.007901
average_loss              0.005534
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.052525
average_termination     638.444444
average_q_val             0.016661
average_target_q_val      0.005800
l2_diff                   0.042027
average_loss              0.006093
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.485514
average_termination     601.375000
average_q_val             0.007984
average_target_q_val      0.009195
l2_diff                   0.062194
average_loss              0.002812
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.107003
average_termination     641.888889
average_q_val             0.006558
average_target_q_val      0.008148
l2_diff                   0.058013
average_loss              0.002725
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.332156
average_termination     536.375000
average_q_val            -0.000657
average_target_q_val      0.005374
l2_diff                   0.055771
average_loss              0.002870
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.190350
average_termination     511.333333
average_q_val             0.011376
average_target_q_val      0.005473
l2_diff                   0.062075
average_loss              0.002580
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.405670
average_termination     607.875000
average_q_val             0.009722
average_target_q_val      0.003610
l2_diff                   0.060603
average_loss              0.002228
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.135898
average_termination     589.300000
average_q_val            -0.009861
average_target_q_val     -0.002375
l2_diff                   0.050241
average_loss              0.001430
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.255425
average_termination     744.900000
average_q_val            -0.004450
average_target_q_val     -0.001276
l2_diff                   0.042399
average_loss              0.001633
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.064668
average_termination     690.333333
average_q_val            -0.010389
average_target_q_val     -0.006658
l2_diff                   0.032445
average_loss              0.002119
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.196148
average_termination     581.000000
average_q_val             0.005292
average_target_q_val      0.002223
l2_diff                   0.034593
average_loss              0.001827
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.471721
average_termination     346.750000
average_q_val             0.014248
average_target_q_val      0.010201
l2_diff                   0.047051
average_loss              0.001586
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.287925
average_termination     506.111111
average_q_val             0.004255
average_target_q_val      0.003542
l2_diff                   0.035024
average_loss              0.002324
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.889654
average_termination     196.000000
average_q_val             0.027793
average_target_q_val      0.016033
l2_diff                   0.056971
average_loss              0.001771
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.406913
average_termination     810.200000
average_q_val             0.001150
average_target_q_val     -0.000452
l2_diff                   0.033197
average_loss              0.002047
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.217609
average_termination     143.750000
average_q_val             0.011894
average_target_q_val      0.004687
l2_diff                   0.038790
average_loss              0.001738
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.920557
average_termination     208.375000
average_q_val             0.035200
average_target_q_val      0.023797
l2_diff                   0.057971
average_loss              0.001511
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.880860
average_termination     208.125000
average_q_val             0.037417
average_target_q_val      0.027688
l2_diff                   0.061922
average_loss              0.002321
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.291852
average_termination     502.000000
average_q_val             0.028076
average_target_q_val      0.020383
l2_diff                   0.042854
average_loss              0.001950
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.514936
average_termination     319.600000
average_q_val             0.031809
average_target_q_val      0.023262
l2_diff                   0.048263
average_loss              0.001964
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.304071
average_termination     149.125000
average_q_val             0.015367
average_target_q_val      0.016044
l2_diff                   0.049994
average_loss              0.002011
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.106524
average_termination     584.428571
average_q_val             0.018883
average_target_q_val      0.015216
l2_diff                   0.031544
average_loss              0.001777
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.879791
average_termination     202.000000
average_q_val             0.043686
average_target_q_val      0.033004
l2_diff                   0.058708
average_loss              0.001437
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.210834
average_termination     489.600000
average_q_val             0.035335
average_target_q_val      0.022807
l2_diff                   0.041349
average_loss              0.001328
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.302827
average_termination     585.428571
average_q_val             0.024851
average_target_q_val      0.020797
l2_diff                   0.030113
average_loss              0.001217
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.079064
average_termination     432.375000
average_q_val             0.023869
average_target_q_val      0.023750
l2_diff                   0.037354
average_loss              0.001256
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.241643
average_termination     353.888889
average_q_val             0.025695
average_target_q_val      0.019529
l2_diff                   0.034893
average_loss              0.001455
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.327072
average_termination     246.500000
average_q_val             0.034317
average_target_q_val      0.029612
l2_diff                   0.049801
average_loss              0.001134
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.360841
average_termination     404.000000
average_q_val             0.045025
average_target_q_val      0.032731
l2_diff                   0.052356
average_loss              0.001317
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.466059
average_termination     414.444444
average_q_val             0.048022
average_target_q_val      0.038944
l2_diff                   0.049281
average_loss              0.001167
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.095864
average_termination     512.500000
average_q_val             0.021795
average_target_q_val      0.019677
l2_diff                   0.028728
average_loss              0.001069
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.064185
average_termination     475.800000
average_q_val             0.038483
average_target_q_val      0.026478
l2_diff                   0.036815
average_loss              0.001180
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.449005
average_termination     453.250000
average_q_val             0.062097
average_target_q_val      0.044647
l2_diff                   0.052271
average_loss              0.001201
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.479425
average_termination     167.875000
average_q_val             0.038517
average_target_q_val      0.029620
l2_diff                   0.055210
average_loss              0.001197
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.986758
average_termination     220.375000
average_q_val             0.080006
average_target_q_val      0.067056
l2_diff                   0.062639
average_loss              0.001429
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.620202
average_termination     353.875000
average_q_val             0.054817
average_target_q_val      0.048707
l2_diff                   0.054238
average_loss              0.001174
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.839475
average_termination     198.750000
average_q_val             0.073313
average_target_q_val      0.062222
l2_diff                   0.060331
average_loss              0.001423
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.645600
average_termination     423.777778
average_q_val             0.067400
average_target_q_val      0.055147
l2_diff                   0.052091
average_loss              0.001042
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.624190
average_termination     196.111111
average_q_val             0.045313
average_target_q_val      0.044904
l2_diff                   0.045064
average_loss              0.000854
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.619890
average_termination     382.818182
average_q_val             0.023415
average_target_q_val      0.017703
l2_diff                   0.055702
average_loss              0.001005
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.584046
average_termination     175.250000
average_q_val             0.059027
average_target_q_val      0.046014
l2_diff                   0.055134
average_loss              0.000998
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.933040
average_termination     237.125000
average_q_val             0.124154
average_target_q_val      0.101742
l2_diff                   0.077748
average_loss              0.000937
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.444806
average_termination     441.500000
average_q_val             0.055307
average_target_q_val      0.047859
l2_diff                   0.046199
average_loss              0.001163
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.365564
average_termination     487.615385
average_q_val             0.076050
average_target_q_val      0.066637
l2_diff                   0.041351
average_loss              0.001170
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.883980
average_termination     221.125000
average_q_val             0.106000
average_target_q_val      0.099223
l2_diff                   0.058986
average_loss              0.001400
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.059449
average_termination     291.125000
average_q_val             0.016843
average_target_q_val      0.009707
l2_diff                   0.024148
average_loss              0.001427
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.098921
average_termination     276.250000
average_q_val             0.023925
average_target_q_val      0.018705
l2_diff                   0.026958
average_loss              0.001940
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.326796
average_termination     332.900000
average_q_val             0.064094
average_target_q_val      0.061996
l2_diff                   0.034881
average_loss              0.001478
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.604045
average_termination     294.000000
average_q_val             0.078870
average_target_q_val      0.062145
l2_diff                   0.058846
average_loss              0.001523
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.276503
average_termination     264.111111
average_q_val             0.052975
average_target_q_val      0.051759
l2_diff                   0.058301
average_loss              0.001439
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.164501
average_termination     478.200000
average_q_val             0.052058
average_target_q_val      0.042644
l2_diff                   0.033155
average_loss              0.001365
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.132390
average_termination     425.700000
average_q_val             0.056384
average_target_q_val      0.053453
l2_diff                   0.040304
average_loss              0.001454
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.841014
average_termination     291.375000
average_q_val             0.137861
average_target_q_val      0.123067
l2_diff                   0.063557
average_loss              0.001868
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.327428
average_termination     357.250000
average_q_val             0.066258
average_target_q_val      0.063099
l2_diff                   0.041048
average_loss              0.001992
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.908618
average_termination     210.125000
average_q_val             0.120701
average_target_q_val      0.112470
l2_diff                   0.060699
average_loss              0.001216
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.254490
average_termination     377.200000
average_q_val             0.060782
average_target_q_val      0.063719
l2_diff                   0.039048
average_loss              0.001233
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.732201
average_termination     273.500000
average_q_val             0.129236
average_target_q_val      0.117304
l2_diff                   0.055458
average_loss              0.001633
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.624593
average_termination     271.125000
average_q_val             0.101516
average_target_q_val      0.095469
l2_diff                   0.050083
average_loss              0.001812
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.686652
average_termination     273.375000
average_q_val             0.106131
average_target_q_val      0.096838
l2_diff                   0.049092
average_loss              0.001712
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.611095
average_termination     322.444444
average_q_val             0.129521
average_target_q_val      0.117567
l2_diff                   0.055674
average_loss              0.001769
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.274433
average_termination     482.000000
average_q_val             0.081893
average_target_q_val      0.075254
l2_diff                   0.044482
average_loss              0.001456
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.145982
average_termination     365.750000
average_q_val             0.085304
average_target_q_val      0.072378
l2_diff                   0.036552
average_loss              0.001779
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.320757
average_termination     415.000000
average_q_val             0.118281
average_target_q_val      0.101377
l2_diff                   0.046856
average_loss              0.001845
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.951364
average_termination     281.125000
average_q_val             0.185945
average_target_q_val      0.163917
l2_diff                   0.054658
average_loss              0.001067
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.359483
average_termination     437.444444
average_q_val             0.063397
average_target_q_val      0.067214
l2_diff                   0.041967
average_loss              0.001583
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.230986
average_termination     443.500000
average_q_val             0.088718
average_target_q_val      0.083093
l2_diff                   0.042856
average_loss              0.000711
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.657643
average_termination     358.250000
average_q_val             0.173073
average_target_q_val      0.153376
l2_diff                   0.064213
average_loss              0.000842
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.346544
average_termination     540.777778
average_q_val             0.134805
average_target_q_val      0.124811
l2_diff                   0.051826
average_loss              0.000799
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.695815
average_termination     502.727273
average_q_val             0.155535
average_target_q_val      0.139180
l2_diff                   0.070928
average_loss              0.000568
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.508430
average_termination     473.250000
average_q_val             0.140892
average_target_q_val      0.125771
l2_diff                   0.058201
average_loss              0.000577
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.605956
average_termination     506.200000
average_q_val             0.137676
average_target_q_val      0.127255
l2_diff                   0.053940
average_loss              0.000348
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.313424
average_termination     554.111111
average_q_val             0.128905
average_target_q_val      0.118838
l2_diff                   0.055889
average_loss              0.000816
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.544910
average_termination     411.250000
average_q_val             0.144377
average_target_q_val      0.130175
l2_diff                   0.055467
average_loss              0.000536
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.468945
average_termination     534.250000
average_q_val             0.134476
average_target_q_val      0.126211
l2_diff                   0.044254
average_loss              0.000505
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.385799
average_termination     534.875000
average_q_val             0.141458
average_target_q_val      0.139092
l2_diff                   0.057592
average_loss              0.000626
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.385050
average_termination     453.100000
average_q_val             0.117778
average_target_q_val      0.106581
l2_diff                   0.049608
average_loss              0.000494
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.099592
average_termination     707.000000
average_q_val             0.107440
average_target_q_val      0.099717
l2_diff                   0.041585
average_loss              0.000267
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.335256
average_termination     419.500000
average_q_val             0.185568
average_target_q_val      0.168927
l2_diff                   0.057162
average_loss              0.000314
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.623886
average_termination     614.700000
average_q_val             0.168537
average_target_q_val      0.161016
l2_diff                   0.055673
average_loss              0.000447
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.784931
average_termination     482.375000
average_q_val             0.175275
average_target_q_val      0.168238
l2_diff                   0.070901
average_loss              0.000488
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.531407
average_termination     512.875000
average_q_val             0.157935
average_target_q_val      0.151906
l2_diff                   0.056376
average_loss              0.000286
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.122069
average_termination     603.818182
average_q_val             0.133333
average_target_q_val      0.125407
l2_diff                   0.055890
average_loss              0.000497
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.681534
average_termination     383.375000
average_q_val             0.151264
average_target_q_val      0.145470
l2_diff                   0.056005
average_loss              0.000340
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.755777
average_termination     402.000000
average_q_val             0.196021
average_target_q_val      0.184924
l2_diff                   0.053046
average_loss              0.000327
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.526393
average_termination     534.375000
average_q_val             0.162909
average_target_q_val      0.157996
l2_diff                   0.047367
average_loss              0.000297
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.588630
average_termination     498.111111
average_q_val             0.157261
average_target_q_val      0.148688
l2_diff                   0.063119
average_loss              0.000458
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(n_test_envs, np.bool)


average_rewards           0.538445
average_termination     639.333333
average_q_val             0.149408
average_target_q_val      0.134036
l2_diff                   0.058777
average_loss              0.000550
dtype: float64
