In [1]:
import tensorflow as tf
import shutil
import numpy as np
import tqdm
import pandas as pd
import os
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import random

GAMMA = 0.98
TAU = 0.05
BATCHSIZE = 32
N_BATCHES = 100
N_MULTI_ENVS = 8
EXPLORATION_RATE = 0.2
EPSILON_DECAY = 0.999

N_NEW_SAMPLES = 500
N_SAMPLE_SETS = 100

CHECKPOINT_FREQ = 40
LOG_FREQ = 30


In [2]:
TMP_SAVE_TO_PATH = "ckpts/ckpt"
TMP_LOG_PATH = "logs/{}.json"
TB_LOGS = "tb_lobs/run"


os.makedirs(TMP_SAVE_TO_PATH.replace("/ckpt",""), exist_ok= True)
os.makedirs(TMP_LOG_PATH.replace("/{}.json",""), exist_ok= True)
os.makedirs(TB_LOGS, exist_ok= True)

# # get old checkpoint
# !cp /content/gdrive/MyDrive/DeepRL/HW4/checkpoint.zip checkpoint.zip
# !unzip -d {TMP_SAVE_TO_PATH} checkpoint.zip

LOSS = tf.keras.losses.Huber()
CNN_SHAPE = (84, 84)


In [3]:


def triple_conv_block_no_batchnorm(x, filters):

    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x)
    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x) + x
    x = tf.keras.layers.Conv2D(
        filters, 3, padding='same', activation='relu')(x) + x

    return x


def get_standard_dqn():
    tf.keras.backend.clear_session()

    inputs = tf.keras.layers.Input(CNN_SHAPE + (3,))

    x = triple_conv_block_no_batchnorm(inputs, 16)
    x = tf.keras.layers.MaxPool2D(2)(x)
    x = triple_conv_block_no_batchnorm(x, 32)
    x = tf.keras.layers.MaxPool2D(2)(x)
    x = triple_conv_block_no_batchnorm(x, 64)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(512, "relu")(x)
    outputs = tf.keras.layers.Dense(4, "linear")(x)

    model = tf.keras.Model(inputs, outputs, name="standard_dqn")

    return model


def get_small_dqn():
    tf.keras.backend.clear_session()

    inputs = tf.keras.layers.Input(CNN_SHAPE + (3,))
    x = triple_conv_block_no_batchnorm(inputs, 10)
    x = tf.keras.layers.MaxPool2D(2)(x)
    x = triple_conv_block_no_batchnorm(x, 20)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dense(512, "relu")(x)
    outputs = tf.keras.layers.Dense(4, "linear")(x)

    model = tf.keras.Model(inputs, outputs, name="standard_dqn")

    return model


@tf.function
def sample_trajectory(dqn, state, epsilon=0.2):

    n_par = tf.shape(state)[0]

    mask = tf.random.uniform((n_par,), 0, 1, tf.float32) > epsilon

    predictions = dqn(state, training=False)
    max_actions = tf.math.argmax(predictions, axis=-1)

    random_choices = tf.random.uniform(
        shape=[n_par], minval=0, maxval=4, dtype=tf.int64)

    return tf.where(mask, max_actions, random_choices), tf.reduce_max(predictions, -1)


@tf.function
def preprocess_all(observation, next_observation, action, reward, terminated):

    observation = tf.cast(observation, tf.float32)/255
    observation = tf.image.resize(observation, CNN_SHAPE)

    next_observation = tf.cast(next_observation, tf.float32)/255
    next_observation = tf.image.resize(next_observation, CNN_SHAPE)

    action = tf.cast(action, tf.int64)
    reward = tf.cast(reward, tf.float32)
    terminated = tf.cast(terminated, tf.bool)

    return observation, next_observation, action, reward, terminated


@tf.function
def preprocess_obersvation(observation):

    observation = tf.cast(observation, tf.float32)/255

    return tf.image.resize(observation, CNN_SHAPE)


@tf.function
def polyak_averaging(Q_target, Q_dqn, tau):
    """

    Args:
        Q_target (_type_): _description_
        Q_dqn (_type_): _description_
        tau (_type_): _description_
    """

    for old, new in zip(Q_target.trainable_variables, Q_dqn.trainable_variables):
        update = old * (1 - tau) + new * tau
        old.assign(update)


@tf.function
def update_q_network(data, dqn, q_target, optimizer, gamma):

    state, next_state, action, reward, terminated = data

    s_prime_values = q_target(next_state, training=False)
    s_prime_values = tf.reduce_max(s_prime_values, -1)
    mask = 1 - tf.cast(terminated, tf.float32)

    labels = reward + gamma * mask * s_prime_values

    with tf.GradientTape() as tape:

        predictions = dqn(state, training=True)
        action_values = tf.gather(predictions, action, batch_dims=1)

        loss = LOSS(action_values, labels)

    gradients = tape.gradient(loss, dqn.trainable_variables)
    optimizer.apply_gradients(zip(gradients, dqn.trainable_variables))
    return loss


class ENV_SAMPLER:
    """
    Class for sampling environment data using a DQN model.
    """

    def __init__(self, dqn, n_multi_envs) -> None:
        """
        Initialize the ENV_SAMPLER instance.

        Args:
            env: The environment to sample from.
            dqn: The DQN model for action selection.
            n_multi_envs: The number of parallel environments.
            preprocess_observation: Function to preprocess observations.
        """
        self.env = gym.vector.make('ALE/Breakout-v5', num_envs=n_multi_envs)
        self.current_state = self.env.reset()[0]
        self.dqn = dqn
        self.n_multi_envs = n_multi_envs

    def reset_env(self):
        """
        Reset the environment to the initial state.
        """
        self.current_state = self.env.reset()[0]

    def sample(self, n_samples, epsilon=0.2):
        """
        Sample environment data.

        Args:
            n_samples: The number of samples to generate.
            epsilon: The exploration factor for action selection (default: 0.2).

        Returns:
            samples: List of sampled data tuples (current_state, next_state, action, reward, terminated).
        """
        samples = []

        n_steps = np.ceil(n_samples / self.n_multi_envs).astype(int)

        for _ in range(n_steps):
            oberservation_as_tensor = preprocess_obersvation(
                self.current_state)

            action, q_vals = map(lambda x: x.numpy(), sample_trajectory(
                self.dqn, oberservation_as_tensor, epsilon))

            observation, reward, terminated, truncated, info = self.env.step(
                action)

            for i in range(self.n_multi_envs):
                samples.append((self.current_state[i],
                                observation[i],
                                action[i],
                                reward[i],
                                terminated[i]))

            self.current_state = observation

        return samples[:n_samples]

    def measure_model_perforamnce(self, gamma: float, target_q):

        self.reset_env()

        rewards = np.zeros(self.n_multi_envs)
        terminated_at = []
        q_values = []
        target_q_values = []

        allready_terminated = np.zeros(self.n_multi_envs, np.bool)

        steps = 0

        while True:
            

            oberservation_as_tensor = preprocess_obersvation(
                self.current_state)

            action, q_vals = map(lambda x: x.numpy(), sample_trajectory(
                self.dqn, oberservation_as_tensor, EXPLORATION_RATE))

            target_vals = tf.reduce_max(target_q(oberservation_as_tensor), -1)

            observation, reward, terminated, truncated, info = self.env.step(
                action)

            self.current_state = observation

            rewards += (gamma ** steps) * reward * (1 - allready_terminated)


            allready_terminated = np.logical_or(
                allready_terminated, terminated)

            for t in terminated:

                if t:
                    terminated_at.append(steps)

            q_values.extend(q_vals.tolist())
            target_q_values.extend(target_vals.numpy().tolist())

            steps += 1

            if allready_terminated.all():

                break

        average_q_val = np.mean(q_values)
        average_target_q_val = np.mean(target_q_values)

        l2_diff = np.array(q_values) - np.array(target_q_values)
        l2_diff = np.sqrt(np.square(l2_diff).mean())

        average_rewards = np.mean(rewards)
        average_termination = np.mean(terminated_at)

        return average_rewards, average_termination, average_q_val, average_target_q_val, l2_diff


class ReplayBuffer:
    """
    Class for managing a replay buffer for reinforcement learning.
    """

    def __init__(self, ) -> None:
        """
        Initialize the ReplayBuffer instance.

        Args:
            preprocess_func: Function to preprocess examples.
        """
        self.saved_trajectories = []

    def add_new_trajectory(self, trajectory):
        """
        Add a new trajectory to the replay buffer.

        Args:
            trajectory: List of examples representing a trajectory.
        """
        self.saved_trajectories.append(trajectory)

    def drop_first_trajectory(self):
        """
        Remove the oldest trajectory from the replay buffer.
        """
        to_delete = self.saved_trajectories.pop(0)
        del to_delete

    def sample_singe_example(
        self,
    ):
        """
        Sample a single example from the replay buffer.

        Args:
            melt_stop_criteria: Boolean flag indicating whether to consider stop criteria (default: False).

        Returns:
            example: A single example from a randomly chosen trajectory.
        """
        trajectory = random.choice(self.saved_trajectories)
        example = random.choice(trajectory)

        states, next_states, actions, rewards, terminations, = example

        return states, next_states, actions, rewards, terminations

    def sample_n_examples(self, n_examples: int):
        """
        Sample multiple examples from the replay buffer.

        Args:
            n_examples: The number of examples to sample.

        Returns:
            states, next_states, actions, rewards, stop_criteria: Arrays of sampled examples.
        """
        trajectories = [self.sample_singe_example() for _ in range(n_examples)]

        states, next_states, actions, rewards, stop_criteria = map(
            np.array, zip(*trajectories)
        )

        return states, next_states, actions, rewards, stop_criteria

    def generate_tf_dataset(self, n_batches, batchsize):
        """
        Generate a TensorFlow dataset from the replay buffer.

        Args:
            n_batches: The number of batches to generate.
            batchsize: The size of each batch.

        Returns:
            ds: TensorFlow dataset containing the preprocessed examples.
        """
        n_steps = n_batches * batchsize

        ds = self.sample_n_examples(n_steps)
        ds = tf.data.Dataset.from_tensor_slices(ds)
        ds = ds.map(preprocess_all, tf.data.AUTOTUNE)
        ds = ds.batch(batchsize)

        return ds

In [4]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

q_net = get_standard_dqn()

target_net = tf.keras.models.clone_model(q_net)

env_sampler = ENV_SAMPLER(q_net, N_MULTI_ENVS)
replay_buffer = ReplayBuffer()

checkpoint = tf.train.Checkpoint(q_net = q_net, target_net = target_net)

writer = tf.summary.create_file_writer(TB_LOGS)

# restore
# checkpoint.restore(TMP_SAVE_TO_PATH)

for _ in tqdm.tqdm(range(N_SAMPLE_SETS), desc = "Fill Buffer initally"):

    new_samples = env_sampler.sample(N_NEW_SAMPLES, epsilon = 1)
    replay_buffer.add_new_trajectory(new_samples)

2023-06-14 12:33:21.856434: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 12:33:21.861007: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 12:33:21.861171: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-14 12:33:21.861446: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [5]:
outer_steps = 0

env_sampler.reset_env()



while True:
    

    outer_steps +=1
    EXPLORATION_RATE *= EPSILON_DECAY
    
    ds = replay_buffer.generate_tf_dataset(N_BATCHES, BATCHSIZE)
    
    bar = ds # tqdm.tqdm(ds)
    
    losses = []

    for data in bar:

        loss = update_q_network(data, q_net, target_net, optimizer, GAMMA)
        
        
        losses.append(loss)
        mean_loss = float(np.mean(losses))
        # bar.set_description(f"Loss {mean_loss:.6f}")

    mean_loss = float(np.mean(losses))
    
    with writer.as_default():

        tf.summary.scalar("loss", mean_loss, step=outer_steps)
        tf.summary.scalar("epsilon", EXPLORATION_RATE, step=outer_steps)


    new_samples = env_sampler.sample(N_NEW_SAMPLES, epsilon= EXPLORATION_RATE)
    replay_buffer.add_new_trajectory(new_samples)

    replay_buffer.drop_first_trajectory()

    polyak_averaging(target_net, q_net, TAU)
    
    
    if outer_steps % LOG_FREQ == 0:
        
        print("Logging Model Metrics")
        
        results = env_sampler.measure_model_perforamnce( GAMMA, target_net)
        names = ["average_rewards", "average_termination", "average_q_val", "average_target_q_val", "l2_diff",]

        for val, name in zip(results, names):

            with writer.as_default():

                tf.summary.scalar(name, val, step=outer_steps)

        results = pd.Series(results, names)
        
        results["average_loss"] = np.mean(losses)
        
        results.to_json(TMP_LOG_PATH.format(outer_steps))
        
        print(results)
        
        
    if outer_steps % CHECKPOINT_FREQ == 0:
        
        print("Saving Checkpoint")
        
        checkpoint.save(TMP_SAVE_TO_PATH)

Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.194513
average_termination     172.000000
average_q_val             0.023442
average_target_q_val      0.010534
l2_diff                   0.012907
average_loss              0.003888
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.414502
average_termination     340.750000
average_q_val             0.028808
average_target_q_val      0.019719
l2_diff                   0.081521
average_loss              0.000336
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.163704
average_termination     163.750000
average_q_val             0.037773
average_target_q_val      0.026023
l2_diff                   0.090902
average_loss              0.002826
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.018561
average_termination     237.500000
average_q_val             0.040974
average_target_q_val      0.030144
l2_diff                   0.033221
average_loss              0.000997
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.245406
average_termination     210.916667
average_q_val             0.050226
average_target_q_val      0.044991
l2_diff                   0.034891
average_loss              0.001967
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.516666
average_termination     336.250000
average_q_val             0.104417
average_target_q_val      0.076443
l2_diff                   0.103081
average_loss              0.001483
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.123424
average_termination     273.000000
average_q_val             0.044767
average_target_q_val      0.037927
l2_diff                   0.040845
average_loss              0.001759
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.077794
average_termination     140.875000
average_q_val             0.060137
average_target_q_val      0.045890
l2_diff                   0.031729
average_loss              0.000954
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.065959
average_termination     143.750000
average_q_val             0.067474
average_target_q_val      0.055186
l2_diff                   0.047318
average_loss              0.001211
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.064485
average_termination     225.500000
average_q_val             0.036293
average_target_q_val      0.032417
l2_diff                   0.021680
average_loss              0.001357
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.089352
average_termination     147.125000
average_q_val             0.059297
average_target_q_val      0.053459
l2_diff                   0.035351
average_loss              0.001442
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.017974
average_termination     292.375000
average_q_val             0.035632
average_target_q_val      0.023619
l2_diff                   0.018496
average_loss              0.001916
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.495233
average_termination     215.625000
average_q_val             0.104473
average_target_q_val      0.095992
l2_diff                   0.023791
average_loss              0.002015
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.104187
average_termination     366.125000
average_q_val             0.093313
average_target_q_val      0.078430
l2_diff                   0.033365
average_loss              0.001655
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.017863
average_termination     275.625000
average_q_val             0.049643
average_target_q_val      0.039693
l2_diff                   0.025047
average_loss              0.002146
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.050805
average_termination     324.000000
average_q_val             0.072946
average_target_q_val      0.058825
l2_diff                   0.025929
average_loss              0.001950
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.202602
average_termination     161.000000
average_q_val             0.132635
average_target_q_val      0.111504
l2_diff                   0.034660
average_loss              0.001735
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.082017
average_termination     382.625000
average_q_val             0.059273
average_target_q_val      0.063946
l2_diff                   0.020368
average_loss              0.001175
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.045750
average_termination     375.250000
average_q_val             0.124022
average_target_q_val      0.118900
l2_diff                   0.057027
average_loss              0.001469
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.127605
average_termination     458.125000
average_q_val             0.105444
average_target_q_val      0.101717
l2_diff                   0.018828
average_loss              0.001416
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.194575
average_termination     468.625000
average_q_val             0.192715
average_target_q_val      0.171993
l2_diff                   0.095272
average_loss              0.001077
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.253008
average_termination     415.750000
average_q_val             0.190762
average_target_q_val      0.190345
l2_diff                   0.078210
average_loss              0.001652
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.306092
average_termination     458.555556
average_q_val             0.211341
average_target_q_val      0.200859
l2_diff                   0.089288
average_loss              0.001939
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.102206
average_termination     132.125000
average_q_val             0.069873
average_target_q_val      0.053961
l2_diff                   0.026289
average_loss              0.002856
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.042198
average_termination     386.500000
average_q_val             0.110447
average_target_q_val      0.106788
l2_diff                   0.039887
average_loss              0.001941
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.255379
average_termination     540.700000
average_q_val             0.175535
average_target_q_val      0.163654
l2_diff                   0.063246
average_loss              0.001863
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.156464
average_termination     626.615385
average_q_val             0.197385
average_target_q_val      0.163922
l2_diff                   0.063871
average_loss              0.001877
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.305908
average_termination     469.666667
average_q_val             0.184139
average_target_q_val      0.164265
l2_diff                   0.059807
average_loss              0.002551
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.348841
average_termination     465.750000
average_q_val             0.171453
average_target_q_val      0.150964
l2_diff                   0.047030
average_loss              0.002707
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.096522
average_termination     442.875000
average_q_val             0.023846
average_target_q_val      0.040578
l2_diff                   0.033535
average_loss              0.002852
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.465249
average_termination     398.444444
average_q_val             0.088486
average_target_q_val      0.079059
l2_diff                   0.045933
average_loss              0.003304
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.472520
average_termination     254.444444
average_q_val             0.180086
average_target_q_val      0.178246
l2_diff                   0.055523
average_loss              0.005053
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.696552
average_termination     300.375000
average_q_val             0.234284
average_target_q_val      0.205795
l2_diff                   0.066101
average_loss              0.003655
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.629697
average_termination     267.750000
average_q_val             0.241504
average_target_q_val      0.240395
l2_diff                   0.047799
average_loss              0.004293
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.570928
average_termination     235.250000
average_q_val             0.250506
average_target_q_val      0.233463
l2_diff                   0.038806
average_loss              0.004218
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.253008
average_termination     489.125000
average_q_val             0.169249
average_target_q_val      0.135676
l2_diff                   0.065214
average_loss              0.004321
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.571102
average_termination     243.818182
average_q_val             0.297499
average_target_q_val      0.259324
l2_diff                   0.054289
average_loss              0.005819
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.627602
average_termination     236.375000
average_q_val             0.315523
average_target_q_val      0.300393
l2_diff                   0.052066
average_loss              0.004712
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.442381
average_termination     176.250000
average_q_val             0.243394
average_target_q_val      0.250859
l2_diff                   0.045539
average_loss              0.003783
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.536062
average_termination     209.000000
average_q_val             0.324785
average_target_q_val      0.313894
l2_diff                   0.029191
average_loss              0.002258
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.228170
average_termination     190.875000
average_q_val             0.358659
average_target_q_val      0.350796
l2_diff                   0.045296
average_loss              0.004130
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.668769
average_termination     211.125000
average_q_val             0.348625
average_target_q_val      0.350955
l2_diff                   0.036146
average_loss              0.004013
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.410745
average_termination     188.250000
average_q_val             0.329389
average_target_q_val      0.328010
l2_diff                   0.038879
average_loss              0.004321
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.528416
average_termination     251.777778
average_q_val             0.358637
average_target_q_val      0.375001
l2_diff                   0.040519
average_loss              0.004130
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.504900
average_termination     283.700000
average_q_val             0.338412
average_target_q_val      0.306818
l2_diff                   0.042500
average_loss              0.002612
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.601218
average_termination     465.227273
average_q_val             0.359061
average_target_q_val      0.337312
l2_diff                   0.042430
average_loss              0.003728
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.077659
average_termination     145.500000
average_q_val             0.323982
average_target_q_val      0.334605
l2_diff                   0.035150
average_loss              0.004867
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.334772
average_termination     365.947368
average_q_val             0.286413
average_target_q_val      0.284280
l2_diff                   0.030565
average_loss              0.002956
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.173281
average_termination     167.111111
average_q_val             0.369708
average_target_q_val      0.355221
l2_diff                   0.028306
average_loss              0.002710
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.260306
average_termination     457.750000
average_q_val             0.316229
average_target_q_val      0.301167
l2_diff                   0.028091
average_loss              0.002439
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.055750
average_termination     318.700000
average_q_val             0.278758
average_target_q_val      0.284159
l2_diff                   0.027208
average_loss              0.002031
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.206565
average_termination     198.111111
average_q_val             0.336496
average_target_q_val      0.343974
l2_diff                   0.042647
average_loss              0.002353
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.305740
average_termination     656.928571
average_q_val             0.291693
average_target_q_val      0.310725
l2_diff                   0.044196
average_loss              0.001997
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.094638
average_termination     148.000000
average_q_val             0.327990
average_target_q_val      0.323147
l2_diff                   0.033198
average_loss              0.001644
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.515802
average_termination     200.250000
average_q_val             0.391468
average_target_q_val      0.377440
l2_diff                   0.025495
average_loss              0.001599
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.605172
average_termination     389.307692
average_q_val             0.328068
average_target_q_val      0.319106
l2_diff                   0.026597
average_loss              0.002157
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.586175
average_termination     192.125000
average_q_val             0.452969
average_target_q_val      0.438902
l2_diff                   0.035178
average_loss              0.003223
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.596350
average_termination     260.750000
average_q_val             0.356879
average_target_q_val      0.360678
l2_diff                   0.038399
average_loss              0.001713
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.536515
average_termination     213.750000
average_q_val             0.444276
average_target_q_val      0.407238
l2_diff                   0.058321
average_loss              0.002685
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.532352
average_termination     199.375000
average_q_val             0.435540
average_target_q_val      0.421407
l2_diff                   0.025216
average_loss              0.002420
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.523919
average_termination     215.250000
average_q_val             0.405889
average_target_q_val      0.388221
l2_diff                   0.031677
average_loss              0.004707
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.648571
average_termination     303.466667
average_q_val             0.389411
average_target_q_val      0.372440
l2_diff                   0.028692
average_loss              0.002916
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.169515
average_termination     590.909091
average_q_val             0.318066
average_target_q_val      0.300359
l2_diff                   0.038799
average_loss              0.004878
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.747047
average_termination     247.375000
average_q_val             0.376929
average_target_q_val      0.353941
l2_diff                   0.050366
average_loss              0.003393
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.553624
average_termination     449.444444
average_q_val             0.408769
average_target_q_val      0.418833
l2_diff                   0.029669
average_loss              0.003027
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.640518
average_termination     352.083333
average_q_val             0.395678
average_target_q_val      0.384874
l2_diff                   0.040864
average_loss              0.003325
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.455283
average_termination     634.500000
average_q_val             0.336874
average_target_q_val      0.341603
l2_diff                   0.034499
average_loss              0.001902
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.280403
average_termination     182.875000
average_q_val             0.374138
average_target_q_val      0.363443
l2_diff                   0.023922
average_loss              0.002742
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.241459
average_termination     639.666667
average_q_val             0.340354
average_target_q_val      0.341469
l2_diff                   0.034504
average_loss              0.001960
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.093812
average_termination     727.625000
average_q_val             0.367540
average_target_q_val      0.347305
l2_diff                   0.028934
average_loss              0.003528
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards            0.086492
average_termination     1169.300000
average_q_val              0.296105
average_target_q_val       0.295310
l2_diff                    0.045883
average_loss               0.001587
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.681347
average_termination     352.818182
average_q_val             0.415076
average_target_q_val      0.391392
l2_diff                   0.047477
average_loss              0.003276
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards            0.102934
average_termination     1536.312500
average_q_val              0.298075
average_target_q_val       0.302245
l2_diff                    0.020882
average_loss               0.002865
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.592953
average_termination     212.250000
average_q_val             0.428247
average_target_q_val      0.417261
l2_diff                   0.021872
average_loss              0.003920
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.178878
average_termination     924.000000
average_q_val             0.278208
average_target_q_val      0.276448
l2_diff                   0.020667
average_loss              0.002997
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.639456
average_termination     227.125000
average_q_val             0.453802
average_target_q_val      0.448048
l2_diff                   0.023235
average_loss              0.002425
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.484843
average_termination     609.750000
average_q_val             0.435127
average_target_q_val      0.418568
l2_diff                   0.030425
average_loss              0.002233
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.636157
average_termination     618.451613
average_q_val             0.418146
average_target_q_val      0.404216
l2_diff                   0.031434
average_loss              0.002655
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.638830
average_termination     238.000000
average_q_val             0.490335
average_target_q_val      0.484725
l2_diff                   0.022168
average_loss              0.006803
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.615475
average_termination     211.250000
average_q_val             0.497231
average_target_q_val      0.492159
l2_diff                   0.021358
average_loss              0.004805
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.649196
average_termination     322.500000
average_q_val             0.458144
average_target_q_val      0.445005
l2_diff                   0.026449
average_loss              0.003407
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.598234
average_termination     529.285714
average_q_val             0.332917
average_target_q_val      0.328546
l2_diff                   0.019561
average_loss              0.002791
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.605753
average_termination     539.500000
average_q_val             0.445772
average_target_q_val      0.450623
l2_diff                   0.028402
average_loss              0.001854
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.764184
average_termination     856.250000
average_q_val             0.380219
average_target_q_val      0.378684
l2_diff                   0.019917
average_loss              0.004570
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.579511
average_termination     232.875000
average_q_val             0.482449
average_target_q_val      0.477145
l2_diff                   0.022695
average_loss              0.002903
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.737930
average_termination     885.600000
average_q_val             0.349802
average_target_q_val      0.344751
l2_diff                   0.022630
average_loss              0.002466
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.754885
average_termination     242.000000
average_q_val             0.514891
average_target_q_val      0.480696
l2_diff                   0.041155
average_loss              0.002833
dtype: float64
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.741042
average_termination     435.466667
average_q_val             0.553810
average_target_q_val      0.550214
l2_diff                   0.025178
average_loss              0.002077
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.613689
average_termination     207.125000
average_q_val             0.558106
average_target_q_val      0.543784
l2_diff                   0.021840
average_loss              0.003634
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.742208
average_termination     538.888889
average_q_val             0.461655
average_target_q_val      0.461676
l2_diff                   0.030208
average_loss              0.002383
dtype: float64
Saving Checkpoint
Logging Model Metrics


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  allready_terminated = np.zeros(self.n_multi_envs, np.bool)


average_rewards           0.562692
average_termination     268.900000
average_q_val             0.487788
average_target_q_val      0.477911
l2_diff                   0.028137
average_loss              0.005207
dtype: float64
