In [1]:
import gym
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import matplotlib.pyplot as plt
import time
from gym import spaces

  import imp
  'nearest': pil_image.NEAREST,
  'bilinear': pil_image.BILINEAR,
  'bicubic': pil_image.BICUBIC,
  'hamming': pil_image.HAMMING,
  'box': pil_image.BOX,
  'lanczos': pil_image.LANCZOS,


In [2]:
# https://www.gymlibrary.ml/environments/box2d/lunar_lander/

In [3]:
# Using OU Noise
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
        )
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [4]:
def get_actor(num_states, num_actions, upper_bound, continuous=True, layer1=400, layer2=300):
    # Initialize weights between -3e-3 and 3-e3
    last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

    inputs = layers.Input(shape=(num_states,))
    out = layers.Dense(layer1, activation="relu")(inputs)
    out = layers.Dense(layer2, activation="relu")(out)
    
    # Different output activation based on discrete or continous version
    if continuous:
        outputs = layers.Dense(num_actions, activation="tanh", kernel_initializer=last_init)(out)
    else:
        outputs = layers.Dense(num_actions, activation="softmax", kernel_initializer=last_init)(out)

    # Multiply to fill the whole action space which should be equal around 0
    outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, outputs)
    return model

def get_critic(num_states, num_actions, layer1=400, layer2=300):
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(16, activation="relu")(state_input)
    state_out = layers.Dense(32, activation="relu")(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(32, activation="relu")(action_input)

    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(layer1, activation="relu")(concat)
    out = layers.Dense(layer2, activation="relu")(out)

    outputs = layers.Dense(num_actions)(out)

    # Make it into a keras model
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

# This updates the weights in a slow manner which keeps stability
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

In [5]:
class Agent:
    def __init__(self, num_states, num_actions, lower_bound, upper_bound, continuous=True,
            buffer_capacity=50000, batch_size=64, std_dev=0.2, critic_lr=0.002,
            actor_lr=0.001, gamma=0.99, tau=0.005):
        
        self.buffer_capacity = buffer_capacity
        self.batch_size = batch_size
        
        # For methods
        self.lower_bound = lower_bound
        self.upper_bound = upper_bound
        self.continuous = continuous

        # This is used to make sure we only sample from used buffer space
        self.buffer_counter = 0

        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
        
        self.std_dev = std_dev
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.gamma = gamma
        self.tau = tau
        
        self.actor_model = get_actor(num_states, num_actions, upper_bound, continuous=continuous, layer1=400, layer2=300)
        self.critic_model = get_critic(num_states, num_actions, layer1=400, layer2=300)

        self.target_actor = get_actor(num_states, num_actions, upper_bound, continuous=continuous, layer1=400, layer2=300)
        self.target_critic = get_critic(num_states, num_actions, layer1=400, layer2=300)
        
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr,beta_1=0.9,beta_2=0.999,epsilon=1e-07)
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr,beta_1=0.9,beta_2=0.999,epsilon=1e-07)
        
        # Making the weights equal initially
        self.target_actor.set_weights(self.actor_model.get_weights())
        self.target_critic.set_weights(self.critic_model.get_weights())
        
        self.ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
    
    # Makes a record of the outputted (s,a,r,s') obervation tuple
    def record(self, obs_tuple):
        # Reuse the same buffer replacing old entries
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]

        self.buffer_counter += 1
    
    # Move the update and learn function from buffer to Agent to "decrease" scope
    @tf.function
    def update(self, state_batch, action_batch, reward_batch, next_state_batch,):
        with tf.GradientTape() as tape:
            target_actions = self.target_actor(next_state_batch, training=True)
            y = reward_batch + self.gamma * self.target_critic(
                [next_state_batch, target_actions], training=True
            )
            critic_value = self.critic_model([state_batch, action_batch], training=True)
            critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value))

        critic_grad = tape.gradient(critic_loss, self.critic_model.trainable_variables)
        self.critic_optimizer.apply_gradients(
            zip(critic_grad, self.critic_model.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions = self.actor_model(state_batch, training=True)
            critic_value = self.critic_model([state_batch, actions], training=True)

            actor_loss = -tf.math.reduce_mean(critic_value)

        actor_grad = tape.gradient(actor_loss, self.actor_model.trainable_variables)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor_model.trainable_variables)
        )

    # We compute the loss and update parameters
    def learn(self):
        # Sample only valid data
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])

        self.update(state_batch, action_batch, reward_batch, next_state_batch)
        
    def policy(self, state, noise_object=0, use_noise=True, noise_mult=1):
        # Default noise_object to 0 for when it is not needed
        # For doing actions without added noise
        if not use_noise:     
            sampled_actions = tf.squeeze(self.actor_model(state)).numpy()
            legal_action = np.clip(sampled_actions, self.lower_bound, self.upper_bound)

            return [np.squeeze(legal_action)]
        else:
            sampled_actions = tf.squeeze(self.actor_model(state))
            noise = noise_object()
            # Adding noise to action
            sampled_actions = sampled_actions.numpy() + noise * noise_mult

            # We make sure action is within bounds
            legal_action = np.clip(sampled_actions, self.lower_bound, self.upper_bound)

            return [np.squeeze(legal_action)]

In [6]:
def fixed(x, episode):
    return x

In [7]:
def run(total_trials=3, total_episodes=100, 
            buffer_capacity=50000, batch_size=64, std_dev=0.2, critic_lr=0.002, render=False,
            actor_lr=0.001, gamma=0.99, tau=0.005, noise_mult=1, save_weights=False, 
            directory='Weights/', actor_name='actor', critic_name='critic',
            gamma_func=fixed, tau_func=fixed, critic_lr_func=fixed, actor_lr_func=fixed,
            noise_mult_func=fixed, std_dev_func=fixed, mean_number=40, output=True,
            return_rewards=False, total_time=True, use_guide=False, solved=200,
            continuous=True, environment='LunarLander-v2', seed=1453, start_steps=0,
            gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5):
    tot_time = time.time()
    
    if environment == 'LunarLander-v2':
        env = gym.make(
            "LunarLander-v2",
            continuous=continuous,
            gravity=gravity,
            enable_wind=enable_wind,
            wind_power=wind_power,
            turbulence_power=turbulence_power
        )
    else:
        env = gym.make(environment)
        
    # Apply the seed
    _ = env.reset(seed=seed)
    
    # Stepcount for random start
    step = 0
        
    # This is needed to get the input size for the NN
    num_states = env.observation_space.low.shape[0]
    num_actions = env.action_space.shape[0]

    # Normalize action space according to https://stable-baselines3.readthedocs.io/en/master/guide/rl_tips.html
    action_space = spaces.Box(low=-1, high=1, shape=(num_actions,), dtype='float32')

    # This is needed to clip the actions within the legal boundaries
    upper_bound = action_space.high[0]
    lower_bound = action_space.low[0]
    
    # To store reward history of each episode
    ep_reward_list = []
    # To store average reward history of last few episodes
    avg_reward_list = []
    # To separate assisted reward structures from the "true"
    true_reward_list = []
    true_avg_reward_list = []
    
    for trial in range(total_trials):

        # add sublists for each trial
        avg_reward_list.append([])
        ep_reward_list.append([])
        
        true_reward_list.append([])
        true_avg_reward_list.append([])
        
        agent = Agent(num_states=num_states, num_actions=num_actions, lower_bound=lower_bound, 
                upper_bound=upper_bound, continuous=continuous, buffer_capacity=buffer_capacity, 
                batch_size=batch_size, std_dev=std_dev, critic_lr=critic_lr, actor_lr=actor_lr, 
                gamma=gamma, tau=tau)

        for ep in range(total_episodes):
            # functions for different parameters
            agent.gamma = gamma_func(gamma, ep)
            agent.tau = tau_func(tau, ep)
            agent.critic_lr = critic_lr_func(critic_lr, ep)
            agent.actor_lr = actor_lr_func(actor_lr, ep)
            agent.noise_mult = noise_mult_func(noise_mult, ep)
            agent.std_dev = std_dev_func(std_dev, ep)
            
            # Used for time benchmarking
            before = time.time()

            prev_state = env.reset()
            episodic_reward = 0
            true_reward = 0

            while True:
                if render:
                    env.render()
                
                tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

                if step >= start_steps:
                    action = agent.policy(state=tf_prev_state, noise_object=agent.ou_noise, noise_mult=noise_mult)
                    # To get the right format
                    action = action[0]
                else:
                    action = env.action_space.sample()
                
                step += 1
                
                # Recieve state and reward from environment.
                state, reward, done, info = env.step(action)
                
                # Add this before eventual reward modification
                true_reward += reward
                
                # Reward modification
                if use_guide:
                    # giving penalty for straying far from flags and having high speed
                    # x max
#                     reward -= int(abs(state[0]) > 0.15) * 2 * abs(state[0])
#                     # y top
#                     reward -= int(state[1] > 1) * state[1] / 2
#                     # horizontal speed
#                     reward -= int(abs(state[2]) > 1) * abs(state[2])
#                     # down speed
#                     reward -= int(state[3] <  -1) * abs(state[3])
#                     # up speed
#                     reward -= int(state[3] > 0.1) * 3 * state[3]
                    reward -= abs(state[2]/2) + abs(state[3]) + (abs(state[0])) + (abs(state[1])/2)

                agent.record((prev_state, action, reward, state))
                episodic_reward += reward

                agent.learn()
                update_target(agent.target_actor.variables, agent.actor_model.variables, agent.tau)
                update_target(agent.target_critic.variables, agent.critic_model.variables, agent.tau)

                # End this episode if en episode is done
                if done:
                    break

                prev_state = state

            ep_reward_list[trial].append(episodic_reward)
            
            true_reward_list[trial].append(true_reward)
            
            true_avg_reward = np.mean(true_reward_list[trial][-mean_number:])
            true_avg_reward_list[trial].append(true_avg_reward)

            # Mean of last x episodes
            avg_reward = np.mean(ep_reward_list[trial][-mean_number:])
            if output:
                print("Ep {} * AvgReward {:.2f} * true AvgReward {:.2f} * Reward {:.2f} * True Reward {:.2f} * time {:.2f} * step {}"
                  .format(ep, avg_reward, true_avg_reward, episodic_reward, true_reward, (time.time() - before), step))
            avg_reward_list[trial].append(avg_reward)
            
            # stop if avg is solved
            if true_avg_reward >= solved:
                break

        if save_weights:
            agent.actor_model.save_weights(directory + actor_name + '-trial' + str(trial) + '.h5')
            agent.critic_model.save_weights(directory + critic_name + '-trial' + str(trial) + '.h5')
    
    # Plotting graph
    for idx, p in enumerate(true_avg_reward_list):
        plt.plot(p, label=str(idx))
    plt.xlabel("Episode")
    plt.ylabel("True Avg. Epsiodic Reward (" + str(mean_number) + ")")
    plt.legend()
    plt.show()
    
    print('total time:',time.time() - tot_time, 's')
    
    # Return to be able to make graphs etc. later, or use the data for other stuff
    if return_rewards:
        return true_reward_list

In [8]:
def test(total_episodes=10, actor_weights='Weights/actor-trial0.h5', render=False,
        environment="LunarLander-v2", continuous=False, gravity=-10.0, enable_wind=False,
        wind_power=15.0, turbulence_power=1.5, seed=1453):
    rewards = []
    
    env = gym.make(
        environment,
        continuous=continuous,
        gravity=gravity,
        enable_wind=enable_wind,
        wind_power=wind_power,
        turbulence_power=turbulence_power
    )
    
    # Apply the seed
    _ = env.reset(seed=seed)
    
    for ep in range(total_episodes):
        ep_reward = 0
        
        # Used for time benchmarking
        before = time.time()
        
        prev_state = env.reset()
        agent = Agent(buffer_capacity=0, batch_size=0, std_dev=0, 
                critic_lr=0, actor_lr=0, gamma=0, tau=0)
        agent.actor_model.load_weights(actor_weights)
        
        while True:
            if render:
                env.render()

            tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

            action = agent.policy(state=tf_prev_state, use_noise=False)
            action = action[0]
            state, reward, done, _ = env.step(action)
            
            print(state)
            
            ep_reward += reward

            if done:
                print(str(time.time() - before) + 's')
                rewards.append(ep_reward)
                break

            prev_state = state
            
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("True reward")
    plt.show()

In [9]:
def random(total_episodes=10, render=False, environment="LunarLander-v2",continuous=False,
        gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5, seed=1453):
    rewards = []
    
    env = gym.make(
        environment,
        continuous=continuous,
        gravity=gravity,
        enable_wind=enable_wind,
        wind_power=wind_power,
        turbulence_power=turbulence_power,
    )
    
    # Apply the seed
    _ = env.reset(seed=seed)
    
    for ep in range(total_episodes):
        ep_reward = 0
        
        # Used for time benchmarking
        before = time.time()
        
        prev_state = env.reset()
        
        while True:
            if render:
                env.render()
            action = env.action_space.sample()
            state, reward, done, _ = env.step(action)
            ep_reward += reward

            if done:
                print(str(time.time() - before) + 's')
                rewards.append(ep_reward)
                break

            prev_state = state
            
    plt.plot(rewards)
    plt.xlabel("Episode")
    plt.ylabel("True reward")
    plt.show()

---
# Runs and tests
---

In [10]:
def a(x, episode):
    if (episode % 2 == 1):
        return 0.2
    else:
        return 0.5

In [11]:
def b(x, episode):
    if episode < 600:
        return 0.015

In [12]:
#run(total_trials=3, total_episodes=500, save_weights=True)

In [13]:
#run(total_trials=2, total_episodes=1000, buffer_capacity=250000, std_dev_func=a, save_weights=True)

In [14]:
#test(render=True, total_episodes=1)

In [None]:
run(total_trials=2, total_episodes=500, start_steps=10000, save_weights=True)

  logger.warn(


Ep 0 * AvgReward -409.58 * true AvgReward -409.58 * Reward -409.58 * True Reward -409.58 * time 1.80 * step 180
Ep 1 * AvgReward -368.12 * true AvgReward -368.12 * Reward -326.66 * True Reward -326.66 * time 0.64 * step 293
Ep 2 * AvgReward -288.98 * true AvgReward -288.98 * Reward -130.71 * True Reward -130.71 * time 1.32 * step 407
Ep 3 * AvgReward -237.29 * true AvgReward -237.29 * Reward -82.22 * True Reward -82.22 * time 0.78 * step 476
Ep 4 * AvgReward -217.98 * true AvgReward -217.98 * Reward -140.73 * True Reward -140.73 * time 2.06 * step 656
Ep 5 * AvgReward -211.46 * true AvgReward -211.46 * Reward -178.86 * True Reward -178.86 * time 0.78 * step 727
Ep 6 * AvgReward -192.86 * true AvgReward -192.86 * Reward -81.28 * True Reward -81.28 * time 0.92 * step 804
Ep 7 * AvgReward -193.06 * true AvgReward -193.06 * Reward -194.46 * True Reward -194.46 * time 0.99 * step 890
Ep 8 * AvgReward -187.38 * true AvgReward -187.38 * Reward -141.90 * True Reward -141.90 * time 1.47 * step 

Ep 72 * AvgReward -962.62 * true AvgReward -962.62 * Reward -827.39 * True Reward -827.39 * time 0.93 * step 5000
Ep 73 * AvgReward -980.07 * true AvgReward -980.07 * Reward -794.43 * True Reward -794.43 * time 1.05 * step 5000
Ep 74 * AvgReward -991.64 * true AvgReward -991.64 * Reward -797.17 * True Reward -797.17 * time 0.98 * step 5000
Ep 75 * AvgReward -1000.53 * true AvgReward -1000.53 * Reward -752.55 * True Reward -752.55 * time 0.83 * step 5000
Ep 76 * AvgReward -1028.60 * true AvgReward -1028.60 * Reward -1347.48 * True Reward -1347.48 * time 1.62 * step 5000
Ep 77 * AvgReward -1043.33 * true AvgReward -1043.33 * Reward -758.29 * True Reward -758.29 * time 0.85 * step 5000
Ep 78 * AvgReward -1056.60 * true AvgReward -1056.60 * Reward -794.49 * True Reward -794.49 * time 0.95 * step 5000
Ep 79 * AvgReward -1071.08 * true AvgReward -1071.08 * Reward -796.47 * True Reward -796.47 * time 1.01 * step 5000
Ep 80 * AvgReward -1090.16 * true AvgReward -1090.16 * Reward -1881.76 * Tru

Ep 144 * AvgReward -364.31 * true AvgReward -364.31 * Reward -630.58 * True Reward -630.58 * time 1.14 * step 5000
Ep 145 * AvgReward -374.66 * true AvgReward -374.66 * Reward -641.78 * True Reward -641.78 * time 1.08 * step 5000
Ep 146 * AvgReward -383.74 * true AvgReward -383.74 * Reward -609.31 * True Reward -609.31 * time 0.90 * step 5000
Ep 147 * AvgReward -393.87 * true AvgReward -393.87 * Reward -625.69 * True Reward -625.69 * time 1.22 * step 5000
Ep 148 * AvgReward -401.63 * true AvgReward -401.63 * Reward -559.85 * True Reward -559.85 * time 1.06 * step 5000
Ep 149 * AvgReward -407.17 * true AvgReward -407.17 * Reward -428.65 * True Reward -428.65 * time 1.07 * step 5000
Ep 150 * AvgReward -408.89 * true AvgReward -408.89 * Reward -39.25 * True Reward -39.25 * time 1.13 * step 5000
Ep 151 * AvgReward -414.59 * true AvgReward -414.59 * Reward -465.05 * True Reward -465.05 * time 1.16 * step 5000
Ep 152 * AvgReward -425.45 * true AvgReward -425.45 * Reward -768.28 * True Reward

Ep 216 * AvgReward -551.43 * true AvgReward -551.43 * Reward -508.74 * True Reward -508.74 * time 1.17 * step 5000
Ep 217 * AvgReward -552.31 * true AvgReward -552.31 * Reward -712.83 * True Reward -712.83 * time 0.96 * step 5000
Ep 218 * AvgReward -549.79 * true AvgReward -549.79 * Reward -475.31 * True Reward -475.31 * time 1.36 * step 5000
Ep 219 * AvgReward -546.78 * true AvgReward -546.78 * Reward -637.18 * True Reward -637.18 * time 0.97 * step 5000
Ep 220 * AvgReward -544.83 * true AvgReward -544.83 * Reward -467.39 * True Reward -467.39 * time 1.26 * step 5000
Ep 221 * AvgReward -550.05 * true AvgReward -550.05 * Reward -639.08 * True Reward -639.08 * time 1.14 * step 5000
Ep 222 * AvgReward -553.16 * true AvgReward -553.16 * Reward -787.70 * True Reward -787.70 * time 1.10 * step 5000
Ep 223 * AvgReward -548.31 * true AvgReward -548.31 * Reward -751.99 * True Reward -751.99 * time 1.21 * step 5000
Ep 224 * AvgReward -554.31 * true AvgReward -554.31 * Reward -731.11 * True Rewa

Ep 287 * AvgReward -968.10 * true AvgReward -968.10 * Reward -791.01 * True Reward -791.01 * time 0.88 * step 5000
Ep 288 * AvgReward -962.40 * true AvgReward -962.40 * Reward -808.53 * True Reward -808.53 * time 1.25 * step 5000
Ep 289 * AvgReward -959.04 * true AvgReward -959.04 * Reward -977.43 * True Reward -977.43 * time 1.23 * step 5000
Ep 290 * AvgReward -963.21 * true AvgReward -963.21 * Reward -897.48 * True Reward -897.48 * time 1.11 * step 5000
Ep 291 * AvgReward -972.95 * true AvgReward -972.95 * Reward -1188.78 * True Reward -1188.78 * time 1.38 * step 5000
Ep 292 * AvgReward -985.30 * true AvgReward -985.30 * Reward -1422.90 * True Reward -1422.90 * time 1.69 * step 5000
Ep 293 * AvgReward -990.12 * true AvgReward -990.12 * Reward -1103.33 * True Reward -1103.33 * time 1.25 * step 5000
Ep 294 * AvgReward -988.56 * true AvgReward -988.56 * Reward -1040.65 * True Reward -1040.65 * time 1.36 * step 5000
Ep 295 * AvgReward -997.47 * true AvgReward -997.47 * Reward -953.48 * T

Ep 358 * AvgReward -941.12 * true AvgReward -941.12 * Reward -1283.19 * True Reward -1283.19 * time 1.52 * step 5000
Ep 359 * AvgReward -958.18 * true AvgReward -958.18 * Reward -1693.22 * True Reward -1693.22 * time 2.11 * step 5000
Ep 360 * AvgReward -955.78 * true AvgReward -955.78 * Reward -902.00 * True Reward -902.00 * time 1.15 * step 5000
Ep 361 * AvgReward -935.94 * true AvgReward -935.94 * Reward -697.68 * True Reward -697.68 * time 0.87 * step 5000
Ep 362 * AvgReward -942.83 * true AvgReward -942.83 * Reward -973.66 * True Reward -973.66 * time 1.46 * step 5000
Ep 363 * AvgReward -943.53 * true AvgReward -943.53 * Reward -747.82 * True Reward -747.82 * time 1.01 * step 5000
Ep 364 * AvgReward -946.70 * true AvgReward -946.70 * Reward -1045.61 * True Reward -1045.61 * time 1.37 * step 5000
Ep 365 * AvgReward -955.24 * true AvgReward -955.24 * Reward -1084.74 * True Reward -1084.74 * time 1.34 * step 5000
Ep 366 * AvgReward -962.84 * true AvgReward -962.84 * Reward -1215.77 * 

Ep 29 * AvgReward -290.79 * true AvgReward -290.79 * Reward -36.32 * True Reward -36.32 * time 1.92 * step 5000
Ep 30 * AvgReward -291.24 * true AvgReward -291.24 * Reward -304.64 * True Reward -304.64 * time 1.73 * step 5000
Ep 31 * AvgReward -288.95 * true AvgReward -288.95 * Reward -217.96 * True Reward -217.96 * time 2.59 * step 5000
Ep 32 * AvgReward -292.21 * true AvgReward -292.21 * Reward -396.51 * True Reward -396.51 * time 1.47 * step 5000
Ep 33 * AvgReward -291.07 * true AvgReward -291.07 * Reward -253.46 * True Reward -253.46 * time 4.45 * step 5000
Ep 34 * AvgReward -305.32 * true AvgReward -305.32 * Reward -789.71 * True Reward -789.71 * time 3.37 * step 5000
Ep 35 * AvgReward -308.51 * true AvgReward -308.51 * Reward -420.42 * True Reward -420.42 * time 1.20 * step 5000
Ep 36 * AvgReward -316.44 * true AvgReward -316.44 * Reward -601.61 * True Reward -601.61 * time 1.10 * step 5000
