In [None]:
import tensorflow as tf      # Deep Learning library
import numpy as np           # Handle matrices
from vizdoom import *        # Doom Environment

import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from environment_creation import create_environment
from frame_preprocessing import preprocess_frame
from memory import Memory
from dddq_neural_network import DDDQNNet

from collections import deque# Ordered collection with ends
import matplotlib.pyplot as plt # Display graphs

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore')

In [2]:
stack_size = 4 # We stack 4 frames

# Initialize deque with zero-images one array for each image
stacked_frames  =  deque([np.zeros((100,120), dtype=np.int) for i in range(stack_size)], maxlen=4) 

In [3]:
game, possible_actions = create_environment()

(<vizdoom.vizdoom.DoomGame at 0x1f9364a1b20>,
 [[1, 0, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0],
  [0, 0, 1, 0, 0, 0, 0],
  [0, 0, 0, 1, 0, 0, 0],
  [0, 0, 0, 0, 1, 0, 0],
  [0, 0, 0, 0, 0, 1, 0],
  [0, 0, 0, 0, 0, 0, 1]])

In [4]:
def stack_frames(stacked_frames, state, is_new_episode, stack_size):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((100,120), dtype=np.int) for i in range(stack_size)], maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)

    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

In [5]:
### MODEL HYPERPARAMETERS
state_size = [100,120,4]      # Our input is a stack of 4 frames hence 100x120x4 (Width, height, channels) 
action_size = game.get_available_buttons_size()              # 7 possible actions
learning_rate =  0.00025      # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 5000         # Total episodes for training
max_steps = 5000              # Max possible steps in an episode
batch_size = 64             

# FIXED Q TARGETS HYPERPARAMETERS 
max_tau = 10000 #Tau is the C step where we update our target network

# EXPLORATION HYPERPARAMETERS for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.00005            # exponential decay rate for exploration prob

# Q LEARNING hyperparameters
gamma = 0.95               # Discounting rate

### MEMORY HYPERPARAMETERS
## If you have GPU change to 1million
pretrain_length = 10000   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 10000       # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = True

In [7]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DDDQNNet(state_size, action_size, learning_rate, name="DQNetwork")

# Instantiate the target network
TargetNetwork = DDDQNNet(state_size, action_size, learning_rate, name="TargetNetwork")

In [8]:
# Instantiate memory
memory = Memory(memory_size)

# Render the environment
game.new_episode()

for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        # First we need a state
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)
    
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()

    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        #experience = np.hstack((state, [action, reward], next_state, done))
        
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        # Start a new episode
        game.new_episode()
        
        # First we need a state
        state = game.get_state().screen_buffer
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)
        
    else:
        # Get the next state
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)
        
        # Add experience to memory
        experience = state, action, reward, next_state, done
        memory.store(experience)
        
        # Our state is now the next_state
        state = next_state

In [9]:
"""
This function will do the part
With ϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        action = random.choice(possible_actions)
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
                
    return action, explore_probability

In [10]:
# This function helps us to copy one set of variables to another
# In our case we use it when we want to copy the parameters of DQN to Target_network
# Thanks of the very good implementation of Arthur Juliani https://github.com/awjuliani

def update_target_graph():
    from_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "DQNetwork")
    to_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, "TargetNetwork")
    
    op_holder = []
    
    #update our target_network parameters with DQNNetwork parameters
    for from_var, to_var in zip(from_vars, to_vars):
        op_holder.append(to_var.assign(from_var))
    return op_holder    

Training algorithm:

    Initialize the weights for DQN
    Initialize target value weights w- <- w
    Init the environment
    Initialize the decay rate (that will use to reduce epsilon)

    For episode to max_episode do
        Make new episode
        Set step to 0
        Observe the first state $s_0$

        While step < max_steps do:
            Increase decay_rate
            With $\epsilon$ select a random action $a_t$, otherwise select $a_t = \mathrm{argmax}_a Q(s_t,a)$
            Execute action $a_t$ in simulator and observe reward $r_{t+1}$ and new state $s_{t+1}$

            Store transition $

            Sample random mini-batch from $D$: $$
            Set target $\hat{Q} = r$ if the episode ends at $+1$, otherwise set $\hat{Q} = r + \gamma Q(s',argmax_{a'}{Q(s', a', w), w^-)}$
            Make a gradient descent step with loss $(\hat{Q} - Q(s, a))^2$
            Every C steps, reset: $w^- \leftarrow w$
        endfor

    endfor

In [12]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dddqn/1")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

In [None]:
# Saver will help us to save our model
saver = tf.train.Saver()

if training == True:
    with tf.Session() as sess:
        # Initialize the variables
        sess.run(tf.global_variables_initializer())
        
        # Initialize the decay rate (that will use to reduce epsilon) 
        decay_step = 0
        
        # Set tau = 0
        tau = 0

        # Init the game
        game.init()
        
        # Update the parameters of our TargetNetwork with DQN_weights
        update_target = update_target_graph()
        sess.run(update_target)
        
        for episode in range(total_episodes):
            # Set step to 0
            step = 0
            
            # Initialize the rewards of the episode
            episode_rewards = []
            
            # Make a new episode and observe the first state
            game.new_episode()
            
            state = game.get_state().screen_buffer
            
            # Remember that stack frame function also call our preprocess function.
            state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)
        
            while step < max_steps:
                step += 1
                
                # Increase the C step
                tau += 1
                
                # Increase decay_step
                decay_step +=1
                
                # With ϵ select a random action atat, otherwise select a = argmaxQ(st,a)
                action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)

                # Do the action
                reward = game.make_action(action)

                # Look if the episode is finished
                done = game.is_episode_finished()
                
                # Add the reward to total reward
                episode_rewards.append(reward)

                # If the game is finished
                if done:
                    # the episode ends so no next state
                    next_state = np.zeros((120,140), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)

                    # Set step = max_steps to end the episode
                    step = max_steps

                    # Get the total reward of the episode
                    total_reward = np.sum(episode_rewards)

                    print('Episode: {}'.format(episode),
                              'Total reward: {}'.format(total_reward),
                              'Training loss: {:.4f}'.format(loss),
                              'Explore P: {:.4f}'.format(explore_probability))

                    # Add experience to memory
                    experience = state, action, reward, next_state, done
                    memory.store(experience)

                else:
                    # Get the next state
                    next_state = game.get_state().screen_buffer
                    
                    # Stack the frame of the next_state
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)
                    

                    # Add experience to memory
                    experience = state, action, reward, next_state, done
                    memory.store(experience)
                    
                    # st+1 is now our current state
                    state = next_state


                ### LEARNING PART            
                # Obtain random mini-batch from memory
                tree_idx, batch, ISWeights_mb = memory.sample(batch_size)
                
                states_mb = np.array([each[0][0] for each in batch], ndmin=3)
                actions_mb = np.array([each[0][1] for each in batch])
                rewards_mb = np.array([each[0][2] for each in batch]) 
                next_states_mb = np.array([each[0][3] for each in batch], ndmin=3)
                dones_mb = np.array([each[0][4] for each in batch])

                target_Qs_batch = []

                
                ### DOUBLE DQN Logic
                # Use DQNNetwork to select the action to take at next_state (a') (action with the highest Q-value)
                # Use TargetNetwork to calculate the Q_val of Q(s',a')
                
                # Get Q values for next_state 
                q_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Calculate Qtarget for all actions that state
                q_target_next_state = sess.run(TargetNetwork.output, feed_dict = {TargetNetwork.inputs_: next_states_mb})
                
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma * Qtarget(s',a') 
                for i in range(0, len(batch)):
                    terminal = dones_mb[i]
                    
                    # We got a'
                    action = np.argmax(q_next_state[i])

                    # If we are in a terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                        
                    else:
                        # Take the Qtarget for action a'
                        target = rewards_mb[i] + gamma * q_target_next_state[i][action]
                        target_Qs_batch.append(target)
                        

                targets_mb = np.array([each for each in target_Qs_batch])

                
                _, loss, absolute_errors = sess.run([DQNetwork.optimizer, DQNetwork.loss, DQNetwork.absolute_errors],
                                    feed_dict={DQNetwork.inputs_: states_mb,
                                               DQNetwork.target_Q: targets_mb,
                                               DQNetwork.actions_: actions_mb,
                                              DQNetwork.ISWeights_: ISWeights_mb})
              
                
                
                # Update priority
                memory.batch_update(tree_idx, absolute_errors)
                
                
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                   DQNetwork.target_Q: targets_mb,
                                                   DQNetwork.actions_: actions_mb,
                                              DQNetwork.ISWeights_: ISWeights_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                
                if tau > max_tau:
                    # Update the parameters of our TargetNetwork with DQN_weights
                    update_target = update_target_graph()
                    sess.run(update_target)
                    tau = 0
                    print("Model updated")

            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/model.ckpt")
                print("Model Saved")

Episode: 0 Total reward: -110.57624816894531 Training loss: 47.3862 Explore P: 0.9959
Model Saved
Episode: 1 Total reward: -115.95259094238281 Training loss: 37.8028 Explore P: 0.9935
Episode: 2 Total reward: -112.43696594238281 Training loss: 29.0424 Explore P: 0.9895
Episode: 3 Total reward: -58.77366638183594 Training loss: 35.3860 Explore P: 0.9855
Episode: 4 Total reward: -115.42689514160156 Training loss: 9.7779 Explore P: 0.9815
Episode: 5 Total reward: -84.46812438964844 Training loss: 8.8634 Explore P: 0.9775
Model Saved
Episode: 6 Total reward: -82.31275939941406 Training loss: 3.9465 Explore P: 0.9735
Episode: 7 Total reward: -67.365234375 Training loss: 3.7157 Explore P: 0.9696
Episode: 8 Total reward: -103.49986267089844 Training loss: 1.3084 Explore P: 0.9657
Episode: 9 Total reward: -111.79777526855469 Training loss: 0.9416 Explore P: 0.9603
Episode: 10 Total reward: -101.93240356445312 Training loss: 0.5273 Explore P: 0.9564
Model Saved
Episode: 11 Total reward: -44.430

Episode: 94 Total reward: -109.33488464355469 Training loss: 0.1681 Explore P: 0.6396
Episode: 95 Total reward: -86.29859924316406 Training loss: 0.6195 Explore P: 0.6370
Model Saved
Episode: 96 Total reward: -42.55110168457031 Training loss: 0.8006 Explore P: 0.6335
Episode: 97 Total reward: -111.52056884765625 Training loss: 0.2566 Explore P: 0.6320
Episode: 98 Total reward: -82.16763305664062 Training loss: 0.8413 Explore P: 0.6249
Episode: 99 Total reward: -67.3795166015625 Training loss: 0.0562 Explore P: 0.6224
Episode: 100 Total reward: -104.04939270019531 Training loss: 0.1029 Explore P: 0.6198
Model Saved
Episode: 101 Total reward: -45.13116455078125 Training loss: 0.3585 Explore P: 0.6173
Episode: 102 Total reward: -66.38711547851562 Training loss: 0.0875 Explore P: 0.6150
Episode: 103 Total reward: -75.03628540039062 Training loss: 0.0745 Explore P: 0.6128
Model updated
Episode: 104 Total reward: -67.68966674804688 Training loss: 8.9984 Explore P: 0.6103
Episode: 105 Total r

Episode: 187 Total reward: -115.94224548339844 Training loss: 0.0661 Explore P: 0.4162
Episode: 188 Total reward: -56.39424133300781 Training loss: 0.2189 Explore P: 0.4146
Episode: 189 Total reward: -96.15127563476562 Training loss: 0.5088 Explore P: 0.4129
Episode: 190 Total reward: -114.42535400390625 Training loss: 0.8223 Explore P: 0.4112
Model Saved
Episode: 191 Total reward: -81.76736450195312 Training loss: 0.4091 Explore P: 0.4091
Episode: 192 Total reward: -87.98257446289062 Training loss: 1.1431 Explore P: 0.4075
Episode: 193 Total reward: -78.02105712890625 Training loss: 0.1352 Explore P: 0.4058
Episode: 194 Total reward: -77.38835144042969 Training loss: 0.7857 Explore P: 0.4042
Episode: 195 Total reward: -103.32853698730469 Training loss: 0.1596 Explore P: 0.4028
Model Saved
Episode: 196 Total reward: -99.25189208984375 Training loss: 0.1789 Explore P: 0.4018
Episode: 197 Total reward: -62.856231689453125 Training loss: 0.3685 Explore P: 0.4002
Episode: 198 Total reward:

Episode: 280 Total reward: -114.20535278320312 Training loss: 0.2286 Explore P: 0.2814
Model Saved
Episode: 281 Total reward: -94.4296875 Training loss: 0.6993 Explore P: 0.2803
Episode: 282 Total reward: -83.23300170898438 Training loss: 0.7476 Explore P: 0.2797
Episode: 283 Total reward: -76.07620239257812 Training loss: 0.9076 Explore P: 0.2786
Episode: 284 Total reward: -96.82943725585938 Training loss: 0.3188 Explore P: 0.2775
Episode: 285 Total reward: -61.340728759765625 Training loss: 0.5037 Explore P: 0.2760
Model Saved
Episode: 286 Total reward: -115.99931335449219 Training loss: 0.2523 Explore P: 0.2749
Episode: 287 Total reward: -76.28294372558594 Training loss: 0.2646 Explore P: 0.2742
Episode: 288 Total reward: -76.76051330566406 Training loss: 0.3316 Explore P: 0.2731
Episode: 289 Total reward: -75.72247314453125 Training loss: 0.5217 Explore P: 0.2720
Episode: 290 Total reward: -86.41487121582031 Training loss: 0.2574 Explore P: 0.2710
Model Saved
Episode: 291 Total rew

Episode: 373 Total reward: -115.42623901367188 Training loss: 0.3167 Explore P: 0.1903
Episode: 374 Total reward: -88.26486206054688 Training loss: 0.4546 Explore P: 0.1896
Episode: 375 Total reward: -112.24098205566406 Training loss: 1.1993 Explore P: 0.1889
Model Saved
Episode: 376 Total reward: -115.9771728515625 Training loss: 0.2250 Explore P: 0.1880
Episode: 377 Total reward: -111.08822631835938 Training loss: 0.5546 Explore P: 0.1873
Episode: 378 Total reward: -103.59797668457031 Training loss: 0.2350 Explore P: 0.1858
Episode: 379 Total reward: -65.19097900390625 Training loss: 0.5464 Explore P: 0.1851
Episode: 380 Total reward: -90.09780883789062 Training loss: 0.5532 Explore P: 0.1843
Model Saved
Episode: 381 Total reward: -112.5201416015625 Training loss: 0.2041 Explore P: 0.1831
Episode: 382 Total reward: -115.99713134765625 Training loss: 0.7622 Explore P: 0.1824
Episode: 383 Total reward: -37.97607421875 Training loss: 0.2844 Explore P: 0.1817
Episode: 384 Total reward: -

Model Saved
Episode: 466 Total reward: 40.64173889160156 Training loss: 0.3309 Explore P: 0.1252
Episode: 467 Total reward: -83.92100524902344 Training loss: 1.1255 Explore P: 0.1247
Episode: 468 Total reward: 36.38462829589844 Training loss: 0.3237 Explore P: 0.1243
Episode: 469 Total reward: 84.2318115234375 Training loss: 0.3104 Explore P: 0.1238
Episode: 470 Total reward: 39.12879943847656 Training loss: 0.5808 Explore P: 0.1234
Model Saved
Episode: 471 Total reward: 41.696563720703125 Training loss: 0.7029 Explore P: 0.1229
Episode: 472 Total reward: 4.04364013671875 Training loss: 0.6262 Explore P: 0.1225
Episode: 473 Total reward: -88.25157165527344 Training loss: 0.7172 Explore P: 0.1222
Episode: 474 Total reward: 30.411834716796875 Training loss: 0.2918 Explore P: 0.1217
Episode: 475 Total reward: -56.63629150390625 Training loss: 0.4268 Explore P: 0.1213
Model Saved
Episode: 476 Total reward: -8.949920654296875 Training loss: 0.2119 Explore P: 0.1208
Episode: 477 Total reward

Episode: 559 Total reward: -53.74351501464844 Training loss: 0.4626 Explore P: 0.0905
Episode: 560 Total reward: -51.82545471191406 Training loss: 0.2924 Explore P: 0.0899
Model Saved
Episode: 561 Total reward: -67.82234191894531 Training loss: 0.3052 Explore P: 0.0896
Episode: 562 Total reward: -84.06192016601562 Training loss: 2.1924 Explore P: 0.0894
Episode: 563 Total reward: -55.214447021484375 Training loss: 0.3310 Explore P: 0.0891
Episode: 564 Total reward: -66.06192016601562 Training loss: 0.1552 Explore P: 0.0887
Episode: 565 Total reward: -77.30355834960938 Training loss: 0.5341 Explore P: 0.0883
Model Saved
Episode: 566 Total reward: -80.58245849609375 Training loss: 0.3213 Explore P: 0.0880
Episode: 567 Total reward: -29.642776489257812 Training loss: 0.5376 Explore P: 0.0877
Episode: 568 Total reward: -71.26087951660156 Training loss: 0.5381 Explore P: 0.0873
Episode: 569 Total reward: -69.63667297363281 Training loss: 1.2826 Explore P: 0.0870
Episode: 570 Total reward: -

Episode: 652 Total reward: -57.74993896484375 Training loss: 0.6254 Explore P: 0.0652
Episode: 653 Total reward: -37.06684875488281 Training loss: 0.3139 Explore P: 0.0649
Episode: 654 Total reward: -53.24530029296875 Training loss: 0.5504 Explore P: 0.0647
Episode: 655 Total reward: 29.661941528320312 Training loss: 1.0787 Explore P: 0.0645
Model Saved
Episode: 656 Total reward: -71.65933227539062 Training loss: 0.5385 Explore P: 0.0643
Episode: 657 Total reward: -52.538116455078125 Training loss: 0.5622 Explore P: 0.0641
Episode: 658 Total reward: 32.28216552734375 Training loss: 0.5283 Explore P: 0.0639
Episode: 659 Total reward: -70.91502380371094 Training loss: 0.4220 Explore P: 0.0636
Episode: 660 Total reward: -79.70916748046875 Training loss: 0.3784 Explore P: 0.0634
Model Saved
Episode: 661 Total reward: -106.28431701660156 Training loss: 0.7306 Explore P: 0.0633
Episode: 662 Total reward: -80.38011169433594 Training loss: 1.0219 Explore P: 0.0631
Episode: 663 Total reward: 5.

Episode: 745 Total reward: -83.429443359375 Training loss: 1.5945 Explore P: 0.0490
Model Saved
Episode: 746 Total reward: -64.96124267578125 Training loss: 0.5944 Explore P: 0.0488
Episode: 747 Total reward: -82.14738464355469 Training loss: 0.7562 Explore P: 0.0487
Episode: 748 Total reward: -82.09222412109375 Training loss: 0.5635 Explore P: 0.0486
Episode: 749 Total reward: -52.020782470703125 Training loss: 0.2289 Explore P: 0.0484
Episode: 750 Total reward: -64.74761962890625 Training loss: 1.1094 Explore P: 0.0483
Model Saved
Episode: 751 Total reward: -115.84693908691406 Training loss: 0.8680 Explore P: 0.0482
Episode: 752 Total reward: -28.061111450195312 Training loss: 0.5781 Explore P: 0.0480
Episode: 753 Total reward: -64.02908325195312 Training loss: 1.7329 Explore P: 0.0479
Episode: 754 Total reward: -102.50846862792969 Training loss: 0.5536 Explore P: 0.0479
Episode: 755 Total reward: -47.843780517578125 Training loss: 0.4629 Explore P: 0.0477
Model Saved
Episode: 756 To

Episode: 838 Total reward: -49.27473449707031 Training loss: 0.3268 Explore P: 0.0375
Episode: 839 Total reward: 52.02732849121094 Training loss: 0.3725 Explore P: 0.0374
Episode: 840 Total reward: 20.331863403320312 Training loss: 0.7094 Explore P: 0.0373
Model Saved
Episode: 841 Total reward: -90.11940002441406 Training loss: 0.2635 Explore P: 0.0372
Episode: 842 Total reward: -105.08660888671875 Training loss: 0.7023 Explore P: 0.0370
Episode: 843 Total reward: 1.126220703125 Training loss: 0.4947 Explore P: 0.0369
Episode: 844 Total reward: -62.209320068359375 Training loss: 0.5883 Explore P: 0.0368
Episode: 845 Total reward: -92.32077026367188 Training loss: 0.8467 Explore P: 0.0367
Model Saved
Episode: 846 Total reward: -99.07498168945312 Training loss: 0.8559 Explore P: 0.0366
Episode: 847 Total reward: -18.788543701171875 Training loss: 0.6990 Explore P: 0.0365
Episode: 848 Total reward: -68.82591247558594 Training loss: 0.2819 Explore P: 0.0363
Episode: 849 Total reward: 4.855

Model Saved
Episode: 931 Total reward: -29.962509155273438 Training loss: 1.0022 Explore P: 0.0291
Episode: 932 Total reward: -16.070220947265625 Training loss: 1.0140 Explore P: 0.0290
Episode: 933 Total reward: -26.640335083007812 Training loss: 1.4401 Explore P: 0.0289
Episode: 934 Total reward: -32.76564025878906 Training loss: 0.7647 Explore P: 0.0289
Episode: 935 Total reward: 91.84123229980469 Training loss: 0.2977 Explore P: 0.0288
Model Saved
Episode: 936 Total reward: 56.42646789550781 Training loss: 0.7188 Explore P: 0.0287
Episode: 937 Total reward: 73.69035339355469 Training loss: 0.5373 Explore P: 0.0286
Episode: 938 Total reward: 50.99821472167969 Training loss: 1.6067 Explore P: 0.0286
Episode: 939 Total reward: 43.45637512207031 Training loss: 1.0010 Explore P: 0.0285
Episode: 940 Total reward: 52.567626953125 Training loss: 0.6392 Explore P: 0.0284
Model Saved
Episode: 941 Total reward: -11.382965087890625 Training loss: 1.3194 Explore P: 0.0283
Episode: 942 Total rew

Episode: 1024 Total reward: -20.822830200195312 Training loss: 0.8533 Explore P: 0.0234
Episode: 1025 Total reward: -61.5177001953125 Training loss: 0.3766 Explore P: 0.0234
Model Saved
Episode: 1026 Total reward: -82.12954711914062 Training loss: 0.9077 Explore P: 0.0233
Episode: 1027 Total reward: 60.385162353515625 Training loss: 0.8783 Explore P: 0.0233
Episode: 1028 Total reward: 10.349716186523438 Training loss: 0.2890 Explore P: 0.0232
Episode: 1029 Total reward: -32.46330261230469 Training loss: 0.5167 Explore P: 0.0231
Episode: 1030 Total reward: 109.65664672851562 Training loss: 0.7882 Explore P: 0.0231
Model Saved
Episode: 1031 Total reward: -61.597686767578125 Training loss: 0.8768 Explore P: 0.0230
Episode: 1032 Total reward: -32.702911376953125 Training loss: 1.5530 Explore P: 0.0230
Episode: 1033 Total reward: -39.45594787597656 Training loss: 0.6420 Explore P: 0.0229
Episode: 1034 Total reward: -11.250625610351562 Training loss: 0.5990 Explore P: 0.0229
Episode: 1035 To