In [11]:
import tensorflow as tf
import random
import time
import numpy as np
from collections import deque

from environment_creation import create_environment
import policy_gradient_network as pgn
from frame_preprocessing import preprocess_frame
from frames_stacking import stack_frames 

import warnings
warnings.filterwarnings('ignore')

In [12]:
game, possible_actions = create_environment()

stack_size = 4
stacked_frames = deque([np.zeros((84,84), dtype = np.int) for i in range(stack_size)], maxlen = 4)

In [13]:
def discount_and_normalize_rewards(episode_rewards, gamma):
    discounted_episode_rewards = np.zeros_like(episode_rewards)
    cumulative = 0.0
    for i in reversed(range(len(episode_rewards))):
        cumulative = cumulative * gamma + episode_rewards[i]
        discounted_episode_rewards[i] = cumulative
    
    mean = np.mean(discounted_episode_rewards)
    std = np.std(discounted_episode_rewards)
    discounted_episode_rewards = (discounted_episode_rewards - mean) / (std)

    return discounted_episode_rewards

In [14]:
####################################
###Environment hyperparameters
state_size = [84, 84, 4] #our input is a stack of 4 frames , 84x84
action_size = game.get_available_buttons_size() # 3 possible actions, turn left, turn right, go forward
stack_size = 4 #how many frames are stacked together

#deep learning model hyperparameters
learning_rate = 0.002
num_epochs = 1000

batch_size = 2000
gamma = 0.95 #discount rate

training = True
######################################
action_size

3

In [15]:
tf.reset_default_graph()

PGNetwork = pgn.PGNetwork(state_size, action_size, learning_rate)

In [16]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)

In [17]:
#setup tensorflow writer
writer = tf.summary.FileWriter("/tensorboard/pg/test")

#losses
tf.summary.scalar("Loss", PGNetwork.loss)

tf.summary.scalar("Reward_mean", PGNetwork.mean_reward_ )

write_op = tf.summary.merge_all()

In [18]:
#now is the time to train the agent

Here we'll create batches.
These batches contains episodes (their number depends on how many rewards we collect: for instance if we have episodes with only 10 rewards we can put batch_size/10 episodes

    Make a batch
        For each step:
            Choose action a
            Perform action a
            Store s, a, r
            If done:
                Calculate sum reward
                Calculate gamma Gt



In [19]:
def make_batch(batch_size, stacked_frames):
    # Initialize lists: states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards
    states, actions, rewards_of_episode, rewards_of_batch, discounted_rewards = [], [], [], [], []
    
    # Reward of batch is also a trick to keep track of how many timestep we made.
    # We use to to verify at the end of each episode if > batch_size or not.
    
    # Keep track of how many episodes in our batch (useful when we'll need to calculate the average reward per episode)
    episode_num  = 1
    
    # Launch a new episode
    game.new_episode()
        
    # Get a new state
    state = game.get_state().screen_buffer
    state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)

    while True:
        # Run State Through Policy & Calculate Action
        action_probability_distribution = sess.run(PGNetwork.action_distribution, 
                                                   feed_dict={PGNetwork.inputs_: state.reshape(1, *state_size)})
        
        # REMEMBER THAT WE ARE IN A STOCHASTIC POLICY SO WE DON'T ALWAYS TAKE THE ACTION WITH THE HIGHEST PROBABILITY
        # (For instance if the action with the best probability for state S is a1 with 70% chances, there is
        #30% chance that we take action a2)
        action = np.random.choice(range(action_probability_distribution.shape[1]), 
                                  p=action_probability_distribution.ravel())  # select action w.r.t the actions prob
        action = possible_actions[action]

        # Perform action
        reward = game.make_action(action)
        done = game.is_episode_finished()

        # Store results
        states.append(state)
        actions.append(action)
        rewards_of_episode.append(reward)
        
        if done:
            # The episode ends so no next state
            next_state = np.zeros((84, 84), dtype=np.int)
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)
            
            # Append the rewards_of_batch to reward_of_episode
            rewards_of_batch.append(rewards_of_episode)
            
            # Calculate gamma Gt
            discounted_rewards.append(discount_and_normalize_rewards(rewards_of_episode, gamma))
           
            # If the number of rewards_of_batch > batch_size stop the minibatch creation
            # (Because we have sufficient number of episode mb)
            # Remember that we put this condition here, because we want entire episode (Monte Carlo)
            # so we can't check that condition for each step but only if an episode is finished
            if len(np.concatenate(rewards_of_batch)) > batch_size:
                break
                
            # Reset the transition stores
            rewards_of_episode = []
            
            # Add episode
            episode_num += 1
            
            # Start a new episode
            game.new_episode()

            # First we need a state
            state = game.get_state().screen_buffer

            # Stack the frames
            state, stacked_frames = stack_frames(stacked_frames, state, True, stack_size)
         
        else:
            # If not done, the next_state become the current state
            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False, stack_size)
            state = next_state
                         
    return np.stack(np.array(states)), np.stack(np.array(actions)), np.concatenate(rewards_of_batch), np.concatenate(discounted_rewards), episode_num

- Create the Neural Network
- Init the weights
- Init the environment
- maxReward = 0 #keep track of max reward
- for epochs in range(num_epochs):
    - get batches
    - optimize

In [20]:
# Keep track of all rewards total for each batch
allRewards = []

total_rewards = 0
maximumRewardRecorded = 0
mean_reward_total = []
epoch = 1
average_reward = []

# Saver
saver = tf.train.Saver()

if training:
    # Load the model
    #saver.restore(sess, "./models/model.ckpt")

    while epoch < num_epochs + 1:
        # Gather training data
        states_mb, actions_mb, rewards_of_batch, discounted_rewards_mb, nb_episodes_mb = make_batch(batch_size, stacked_frames)

        ### These part is used for analytics
        # Calculate the total reward ot the batch
        total_reward_of_that_batch = np.sum(rewards_of_batch)
        allRewards.append(total_reward_of_that_batch)

        # Calculate the mean reward of the batch
        # Total rewards of batch / nb episodes in that batch
        mean_reward_of_that_batch = np.divide(total_reward_of_that_batch, nb_episodes_mb)
        mean_reward_total.append(mean_reward_of_that_batch)

        # Calculate the average reward of all training
        # mean_reward_of_that_batch / epoch
        average_reward_of_all_training = np.divide(np.sum(mean_reward_total), epoch)

        # Calculate maximum reward recorded 
        maximumRewardRecorded = np.amax(allRewards)

        print("==========================================")
        print("Epoch: ", epoch, "/", num_epochs)
        print("-----------")
        print("Number of training episodes: {}".format(nb_episodes_mb))
        print("Total reward: {}".format(total_reward_of_that_batch, nb_episodes_mb))
        print("Mean Reward of that batch {}".format(mean_reward_of_that_batch))
        print("Average Reward of all training: {}".format(average_reward_of_all_training))
        print("Max reward for a batch so far: {}".format(maximumRewardRecorded))

        # Feedforward, gradient and backpropagation
        loss_, _ = sess.run([PGNetwork.loss, PGNetwork.train_opt], feed_dict={PGNetwork.inputs_: states_mb.reshape((len(states_mb), 84,84,4)),
                                                            PGNetwork.actions: actions_mb,
                                                                     PGNetwork.discounted_episode_rewards_: discounted_rewards_mb 
                                                                    })

        print("Training Loss: {}".format(loss_))

        # Write TF Summaries
        summary = sess.run(write_op, feed_dict={PGNetwork.inputs_: states_mb.reshape((len(states_mb), 84,84,4)),
                                                            PGNetwork.actions: actions_mb,
                                                                     PGNetwork.discounted_episode_rewards_: discounted_rewards_mb,
                                                                    PGNetwork.mean_reward_: mean_reward_of_that_batch
                                                                    })

        #summary = sess.run(write_op, feed_dict={x: s_.reshape(len(s_),84,84,1), y:a_, d_r: d_r_, r: r_, n: n_})
        writer.add_summary(summary, epoch)
        writer.flush()

        # Save Model
        if epoch % 10 == 0:
            saver.save(sess, "./models/model.ckpt")
            print("Model saved")
        epoch += 1

Epoch:  1 / 1000
-----------
Number of training episodes: 4
Total reward: 1840.0
Mean Reward of that batch 460.0
Average Reward of all training: 460.0
Max reward for a batch so far: 1840.0
Training Loss: -0.0076517038978636265
Epoch:  2 / 1000
-----------
Number of training episodes: 4
Total reward: 1680.0
Mean Reward of that batch 420.0
Average Reward of all training: 440.0
Max reward for a batch so far: 1840.0
Training Loss: -0.010891149751842022
Epoch:  3 / 1000
-----------
Number of training episodes: 4
Total reward: 1840.0
Mean Reward of that batch 460.0
Average Reward of all training: 446.6666666666667
Max reward for a batch so far: 1840.0
Training Loss: -0.014711610041558743
Epoch:  4 / 1000
-----------
Number of training episodes: 4
Total reward: 1968.0
Mean Reward of that batch 492.0
Average Reward of all training: 458.0
Max reward for a batch so far: 1968.0
Training Loss: -0.0057493350468575954
Epoch:  5 / 1000
-----------
Number of training episodes: 4
Total reward: 1904.0
M

Training Loss: 0.013952933251857758
Model saved
Epoch:  31 / 1000
-----------
Number of training episodes: 4
Total reward: 1712.0
Mean Reward of that batch 428.0
Average Reward of all training: 485.51397849462364
Max reward for a batch so far: 2360.0
Training Loss: -0.13070173561573029
Epoch:  32 / 1000
-----------
Number of training episodes: 4
Total reward: 2064.0
Mean Reward of that batch 516.0
Average Reward of all training: 486.4666666666667
Max reward for a batch so far: 2360.0
Training Loss: 0.007631264626979828
Epoch:  33 / 1000
-----------
Number of training episodes: 3
Total reward: 1780.0
Mean Reward of that batch 593.3333333333334
Average Reward of all training: 489.70505050505056
Max reward for a batch so far: 2360.0
Training Loss: 0.036599233746528625
Epoch:  34 / 1000
-----------
Number of training episodes: 4
Total reward: 2320.0
Mean Reward of that batch 580.0
Average Reward of all training: 492.3607843137256
Max reward for a batch so far: 2360.0
Training Loss: -0.0027

Training Loss: -0.028602292761206627
Epoch:  60 / 1000
-----------
Number of training episodes: 5
Total reward: 1516.0
Mean Reward of that batch 303.2
Average Reward of all training: 522.06
Max reward for a batch so far: 3400.0
Training Loss: -0.05140203982591629
Model saved
Epoch:  61 / 1000
-----------
Number of training episodes: 5
Total reward: 1580.0
Mean Reward of that batch 316.0
Average Reward of all training: 518.6819672131147
Max reward for a batch so far: 3400.0
Training Loss: -0.002965521765872836
Epoch:  62 / 1000
-----------
Number of training episodes: 5
Total reward: 1612.0
Mean Reward of that batch 322.4
Average Reward of all training: 515.516129032258
Max reward for a batch so far: 3400.0
Training Loss: -0.053374722599983215
Epoch:  63 / 1000
-----------
Number of training episodes: 5
Total reward: 1644.0
Mean Reward of that batch 328.8
Average Reward of all training: 512.552380952381
Max reward for a batch so far: 3400.0
Training Loss: -0.03388804942369461
Epoch:  64

Training Loss: 0.048488933593034744
Epoch:  89 / 1000
-----------
Number of training episodes: 3
Total reward: 1780.0
Mean Reward of that batch 593.3333333333334
Average Reward of all training: 479.62996254681656
Max reward for a batch so far: 3400.0
Training Loss: 0.0163747426122427
Epoch:  90 / 1000
-----------
Number of training episodes: 4
Total reward: 1680.0
Mean Reward of that batch 420.0
Average Reward of all training: 478.96740740740745
Max reward for a batch so far: 3400.0
Training Loss: -0.022535840049386024
Model saved
Epoch:  91 / 1000
-----------
Number of training episodes: 4
Total reward: 3080.0
Mean Reward of that batch 770.0
Average Reward of all training: 482.16556776556786
Max reward for a batch so far: 3400.0
Training Loss: -0.06412021070718765
Epoch:  92 / 1000
-----------
Number of training episodes: 3
Total reward: 2132.0
Mean Reward of that batch 710.6666666666666
Average Reward of all training: 484.64927536231886
Max reward for a batch so far: 3400.0
Training 

Training Loss: 0.12492426484823227
Epoch:  118 / 1000
-----------
Number of training episodes: 5
Total reward: 1932.0
Mean Reward of that batch 386.4
Average Reward of all training: 640.754802259887
Max reward for a batch so far: 3820.0
Training Loss: 0.12698735296726227
Epoch:  119 / 1000
-----------
Number of training episodes: 5
Total reward: 1900.0
Mean Reward of that batch 380.0
Average Reward of all training: 638.5635854341737
Max reward for a batch so far: 3820.0
Training Loss: 0.07102356106042862
Epoch:  120 / 1000
-----------
Number of training episodes: 4
Total reward: 1616.0
Mean Reward of that batch 404.0
Average Reward of all training: 636.6088888888888
Max reward for a batch so far: 3820.0
Training Loss: 0.05223986506462097
Model saved
Epoch:  121 / 1000
-----------
Number of training episodes: 4
Total reward: 1712.0
Mean Reward of that batch 428.0
Average Reward of all training: 634.8848484848485
Max reward for a batch so far: 3820.0
Training Loss: 0.06975892186164856
Ep

Training Loss: 0.00037287946906872094
Epoch:  147 / 1000
-----------
Number of training episodes: 5
Total reward: 1516.0
Mean Reward of that batch 303.2
Average Reward of all training: 589.4984126984127
Max reward for a batch so far: 3820.0
Training Loss: 0.007514607161283493
Epoch:  148 / 1000
-----------
Number of training episodes: 5
Total reward: 1708.0
Mean Reward of that batch 341.6
Average Reward of all training: 587.8234234234234
Max reward for a batch so far: 3820.0
Training Loss: 0.0008210219093598425
Epoch:  149 / 1000
-----------
Number of training episodes: 5
Total reward: 1708.0
Mean Reward of that batch 341.6
Average Reward of all training: 586.1709172259507
Max reward for a batch so far: 3820.0
Training Loss: 0.0057106320746243
Epoch:  150 / 1000
-----------
Number of training episodes: 5
Total reward: 1644.0
Mean Reward of that batch 328.8
Average Reward of all training: 584.4551111111111
Max reward for a batch so far: 3820.0
Training Loss: 0.0075381845235824585
Model 

Training Loss: 0.017114298418164253
Epoch:  176 / 1000
-----------
Number of training episodes: 5
Total reward: 1708.0
Mean Reward of that batch 341.6
Average Reward of all training: 546.4787878787879
Max reward for a batch so far: 3820.0
Training Loss: 0.04695768281817436
Epoch:  177 / 1000
-----------
Number of training episodes: 5
Total reward: 1804.0
Mean Reward of that batch 360.8
Average Reward of all training: 545.4297551789078
Max reward for a batch so far: 3820.0
Training Loss: 0.0816003754734993
Epoch:  178 / 1000
-----------
Number of training episodes: 5
Total reward: 1804.0
Mean Reward of that batch 360.8
Average Reward of all training: 544.3925093632959
Max reward for a batch so far: 3820.0
Training Loss: 0.012833541259169579
Epoch:  179 / 1000
-----------
Number of training episodes: 5
Total reward: 1548.0
Mean Reward of that batch 309.6
Average Reward of all training: 543.0808193668529
Max reward for a batch so far: 3820.0
Training Loss: -0.01204012706875801
Epoch:  180

Training Loss: 1.722204379106529e-09
Epoch:  205 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 511.6188617886179
Max reward for a batch so far: 3820.0
Training Loss: 4.1750458645850586e-10
Epoch:  206 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 510.5139158576052
Max reward for a batch so far: 3820.0
Training Loss: 1.356887602277368e-09
Epoch:  207 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 509.4196457326892
Max reward for a batch so far: 3820.0
Training Loss: 2.6441904221741197e-09
Epoch:  208 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 508.3358974358974
Max reward for a batch so far: 3820.0
Training Loss: 2.5746065279719232e-09


Training Loss: 1.913559088961847e-09
Epoch:  234 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 483.4096866096866
Max reward for a batch so far: 3820.0
Training Loss: 1.496056611927088e-09
Epoch:  235 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 482.56113475177307
Max reward for a batch so far: 3820.0
Training Loss: 5.075236675367023e-09
Epoch:  236 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 481.7197740112994
Max reward for a batch so far: 3820.0
Training Loss: 1.3220969874438993e-09
Epoch:  237 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 480.88551336146276
Max reward for a batch so far: 3820.0
Training Loss: 1.704804630797696e-09


Training Loss: 0.0
Epoch:  264 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 460.769696969697
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  265 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 460.102641509434
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  266 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 459.44060150375947
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  267 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 458.783520599251
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  268 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward 

Training Loss: 0.0
Epoch:  295 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 442.1938983050847
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  296 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 441.6594594594594
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  297 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 441.1286195286195
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  298 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 440.6013422818791
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  299 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Rewar

Training Loss: 0.0
Epoch:  326 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 427.1509202453988
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  327 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 426.7131498470948
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  328 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 426.27804878048784
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  329 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 425.84559270516723
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  330 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Rew

Training Loss: 0.0
Epoch:  357 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 414.7204481792717
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  358 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 414.35530726256985
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  359 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 413.9922005571031
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  360 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 413.6311111111111
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Model saved
Epoch:  361 / 1000
-----------
Number of training episodes: 6
Total reward: 1704

Training Loss: 0.0
Epoch:  388 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 404.27628865979386
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  389 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 403.9670951156813
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  390 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 403.6594871794872
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Model saved
Epoch:  391 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 403.35345268542204
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  392 / 1000
-----------
Number of training episodes: 6
Total reward: 170

Training Loss: 0.0
Epoch:  419 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 395.3775656324582
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  420 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 395.11238095238093
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Model saved
Epoch:  421 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 394.84845605700707
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  422 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 394.5857819905213
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  423 / 1000
-----------
Number of training episodes: 6
Total reward: 170

Training Loss: 0.0
Epoch:  450 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 387.7048888888889
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Model saved
Epoch:  451 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 387.4749445676275
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  452 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 387.24601769911504
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  453 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 387.0181015452539
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  454 / 1000
-----------
Number of training episodes: 6
Total reward: 1704

Training Loss: 0.0
Model saved
Epoch:  481 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 381.02120582120585
Max reward for a batch so far: 3820.0
Training Loss: 0.0022350987419486046
Epoch:  482 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 380.8199170124482
Max reward for a batch so far: 3820.0
Training Loss: 0.02094390243291855
Epoch:  483 / 1000
-----------
Number of training episodes: 4
Total reward: 1712.0
Mean Reward of that batch 428.0
Average Reward of all training: 380.9175983436853
Max reward for a batch so far: 3820.0
Training Loss: 0.3517810106277466
Epoch:  484 / 1000
-----------
Number of training episodes: 5
Total reward: 1708.0
Mean Reward of that batch 341.6
Average Reward of all training: 380.8363636363636
Max reward for a batch so far: 3820.0
Training Loss: 0.014548123814165592
Epoch:  485 / 

Training Loss: 0.0029644498135894537
Model saved
Epoch:  511 / 1000
-----------
Number of training episodes: 4
Total reward: 1648.0
Mean Reward of that batch 412.0
Average Reward of all training: 376.57455968688845
Max reward for a batch so far: 3820.0
Training Loss: 0.008004232309758663
Epoch:  512 / 1000
-----------
Number of training episodes: 5
Total reward: 1612.0
Mean Reward of that batch 322.4
Average Reward of all training: 376.46875
Max reward for a batch so far: 3820.0
Training Loss: 0.002963816514238715
Epoch:  513 / 1000
-----------
Number of training episodes: 5
Total reward: 1804.0
Mean Reward of that batch 360.8
Average Reward of all training: 376.43820662768036
Max reward for a batch so far: 3820.0
Training Loss: 0.007630860432982445
Epoch:  514 / 1000
-----------
Number of training episodes: 6
Total reward: 1768.0
Mean Reward of that batch 294.6666666666667
Average Reward of all training: 376.2791180285344
Max reward for a batch so far: 3820.0
Training Loss: -0.0043326

Training Loss: 0.00018088592332787812
Epoch:  540 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 373.08641975308643
Max reward for a batch so far: 3820.0
Training Loss: 0.0013839465100318193
Model saved
Epoch:  541 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 372.9217498459643
Max reward for a batch so far: 3820.0
Training Loss: 0.0037478746380656958
Epoch:  542 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 372.7576875768758
Max reward for a batch so far: 3820.0
Training Loss: 0.0004559890949167311
Epoch:  543 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 372.59422958870476
Max reward for a batch so far: 3820.0
Training Loss: 0.02854506

Training Loss: 0.0
Epoch:  570 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 368.6278362573099
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Model saved
Epoch:  571 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 368.4796263864565
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  572 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 368.3319347319347
Max reward for a batch so far: 3820.0
Training Loss: 3.47920720011885e-11
Epoch:  573 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 368.1847585805701
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  574 / 1000
-----------
Number of training episodes: 6
To

Training Loss: 0.0
Model saved
Epoch:  601 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 364.2626733222407
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  602 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 364.12934662236984
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  603 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 363.99646213377554
Max reward for a batch so far: 3820.0
Training Loss: 2.2614830841316547e-10
Epoch:  604 / 1000
-----------
Number of training episodes: 6
Total reward: 1704.0
Mean Reward of that batch 284.0
Average Reward of all training: 363.8640176600441
Max reward for a batch so far: 3820.0
Training Loss: 0.0
Epoch:  605 / 1000
-----------
Number of training episodes: 

ViZDoomUnexpectedExitException: Controlled ViZDoom instance exited unexpectedly.