In [108]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow import keras
import os
import datetime
from gym import wrappers

In [109]:
class MyModel(tf.keras.Model):
    def __init__(self, num_states, hidden_units, num_actions):
        super(MyModel, self).__init__()
        self.input_layer = tf.keras.layers.InputLayer(input_shape=(num_states,))
        self.hidden_layers = []
        for i in hidden_units:
            self.hidden_layers.append(Dense(i, activation='tanh', kernel_initializer='RandomNormal'))
        self.output_layer = Dense(num_actions, activation='linear', kernel_initializer='RandomNormal')
        
    @tf.function
    def call(self, inputs):
        z = self.input_layer(inputs)
        for layer in self.hidden_layers:
            z = layer(z)
        output = self.output_layer(z)
        return output

In [110]:
class DQN:
    def __init__(self, num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr):
        self.num_actions = num_actions
        self.batch_size = batch_size
        self.optimizer = tf.optimizers.Adam(lr)
        self.gamma = gamma
        self.model = MyModel(num_states, hidden_units, num_actions)
        self.experience = {'s': [], 'a': [], 'r': [], 's2': [], 'done': []}  # done (bool): is current state terminal
        self.max_experiences = max_experiences
        self.min_experiences = min_experiences
         
    def predict(self, inputs):
        return self.model(np.atleast_2d(inputs.astype('float32')))
    
    def train(self, TargetNet):
        if len(self.experience['s']) < self.min_experiences:
            return 0
        ids = np.random.randint(low=0, high=len(self.experience['s']), size=self.batch_size)
        states = np.asarray([self.experience['s'][i] for i in ids])
        actions = np.asarray([self.experience['a'][i] for i in ids])
        rewards = np.asarray([self.experience['r'][i] for i in ids])
        states_next = np.asarray([self.experience['s2'][i] for i in ids])
        dones = np.asarray([self.experience['done'][i] for i in ids])
        value_next = np.max(TargetNet.predict(states_next), axis=1)
        actual_values = np.where(dones, rewards, rewards + self.gamma*value_next)
        
        with tf.GradientTape() as tape:
            selected_action_values = tf.math.reduce_sum(
                self.predict(states) * tf.one_hot(actions, self.num_actions), axis=1)
            loss = tf.math.reduce_mean(tf.square(actual_values - selected_action_values))
        variables = self.model.trainable_variables
        gradients = tape.gradient(loss, variables)
        self.optimizer.apply_gradients(zip(gradients, variables))
        return loss
        
    def get_action(self, states, epsilon):
        if np.random.random() < epsilon:
            return np.random.choice(self.num_actions)
        else:
            return np.argmax(self.predict(np.atleast_2d(states))[0])


    def add_experience(self, exp):
        if len(self.experience['s']) >= self.max_experiences:
            for key in self.experience.keys():
                self.experience[key].pop(0)
        for key, value in exp.items():
            self.experience[key].append(value)

    def copy_weights(self, TrainNet):
        variables1 = self.model.trainable_variables
        variables2 = TrainNet.model.trainable_variables
        for v1, v2 in zip(variables1, variables2):
            v1.assign(v2.numpy())

In [111]:
def play_game(env, TrainNet, TargetNet, epsilon, copy_step):
    rewards = 0
    iter = 0
    done = False
    observations = env.reset()
    losses = list()
    while not done:
        action = TrainNet.get_action(observations, epsilon)
        prev_observations = observations
        observations, reward, done, _ = env.step(action)
        rewards += reward
        if done:
            reward = -200
            env.reset()

        exp = {'s': prev_observations, 'a': action, 'r': reward, 's2': observations, 'done': done}
        TrainNet.add_experience(exp)
        loss = TrainNet.train(TargetNet)
        if isinstance(loss, int):
            losses.append(loss)
        else:
            losses.append(loss.numpy())
        iter += 1
        if iter % copy_step == 0:
            TargetNet.copy_weights(TrainNet)
    return rewards, np.mean(losses)

In [112]:
def make_video(env, TrainNet):
    env = wrappers.Monitor(env, os.path.join(os.getcwd(), "videos"), force=True)
    rewards = 0
    steps = 0
    done = False
    observation = env.reset()
    while not done:
        action = TrainNet.get_action(observation, 0)
        observation, reward, done, _ = env.step(action)
        steps += 1
        rewards += reward
    print("Testing steps: {} rewards {}: ".format(steps, rewards))

In [113]:
def main():
    env = gym.make('CartPole-v0')
    gamma = 0.99
    copy_step = 25
    num_states = len(env.observation_space.sample())
    num_actions = env.action_space.n
    hidden_units = [200, 200]
    max_experiences = 10000
    min_experiences = 100
    batch_size = 32
    lr = 1e-2
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    log_dir = 'logs/dqn/' + current_time
    summary_writer = tf.summary.create_file_writer(log_dir)

    TrainNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    TargetNet = DQN(num_states, num_actions, hidden_units, gamma, max_experiences, min_experiences, batch_size, lr)
    N = 50000
    total_rewards = np.empty(N)
    epsilon = 0.99
    decay = 0.9999
    min_epsilon = 0.1
    for n in range(N):
        epsilon = max(min_epsilon, epsilon * decay)
        total_reward, losses = play_game(env, TrainNet, TargetNet, epsilon, copy_step)
        total_rewards[n] = total_reward
        avg_rewards = total_rewards[max(0, n - 100):(n + 1)].mean()
        with summary_writer.as_default():
            tf.summary.scalar('episode reward', total_reward, step=n)
            tf.summary.scalar('running avg reward(100)', avg_rewards, step=n)
            tf.summary.scalar('average loss)', losses, step=n)
        if True:  # n % 100 == 0:
            print("episode:", n, "episode reward:", total_reward, "epsilon:", epsilon, "avg reward (last 100):", avg_rewards,
                  "episode loss: ", losses)
    print("avg reward for last 100 episodes:", avg_rewards)
    make_video(env, TrainNet)
    env.close()

In [None]:
if __name__ == '__main__':
    main()

episode: 0 episode reward: 21.0 epsilon: 0.989901 avg reward (last 100): 21.0 episode loss:  0.0
episode: 1 episode reward: 22.0 epsilon: 0.9898020099 avg reward (last 100): 21.5 episode loss:  0.0
episode: 2 episode reward: 31.0 epsilon: 0.98970302969901 avg reward (last 100): 24.666666666666668 episode loss:  0.0
episode: 3 episode reward: 22.0 epsilon: 0.9896040593960401 avg reward (last 100): 24.0 episode loss:  0.0
episode: 4 episode reward: 18.0 epsilon: 0.9895050989901005 avg reward (last 100): 22.8 episode loss:  821.255954530504
episode: 5 episode reward: 14.0 epsilon: 0.9894061484802015 avg reward (last 100): 21.333333333333332 episode loss:  1603.0071
episode: 6 episode reward: 11.0 epsilon: 0.9893072078653534 avg reward (last 100): 19.857142857142858 episode loss:  2103.9795
episode: 7 episode reward: 19.0 epsilon: 0.9892082771445669 avg reward (last 100): 19.75 episode loss:  1990.7671
episode: 8 episode reward: 14.0 epsilon: 0.9891093563168525 avg reward (last 100): 19.11

episode: 67 episode reward: 21.0 epsilon: 0.9832905026656819 avg reward (last 100): 22.897058823529413 episode loss:  626.6761
episode: 68 episode reward: 16.0 epsilon: 0.9831921736154153 avg reward (last 100): 22.797101449275363 episode loss:  761.53296
episode: 69 episode reward: 10.0 epsilon: 0.9830938543980539 avg reward (last 100): 22.614285714285714 episode loss:  510.57993
episode: 70 episode reward: 18.0 epsilon: 0.9829955450126141 avg reward (last 100): 22.549295774647888 episode loss:  639.3327
episode: 71 episode reward: 18.0 epsilon: 0.9828972454581129 avg reward (last 100): 22.48611111111111 episode loss:  580.90717
episode: 72 episode reward: 11.0 epsilon: 0.982798955733567 avg reward (last 100): 22.328767123287673 episode loss:  569.0018
episode: 73 episode reward: 14.0 epsilon: 0.9827006758379937 avg reward (last 100): 22.216216216216218 episode loss:  502.8875
episode: 74 episode reward: 17.0 epsilon: 0.9826024057704099 avg reward (last 100): 22.14666666666667 episode 

episode: 132 episode reward: 21.0 epsilon: 0.9769195239571736 avg reward (last 100): 23.693069306930692 episode loss:  290.49994
episode: 133 episode reward: 17.0 epsilon: 0.9768218320047779 avg reward (last 100): 23.742574257425744 episode loss:  392.43628
episode: 134 episode reward: 13.0 epsilon: 0.9767241498215774 avg reward (last 100): 23.702970297029704 episode loss:  263.12408
episode: 135 episode reward: 17.0 epsilon: 0.9766264774065953 avg reward (last 100): 23.475247524752476 episode loss:  242.25456
episode: 136 episode reward: 17.0 epsilon: 0.9765288147588546 avg reward (last 100): 23.504950495049506 episode loss:  302.56952
episode: 137 episode reward: 11.0 epsilon: 0.9764311618773787 avg reward (last 100): 23.366336633663366 episode loss:  193.82422
episode: 138 episode reward: 16.0 epsilon: 0.9763335187611909 avg reward (last 100): 23.207920792079207 episode loss:  374.81012
episode: 139 episode reward: 17.0 epsilon: 0.9762358854093148 avg reward (last 100): 23.217821782

episode: 196 episode reward: 11.0 epsilon: 0.9706868930610726 avg reward (last 100): 23.128712871287128 episode loss:  354.92273
episode: 197 episode reward: 32.0 epsilon: 0.9705898243717664 avg reward (last 100): 23.346534653465348 episode loss:  346.61514
episode: 198 episode reward: 24.0 epsilon: 0.9704927653893293 avg reward (last 100): 23.0 episode loss:  399.4546
episode: 199 episode reward: 12.0 epsilon: 0.9703957161127903 avg reward (last 100): 22.95049504950495 episode loss:  272.55792
episode: 200 episode reward: 11.0 epsilon: 0.9702986765411791 avg reward (last 100): 22.85148514851485 episode loss:  269.54056
episode: 201 episode reward: 21.0 epsilon: 0.9702016466735249 avg reward (last 100): 22.940594059405942 episode loss:  409.76996
episode: 202 episode reward: 18.0 epsilon: 0.9701046265088575 avg reward (last 100): 22.534653465346533 episode loss:  297.17114
episode: 203 episode reward: 16.0 epsilon: 0.9700076160462067 avg reward (last 100): 22.306930693069308 episode lo

episode: 261 episode reward: 14.0 epsilon: 0.9643975762095879 avg reward (last 100): 20.871287128712872 episode loss:  291.67145
episode: 262 episode reward: 40.0 epsilon: 0.9643011364519669 avg reward (last 100): 21.138613861386137 episode loss:  308.75153
episode: 263 episode reward: 14.0 epsilon: 0.9642047063383217 avg reward (last 100): 20.96039603960396 episode loss:  204.1534
episode: 264 episode reward: 17.0 epsilon: 0.9641082858676878 avg reward (last 100): 20.93069306930693 episode loss:  271.23706
episode: 265 episode reward: 40.0 epsilon: 0.964011875039101 avg reward (last 100): 21.14851485148515 episode loss:  223.88058
episode: 266 episode reward: 21.0 epsilon: 0.9639154738515971 avg reward (last 100): 21.198019801980198 episode loss:  215.32977
episode: 267 episode reward: 25.0 epsilon: 0.963819082304212 avg reward (last 100): 21.178217821782177 episode loss:  263.98935
episode: 268 episode reward: 41.0 epsilon: 0.9637227003959816 avg reward (last 100): 21.247524752475247

episode: 327 episode reward: 13.0 epsilon: 0.958053194473202 avg reward (last 100): 20.594059405940595 episode loss:  208.11038
episode: 328 episode reward: 13.0 epsilon: 0.9579573891537547 avg reward (last 100): 20.26732673267327 episode loss:  260.51105
episode: 329 episode reward: 13.0 epsilon: 0.9578615934148393 avg reward (last 100): 20.287128712871286 episode loss:  200.43805
episode: 330 episode reward: 30.0 epsilon: 0.9577658072554979 avg reward (last 100): 20.376237623762375 episode loss:  213.65788
episode: 331 episode reward: 16.0 epsilon: 0.9576700306747723 avg reward (last 100): 20.396039603960396 episode loss:  264.87793
episode: 332 episode reward: 24.0 epsilon: 0.9575742636717048 avg reward (last 100): 20.475247524752476 episode loss:  267.57916
episode: 333 episode reward: 11.0 epsilon: 0.9574785062453377 avg reward (last 100): 20.356435643564357 episode loss:  216.37071
episode: 334 episode reward: 28.0 epsilon: 0.9573827583947132 avg reward (last 100): 20.30693069306

episode: 393 episode reward: 24.0 epsilon: 0.9517505498591503 avg reward (last 100): 21.237623762376238 episode loss:  250.41585
episode: 394 episode reward: 16.0 epsilon: 0.9516553748041644 avg reward (last 100): 21.059405940594058 episode loss:  148.54337
episode: 395 episode reward: 21.0 epsilon: 0.9515602092666839 avg reward (last 100): 21.06930693069307 episode loss:  184.60544
episode: 396 episode reward: 15.0 epsilon: 0.9514650532457573 avg reward (last 100): 21.11881188118812 episode loss:  249.76009
episode: 397 episode reward: 21.0 epsilon: 0.9513699067404326 avg reward (last 100): 21.178217821782177 episode loss:  233.47327
episode: 398 episode reward: 14.0 epsilon: 0.9512747697497587 avg reward (last 100): 21.15841584158416 episode loss:  245.14511
episode: 399 episode reward: 15.0 epsilon: 0.9511796422727837 avg reward (last 100): 21.03960396039604 episode loss:  205.0399
episode: 400 episode reward: 15.0 epsilon: 0.9510845243085565 avg reward (last 100): 20.89108910891089

episode: 458 episode reward: 30.0 epsilon: 0.9455839261883985 avg reward (last 100): 22.514851485148515 episode loss:  252.85524
episode: 459 episode reward: 19.0 epsilon: 0.9454893677957796 avg reward (last 100): 22.485148514851485 episode loss:  225.30489
episode: 460 episode reward: 24.0 epsilon: 0.9453948188590001 avg reward (last 100): 22.336633663366335 episode loss:  263.86118
episode: 461 episode reward: 18.0 epsilon: 0.9453002793771142 avg reward (last 100): 22.326732673267326 episode loss:  233.0452
episode: 462 episode reward: 23.0 epsilon: 0.9452057493491766 avg reward (last 100): 22.217821782178216 episode loss:  205.87358
episode: 463 episode reward: 15.0 epsilon: 0.9451112287742417 avg reward (last 100): 21.81188118811881 episode loss:  228.49376
episode: 464 episode reward: 19.0 epsilon: 0.9450167176513643 avg reward (last 100): 21.683168316831683 episode loss:  213.40988
episode: 465 episode reward: 29.0 epsilon: 0.9449222159795991 avg reward (last 100): 21.82178217821

episode: 524 episode reward: 24.0 epsilon: 0.9393633118489179 avg reward (last 100): 24.376237623762375 episode loss:  330.58124
episode: 525 episode reward: 14.0 epsilon: 0.9392693755177329 avg reward (last 100): 24.425742574257427 episode loss:  259.37274
episode: 526 episode reward: 11.0 epsilon: 0.9391754485801812 avg reward (last 100): 24.14851485148515 episode loss:  320.36465
episode: 527 episode reward: 20.0 epsilon: 0.9390815310353232 avg reward (last 100): 23.940594059405942 episode loss:  322.7015
episode: 528 episode reward: 44.0 epsilon: 0.9389876228822197 avg reward (last 100): 24.059405940594058 episode loss:  324.30197
episode: 529 episode reward: 15.0 epsilon: 0.9388937241199315 avg reward (last 100): 24.06930693069307 episode loss:  162.57506
episode: 530 episode reward: 13.0 epsilon: 0.9387998347475196 avg reward (last 100): 24.03960396039604 episode loss:  239.08939
episode: 531 episode reward: 15.0 epsilon: 0.9387059547640448 avg reward (last 100): 24.0396039603960

episode: 590 episode reward: 12.0 epsilon: 0.933183620416108 avg reward (last 100): 22.574257425742573 episode loss:  426.37967
episode: 591 episode reward: 37.0 epsilon: 0.9330903020540664 avg reward (last 100): 22.683168316831683 episode loss:  295.00186
episode: 592 episode reward: 19.0 epsilon: 0.932996993023861 avg reward (last 100): 22.524752475247524 episode loss:  248.12932
episode: 593 episode reward: 44.0 epsilon: 0.9329036933245586 avg reward (last 100): 22.495049504950494 episode loss:  310.9648
episode: 594 episode reward: 48.0 epsilon: 0.9328104029552261 avg reward (last 100): 22.653465346534652 episode loss:  303.7687
episode: 595 episode reward: 27.0 epsilon: 0.9327171219149306 avg reward (last 100): 22.386138613861387 episode loss:  257.618
episode: 596 episode reward: 22.0 epsilon: 0.9326238502027391 avg reward (last 100): 22.465346534653467 episode loss:  238.99594
episode: 597 episode reward: 18.0 epsilon: 0.9325305878177188 avg reward (last 100): 22.247524752475247

episode: 656 episode reward: 15.0 epsilon: 0.9270445826747113 avg reward (last 100): 23.257425742574256 episode loss:  210.9772
episode: 657 episode reward: 14.0 epsilon: 0.9269518782164439 avg reward (last 100): 23.257425742574256 episode loss:  210.75989
episode: 658 episode reward: 44.0 epsilon: 0.9268591830286222 avg reward (last 100): 23.544554455445546 episode loss:  216.0598
episode: 659 episode reward: 77.0 epsilon: 0.9267664971103194 avg reward (last 100): 24.04950495049505 episode loss:  193.25436
episode: 660 episode reward: 55.0 epsilon: 0.9266738204606084 avg reward (last 100): 24.128712871287128 episode loss:  194.59918
episode: 661 episode reward: 13.0 epsilon: 0.9265811530785624 avg reward (last 100): 24.099009900990097 episode loss:  176.67633
episode: 662 episode reward: 11.0 epsilon: 0.9264884949632545 avg reward (last 100): 23.95049504950495 episode loss:  146.76927
episode: 663 episode reward: 15.0 epsilon: 0.9263958461137581 avg reward (last 100): 23.9108910891089

episode: 720 episode reward: 30.0 epsilon: 0.9211301479988263 avg reward (last 100): 23.059405940594058 episode loss:  214.90315
episode: 721 episode reward: 17.0 epsilon: 0.9210380349840265 avg reward (last 100): 22.95049504950495 episode loss:  271.47357
episode: 722 episode reward: 42.0 epsilon: 0.920945931180528 avg reward (last 100): 23.257425742574256 episode loss:  208.41211
episode: 723 episode reward: 11.0 epsilon: 0.9208538365874099 avg reward (last 100): 23.14851485148515 episode loss:  192.11124
episode: 724 episode reward: 22.0 epsilon: 0.9207617512037513 avg reward (last 100): 23.247524752475247 episode loss:  246.96524
episode: 725 episode reward: 51.0 epsilon: 0.9206696750286308 avg reward (last 100): 23.475247524752476 episode loss:  235.70921
episode: 726 episode reward: 18.0 epsilon: 0.920577608061128 avg reward (last 100): 23.514851485148515 episode loss:  198.20998
episode: 727 episode reward: 14.0 epsilon: 0.9204855503003219 avg reward (last 100): 23.4356435643564

episode: 785 episode reward: 19.0 epsilon: 0.9151619213712362 avg reward (last 100): 20.544554455445546 episode loss:  226.91866
episode: 786 episode reward: 15.0 epsilon: 0.9150704051790991 avg reward (last 100): 20.376237623762375 episode loss:  261.7423
episode: 787 episode reward: 16.0 epsilon: 0.9149788981385812 avg reward (last 100): 20.22772277227723 episode loss:  211.40009
episode: 788 episode reward: 61.0 epsilon: 0.9148874002487674 avg reward (last 100): 20.673267326732674 episode loss:  247.5358
episode: 789 episode reward: 21.0 epsilon: 0.9147959115087425 avg reward (last 100): 20.465346534653467 episode loss:  213.84875
episode: 790 episode reward: 21.0 epsilon: 0.9147044319175917 avg reward (last 100): 20.534653465346533 episode loss:  211.93076
episode: 791 episode reward: 30.0 epsilon: 0.9146129614743999 avg reward (last 100): 20.455445544554454 episode loss:  271.88007
episode: 792 episode reward: 36.0 epsilon: 0.9145215001782524 avg reward (last 100): 20.623762376237

episode: 849 episode reward: 55.0 epsilon: 0.9093232966675666 avg reward (last 100): 20.91089108910891 episode loss:  223.76784
episode: 850 episode reward: 46.0 epsilon: 0.9092323643378999 avg reward (last 100): 21.26732673267327 episode loss:  236.57878
episode: 851 episode reward: 15.0 epsilon: 0.9091414411014661 avg reward (last 100): 21.277227722772277 episode loss:  323.98242
episode: 852 episode reward: 12.0 epsilon: 0.909050526957356 avg reward (last 100): 21.306930693069308 episode loss:  233.53741
episode: 853 episode reward: 14.0 epsilon: 0.9089596219046603 avg reward (last 100): 21.26732673267327 episode loss:  332.89554
episode: 854 episode reward: 16.0 epsilon: 0.9088687259424698 avg reward (last 100): 21.217821782178216 episode loss:  213.49132
episode: 855 episode reward: 20.0 epsilon: 0.9087778390698755 avg reward (last 100): 21.297029702970296 episode loss:  186.27396
episode: 856 episode reward: 25.0 epsilon: 0.9086869612859686 avg reward (last 100): 21.3861386138613

episode: 913 episode reward: 20.0 epsilon: 0.9035219216982168 avg reward (last 100): 22.485148514851485 episode loss:  230.38203
episode: 914 episode reward: 33.0 epsilon: 0.903431569506047 avg reward (last 100): 22.603960396039604 episode loss:  237.43787
episode: 915 episode reward: 44.0 epsilon: 0.9033412263490964 avg reward (last 100): 22.821782178217823 episode loss:  215.1645
episode: 916 episode reward: 15.0 epsilon: 0.9032508922264615 avg reward (last 100): 22.85148514851485 episode loss:  257.28137
episode: 917 episode reward: 27.0 epsilon: 0.9031605671372388 avg reward (last 100): 22.85148514851485 episode loss:  276.81665
episode: 918 episode reward: 32.0 epsilon: 0.9030702510805251 avg reward (last 100): 22.940594059405942 episode loss:  281.95087
episode: 919 episode reward: 28.0 epsilon: 0.902979944055417 avg reward (last 100): 23.00990099009901 episode loss:  247.42517
episode: 920 episode reward: 11.0 epsilon: 0.9028896460610115 avg reward (last 100): 22.940594059405942

episode: 977 episode reward: 19.0 epsilon: 0.8977575588142918 avg reward (last 100): 22.633663366336634 episode loss:  291.54874
episode: 978 episode reward: 13.0 epsilon: 0.8976677830584103 avg reward (last 100): 22.643564356435643 episode loss:  234.14272
episode: 979 episode reward: 40.0 epsilon: 0.8975780162801045 avg reward (last 100): 22.900990099009903 episode loss:  201.918
episode: 980 episode reward: 17.0 epsilon: 0.8974882584784765 avg reward (last 100): 22.831683168316832 episode loss:  197.82913
episode: 981 episode reward: 14.0 epsilon: 0.8973985096526286 avg reward (last 100): 22.594059405940595 episode loss:  177.29646
episode: 982 episode reward: 16.0 epsilon: 0.8973087698016633 avg reward (last 100): 22.366336633663366 episode loss:  274.75684
episode: 983 episode reward: 30.0 epsilon: 0.8972190389246831 avg reward (last 100): 22.297029702970296 episode loss:  302.0249
episode: 984 episode reward: 14.0 epsilon: 0.8971293170207907 avg reward (last 100): 21.930693069306

episode: 1042 episode reward: 32.0 epsilon: 0.8919407688858804 avg reward (last 100): 20.940594059405942 episode loss:  228.92467
episode: 1043 episode reward: 22.0 epsilon: 0.8918515748089918 avg reward (last 100): 21.059405940594058 episode loss:  306.68503
episode: 1044 episode reward: 16.0 epsilon: 0.8917623896515109 avg reward (last 100): 21.059405940594058 episode loss:  228.41751
episode: 1045 episode reward: 30.0 epsilon: 0.8916732134125458 avg reward (last 100): 21.15841584158416 episode loss:  287.12265
episode: 1046 episode reward: 12.0 epsilon: 0.8915840460912046 avg reward (last 100): 21.15841584158416 episode loss:  327.08224
episode: 1047 episode reward: 30.0 epsilon: 0.8914948876865955 avg reward (last 100): 20.99009900990099 episode loss:  240.96135
episode: 1048 episode reward: 17.0 epsilon: 0.8914057381978268 avg reward (last 100): 20.97029702970297 episode loss:  328.7134
episode: 1049 episode reward: 24.0 epsilon: 0.891316597624007 avg reward (last 100): 20.9405940

episode: 1106 episode reward: 28.0 epsilon: 0.8862502923856952 avg reward (last 100): 20.178217821782177 episode loss:  299.24478
episode: 1107 episode reward: 24.0 epsilon: 0.8861616673564566 avg reward (last 100): 20.02970297029703 episode loss:  334.7255
episode: 1108 episode reward: 50.0 epsilon: 0.886073051189721 avg reward (last 100): 20.396039603960396 episode loss:  263.85876
episode: 1109 episode reward: 12.0 epsilon: 0.8859844438846021 avg reward (last 100): 20.287128712871286 episode loss:  333.39914
episode: 1110 episode reward: 10.0 epsilon: 0.8858958454402136 avg reward (last 100): 20.237623762376238 episode loss:  269.47446
episode: 1111 episode reward: 28.0 epsilon: 0.8858072558556696 avg reward (last 100): 20.396039603960396 episode loss:  232.84706
episode: 1112 episode reward: 27.0 epsilon: 0.885718675130084 avg reward (last 100): 20.396039603960396 episode loss:  243.71051
episode: 1113 episode reward: 28.0 epsilon: 0.885630103262571 avg reward (last 100): 20.594059

episode: 1170 episode reward: 31.0 epsilon: 0.8805961204518324 avg reward (last 100): 21.871287128712872 episode loss:  205.53369
episode: 1171 episode reward: 43.0 epsilon: 0.8805080608397873 avg reward (last 100): 21.99009900990099 episode loss:  177.55357
episode: 1172 episode reward: 25.0 epsilon: 0.8804200100337033 avg reward (last 100): 21.980198019801982 episode loss:  192.91875
episode: 1173 episode reward: 27.0 epsilon: 0.8803319680327 avg reward (last 100): 21.96039603960396 episode loss:  148.75014
episode: 1174 episode reward: 13.0 epsilon: 0.8802439348358967 avg reward (last 100): 21.97029702970297 episode loss:  223.15967
episode: 1175 episode reward: 18.0 epsilon: 0.8801559104424131 avg reward (last 100): 22.019801980198018 episode loss:  279.71252
episode: 1176 episode reward: 15.0 epsilon: 0.8800678948513689 avg reward (last 100): 21.96039603960396 episode loss:  264.67105
episode: 1177 episode reward: 22.0 epsilon: 0.8799798880618838 avg reward (last 100): 22.00990099

episode: 1234 episode reward: 26.0 epsilon: 0.8749780214654566 avg reward (last 100): 22.336633663366335 episode loss:  191.14223
episode: 1235 episode reward: 28.0 epsilon: 0.87489052366331 avg reward (last 100): 22.415841584158414 episode loss:  241.07568
episode: 1236 episode reward: 9.0 epsilon: 0.8748030346109437 avg reward (last 100): 22.14851485148515 episode loss:  182.4205
episode: 1237 episode reward: 41.0 epsilon: 0.8747155543074826 avg reward (last 100): 22.198019801980198 episode loss:  173.48074
episode: 1238 episode reward: 41.0 epsilon: 0.8746280827520518 avg reward (last 100): 22.376237623762375 episode loss:  134.87482
episode: 1239 episode reward: 10.0 epsilon: 0.8745406199437766 avg reward (last 100): 22.316831683168317 episode loss:  197.24268
episode: 1240 episode reward: 28.0 epsilon: 0.8744531658817822 avg reward (last 100): 22.396039603960396 episode loss:  222.14754
episode: 1241 episode reward: 10.0 epsilon: 0.874365720565194 avg reward (last 100): 22.0495049

episode: 1299 episode reward: 11.0 epsilon: 0.8693088257089054 avg reward (last 100): 22.683168316831683 episode loss:  264.21478
episode: 1300 episode reward: 12.0 epsilon: 0.8692218948263346 avg reward (last 100): 22.673267326732674 episode loss:  260.67133
episode: 1301 episode reward: 23.0 epsilon: 0.869134972636852 avg reward (last 100): 22.782178217821784 episode loss:  258.76917
episode: 1302 episode reward: 12.0 epsilon: 0.8690480591395884 avg reward (last 100): 22.752475247524753 episode loss:  249.61353
episode: 1303 episode reward: 27.0 epsilon: 0.8689611543336744 avg reward (last 100): 22.73267326732673 episode loss:  338.75464
episode: 1304 episode reward: 14.0 epsilon: 0.868874258218241 avg reward (last 100): 22.455445544554454 episode loss:  250.27942
episode: 1305 episode reward: 16.0 epsilon: 0.8687873707924192 avg reward (last 100): 22.425742574257427 episode loss:  339.34753
episode: 1306 episode reward: 48.0 epsilon: 0.86870049205534 avg reward (last 100): 22.772277

episode: 1363 episode reward: 19.0 epsilon: 0.86376273832658 avg reward (last 100): 23.396039603960396 episode loss:  253.57288
episode: 1364 episode reward: 13.0 epsilon: 0.8636763620527473 avg reward (last 100): 23.376237623762375 episode loss:  189.91745
episode: 1365 episode reward: 13.0 epsilon: 0.863589994416542 avg reward (last 100): 23.26732673267327 episode loss:  273.21075
episode: 1366 episode reward: 14.0 epsilon: 0.8635036354171004 avg reward (last 100): 23.18811881188119 episode loss:  198.9217
episode: 1367 episode reward: 62.0 epsilon: 0.8634172850535587 avg reward (last 100): 23.465346534653467 episode loss:  257.86133
episode: 1368 episode reward: 19.0 epsilon: 0.8633309433250534 avg reward (last 100): 23.534653465346533 episode loss:  225.06825
episode: 1369 episode reward: 10.0 epsilon: 0.863244610230721 avg reward (last 100): 23.10891089108911 episode loss:  141.75146
episode: 1370 episode reward: 26.0 epsilon: 0.8631582857696979 avg reward (last 100): 23.227722772

episode: 1427 episode reward: 28.0 epsilon: 0.8582520343251 avg reward (last 100): 23.316831683168317 episode loss:  251.2009
episode: 1428 episode reward: 16.0 epsilon: 0.8581662091216675 avg reward (last 100): 23.168316831683168 episode loss:  211.42828
episode: 1429 episode reward: 12.0 epsilon: 0.8580803925007554 avg reward (last 100): 23.04950495049505 episode loss:  202.80878
episode: 1430 episode reward: 50.0 epsilon: 0.8579945844615053 avg reward (last 100): 23.326732673267326 episode loss:  232.63896
episode: 1431 episode reward: 24.0 epsilon: 0.8579087850030592 avg reward (last 100): 23.415841584158414 episode loss:  173.20807
episode: 1432 episode reward: 9.0 epsilon: 0.857822994124559 avg reward (last 100): 23.336633663366335 episode loss:  221.26926
episode: 1433 episode reward: 9.0 epsilon: 0.8577372118251465 avg reward (last 100): 22.93069306930693 episode loss:  112.784676
episode: 1434 episode reward: 16.0 epsilon: 0.857651438103964 avg reward (last 100): 22.9603960396

episode: 1491 episode reward: 9.0 epsilon: 0.8527764879626851 avg reward (last 100): 20.673267326732674 episode loss:  228.15785
episode: 1492 episode reward: 20.0 epsilon: 0.8526912103138888 avg reward (last 100): 20.693069306930692 episode loss:  237.46768
episode: 1493 episode reward: 37.0 epsilon: 0.8526059411928575 avg reward (last 100): 20.89108910891089 episode loss:  247.76498
episode: 1494 episode reward: 20.0 epsilon: 0.8525206805987382 avg reward (last 100): 20.821782178217823 episode loss:  186.60532
episode: 1495 episode reward: 11.0 epsilon: 0.8524354285306783 avg reward (last 100): 20.821782178217823 episode loss:  160.6253
episode: 1496 episode reward: 13.0 epsilon: 0.8523501849878252 avg reward (last 100): 20.84158415841584 episode loss:  265.63895
episode: 1497 episode reward: 13.0 epsilon: 0.8522649499693264 avg reward (last 100): 20.485148514851485 episode loss:  229.28459
episode: 1498 episode reward: 12.0 epsilon: 0.8521797234743296 avg reward (last 100): 20.43564

episode: 1555 episode reward: 32.0 epsilon: 0.8473358749377602 avg reward (last 100): 20.435643564356436 episode loss:  259.24695
episode: 1556 episode reward: 62.0 epsilon: 0.8472511413502665 avg reward (last 100): 20.95049504950495 episode loss:  240.85716
episode: 1557 episode reward: 23.0 epsilon: 0.8471664162361314 avg reward (last 100): 20.940594059405942 episode loss:  189.37924
episode: 1558 episode reward: 45.0 epsilon: 0.8470816995945079 avg reward (last 100): 21.059405940594058 episode loss:  230.36653
episode: 1559 episode reward: 9.0 epsilon: 0.8469969914245484 avg reward (last 100): 20.96039603960396 episode loss:  197.07397
episode: 1560 episode reward: 12.0 epsilon: 0.8469122917254059 avg reward (last 100): 20.91089108910891 episode loss:  204.81226
episode: 1561 episode reward: 51.0 epsilon: 0.8468276004962334 avg reward (last 100): 21.14851485148515 episode loss:  227.0443
episode: 1562 episode reward: 17.0 epsilon: 0.8467429177361838 avg reward (last 100): 21.0099009

episode: 1620 episode reward: 9.0 epsilon: 0.8418457793825311 avg reward (last 100): 21.316831683168317 episode loss:  285.15375
episode: 1621 episode reward: 19.0 epsilon: 0.8417615948045929 avg reward (last 100): 21.207920792079207 episode loss:  271.27316
episode: 1622 episode reward: 12.0 epsilon: 0.8416774186451125 avg reward (last 100): 21.22772277227723 episode loss:  249.21367
episode: 1623 episode reward: 23.0 epsilon: 0.841593250903248 avg reward (last 100): 21.287128712871286 episode loss:  328.66507
episode: 1624 episode reward: 18.0 epsilon: 0.8415090915781577 avg reward (last 100): 21.356435643564357 episode loss:  297.0222
episode: 1625 episode reward: 26.0 epsilon: 0.8414249406689999 avg reward (last 100): 21.14851485148515 episode loss:  239.07326
episode: 1626 episode reward: 22.0 epsilon: 0.841340798174933 avg reward (last 100): 21.198019801980198 episode loss:  233.70499
episode: 1627 episode reward: 22.0 epsilon: 0.8412566640951156 avg reward (last 100): 21.1782178

episode: 1684 episode reward: 14.0 epsilon: 0.8364749029841582 avg reward (last 100): 20.455445544554454 episode loss:  250.84612
episode: 1685 episode reward: 13.0 epsilon: 0.8363912554938597 avg reward (last 100): 20.445544554455445 episode loss:  343.9719
episode: 1686 episode reward: 23.0 epsilon: 0.8363076163683103 avg reward (last 100): 20.534653465346533 episode loss:  333.28018
episode: 1687 episode reward: 8.0 epsilon: 0.8362239856066734 avg reward (last 100): 20.495049504950494 episode loss:  283.41193
episode: 1688 episode reward: 10.0 epsilon: 0.8361403632081128 avg reward (last 100): 20.455445544554454 episode loss:  229.8737
episode: 1689 episode reward: 18.0 epsilon: 0.836056749171792 avg reward (last 100): 20.504950495049506 episode loss:  288.4128
episode: 1690 episode reward: 14.0 epsilon: 0.8359731434968748 avg reward (last 100): 20.425742574257427 episode loss:  322.76978
episode: 1691 episode reward: 53.0 epsilon: 0.8358895461825251 avg reward (last 100): 20.663366

episode: 1749 episode reward: 35.0 epsilon: 0.8310551783120836 avg reward (last 100): 21.495049504950494 episode loss:  243.23653
episode: 1750 episode reward: 16.0 epsilon: 0.8309720727942524 avg reward (last 100): 21.455445544554454 episode loss:  254.32217
episode: 1751 episode reward: 51.0 epsilon: 0.830888975586973 avg reward (last 100): 21.792079207920793 episode loss:  280.87918
episode: 1752 episode reward: 18.0 epsilon: 0.8308058866894144 avg reward (last 100): 21.564356435643564 episode loss:  176.30653
episode: 1753 episode reward: 27.0 epsilon: 0.8307228061007454 avg reward (last 100): 21.623762376237625 episode loss:  223.55388
episode: 1754 episode reward: 23.0 epsilon: 0.8306397338201353 avg reward (last 100): 21.564356435643564 episode loss:  300.24857
episode: 1755 episode reward: 10.0 epsilon: 0.8305566698467534 avg reward (last 100): 21.554455445544555 episode loss:  243.6324
episode: 1756 episode reward: 10.0 epsilon: 0.8304736141797687 avg reward (last 100): 21.287

episode: 1814 episode reward: 14.0 epsilon: 0.8256705693564722 avg reward (last 100): 21.603960396039604 episode loss:  354.38153
episode: 1815 episode reward: 16.0 epsilon: 0.8255880022995365 avg reward (last 100): 21.663366336633665 episode loss:  233.25192
episode: 1816 episode reward: 29.0 epsilon: 0.8255054434993067 avg reward (last 100): 21.81188118811881 episode loss:  205.06026
episode: 1817 episode reward: 19.0 epsilon: 0.8254228929549567 avg reward (last 100): 21.722772277227723 episode loss:  265.98547
episode: 1818 episode reward: 16.0 epsilon: 0.8253403506656612 avg reward (last 100): 21.613861386138613 episode loss:  165.8576
episode: 1819 episode reward: 15.0 epsilon: 0.8252578166305947 avg reward (last 100): 21.237623762376238 episode loss:  144.91891
episode: 1820 episode reward: 17.0 epsilon: 0.8251752908489317 avg reward (last 100): 21.247524752475247 episode loss:  199.06357
episode: 1821 episode reward: 13.0 epsilon: 0.8250927733198469 avg reward (last 100): 21.257

episode: 1879 episode reward: 12.0 epsilon: 0.820320848594041 avg reward (last 100): 21.722772277227723 episode loss:  236.19646
episode: 1880 episode reward: 20.0 epsilon: 0.8202388165091816 avg reward (last 100): 21.831683168316832 episode loss:  150.14615
episode: 1881 episode reward: 12.0 epsilon: 0.8201567926275307 avg reward (last 100): 21.485148514851485 episode loss:  169.17357
episode: 1882 episode reward: 31.0 epsilon: 0.820074776948268 avg reward (last 100): 21.633663366336634 episode loss:  158.67496
episode: 1883 episode reward: 17.0 epsilon: 0.8199927694705732 avg reward (last 100): 21.693069306930692 episode loss:  160.61967
episode: 1884 episode reward: 24.0 epsilon: 0.8199107701936261 avg reward (last 100): 21.693069306930692 episode loss:  230.57513
episode: 1885 episode reward: 22.0 epsilon: 0.8198287791166068 avg reward (last 100): 21.73267326732673 episode loss:  258.54605
episode: 1886 episode reward: 19.0 epsilon: 0.8197467962386952 avg reward (last 100): 21.6831

episode: 1943 episode reward: 30.0 epsilon: 0.8150872987055585 avg reward (last 100): 21.06930693069307 episode loss:  149.1804
episode: 1944 episode reward: 20.0 epsilon: 0.815005789975688 avg reward (last 100): 21.15841584158416 episode loss:  187.71884
episode: 1945 episode reward: 12.0 epsilon: 0.8149242893966904 avg reward (last 100): 21.019801980198018 episode loss:  182.85876
episode: 1946 episode reward: 32.0 epsilon: 0.8148427969677507 avg reward (last 100): 21.198019801980198 episode loss:  142.2844
episode: 1947 episode reward: 12.0 epsilon: 0.814761312688054 avg reward (last 100): 21.128712871287128 episode loss:  149.96516
episode: 1948 episode reward: 20.0 epsilon: 0.8146798365567852 avg reward (last 100): 21.06930693069307 episode loss:  204.6127
episode: 1949 episode reward: 16.0 epsilon: 0.8145983685731295 avg reward (last 100): 20.85148514851485 episode loss:  193.88736
episode: 1950 episode reward: 16.0 epsilon: 0.8145169087362722 avg reward (last 100): 20.5841584158

episode: 2008 episode reward: 11.0 epsilon: 0.80980614953189 avg reward (last 100): 21.524752475247524 episode loss:  244.87706
episode: 2009 episode reward: 14.0 epsilon: 0.8097251689169368 avg reward (last 100): 21.455445544554454 episode loss:  194.9525
episode: 2010 episode reward: 41.0 epsilon: 0.8096441964000451 avg reward (last 100): 21.564356435643564 episode loss:  159.08664
episode: 2011 episode reward: 16.0 epsilon: 0.8095632319804051 avg reward (last 100): 21.504950495049506 episode loss:  175.76453
episode: 2012 episode reward: 13.0 epsilon: 0.8094822756572071 avg reward (last 100): 21.425742574257427 episode loss:  364.01117
episode: 2013 episode reward: 62.0 epsilon: 0.8094013274296414 avg reward (last 100): 21.84158415841584 episode loss:  221.18997
episode: 2014 episode reward: 11.0 epsilon: 0.8093203872968985 avg reward (last 100): 21.693069306930692 episode loss:  309.73303
episode: 2015 episode reward: 15.0 epsilon: 0.8092394552581688 avg reward (last 100): 21.67326

episode: 2072 episode reward: 45.0 epsilon: 0.8046396821784888 avg reward (last 100): 22.85148514851485 episode loss:  251.68471
episode: 2073 episode reward: 50.0 epsilon: 0.804559218210271 avg reward (last 100): 23.02970297029703 episode loss:  247.6065
episode: 2074 episode reward: 12.0 epsilon: 0.80447876228845 avg reward (last 100): 23.0 episode loss:  228.15962
episode: 2075 episode reward: 8.0 epsilon: 0.8043983144122211 avg reward (last 100): 22.643564356435643 episode loss:  310.27933
episode: 2076 episode reward: 13.0 epsilon: 0.8043178745807799 avg reward (last 100): 22.643564356435643 episode loss:  409.00638
episode: 2077 episode reward: 30.0 epsilon: 0.8042374427933218 avg reward (last 100): 22.742574257425744 episode loss:  317.51288
episode: 2078 episode reward: 45.0 epsilon: 0.8041570190490425 avg reward (last 100): 23.019801980198018 episode loss:  285.42148
episode: 2079 episode reward: 19.0 epsilon: 0.8040766033471376 avg reward (last 100): 23.06930693069307 episode

episode: 2138 episode reward: 17.0 epsilon: 0.7993462830349023 avg reward (last 100): 21.594059405940595 episode loss:  284.76038
episode: 2139 episode reward: 16.0 epsilon: 0.7992663484065988 avg reward (last 100): 21.594059405940595 episode loss:  229.87532
episode: 2140 episode reward: 38.0 epsilon: 0.7991864217717581 avg reward (last 100): 21.673267326732674 episode loss:  228.81805
episode: 2141 episode reward: 11.0 epsilon: 0.799106503129581 avg reward (last 100): 21.346534653465348 episode loss:  270.01962
episode: 2142 episode reward: 22.0 epsilon: 0.799026592479268 avg reward (last 100): 21.306930693069308 episode loss:  171.45201
episode: 2143 episode reward: 12.0 epsilon: 0.79894668982002 avg reward (last 100): 21.08910891089109 episode loss:  205.89363
episode: 2144 episode reward: 29.0 epsilon: 0.7988667951510381 avg reward (last 100): 21.277227722772277 episode loss:  246.62909
episode: 2145 episode reward: 10.0 epsilon: 0.798786908471523 avg reward (last 100): 21.2376237

episode: 2204 episode reward: 12.0 epsilon: 0.794087707024097 avg reward (last 100): 19.485148514851485 episode loss:  184.8466
episode: 2205 episode reward: 9.0 epsilon: 0.7940082982533946 avg reward (last 100): 19.356435643564357 episode loss:  247.85927
episode: 2206 episode reward: 33.0 epsilon: 0.7939288974235692 avg reward (last 100): 19.594059405940595 episode loss:  223.58131
episode: 2207 episode reward: 11.0 epsilon: 0.7938495045338269 avg reward (last 100): 19.455445544554454 episode loss:  250.15527
episode: 2208 episode reward: 35.0 epsilon: 0.7937701195833735 avg reward (last 100): 19.693069306930692 episode loss:  238.2671
episode: 2209 episode reward: 23.0 epsilon: 0.7936907425714151 avg reward (last 100): 19.643564356435643 episode loss:  206.58508
episode: 2210 episode reward: 11.0 epsilon: 0.793611373497158 avg reward (last 100): 19.613861386138613 episode loss:  199.48482
episode: 2211 episode reward: 20.0 epsilon: 0.7935320123598083 avg reward (last 100): 19.683168

episode: 2268 episode reward: 62.0 epsilon: 0.7890215214728407 avg reward (last 100): 22.801980198019802 episode loss:  170.69025
episode: 2269 episode reward: 12.0 epsilon: 0.7889426193206934 avg reward (last 100): 22.782178217821784 episode loss:  152.34169
episode: 2270 episode reward: 22.0 epsilon: 0.7888637250587613 avg reward (last 100): 22.861386138613863 episode loss:  177.98973
episode: 2271 episode reward: 27.0 epsilon: 0.7887848386862555 avg reward (last 100): 22.99009900990099 episode loss:  169.56526
episode: 2272 episode reward: 19.0 epsilon: 0.7887059602023868 avg reward (last 100): 22.95049504950495 episode loss:  186.61588
episode: 2273 episode reward: 9.0 epsilon: 0.7886270896063666 avg reward (last 100): 22.900990099009903 episode loss:  242.25409
episode: 2274 episode reward: 36.0 epsilon: 0.788548226897406 avg reward (last 100): 23.0 episode loss:  184.9446
episode: 2275 episode reward: 16.0 epsilon: 0.7884693720747162 avg reward (last 100): 23.03960396039604 episo

episode: 2332 episode reward: 14.0 epsilon: 0.7839876575855673 avg reward (last 100): 21.11881188118812 episode loss:  166.4089
episode: 2333 episode reward: 12.0 epsilon: 0.7839092588198088 avg reward (last 100): 20.84158415841584 episode loss:  159.10687
episode: 2334 episode reward: 23.0 epsilon: 0.7838308678939268 avg reward (last 100): 20.89108910891089 episode loss:  211.17119
episode: 2335 episode reward: 14.0 epsilon: 0.7837524848071373 avg reward (last 100): 20.92079207920792 episode loss:  229.75789
episode: 2336 episode reward: 19.0 epsilon: 0.7836741095586566 avg reward (last 100): 21.0 episode loss:  225.93127
episode: 2337 episode reward: 12.0 epsilon: 0.7835957421477007 avg reward (last 100): 20.722772277227723 episode loss:  168.6954
episode: 2338 episode reward: 17.0 epsilon: 0.7835173825734859 avg reward (last 100): 20.594059405940595 episode loss:  221.13739
episode: 2339 episode reward: 16.0 epsilon: 0.7834390308352286 avg reward (last 100): 20.643564356435643 episo

episode: 2397 episode reward: 31.0 epsilon: 0.7789080105629727 avg reward (last 100): 19.415841584158414 episode loss:  177.01086
episode: 2398 episode reward: 23.0 epsilon: 0.7788301197619164 avg reward (last 100): 19.534653465346533 episode loss:  190.44623
episode: 2399 episode reward: 12.0 epsilon: 0.7787522367499402 avg reward (last 100): 19.445544554455445 episode loss:  139.895
episode: 2400 episode reward: 13.0 epsilon: 0.7786743615262652 avg reward (last 100): 19.405940594059405 episode loss:  197.70598
episode: 2401 episode reward: 23.0 epsilon: 0.7785964940901127 avg reward (last 100): 19.26732673267327 episode loss:  217.69032
episode: 2402 episode reward: 15.0 epsilon: 0.7785186344407037 avg reward (last 100): 19.287128712871286 episode loss:  167.50153
episode: 2403 episode reward: 21.0 epsilon: 0.7784407825772596 avg reward (last 100): 19.366336633663366 episode loss:  162.37659
episode: 2404 episode reward: 20.0 epsilon: 0.7783629384990018 avg reward (last 100): 19.2574

episode: 2461 episode reward: 26.0 epsilon: 0.7739386696778702 avg reward (last 100): 20.93069306930693 episode loss:  376.9017
episode: 2462 episode reward: 15.0 epsilon: 0.7738612758109024 avg reward (last 100): 20.475247524752476 episode loss:  222.59453
episode: 2463 episode reward: 17.0 epsilon: 0.7737838896833212 avg reward (last 100): 20.415841584158414 episode loss:  275.61307
episode: 2464 episode reward: 12.0 epsilon: 0.7737065112943529 avg reward (last 100): 20.316831683168317 episode loss:  285.5065
episode: 2465 episode reward: 9.0 epsilon: 0.7736291406432234 avg reward (last 100): 20.26732673267327 episode loss:  275.63007
episode: 2466 episode reward: 16.0 epsilon: 0.7735517777291591 avg reward (last 100): 20.138613861386137 episode loss:  271.83838
episode: 2467 episode reward: 32.0 epsilon: 0.7734744225513862 avg reward (last 100): 20.247524752475247 episode loss:  226.12718
episode: 2468 episode reward: 24.0 epsilon: 0.773397075109131 avg reward (last 100): 19.7821782

episode: 2525 episode reward: 23.0 epsilon: 0.7690010325992473 avg reward (last 100): 19.455445544554454 episode loss:  236.52586
episode: 2526 episode reward: 11.0 epsilon: 0.7689241324959875 avg reward (last 100): 19.455445544554454 episode loss:  168.01253
episode: 2527 episode reward: 10.0 epsilon: 0.7688472400827379 avg reward (last 100): 19.405940594059405 episode loss:  217.35698
episode: 2528 episode reward: 24.0 epsilon: 0.7687703553587296 avg reward (last 100): 19.524752475247524 episode loss:  241.81548
episode: 2529 episode reward: 15.0 epsilon: 0.7686934783231938 avg reward (last 100): 19.168316831683168 episode loss:  204.59592
episode: 2530 episode reward: 15.0 epsilon: 0.7686166089753614 avg reward (last 100): 19.15841584158416 episode loss:  276.9655
episode: 2531 episode reward: 10.0 epsilon: 0.768539747314464 avg reward (last 100): 19.10891089108911 episode loss:  139.63348
episode: 2532 episode reward: 12.0 epsilon: 0.7684628933397325 avg reward (last 100): 19.09900

episode: 2590 episode reward: 26.0 epsilon: 0.7640184875708673 avg reward (last 100): 19.415841584158414 episode loss:  191.72923
episode: 2591 episode reward: 12.0 epsilon: 0.7639420857221102 avg reward (last 100): 19.425742574257427 episode loss:  180.36113
episode: 2592 episode reward: 29.0 epsilon: 0.763865691513538 avg reward (last 100): 19.603960396039604 episode loss:  183.76956
episode: 2593 episode reward: 17.0 epsilon: 0.7637893049443867 avg reward (last 100): 19.673267326732674 episode loss:  167.25473
episode: 2594 episode reward: 9.0 epsilon: 0.7637129260138923 avg reward (last 100): 19.504950495049506 episode loss:  150.63632
episode: 2595 episode reward: 20.0 epsilon: 0.7636365547212909 avg reward (last 100): 19.495049504950494 episode loss:  201.77286
episode: 2596 episode reward: 24.0 epsilon: 0.7635601910658187 avg reward (last 100): 19.623762376237625 episode loss:  201.34521
episode: 2597 episode reward: 22.0 epsilon: 0.7634838350467121 avg reward (last 100): 19.495

episode: 2655 episode reward: 23.0 epsilon: 0.759068225665535 avg reward (last 100): 21.584158415841586 episode loss:  248.24849
episode: 2656 episode reward: 14.0 epsilon: 0.7589923188429685 avg reward (last 100): 21.574257425742573 episode loss:  180.09085
episode: 2657 episode reward: 13.0 epsilon: 0.7589164196110841 avg reward (last 100): 21.584158415841586 episode loss:  202.44249
episode: 2658 episode reward: 14.0 epsilon: 0.758840527969123 avg reward (last 100): 21.485148514851485 episode loss:  212.86919
episode: 2659 episode reward: 35.0 epsilon: 0.7587646439163261 avg reward (last 100): 21.603960396039604 episode loss:  210.19589
episode: 2660 episode reward: 18.0 epsilon: 0.7586887674519345 avg reward (last 100): 21.099009900990097 episode loss:  238.42616
episode: 2661 episode reward: 20.0 epsilon: 0.7586128985751893 avg reward (last 100): 21.15841584158416 episode loss:  191.9423
episode: 2662 episode reward: 15.0 epsilon: 0.7585370372853318 avg reward (last 100): 21.16831

episode: 2719 episode reward: 45.0 epsilon: 0.7542254602590588 avg reward (last 100): 20.871287128712872 episode loss:  174.693
episode: 2720 episode reward: 40.0 epsilon: 0.7541500377130329 avg reward (last 100): 21.04950495049505 episode loss:  213.03409
episode: 2721 episode reward: 33.0 epsilon: 0.7540746227092616 avg reward (last 100): 21.22772277227723 episode loss:  198.38684
episode: 2722 episode reward: 12.0 epsilon: 0.7539992152469907 avg reward (last 100): 21.059405940594058 episode loss:  146.50638
episode: 2723 episode reward: 13.0 epsilon: 0.7539238153254659 avg reward (last 100): 20.96039603960396 episode loss:  162.89308
episode: 2724 episode reward: 15.0 epsilon: 0.7538484229439334 avg reward (last 100): 20.980198019801982 episode loss:  250.64774
episode: 2725 episode reward: 15.0 epsilon: 0.753773038101639 avg reward (last 100): 20.831683168316832 episode loss:  166.82767
episode: 2726 episode reward: 19.0 epsilon: 0.7536976607978289 avg reward (last 100): 20.9009900

episode: 2784 episode reward: 33.0 epsilon: 0.7493386497633826 avg reward (last 100): 21.06930693069307 episode loss:  239.1382
episode: 2785 episode reward: 10.0 epsilon: 0.7492637158984062 avg reward (last 100): 21.04950495049505 episode loss:  264.81015
episode: 2786 episode reward: 19.0 epsilon: 0.7491887895268164 avg reward (last 100): 21.099009900990097 episode loss:  193.90584
episode: 2787 episode reward: 18.0 epsilon: 0.7491138706478637 avg reward (last 100): 21.128712871287128 episode loss:  258.5317
episode: 2788 episode reward: 16.0 epsilon: 0.749038959260799 avg reward (last 100): 21.10891089108911 episode loss:  191.98438
episode: 2789 episode reward: 15.0 epsilon: 0.7489640553648729 avg reward (last 100): 21.138613861386137 episode loss:  240.27766
episode: 2790 episode reward: 12.0 epsilon: 0.7488891589593364 avg reward (last 100): 21.099009900990097 episode loss:  282.12885
episode: 2791 episode reward: 16.0 epsilon: 0.7488142700434405 avg reward (last 100): 21.1584158

episode: 2848 episode reward: 15.0 epsilon: 0.7445579578991856 avg reward (last 100): 21.871287128712872 episode loss:  188.91197
episode: 2849 episode reward: 15.0 epsilon: 0.7444835021033956 avg reward (last 100): 21.663366336633665 episode loss:  196.07314
episode: 2850 episode reward: 19.0 epsilon: 0.7444090537531853 avg reward (last 100): 21.633663366336634 episode loss:  237.61104
episode: 2851 episode reward: 20.0 epsilon: 0.74433461284781 avg reward (last 100): 21.683168316831683 episode loss:  206.27498
episode: 2852 episode reward: 12.0 epsilon: 0.7442601793865252 avg reward (last 100): 21.534653465346533 episode loss:  262.15622
episode: 2853 episode reward: 31.0 epsilon: 0.7441857533685866 avg reward (last 100): 21.603960396039604 episode loss:  184.64584
episode: 2854 episode reward: 14.0 epsilon: 0.7441113347932498 avg reward (last 100): 21.554455445544555 episode loss:  214.00749
episode: 2855 episode reward: 34.0 epsilon: 0.7440369236597705 avg reward (last 100): 21.742

episode: 2912 episode reward: 39.0 epsilon: 0.7398077662830506 avg reward (last 100): 19.425742574257427 episode loss:  151.34993
episode: 2913 episode reward: 27.0 epsilon: 0.7397337855064222 avg reward (last 100): 19.584158415841586 episode loss:  163.53409
episode: 2914 episode reward: 14.0 epsilon: 0.7396598121278716 avg reward (last 100): 19.603960396039604 episode loss:  245.02794
episode: 2915 episode reward: 12.0 epsilon: 0.7395858461466588 avg reward (last 100): 19.623762376237625 episode loss:  182.13261
episode: 2916 episode reward: 10.0 epsilon: 0.7395118875620441 avg reward (last 100): 19.584158415841586 episode loss:  216.78879
episode: 2917 episode reward: 31.0 epsilon: 0.7394379363732879 avg reward (last 100): 19.633663366336634 episode loss:  239.64398
episode: 2918 episode reward: 12.0 epsilon: 0.7393639925796506 avg reward (last 100): 19.584158415841586 episode loss:  98.917725
episode: 2919 episode reward: 12.0 epsilon: 0.7392900561803926 avg reward (last 100): 19.4

episode: 2977 episode reward: 10.0 epsilon: 0.7350143715389735 avg reward (last 100): 21.06930693069307 episode loss:  227.74704
episode: 2978 episode reward: 13.0 epsilon: 0.7349408701018196 avg reward (last 100): 20.96039603960396 episode loss:  228.16634
episode: 2979 episode reward: 9.0 epsilon: 0.7348673760148094 avg reward (last 100): 20.782178217821784 episode loss:  235.89862
episode: 2980 episode reward: 19.0 epsilon: 0.734793889277208 avg reward (last 100): 20.821782178217823 episode loss:  244.992
episode: 2981 episode reward: 25.0 epsilon: 0.7347204098882802 avg reward (last 100): 20.88118811881188 episode loss:  217.48074
episode: 2982 episode reward: 9.0 epsilon: 0.7346469378472914 avg reward (last 100): 20.85148514851485 episode loss:  204.04791
episode: 2983 episode reward: 25.0 epsilon: 0.7345734731535066 avg reward (last 100): 20.88118811881188 episode loss:  218.71576
episode: 2984 episode reward: 20.0 epsilon: 0.7345000158061913 avg reward (last 100): 20.96039603960

episode: 3041 episode reward: 35.0 epsilon: 0.7303250668738611 avg reward (last 100): 21.356435643564357 episode loss:  235.6212
episode: 3042 episode reward: 21.0 epsilon: 0.7302520343671738 avg reward (last 100): 21.405940594059405 episode loss:  206.66397
episode: 3043 episode reward: 11.0 epsilon: 0.7301790091637371 avg reward (last 100): 21.386138613861387 episode loss:  217.30821
episode: 3044 episode reward: 15.0 epsilon: 0.7301059912628207 avg reward (last 100): 21.366336633663366 episode loss:  196.7671
episode: 3045 episode reward: 11.0 epsilon: 0.7300329806636945 avg reward (last 100): 21.316831683168317 episode loss:  248.38939
episode: 3046 episode reward: 9.0 epsilon: 0.7299599773656281 avg reward (last 100): 21.277227722772277 episode loss:  208.80986
episode: 3047 episode reward: 22.0 epsilon: 0.7298869813678915 avg reward (last 100): 21.08910891089109 episode loss:  158.69533
episode: 3048 episode reward: 17.0 epsilon: 0.7298139926697548 avg reward (last 100): 20.99009

episode: 3105 episode reward: 41.0 epsilon: 0.725665679417301 avg reward (last 100): 21.18811881188119 episode loss:  232.79425
episode: 3106 episode reward: 18.0 epsilon: 0.7255931128493592 avg reward (last 100): 21.207920792079207 episode loss:  170.97957
episode: 3107 episode reward: 32.0 epsilon: 0.7255205535380743 avg reward (last 100): 21.386138613861387 episode loss:  228.2362
episode: 3108 episode reward: 44.0 epsilon: 0.7254480014827205 avg reward (last 100): 21.623762376237625 episode loss:  223.4452
episode: 3109 episode reward: 36.0 epsilon: 0.7253754566825722 avg reward (last 100): 21.722772277227723 episode loss:  199.71472
episode: 3110 episode reward: 14.0 epsilon: 0.7253029191369039 avg reward (last 100): 21.673267326732674 episode loss:  150.73203
episode: 3111 episode reward: 14.0 epsilon: 0.7252303888449902 avg reward (last 100): 21.653465346534652 episode loss:  193.25603
episode: 3112 episode reward: 11.0 epsilon: 0.7251578658061058 avg reward (last 100): 21.45544

episode: 3169 episode reward: 16.0 epsilon: 0.7210360183010446 avg reward (last 100): 21.603960396039604 episode loss:  205.70064
episode: 3170 episode reward: 18.0 epsilon: 0.7209639146992145 avg reward (last 100): 21.247524752475247 episode loss:  202.74619
episode: 3171 episode reward: 22.0 epsilon: 0.7208918183077446 avg reward (last 100): 21.376237623762375 episode loss:  155.6399
episode: 3172 episode reward: 17.0 epsilon: 0.7208197291259139 avg reward (last 100): 21.356435643564357 episode loss:  170.3893
episode: 3173 episode reward: 15.0 epsilon: 0.7207476471530013 avg reward (last 100): 21.396039603960396 episode loss:  257.09647
episode: 3174 episode reward: 36.0 epsilon: 0.7206755723882861 avg reward (last 100): 21.534653465346533 episode loss:  232.99124
episode: 3175 episode reward: 28.0 epsilon: 0.7206035048310472 avg reward (last 100): 21.594059405940595 episode loss:  187.39616
episode: 3176 episode reward: 20.0 epsilon: 0.720531444480564 avg reward (last 100): 21.5742

episode: 3233 episode reward: 27.0 epsilon: 0.7164358938745607 avg reward (last 100): 21.821782178217823 episode loss:  158.62537
episode: 3234 episode reward: 14.0 epsilon: 0.7163642502851733 avg reward (last 100): 21.782178217821784 episode loss:  158.5259
episode: 3235 episode reward: 13.0 epsilon: 0.7162926138601448 avg reward (last 100): 21.801980198019802 episode loss:  139.2495
episode: 3236 episode reward: 13.0 epsilon: 0.7162209845987587 avg reward (last 100): 21.475247524752476 episode loss:  194.45847
episode: 3237 episode reward: 10.0 epsilon: 0.7161493625002988 avg reward (last 100): 21.455445544554454 episode loss:  159.93872
episode: 3238 episode reward: 14.0 epsilon: 0.7160777475640487 avg reward (last 100): 21.475247524752476 episode loss:  188.75696
episode: 3239 episode reward: 13.0 epsilon: 0.7160061397892924 avg reward (last 100): 21.316831683168317 episode loss:  245.69434
episode: 3240 episode reward: 20.0 epsilon: 0.7159345391753135 avg reward (last 100): 21.425

episode: 3297 episode reward: 12.0 epsilon: 0.7118651176972655 avg reward (last 100): 19.782178217821784 episode loss:  278.7558
episode: 3298 episode reward: 42.0 epsilon: 0.7117939311854957 avg reward (last 100): 20.03960396039604 episode loss:  224.80785
episode: 3299 episode reward: 11.0 epsilon: 0.7117227517923772 avg reward (last 100): 20.019801980198018 episode loss:  224.30255
episode: 3300 episode reward: 42.0 epsilon: 0.7116515795171979 avg reward (last 100): 20.019801980198018 episode loss:  226.15353
episode: 3301 episode reward: 27.0 epsilon: 0.7115804143592462 avg reward (last 100): 19.95049504950495 episode loss:  258.076
episode: 3302 episode reward: 49.0 epsilon: 0.7115092563178104 avg reward (last 100): 20.26732673267327 episode loss:  246.38693
episode: 3303 episode reward: 37.0 epsilon: 0.7114381053921786 avg reward (last 100): 20.504950495049506 episode loss:  209.0243
episode: 3304 episode reward: 12.0 epsilon: 0.7113669615816394 avg reward (last 100): 20.54455445

episode: 3362 episode reward: 15.0 epsilon: 0.7072527701805509 avg reward (last 100): 20.22772277227723 episode loss:  169.60611
episode: 3363 episode reward: 25.0 epsilon: 0.7071820449035329 avg reward (last 100): 20.326732673267326 episode loss:  209.89754
episode: 3364 episode reward: 16.0 epsilon: 0.7071113266990425 avg reward (last 100): 20.336633663366335 episode loss:  240.75095
episode: 3365 episode reward: 33.0 epsilon: 0.7070406155663727 avg reward (last 100): 20.534653465346533 episode loss:  176.03645
episode: 3366 episode reward: 14.0 epsilon: 0.706969911504816 avg reward (last 100): 20.534653465346533 episode loss:  176.9419
episode: 3367 episode reward: 26.0 epsilon: 0.7068992145136656 avg reward (last 100): 20.653465346534652 episode loss:  288.15457
episode: 3368 episode reward: 14.0 epsilon: 0.7068285245922142 avg reward (last 100): 20.445544554455445 episode loss:  283.91257
episode: 3369 episode reward: 15.0 epsilon: 0.706757841739755 avg reward (last 100): 20.48514

episode: 3426 episode reward: 28.0 epsilon: 0.7027405812451465 avg reward (last 100): 20.762376237623762 episode loss:  278.83734
episode: 3427 episode reward: 37.0 epsilon: 0.702670307187022 avg reward (last 100): 20.871287128712872 episode loss:  205.41797
episode: 3428 episode reward: 11.0 epsilon: 0.7026000401563033 avg reward (last 100): 20.85148514851485 episode loss:  212.62135
episode: 3429 episode reward: 26.0 epsilon: 0.7025297801522876 avg reward (last 100): 20.980198019801982 episode loss:  197.30426
episode: 3430 episode reward: 18.0 epsilon: 0.7024595271742724 avg reward (last 100): 20.752475247524753 episode loss:  221.80443
episode: 3431 episode reward: 13.0 epsilon: 0.702389281221555 avg reward (last 100): 20.455445544554454 episode loss:  159.34319
episode: 3432 episode reward: 21.0 epsilon: 0.7023190422934328 avg reward (last 100): 20.435643564356436 episode loss:  185.8515
episode: 3433 episode reward: 14.0 epsilon: 0.7022488103892034 avg reward (last 100): 20.41584

episode: 3491 episode reward: 100.0 epsilon: 0.6981873538229553 avg reward (last 100): 22.653465346534652 episode loss:  248.8386
episode: 3492 episode reward: 18.0 epsilon: 0.698117535087573 avg reward (last 100): 22.544554455445546 episode loss:  298.0732
episode: 3493 episode reward: 17.0 epsilon: 0.6980477233340643 avg reward (last 100): 22.594059405940595 episode loss:  312.775
episode: 3494 episode reward: 11.0 epsilon: 0.6979779185617309 avg reward (last 100): 22.524752475247524 episode loss:  209.38759
episode: 3495 episode reward: 22.0 epsilon: 0.6979081207698747 avg reward (last 100): 22.306930693069308 episode loss:  182.7289
episode: 3496 episode reward: 18.0 epsilon: 0.6978383299577977 avg reward (last 100): 22.306930693069308 episode loss:  251.03432
episode: 3497 episode reward: 20.0 epsilon: 0.697768546124802 avg reward (last 100): 22.326732673267326 episode loss:  272.3925
episode: 3498 episode reward: 21.0 epsilon: 0.6976987692701895 avg reward (last 100): 22.43564356

episode: 3555 episode reward: 14.0 epsilon: 0.6937330011705723 avg reward (last 100): 19.415841584158414 episode loss:  261.1309
episode: 3556 episode reward: 36.0 epsilon: 0.6936636278704553 avg reward (last 100): 19.574257425742573 episode loss:  278.60126
episode: 3557 episode reward: 40.0 epsilon: 0.6935942615076682 avg reward (last 100): 19.85148514851485 episode loss:  249.32437
episode: 3558 episode reward: 12.0 epsilon: 0.6935249020815175 avg reward (last 100): 19.673267326732674 episode loss:  283.68646
episode: 3559 episode reward: 13.0 epsilon: 0.6934555495913094 avg reward (last 100): 19.693069306930692 episode loss:  234.04526
episode: 3560 episode reward: 9.0 epsilon: 0.6933862040363503 avg reward (last 100): 19.623762376237625 episode loss:  290.1216
episode: 3561 episode reward: 44.0 epsilon: 0.6933168654159467 avg reward (last 100): 19.861386138613863 episode loss:  271.1868
episode: 3562 episode reward: 15.0 epsilon: 0.6932475337294052 avg reward (last 100): 19.871287

episode: 3619 episode reward: 12.0 epsilon: 0.6893070667607186 avg reward (last 100): 19.712871287128714 episode loss:  307.68176
episode: 3620 episode reward: 19.0 epsilon: 0.6892381360540426 avg reward (last 100): 19.663366336633665 episode loss:  243.41542
episode: 3621 episode reward: 44.0 epsilon: 0.6891692122404373 avg reward (last 100): 19.871287128712872 episode loss:  290.62878
episode: 3622 episode reward: 15.0 epsilon: 0.6891002953192132 avg reward (last 100): 19.900990099009903 episode loss:  265.3363
episode: 3623 episode reward: 12.0 epsilon: 0.6890313852896813 avg reward (last 100): 19.89108910891089 episode loss:  291.955
episode: 3624 episode reward: 20.0 epsilon: 0.6889624821511524 avg reward (last 100): 19.95049504950495 episode loss:  259.3581
episode: 3625 episode reward: 14.0 epsilon: 0.6888935859029373 avg reward (last 100): 19.693069306930692 episode loss:  249.76096
episode: 3626 episode reward: 14.0 epsilon: 0.688824696544347 avg reward (last 100): 19.72277227

episode: 3683 episode reward: 15.0 epsilon: 0.684909369288371 avg reward (last 100): 20.376237623762375 episode loss:  206.18784
episode: 3684 episode reward: 23.0 epsilon: 0.6848408783514421 avg reward (last 100): 20.485148514851485 episode loss:  281.29355
episode: 3685 episode reward: 43.0 epsilon: 0.684772394263607 avg reward (last 100): 20.762376237623762 episode loss:  280.8338
episode: 3686 episode reward: 20.0 epsilon: 0.6847039170241807 avg reward (last 100): 20.495049504950494 episode loss:  223.2449
episode: 3687 episode reward: 9.0 epsilon: 0.6846354466324782 avg reward (last 100): 20.396039603960396 episode loss:  261.0834
episode: 3688 episode reward: 16.0 epsilon: 0.6845669830878149 avg reward (last 100): 20.26732673267327 episode loss:  210.2456
episode: 3689 episode reward: 32.0 epsilon: 0.6844985263895061 avg reward (last 100): 19.92079207920792 episode loss:  259.09802
episode: 3690 episode reward: 32.0 epsilon: 0.6844300765368672 avg reward (last 100): 20.0990099009

episode: 3747 episode reward: 34.0 epsilon: 0.6805397286052116 avg reward (last 100): 20.128712871287128 episode loss:  202.55713
episode: 3748 episode reward: 44.0 epsilon: 0.6804716746323511 avg reward (last 100): 20.326732673267326 episode loss:  307.0426
episode: 3749 episode reward: 27.0 epsilon: 0.6804036274648879 avg reward (last 100): 20.475247524752476 episode loss:  187.02208
episode: 3750 episode reward: 41.0 epsilon: 0.6803355871021415 avg reward (last 100): 20.465346534653467 episode loss:  181.02573
episode: 3751 episode reward: 24.0 epsilon: 0.6802675535434313 avg reward (last 100): 20.603960396039604 episode loss:  211.20473
episode: 3752 episode reward: 11.0 epsilon: 0.6801995267880769 avg reward (last 100): 20.633663366336634 episode loss:  208.18642
episode: 3753 episode reward: 18.0 epsilon: 0.6801315068353981 avg reward (last 100): 20.613861386138613 episode loss:  279.6638
episode: 3754 episode reward: 19.0 epsilon: 0.6800634936847146 avg reward (last 100): 20.594

episode: 3811 episode reward: 34.0 epsilon: 0.6761979657122487 avg reward (last 100): 19.346534653465348 episode loss:  289.35355
episode: 3812 episode reward: 18.0 epsilon: 0.6761303459156774 avg reward (last 100): 19.326732673267326 episode loss:  289.3897
episode: 3813 episode reward: 21.0 epsilon: 0.6760627328810859 avg reward (last 100): 19.376237623762375 episode loss:  234.56743
episode: 3814 episode reward: 10.0 epsilon: 0.6759951266077978 avg reward (last 100): 19.138613861386137 episode loss:  286.82938
episode: 3815 episode reward: 15.0 epsilon: 0.675927527095137 avg reward (last 100): 19.06930693069307 episode loss:  185.06659
episode: 3816 episode reward: 12.0 epsilon: 0.6758599343424275 avg reward (last 100): 18.99009900990099 episode loss:  277.09848
episode: 3817 episode reward: 12.0 epsilon: 0.6757923483489933 avg reward (last 100): 18.84158415841584 episode loss:  204.2486
episode: 3818 episode reward: 29.0 epsilon: 0.6757247691141584 avg reward (last 100): 18.8613861

episode: 3876 episode reward: 24.0 epsilon: 0.6718167143622049 avg reward (last 100): 18.386138613861387 episode loss:  186.99506
episode: 3877 episode reward: 41.0 epsilon: 0.6717495326907686 avg reward (last 100): 18.663366336633665 episode loss:  187.1708
episode: 3878 episode reward: 39.0 epsilon: 0.6716823577374996 avg reward (last 100): 18.95049504950495 episode loss:  196.96443
episode: 3879 episode reward: 40.0 epsilon: 0.6716151895017258 avg reward (last 100): 19.198019801980198 episode loss:  202.15018
episode: 3880 episode reward: 18.0 epsilon: 0.6715480279827757 avg reward (last 100): 19.198019801980198 episode loss:  281.204
episode: 3881 episode reward: 10.0 epsilon: 0.6714808731799774 avg reward (last 100): 19.019801980198018 episode loss:  229.82089
episode: 3882 episode reward: 14.0 epsilon: 0.6714137250926594 avg reward (last 100): 19.019801980198018 episode loss:  164.10178
episode: 3883 episode reward: 11.0 epsilon: 0.6713465837201501 avg reward (last 100): 18.98019

episode: 3941 episode reward: 39.0 epsilon: 0.6674638502069845 avg reward (last 100): 21.871287128712872 episode loss:  202.65063
episode: 3942 episode reward: 14.0 epsilon: 0.6673971038219638 avg reward (last 100): 21.900990099009903 episode loss:  206.0629
episode: 3943 episode reward: 64.0 epsilon: 0.6673303641115816 avg reward (last 100): 22.415841584158414 episode loss:  169.28583
episode: 3944 episode reward: 9.0 epsilon: 0.6672636310751705 avg reward (last 100): 22.376237623762375 episode loss:  195.63475
episode: 3945 episode reward: 61.0 epsilon: 0.6671969047120629 avg reward (last 100): 22.782178217821784 episode loss:  207.17606
episode: 3946 episode reward: 12.0 epsilon: 0.6671301850215917 avg reward (last 100): 22.801980198019802 episode loss:  254.46315
episode: 3947 episode reward: 10.0 epsilon: 0.6670634720030896 avg reward (last 100): 22.702970297029704 episode loss:  202.49747
episode: 3948 episode reward: 22.0 epsilon: 0.6669967656558893 avg reward (last 100): 22.792

episode: 4005 episode reward: 19.0 epsilon: 0.6632055098700252 avg reward (last 100): 20.455445544554454 episode loss:  157.31987
episode: 4006 episode reward: 18.0 epsilon: 0.6631391893190383 avg reward (last 100): 20.306930693069308 episode loss:  152.81831
episode: 4007 episode reward: 11.0 epsilon: 0.6630728754001064 avg reward (last 100): 20.277227722772277 episode loss:  122.498604
episode: 4008 episode reward: 10.0 epsilon: 0.6630065681125664 avg reward (last 100): 20.207920792079207 episode loss:  111.40226
episode: 4009 episode reward: 11.0 epsilon: 0.6629402674557552 avg reward (last 100): 20.217821782178216 episode loss:  134.76854
episode: 4010 episode reward: 14.0 epsilon: 0.6628739734290097 avg reward (last 100): 19.861386138613863 episode loss:  239.33269
episode: 4011 episode reward: 18.0 epsilon: 0.6628076860316667 avg reward (last 100): 19.93069306930693 episode loss:  184.93011
episode: 4012 episode reward: 10.0 epsilon: 0.6627414052630636 avg reward (last 100): 19.8

episode: 4069 episode reward: 20.0 epsilon: 0.6589743372402299 avg reward (last 100): 19.306930693069308 episode loss:  156.86906
episode: 4070 episode reward: 19.0 epsilon: 0.6589084398065059 avg reward (last 100): 19.376237623762375 episode loss:  175.85103
episode: 4071 episode reward: 28.0 epsilon: 0.6588425489625253 avg reward (last 100): 19.386138613861387 episode loss:  206.4556
episode: 4072 episode reward: 15.0 epsilon: 0.658776664707629 avg reward (last 100): 19.435643564356436 episode loss:  136.19978
episode: 4073 episode reward: 9.0 epsilon: 0.6587107870411583 avg reward (last 100): 19.396039603960396 episode loss:  179.41034
episode: 4074 episode reward: 10.0 epsilon: 0.6586449159624541 avg reward (last 100): 19.386138613861387 episode loss:  154.12706
episode: 4075 episode reward: 20.0 epsilon: 0.6585790514708578 avg reward (last 100): 19.465346534653467 episode loss:  187.06812
episode: 4076 episode reward: 11.0 epsilon: 0.6585131935657107 avg reward (last 100): 19.4554

episode: 4133 episode reward: 11.0 epsilon: 0.6547701589908439 avg reward (last 100): 21.663366336633665 episode loss:  161.75551
episode: 4134 episode reward: 19.0 epsilon: 0.6547046819749448 avg reward (last 100): 21.554455445544555 episode loss:  171.80937
episode: 4135 episode reward: 35.0 epsilon: 0.6546392115067473 avg reward (last 100): 21.613861386138613 episode loss:  193.76657
episode: 4136 episode reward: 42.0 epsilon: 0.6545737475855966 avg reward (last 100): 21.940594059405942 episode loss:  182.63358
episode: 4137 episode reward: 30.0 epsilon: 0.654508290210838 avg reward (last 100): 21.97029702970297 episode loss:  237.0206
episode: 4138 episode reward: 21.0 epsilon: 0.6544428393818169 avg reward (last 100): 21.99009900990099 episode loss:  244.29253
episode: 4139 episode reward: 15.0 epsilon: 0.6543773950978787 avg reward (last 100): 21.92079207920792 episode loss:  181.94922
episode: 4140 episode reward: 13.0 epsilon: 0.654311957358369 avg reward (last 100): 21.8811881

episode: 4197 episode reward: 11.0 epsilon: 0.6505928029009167 avg reward (last 100): 20.03960396039604 episode loss:  215.81738
episode: 4198 episode reward: 46.0 epsilon: 0.6505277436206266 avg reward (last 100): 20.366336633663366 episode loss:  203.18831
episode: 4199 episode reward: 12.0 epsilon: 0.6504626908462645 avg reward (last 100): 20.217821782178216 episode loss:  116.22483
episode: 4200 episode reward: 14.0 epsilon: 0.6503976445771799 avg reward (last 100): 20.18811881188119 episode loss:  225.89941
episode: 4201 episode reward: 9.0 epsilon: 0.6503326048127221 avg reward (last 100): 20.11881188118812 episode loss:  158.01349
episode: 4202 episode reward: 35.0 epsilon: 0.6502675715522409 avg reward (last 100): 20.11881188118812 episode loss:  197.81487
episode: 4203 episode reward: 36.0 epsilon: 0.6502025447950857 avg reward (last 100): 20.326732673267326 episode loss:  194.82217
episode: 4204 episode reward: 13.0 epsilon: 0.6501375245406061 avg reward (last 100): 20.168316

episode: 4262 episode reward: 18.0 epsilon: 0.646377453638462 avg reward (last 100): 19.633663366336634 episode loss:  167.71323
episode: 4263 episode reward: 23.0 epsilon: 0.6463128158930982 avg reward (last 100): 19.742574257425744 episode loss:  235.49251
episode: 4264 episode reward: 17.0 epsilon: 0.6462481846115089 avg reward (last 100): 19.455445544554454 episode loss:  118.83905
episode: 4265 episode reward: 24.0 epsilon: 0.6461835597930478 avg reward (last 100): 18.980198019801982 episode loss:  223.95752
episode: 4266 episode reward: 13.0 epsilon: 0.6461189414370685 avg reward (last 100): 18.99009900990099 episode loss:  248.21188
episode: 4267 episode reward: 17.0 epsilon: 0.6460543295429247 avg reward (last 100): 18.91089108910891 episode loss:  219.80959
episode: 4268 episode reward: 11.0 epsilon: 0.6459897241099705 avg reward (last 100): 18.831683168316832 episode loss:  221.2705
episode: 4269 episode reward: 28.0 epsilon: 0.6459251251375595 avg reward (last 100): 18.97029

episode: 4327 episode reward: 14.0 epsilon: 0.6421894166507901 avg reward (last 100): 18.504950495049506 episode loss:  181.52087
episode: 4328 episode reward: 19.0 epsilon: 0.642125197709125 avg reward (last 100): 18.554455445544555 episode loss:  222.0552
episode: 4329 episode reward: 9.0 epsilon: 0.6420609851893541 avg reward (last 100): 18.15841584158416 episode loss:  178.27325
episode: 4330 episode reward: 13.0 epsilon: 0.6419967790908351 avg reward (last 100): 18.15841584158416 episode loss:  186.02026
episode: 4331 episode reward: 16.0 epsilon: 0.641932579412926 avg reward (last 100): 18.128712871287128 episode loss:  194.99414
episode: 4332 episode reward: 20.0 epsilon: 0.6418683861549848 avg reward (last 100): 18.217821782178216 episode loss:  252.66824
episode: 4333 episode reward: 33.0 epsilon: 0.6418041993163692 avg reward (last 100): 18.415841584158414 episode loss:  251.95892
episode: 4334 episode reward: 34.0 epsilon: 0.6417400188964376 avg reward (last 100): 18.5643564

episode: 4392 episode reward: 30.0 epsilon: 0.6380285149750188 avg reward (last 100): 18.821782178217823 episode loss:  196.45226
episode: 4393 episode reward: 19.0 epsilon: 0.6379647121235212 avg reward (last 100): 18.81188118811881 episode loss:  229.77673
episode: 4394 episode reward: 15.0 epsilon: 0.6379009156523089 avg reward (last 100): 18.73267326732673 episode loss:  150.35487
episode: 4395 episode reward: 15.0 epsilon: 0.6378371255607437 avg reward (last 100): 18.73267326732673 episode loss:  196.2145
episode: 4396 episode reward: 19.0 epsilon: 0.6377733418481876 avg reward (last 100): 18.762376237623762 episode loss:  201.24525
episode: 4397 episode reward: 27.0 epsilon: 0.6377095645140027 avg reward (last 100): 18.93069306930693 episode loss:  225.44942
episode: 4398 episode reward: 12.0 epsilon: 0.6376457935575514 avg reward (last 100): 18.861386138613863 episode loss:  200.771
episode: 4399 episode reward: 34.0 epsilon: 0.6375820289781956 avg reward (last 100): 18.74257425

episode: 4457 episode reward: 22.0 epsilon: 0.6338945727948517 avg reward (last 100): 20.792079207920793 episode loss:  201.95366
episode: 4458 episode reward: 35.0 epsilon: 0.6338311833375723 avg reward (last 100): 20.96039603960396 episode loss:  261.587
episode: 4459 episode reward: 13.0 epsilon: 0.6337678002192385 avg reward (last 100): 20.762376237623762 episode loss:  215.19019
episode: 4460 episode reward: 33.0 epsilon: 0.6337044234392166 avg reward (last 100): 20.821782178217823 episode loss:  245.83461
episode: 4461 episode reward: 12.0 epsilon: 0.6336410529968727 avg reward (last 100): 20.683168316831683 episode loss:  280.62375
episode: 4462 episode reward: 10.0 epsilon: 0.633577688891573 avg reward (last 100): 20.594059405940595 episode loss:  325.59845
episode: 4463 episode reward: 44.0 epsilon: 0.6335143311226838 avg reward (last 100): 20.77227722772277 episode loss:  298.77563
episode: 4464 episode reward: 10.0 epsilon: 0.6334509796895715 avg reward (last 100): 20.732673

episode: 4522 episode reward: 19.0 epsilon: 0.6297874154331495 avg reward (last 100): 20.02970297029703 episode loss:  234.72081
episode: 4523 episode reward: 13.0 epsilon: 0.6297244366916063 avg reward (last 100): 20.04950495049505 episode loss:  177.76898
episode: 4524 episode reward: 33.0 epsilon: 0.6296614642479371 avg reward (last 100): 20.257425742574256 episode loss:  257.07602
episode: 4525 episode reward: 37.0 epsilon: 0.6295984981015124 avg reward (last 100): 20.495049504950494 episode loss:  210.96785
episode: 4526 episode reward: 24.0 epsilon: 0.6295355382517022 avg reward (last 100): 20.653465346534652 episode loss:  263.1234
episode: 4527 episode reward: 13.0 epsilon: 0.629472584697877 avg reward (last 100): 20.613861386138613 episode loss:  202.96384
episode: 4528 episode reward: 19.0 epsilon: 0.6294096374394073 avg reward (last 100): 20.663366336633665 episode loss:  237.48468
episode: 4529 episode reward: 13.0 epsilon: 0.6293466964756633 avg reward (last 100): 20.41584

episode: 4586 episode reward: 27.0 epsilon: 0.625769446289177 avg reward (last 100): 20.138613861386137 episode loss:  256.4745
episode: 4587 episode reward: 22.0 epsilon: 0.6257068693445481 avg reward (last 100): 20.198019801980198 episode loss:  258.80722
episode: 4588 episode reward: 10.0 epsilon: 0.6256442986576136 avg reward (last 100): 20.198019801980198 episode loss:  234.1713
episode: 4589 episode reward: 16.0 epsilon: 0.6255817342277479 avg reward (last 100): 20.18811881188119 episode loss:  191.2713
episode: 4590 episode reward: 11.0 epsilon: 0.625519176054325 avg reward (last 100): 20.059405940594058 episode loss:  189.52534
episode: 4591 episode reward: 41.0 epsilon: 0.6254566241367197 avg reward (last 100): 20.346534653465348 episode loss:  210.00377
episode: 4592 episode reward: 12.0 epsilon: 0.625394078474306 avg reward (last 100): 20.06930693069307 episode loss:  164.83653
episode: 4593 episode reward: 32.0 epsilon: 0.6253315390664586 avg reward (last 100): 20.168316831