In [1]:
from cyberbattle.simulation.model import *
import logging, sys, gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")


In [2]:
gymids = [f"ActiveDirectory-v{i}" for i in range(0, 3)]
iteration_count = 50
training_episode_count = 20

In [3]:
envs = [gym.make(gymid) for gymid in gymids]
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=30,
    maximum_total_credentials=50000,
    identifiers=envs[0].identifiers
)


In [4]:
# Evaluate the Deep Q-learning agent for each env using transfer learning
l = dqla.DeepQLearnerPolicy(
    ep=ep,
    gamma=0.015,
    replay_memory_size=10000,
    target_update=5,
    batch_size=512,
    learning_rate=0.01  # torch default learning rate is 1e-2
)
for env in envs:
    dqn_learning_run = learner.epsilon_greedy_search(
        cyberbattle_gym_env=env,
        environment_properties=ep,
        learner=l,
        episode_count=training_episode_count,
    iteration_count=iteration_count,
        epsilon=0.90,
        epsilon_exponential_decay=50000,
        epsilon_minimum=0.10,
        verbosity=Verbosity.Quiet,
        render=False,
        plot_episodes_length=False,
        title="DQL"
    )
    l = dqn_learning_run["learner"]


###### DQL
Learning with: episode_count=20,iteration_count=50,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/20 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 21|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 21|reward:   18.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 1|Iteration 49|reward:   18.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 1|Iteration 50|reward:   18.0|last_reward_at:   21|Elapsed Time: 0:00:00||


  Episode 1 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 3/14 (0.18)
    explore-remote: 0/33 (0.00)
    explore-connect: 0/0 (NaN)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 3
  ## Episode: 2/20 'DQL' ϵ=0.8992, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   27.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   32.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   32.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 15|reward:   33.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 15|reward:   33.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 2|Iteration 19|reward:   39.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 2|I

  Episode 2 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/5 (0.62)
    explore-remote: 0/19 (0.00)
    explore-connect: 1/16 (0.06)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 3/20 'DQL' ϵ=0.8984, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 16|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 16|reward:   18.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   33.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   33.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 19|reward:   38.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 19|reward:   38.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 3|

  Episode 3 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/4 (0.64)
    explore-remote: 0/22 (0.00)
    explore-connect: 1/13 (0.07)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 3
  ## Episode: 4/20 'DQL' ϵ=0.8976, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 35|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 39|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 39|reward:   27.0|last_reward_at:   39|Elapsed Time: 0:00:00||
Episode 4|Iteration 44|reward:   32.0|last_reward_at:   39|Elapsed Time: 0:00:00||
Episode 4|Iteration 44|reward:   32.0|last_reward_at:   44|Elapsed Time: 0:00:00||
Episode 4|Iteration 47|reward:   33.0|last_reward_at:   44|Elapsed Time: 0:00:00||
Episode 4|

  Episode 4 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/15 (0.21)
    explore-remote: 0/25 (0.00)
    explore-connect: 0/4 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 2
  ## Episode: 5/20 'DQL' ϵ=0.8968, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 24|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 40|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 40|reward:   18.0|last_reward_at:   40|Elapsed Time: 0:00:00||
Episode 5|Iteration 43|reward:   33.0|last_reward_at:   40|Elapsed Time: 0:00:00||
Episode 5|Iteration 43|reward:   33.0|last_reward_at:   43|Elapsed Time: 0:00:00||
Episode 5|Iteration 47|reward:   38.0|last_reward_at:   43|Elapsed Time: 0:00:00||
Episode 5|

  Episode 5 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/15 (0.21)
    explore-remote: 0/26 (0.00)
    explore-connect: 1/4 (0.20)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 2
  ## Episode: 6/20 'DQL' ϵ=0.8960, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   26.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   26.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   32.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   32.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   38.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   38.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 6|Iteration 17|reward:   39.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 6|Ite

  Episode 6 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/3 (0.75)
    explore-remote: 0/21 (0.00)
    explore-connect: 0/14 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 7/20 'DQL' ϵ=0.8952, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   26.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   26.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 11|reward:   33.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 11|reward:   33.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 7|Iteration 20|reward:   39.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 7|Ite

  Episode 7 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/9 (0.50)
    explore-remote: 0/14 (0.00)
    explore-connect: 0/14 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 1
  ## Episode: 8/20 'DQL' ϵ=0.8944, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 17|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 17|reward:   18.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 8|Iteration 26|reward:   33.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 8|Iteration 26|reward:   33.0|last_reward_at:   26|Elapsed Time: 0:00:00||
Episode 8|Iteration 38|reward:   38.0|last_reward_at:   26|Elapsed Time: 0:00:00||
Episode 8|Iteration 38|reward:   38.0|last_reward_at:   38|Elapsed Time: 0:00:00||
Episode 8|

  Episode 8 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/12 (0.29)
    explore-remote: 0/25 (0.00)
    explore-connect: 0/6 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 3
  ## Episode: 9/20 'DQL' ϵ=0.8936, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   26.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   26.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   28.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   28.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 9|Iter

  Episode 9 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/8 (0.47)
    explore-remote: 0/16 (0.00)
    explore-connect: 0/14 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 3
  ## Episode: 10/20 'DQL' ϵ=0.8928, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   27.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   28.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   28.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   33.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   33.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 10|Iteration 17|reward:   39.0|last_reward_at:   15|Elapsed Time: 0:00:00||

  Episode 10 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/7 (0.53)
    explore-remote: 0/11 (0.00)
    explore-connect: 0/19 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 2
  ## Episode: 11/20 'DQL' ϵ=0.8921, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   15.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 9|reward:   15.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 11|reward:   15.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 13|reward:   15.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 13|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 13|reward:   21.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 11|Iteration 16|reward:   21.0|last_reward_at:   13|Elapsed Time: 0:00:00||

  Episode 11 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/4 (0.56)
    explore-remote: 0/22 (0.00)
    explore-connect: 0/15 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 12/20 'DQL' ϵ=0.8913, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 7|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 13|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 14|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 14|reward:   27.0|last_reward_at:   14|Elapsed Time: 0:00:00||


  Episode 12 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/10 (0.50)
    explore-remote: 0/11 (0.00)
    explore-connect: 0/13 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 13/20 'DQL' ϵ=0.8905, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 5|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 5|reward:   15.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 13|Iteration 7|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 13|Iteration 7|reward:   21.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   21.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   22.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   22.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 13|Iteration 11|reward:   31.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 13|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||

  Episode 13 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/5 (0.55)
    explore-remote: 0/14 (0.00)
    explore-connect: 2/15 (0.12)
    exploit-local: 4/1 (0.80)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 14/20 'DQL' ϵ=0.8897, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:    6.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 9|reward:    6.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 9|reward:   12.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 9|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 14|Iteration 12|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 14|Iteration 14|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 14|Iteration 15|reward:   18.0|last_reward_at:    9|Elapsed Time: 0:00:00||
E

  Episode 14 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/11 (0.27)
    explore-remote: 0/20 (0.00)
    explore-connect: 0/11 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 15/20 'DQL' ϵ=0.8889, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 5|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   20.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   20.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 10|reward:   20.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 12|reward:   20.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 12|reward:   21.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 12|reward:   21.0|last_reward_at:   12|Elapsed Time: 0:00:00||


  Episode 15 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/1 (0.86)
    explore-remote: 0/23 (0.00)
    explore-connect: 0/15 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 16/20 'DQL' ϵ=0.8881, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 16|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 16|Iteration 4|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 16|Iteration 4|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 16|Iteration 5|reward:   33.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 16|Iteration 5|reward:   33.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 16|Iteration 8|reward:   33.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 16|Iteration 10|reward:   33.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Epi

  Episode 16 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/15 (0.35)
    explore-remote: 0/12 (0.00)
    explore-connect: 2/11 (0.15)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 17/20 'DQL' ϵ=0.8873, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 17|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 2|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 2|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 17|Iteration 3|reward:   16.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 17|Iteration 3|reward:   16.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 17|Iteration 6|reward:   16.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 17|Iteration 7|reward:   17.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 17|Iteration 7|reward:   17.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 17|Iteration 10|reward:   17.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 17|Iteration 10|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 17|Iteration 10|reward:   18.0|last_reward_at:   10|Elapsed Time: 0:00:00||
E

  Episode 17 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/5 (0.62)
    explore-remote: 0/14 (0.00)
    explore-connect: 0/12 (0.00)
    exploit-local: 5/1 (0.83)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 18/20 'DQL' ϵ=0.8865, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 18|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 3|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 3|reward:   15.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 18|Iteration 6|reward:   15.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 18|Iteration 8|reward:   15.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 18|Iteration 10|reward:   15.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 18|Iteration 10|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 18|Iteration 10|reward:   21.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 18|Iteration 13|reward:   21.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 18|Iteration 13|reward:   26.0|last_reward_at:   10|Elapsed Time: 0:00:00||

  Episode 18 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/3 (0.67)
    explore-remote: 0/20 (0.00)
    explore-connect: 1/17 (0.06)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 19/20 'DQL' ϵ=0.8857, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 19|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 4|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 4|reward:   15.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 6|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 6|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 19|Iteration 9|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 19|Iteration 10|reward:   26.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 19|Iteration 10|reward:   26.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 19|Iteration 11|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 19|Iteration 11|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||


  Episode 19 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/5 (0.62)
    explore-remote: 0/21 (0.00)
    explore-connect: 0/11 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 20/20 'DQL' ϵ=0.8850, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 20|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 5|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 5|reward:    6.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 20|Iteration 8|reward:    6.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 20|Iteration 9|reward:    6.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 20|Iteration 10|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 20|Iteration 10|reward:   21.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 20|Iteration 11|reward:   22.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 20|Iteration 11|reward:   22.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 20|Iteration 12|reward:   23.0|last_reward_at:   11|Elapsed Time: 0:00:00||

  Episode 20 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/3 (0.75)
    explore-remote: 0/19 (0.00)
    explore-connect: 1/13 (0.07)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
simulation ended
###### DQL
Learning with: episode_count=20,iteration_count=50,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/20 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   30.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|Iteration 14|reward:   30.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|Iteration 16|reward:   30.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|Iteration 18|reward:   30.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|I

  Episode 1 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/18 (0.18)
    explore-remote: 0/12 (0.00)
    explore-connect: 0/16 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 0
  ## Episode: 2/20 'DQL' ϵ=0.8992, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   29.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   29.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Ite

  Episode 2 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/2 (0.82)
    explore-remote: 0/18 (0.00)
    explore-connect: 0/16 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 3/20 'DQL' ϵ=0.8984, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   35.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   35.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   36.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|It

  Episode 3 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/3 (0.77)
    explore-remote: 0/19 (0.00)
    explore-connect: 2/13 (0.13)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 4/20 'DQL' ϵ=0.8976, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   29.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 11|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 13|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 17|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|It

  Episode 4 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/7 (0.42)
    explore-remote: 0/16 (0.00)
    explore-connect: 2/19 (0.10)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 5/20 'DQL' ϵ=0.8968, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   35.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   35.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   35.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   35.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   41.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   41.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 5|Ite

  Episode 5 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/6 (0.62)
    explore-remote: 0/14 (0.00)
    explore-connect: 1/14 (0.07)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 6/20 'DQL' ϵ=0.8960, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   41.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iter

  Episode 6 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/4 (0.67)
    explore-remote: 0/18 (0.00)
    explore-connect: 2/12 (0.14)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 7/20 'DQL' ϵ=0.8952, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   29.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   29.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   35.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   35.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   42.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   42.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iter

  Episode 7 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/9 (0.36)
    explore-remote: 0/17 (0.00)
    explore-connect: 0/13 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 8/20 'DQL' ϵ=0.8944, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   36.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   37.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   37.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   43.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 8|It

  Episode 8 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/7 (0.50)
    explore-remote: 0/19 (0.00)
    explore-connect: 1/9 (0.10)
    exploit-local: 1/1 (0.50)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 9/20 'DQL' ϵ=0.8936, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 9|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   18.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 9|Iteration 14|reward:   18.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 9|It

  Episode 9 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/5 (0.64)
    explore-remote: 0/19 (0.00)
    explore-connect: 0/12 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/1 (0.80)
  exploit deflected to exploration: 0
  ## Episode: 10/20 'DQL' ϵ=0.8928, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   25.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   31.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   31.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   32.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   32.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 11|reward:   33.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Epi

  Episode 10 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/5 (0.64)
    explore-remote: 0/13 (0.00)
    explore-connect: 0/18 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 11/20 'DQL' ϵ=0.8921, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   18.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 9|reward:   18.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 10|reward:   42.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 10|reward:   42.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 11|Iteration 12|reward:   43.0|last_reward_at:   10|Elapsed Time: 0:00:00||
E

  Episode 11 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/11 (0.45)
    explore-remote: 0/14 (0.00)
    explore-connect: 0/10 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 12/20 'DQL' ϵ=0.8913, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 6|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 7|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 7|reward:    6.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 12|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 14|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 16|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 18|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||


  Episode 12 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 3/10 (0.23)
    explore-remote: 0/25 (0.00)
    explore-connect: 0/7 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 13/20 'DQL' ϵ=0.8905, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 2|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 7|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 8|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 13|Iteration 12|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 13|Iteration 14|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 13|Iteration 16|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||


  Episode 13 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/9 (0.40)
    explore-remote: 0/17 (0.00)
    explore-connect: 3/13 (0.19)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 14/20 'DQL' ϵ=0.8897, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 5|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:   30.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   31.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 10|reward:   32.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 10|reward:   32.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 14|Iteration 12|reward:   38.0|last_reward_at:   10|Elapsed Time: 0:00:00||
E

  Episode 14 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/3 (0.73)
    explore-remote: 0/19 (0.00)
    explore-connect: 0/13 (0.00)
    exploit-local: 2/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 15/20 'DQL' ϵ=0.8889, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:   25.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   26.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   26.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 8|reward:   32.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 8|reward:   32.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 15|Iteration 10|reward:   37.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Epi

  Episode 15 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/1 (0.89)
    explore-remote: 0/13 (0.00)
    explore-connect: 0/17 (0.00)
    exploit-local: 1/1 (0.50)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/2 (0.78)
  exploit deflected to exploration: 0
  ## Episode: 16/20 'DQL' ϵ=0.8881, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 16|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 16|Iteration 5|reward:   36.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 16|Iteration 5|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 16|Iteration 6|reward:   42.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 16|Iteration 6|reward:   42.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 16|Iteration 9|reward:   42.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 16|Iteration 9|reward:   47.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Epis

  Episode 16 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/6 (0.54)
    explore-remote: 0/16 (0.00)
    explore-connect: 2/15 (0.12)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 17/20 'DQL' ϵ=0.8873, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 17|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 17|Iteration 7|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 17|Iteration 8|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 17|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 17|Iteration 11|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 17|Iteration 13|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 17|Iteration 14|reward:   35.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 17|Iteration 14|reward:   35.0|last_reward_at:   14|Elapsed Time: 0:00:00||


  Episode 17 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/3 (0.57)
    explore-remote: 0/25 (0.00)
    explore-connect: 0/13 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 18/20 'DQL' ϵ=0.8865, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 18|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 5|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 7|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 7|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 18|Iteration 10|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 18|Iteration 12|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 18|Iteration 12|reward:   35.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 18|Iteration 12|reward:   35.0|last_reward_at:   12|Elapsed Time: 0:00:00||


  Episode 18 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/7 (0.46)
    explore-remote: 0/19 (0.00)
    explore-connect: 0/14 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 19/20 'DQL' ϵ=0.8857, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 19|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 1|reward:   24.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 1|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 19|Iteration 2|reward:   29.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 19|Iteration 2|reward:   29.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 19|Iteration 3|reward:   35.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 19|Iteration 3|reward:   35.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 19|Iteration 5|reward:   41.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 19|Iteration 5|reward:   41.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 19|Iteration 7|reward:   41.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 19|Iteration 7|reward:   47.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 19|Iteration 7|reward:   47.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Epis

  Episode 19 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/3 (0.77)
    explore-remote: 0/16 (0.00)
    explore-connect: 1/12 (0.08)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 20/20 'DQL' ϵ=0.8850, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 20|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 20|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 20|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 20|Iteration 4|reward:   36.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 20|Iteration 4|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 20|Iteration 5|reward:   37.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 20|Iteration 5|reward:   37.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 20|Iteration 7|reward:   38.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 20|Iteration 7|reward:   38.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 20|Iteration 10|reward:   38.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Epi

  Episode 20 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/7 (0.42)
    explore-remote: 0/19 (0.00)
    explore-connect: 1/12 (0.08)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/0 (1.00)
  exploit deflected to exploration: 0
simulation ended
###### DQL
Learning with: episode_count=20,iteration_count=50,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/20 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   31.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|Ite

  Episode 1 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/7 (0.56)
    explore-remote: 0/16 (0.00)
    explore-connect: 1/11 (0.08)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/2 (0.60)
  exploit deflected to exploration: 0
  ## Episode: 2/20 'DQL' ϵ=0.8992, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 6|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   23.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   23.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   24.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   25.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   25.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 2|It

  Episode 2 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/7 (0.50)
    explore-remote: 0/15 (0.00)
    explore-connect: 1/12 (0.08)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/2 (0.60)
  exploit deflected to exploration: 0
  ## Episode: 3/20 'DQL' ϵ=0.8984, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   23.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   23.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   29.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 12|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iter

  Episode 3 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/6 (0.45)
    explore-remote: 0/14 (0.00)
    explore-connect: 1/22 (0.04)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 4/20 'DQL' ϵ=0.8976, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   19.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   19.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   25.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   25.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 9|reward:   25.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 11|reward:   25.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iter

  Episode 4 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/8 (0.47)
    explore-remote: 0/18 (0.00)
    explore-connect: 1/9 (0.10)
    exploit-local: 4/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 5/20 'DQL' ϵ=0.8968, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   25.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   25.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   25.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iter

  Episode 5 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/7 (0.36)
    explore-remote: 0/16 (0.00)
    explore-connect: 0/18 (0.00)
    exploit-local: 2/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 6/20 'DQL' ϵ=0.8960, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iter

  Episode 6 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/8 (0.38)
    explore-remote: 0/18 (0.00)
    explore-connect: 0/13 (0.00)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 7/20 'DQL' ϵ=0.8952, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   23.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   23.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   23.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   29.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   29.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   29.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|It

  Episode 7 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/7 (0.50)
    explore-remote: 0/14 (0.00)
    explore-connect: 2/13 (0.13)
    exploit-local: 5/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 8/20 'DQL' ϵ=0.8944, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   19.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   19.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   25.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   25.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   39.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   39.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iterat

  Episode 8 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/8 (0.50)
    explore-remote: 0/17 (0.00)
    explore-connect: 0/11 (0.00)
    exploit-local: 2/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 9/20 'DQL' ϵ=0.8936, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 6|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 6|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 9|Iteration 9|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 9|Iterat

  Episode 9 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/7 (0.42)
    explore-remote: 0/16 (0.00)
    explore-connect: 2/14 (0.12)
    exploit-local: 2/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/2 (0.50)
  exploit deflected to exploration: 0
  ## Episode: 10/20 'DQL' ϵ=0.8928, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 11|reward:   31.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||


  Episode 10 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/7 (0.53)
    explore-remote: 0/12 (0.00)
    explore-connect: 0/20 (0.00)
    exploit-local: 0/0 (NaN)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 11/20 'DQL' ϵ=0.8921, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 11|Iteration 3|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 11|Iteration 3|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   31.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 7|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 7|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Epis

  Episode 11 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/11 (0.39)
    explore-remote: 0/14 (0.00)
    explore-connect: 1/13 (0.07)
    exploit-local: 2/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 12/20 'DQL' ϵ=0.8913, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:   19.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:   19.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:   25.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 5|reward:   31.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 5|reward:   31.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 12|Iteration 8|reward:   31.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 12|Iteration 8|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 12|Iteration 8|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Epis

  Episode 12 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/6 (0.45)
    explore-remote: 0/21 (0.00)
    explore-connect: 1/12 (0.08)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 13/20 'DQL' ϵ=0.8905, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   25.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 13|Iteration 7|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 13|Iteration 7|reward:   31.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 13|Iteration 8|reward:   31.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Epis

  Episode 13 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/8 (0.50)
    explore-remote: 0/16 (0.00)
    explore-connect: 0/13 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 14/20 'DQL' ϵ=0.8897, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 2|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 5|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 7|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 9|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 10|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 10|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 14|Iteration 12|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 14|Iteration 12|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 14|Iteration 13|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||

  Episode 14 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/9 (0.40)
    explore-remote: 0/17 (0.00)
    explore-connect: 0/12 (0.00)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 15/20 'DQL' ϵ=0.8889, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 6|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 6|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 6|reward:   24.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   29.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   29.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 10|reward:   29.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 12|reward:   29.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 13|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
E

  Episode 15 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/3 (0.70)
    explore-remote: 0/19 (0.00)
    explore-connect: 0/15 (0.00)
    exploit-local: 1/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/1 (0.80)
  exploit deflected to exploration: 0
  ## Episode: 16/20 'DQL' ϵ=0.8881, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 16|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 16|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 16|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 16|Iteration 4|reward:   29.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 16|Iteration 4|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 16|Iteration 7|reward:   29.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 16|Iteration 8|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 16|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 16|Iteration 11|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Epi

  Episode 16 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 3/6 (0.33)
    explore-remote: 0/20 (0.00)
    explore-connect: 1/10 (0.09)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/4 (0.43)
  exploit deflected to exploration: 0
  ## Episode: 17/20 'DQL' ϵ=0.8873, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 17|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 17|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 17|Iteration 4|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 17|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 17|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 17|Iteration 5|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 17|Iteration 5|reward:   25.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 17|Iteration 8|reward:   25.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 17|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 17|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Epis

  Episode 17 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/7 (0.53)
    explore-remote: 0/15 (0.00)
    explore-connect: 0/12 (0.00)
    exploit-local: 2/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/1 (0.83)
  exploit deflected to exploration: 0
  ## Episode: 18/20 'DQL' ϵ=0.8865, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 18|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 18|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 18|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 18|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 5|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 7|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 9|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 11|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 13|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 13|reward:   25.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 18|Iteration 13|reward:   25.0|last_reward_at:   13|Elapsed Time: 0:00:00||


  Episode 18 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/5 (0.29)
    explore-remote: 0/15 (0.00)
    explore-connect: 0/18 (0.00)
    exploit-local: 7/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 19/20 'DQL' ϵ=0.8857, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 19|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 19|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 7|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 9|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 11|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 13|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 15|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 17|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 19|Iteration 18|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||

  Episode 19 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 2/6 (0.25)
    explore-remote: 0/29 (0.00)
    explore-connect: 1/7 (0.12)
    exploit-local: 3/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/0 (1.00)
  exploit deflected to exploration: 0
  ## Episode: 20/20 'DQL' ϵ=0.8850, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 20|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 20|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 20|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 20|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 20|Iteration 5|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 20|Iteration 6|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 20|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 20|Iteration 9|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 20|Iteration 11|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 20|Iteration 13|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 20|Iteration 13|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
E

  Episode 20 stopped at t=50 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/8 (0.33)
    explore-remote: 0/16 (0.00)
    explore-connect: 0/14 (0.00)
    exploit-local: 4/1 (0.80)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/0 (1.00)
  exploit deflected to exploration: 0
simulation ended


In [18]:
tiny = gym.make('ActiveDirectoryTiny-v0')
current_o = tiny.reset()
wrapped_env = AgentWrapper(tiny, ActionTrackingStateAugmentation(ep, current_o))
# Use the trained agent to run the steps one by one
max_steps = 8
# next action suggested by DQL agent
h = []
for i in range(max_steps):
    # run the suggested action
    _, next_action, _ = l.exploit(wrapped_env, current_o)
    h.append((tiny.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
    if next_action is None:
        break
    current_o, _, _, _ = wrapped_env.step(next_action)
tiny.render()


Unnamed: 0,id,status,properties,local_attacks,remote_attacks
0,workstation_0,owned,[breach_node],"[FindDomainControllers, AuthorizationSpoofAndC...",[]
1,workstation_1,owned,[admin],"[FindDomainControllers, ScanForCreds, Enumerat...",[]
2,domain_controller_1,owned,[domain_controller],"[FindDomainControllers, DumpNTDS, EnumerateFil...",[]
