# DQL agent running on the Active Directory sample environment

In [1]:
from cyberbattle.simulation.model import *
import logging, sys, gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")


In [2]:
gymids = [f"ActiveDirectory-v{i}" for i in range(0, 3)]
iteration_count = 800
training_episode_count = 10

In [3]:
envs = [gym.make(gymid) for gymid in gymids]
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=30,
    maximum_total_credentials=50000,
    identifiers=envs[0].identifiers
)


In [4]:
# Evaluate the Deep Q-learning agent for each env using transfer learning
l = dqla.DeepQLearnerPolicy(
    ep=ep,
    gamma=0.015,
    replay_memory_size=10000,
    target_update=5,
    batch_size=512,
    learning_rate=0.01  # torch default learning rate is 1e-2
)
for env in envs:
    dqn_learning_run = learner.epsilon_greedy_search(
        cyberbattle_gym_env=env,
        environment_properties=ep,
        learner=l,
        episode_count=training_episode_count,
    iteration_count=iteration_count,
        epsilon=0.90,
        epsilon_exponential_decay=50000,
        epsilon_minimum=0.10,
        verbosity=Verbosity.Quiet,
        render=False,
        plot_episodes_length=False,
        title="DQL"
    )
    l = dqn_learning_run["learner"]


###### DQL
Learning with: episode_count=10,iteration_count=800,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/10 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   29.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 11|reward:   29.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|Iteration 16|reward:   35.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 1|Iteration 16|reward:   35.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 1|Iteration 23|reward:   41.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 1|Iteration 23|reward:   41.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 1|

  Episode 1 stopped at t=800 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 23/190 (0.11)
    explore-remote: 0/262 (0.00)
    explore-connect: 1/251 (0.00)
    exploit-local: 2/24 (0.08)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 5/41 (0.11)
  exploit deflected to exploration: 7
  ## Episode: 2/10 'DQL' ϵ=0.8873, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   25.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   25.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   31.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   31.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   37.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   37.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   37.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Itera

  Episode 2 stopped at t=800 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 16/183 (0.08)
    explore-remote: 0/239 (0.00)
    explore-connect: 7/273 (0.03)
    exploit-local: 16/55 (0.23)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/10 (0.09)
  exploit deflected to exploration: 1
  ## Episode: 3/10 'DQL' ϵ=0.8748, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   31.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 13|reward:   37.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Ite

  Episode 3 stopped at t=800 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/166 (0.05)
    explore-remote: 0/256 (0.00)
    explore-connect: 6/264 (0.02)
    exploit-local: 29/46 (0.39)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/22 (0.12)
  exploit deflected to exploration: 16
  ## Episode: 4/10 'DQL' ϵ=0.8625, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   29.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   35.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   35.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 11|reward:   41.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 11|reward:   41.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 4|It

  Episode 4 ended at t=403 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/77 (0.09)
    explore-remote: 0/137 (0.00)
    explore-connect: 3/119 (0.02)
    exploit-local: 28/14 (0.67)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/10 (0.41)
  exploit deflected to exploration: 1
  ## Episode: 5/10 'DQL' ϵ=0.8564, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 7|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   36.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   36.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 5|It

  Episode 5 stopped at t=800 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/182 (0.02)
    explore-remote: 0/238 (0.00)
    explore-connect: 0/253 (0.00)
    exploit-local: 33/16 (0.67)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/65 (0.12)
  exploit deflected to exploration: 4
  ## Episode: 6/10 'DQL' ϵ=0.8444, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   23.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   23.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   29.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   29.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   35.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   35.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   41.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   41.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 6|It

  Episode 6 ended at t=399 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/89 (0.07)
    explore-remote: 0/118 (0.00)
    explore-connect: 6/120 (0.05)
    exploit-local: 29/7 (0.81)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/19 (0.17)
  exploit deflected to exploration: 0
  ## Episode: 7/10 'DQL' ϵ=0.8385, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 11|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   24.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 7|Iteration 14|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 7|Iteration 14|reward:   30.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 7|Iteration 17|reward:   36.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 7|

  Episode 7 ended at t=552 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/122 (0.05)
    explore-remote: 0/177 (0.00)
    explore-connect: 3/154 (0.02)
    exploit-local: 29/10 (0.74)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 7/43 (0.14)
  exploit deflected to exploration: 1
  ## Episode: 8/10 'DQL' ϵ=0.8304, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   29.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   29.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 9|reward:   39.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 9|reward:   39.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   45.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Itera

  Episode 8 ended at t=495 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/106 (0.04)
    explore-remote: 0/157 (0.00)
    explore-connect: 2/133 (0.01)
    exploit-local: 33/3 (0.92)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 8/49 (0.14)
  exploit deflected to exploration: 1
  ## Episode: 9/10 'DQL' ϵ=0.8232, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   18.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 9|reward:   24.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 9|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   36.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   36.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|It

  Episode 9 ended at t=539 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/115 (0.07)
    explore-remote: 0/162 (0.00)
    explore-connect: 2/157 (0.01)
    exploit-local: 28/4 (0.88)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 8/54 (0.13)
  exploit deflected to exploration: 0
  ## Episode: 10/10 'DQL' ϵ=0.8154, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 8|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 8|reward:   24.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
E

  Episode 10 ended at t=351 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/62 (0.07)
    explore-remote: 0/102 (0.00)
    explore-connect: 1/100 (0.01)
    exploit-local: 32/4 (0.89)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 9/36 (0.20)
  exploit deflected to exploration: 0
simulation ended
###### DQL
Learning with: episode_count=10,iteration_count=800,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/10 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 10|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   36.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   37.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|It

  Episode 1 ended at t=184 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/40 (0.11)
    explore-remote: 0/58 (0.00)
    explore-connect: 2/50 (0.04)
    exploit-local: 17/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/8 (0.33)
  exploit deflected to exploration: 0
  ## Episode: 2/10 'DQL' ϵ=0.8971, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 16|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|It

  Episode 2 ended at t=495 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/108 (0.07)
    explore-remote: 0/191 (0.00)
    explore-connect: 2/140 (0.01)
    exploit-local: 14/3 (0.82)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/25 (0.14)
  exploit deflected to exploration: 6
  ## Episode: 3/10 'DQL' ϵ=0.8892, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:    6.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   12.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   36.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 3|Iteration 12|reward:   37.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 3|Itera

  Episode 3 ended at t=179 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/24 (0.31)
    explore-remote: 0/71 (0.00)
    explore-connect: 2/53 (0.04)
    exploit-local: 10/2 (0.83)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/2 (0.67)
  exploit deflected to exploration: 0
  ## Episode: 4/10 'DQL' ϵ=0.8864, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 18|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|I

  Episode 4 ended at t=268 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/59 (0.06)
    explore-remote: 0/90 (0.00)
    explore-connect: 2/82 (0.02)
    exploit-local: 18/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/9 (0.31)
  exploit deflected to exploration: 0
  ## Episode: 5/10 'DQL' ϵ=0.8822, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   37.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   37.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iter

  Episode 5 ended at t=179 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/36 (0.16)
    explore-remote: 0/50 (0.00)
    explore-connect: 3/58 (0.05)
    exploit-local: 14/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/8 (0.27)
  exploit deflected to exploration: 0
  ## Episode: 6/10 'DQL' ϵ=0.8794, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   24.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Ite

  Episode 6 ended at t=268 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 3/49 (0.06)
    explore-remote: 0/86 (0.00)
    explore-connect: 4/86 (0.04)
    exploit-local: 18/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/20 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 7/10 'DQL' ϵ=0.8752, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 7|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   36.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 7|Ite

  Episode 7 ended at t=371 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/82 (0.05)
    explore-remote: 0/126 (0.00)
    explore-connect: 3/103 (0.03)
    exploit-local: 18/1 (0.95)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/31 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 8/10 'DQL' ϵ=0.8695, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   36.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 7|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 13|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iter

  Episode 8 ended at t=237 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/58 (0.09)
    explore-remote: 0/72 (0.00)
    explore-connect: 2/69 (0.03)
    exploit-local: 16/1 (0.94)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/9 (0.31)
  exploit deflected to exploration: 0
  ## Episode: 9/10 'DQL' ϵ=0.8659, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   18.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   18.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward:   18.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Iteration 16|reward:   18.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 9|Ite

  Episode 9 ended at t=292 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/71 (0.12)
    explore-remote: 0/84 (0.00)
    explore-connect: 3/84 (0.03)
    exploit-local: 12/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/25 (0.11)
  exploit deflected to exploration: 2
  ## Episode: 10/10 'DQL' ϵ=0.8614, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 5|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   36.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   37.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   37.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Ep

  Episode 10 ended at t=367 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 3/74 (0.04)
    explore-remote: 0/104 (0.00)
    explore-connect: 1/131 (0.01)
    exploit-local: 19/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/30 (0.14)
  exploit deflected to exploration: 0
simulation ended
###### DQL
Learning with: episode_count=10,iteration_count=800,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/10 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 10|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 1|Iteration 10|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 1|Iteration 16|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 1|Iteration 17|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 1|Iteration 17|reward:   30.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 1|

  Episode 1 ended at t=328 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/65 (0.08)
    explore-remote: 0/111 (0.00)
    explore-connect: 2/111 (0.02)
    exploit-local: 20/1 (0.95)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/7 (0.42)
  exploit deflected to exploration: 0
  ## Episode: 2/10 'DQL' ϵ=0.8948, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   25.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   25.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   25.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   31.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 2|Ite

  Episode 2 ended at t=530 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/110 (0.11)
    explore-remote: 0/181 (0.00)
    explore-connect: 5/166 (0.03)
    exploit-local: 12/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/40 (0.05)
  exploit deflected to exploration: 0
  ## Episode: 3/10 'DQL' ϵ=0.8864, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 13|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 15|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iteration 15|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 3|Iteration 17|reward:   36.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 3|

  Episode 3 ended at t=320 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/51 (0.14)
    explore-remote: 0/115 (0.00)
    explore-connect: 3/107 (0.03)
    exploit-local: 18/4 (0.82)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/10 (0.29)
  exploit deflected to exploration: 0
  ## Episode: 4/10 'DQL' ϵ=0.8814, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 9|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   36.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iter

  Episode 4 ended at t=730 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/174 (0.03)
    explore-remote: 0/231 (0.00)
    explore-connect: 5/235 (0.02)
    exploit-local: 19/5 (0.79)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/53 (0.04)
  exploit deflected to exploration: 0
  ## Episode: 5/10 'DQL' ϵ=0.8701, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 12|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iter

  Episode 5 ended at t=773 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/163 (0.04)
    explore-remote: 0/269 (0.00)
    explore-connect: 2/224 (0.01)
    exploit-local: 19/0 (1.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/84 (0.06)
  exploit deflected to exploration: 0
  ## Episode: 6/10 'DQL' ϵ=0.8582, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 14|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:01||
Episode 6|Iteration 19|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:01||
Episode 6|I

  Episode 6 ended at t=403 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/90 (0.07)
    explore-remote: 0/129 (0.00)
    explore-connect: 4/120 (0.03)
    exploit-local: 19/10 (0.66)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/21 (0.12)
  exploit deflected to exploration: 0
  ## Episode: 7/10 'DQL' ϵ=0.8522, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 14|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 15|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 15|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 7|I

  Episode 7 stopped at t=800 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 5/166 (0.03)
    explore-remote: 0/261 (0.00)
    explore-connect: 1/262 (0.00)
    exploit-local: 17/4 (0.81)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/80 (0.05)
  exploit deflected to exploration: 0
  ## Episode: 8/10 'DQL' ϵ=0.8402, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 9|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 9|reward:   36.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   36.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iteration 15|reward:   37.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 8|Iter

  Episode 8 ended at t=363 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/61 (0.06)
    explore-remote: 0/123 (0.00)
    explore-connect: 1/106 (0.01)
    exploit-local: 22/1 (0.96)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/39 (0.13)
  exploit deflected to exploration: 0
  ## Episode: 9/10 'DQL' ϵ=0.8349, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   36.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 9|Ite

  Episode 9 stopped at t=800 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/182 (0.03)
    explore-remote: 0/256 (0.00)
    explore-connect: 3/231 (0.01)
    exploit-local: 20/6 (0.77)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/93 (0.03)
  exploit deflected to exploration: 0
  ## Episode: 10/10 'DQL' ϵ=0.8232, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   24.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 13|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 16|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 19|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||


  Episode 10 ended at t=196 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/41 (0.15)
    explore-remote: 0/57 (0.00)
    explore-connect: 2/48 (0.04)
    exploit-local: 19/2 (0.90)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/15 (0.25)
  exploit deflected to exploration: 0
simulation ended


In [15]:
tiny = gym.make('ActiveDirectoryTiny-v0')
current_o = tiny.reset()
wrapped_env = AgentWrapper(tiny, ActionTrackingStateAugmentation(ep, current_o))
# Use the trained agent to run the steps one by one
max_steps = 1000
# next action suggested by DQL agent
h = []
for i in range(max_steps):
    # run the suggested action
    _, next_action, _ = l.exploit(wrapped_env, current_o)
    h.append((tiny.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
    if next_action is None:
        break
    current_o, _, is_done, _ = wrapped_env.step(next_action)
    if is_done:
        break
tiny.render()


Unnamed: 0_level_0,status,properties,local_attacks,remote_attacks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
workstation_0,owned,[breach_node],"[AuthorizationSpoofAndCrack, FindDomainControl...",[]
workstation_1,owned,[admin],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
domain_controller_1,discovered,,,[]
