# DQL agent running on the Active Directory sample environment

In [1]:
from cyberbattle.simulation.model import *
import logging, sys, gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")


In [2]:
gymids = [f"ActiveDirectory-v{i}" for i in range(0, 2)]
iteration_count = 1000
training_episode_count = 15
random.seed(100)

In [3]:
envs = [gym.make(gymid) for gymid in gymids]
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=30,
    maximum_total_credentials=50,
    identifiers=envs[0].identifiers
)


In [4]:
# Evaluate the Deep Q-learning agent for each env using transfer learning
l = dqla.DeepQLearnerPolicy(
    ep=ep,
    gamma=0.015,
    replay_memory_size=10000,
    target_update=5,
    batch_size=512,
    learning_rate=0.01  # torch default learning rate is 1e-2
)
for env in envs:
    dqn_learning_run = learner.epsilon_greedy_search(
        cyberbattle_gym_env=env,
        environment_properties=ep,
        learner=l,
        episode_count=training_episode_count,
        iteration_count=iteration_count,
        epsilon=0.90,
        epsilon_exponential_decay=50000,
        epsilon_minimum=0.40,
        verbosity=Verbosity.Quiet,
        render=False,
        plot_episodes_length=False,
        title="DQL"
    )
    l = dqn_learning_run["learner"]


###### DQL
Learning with: episode_count=15,iteration_count=1000,ϵ=0.9,ϵ_min=0.4, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/15 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   15.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   21.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 15|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 15|reward:   27.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 1|Iteration 38|reward:   27.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 1|Iteration 39|reward:   33.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 1|Iteration 39|reward:   33.0|last_reward_at:   39|Elapsed Time: 0:00:00||
Episode 1|Iteration 64|reward:   33.0|last_reward_at:   39|Elapsed Time: 0:00:00||
Episode 1|

  Episode 1 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 20/216 (0.08)
    explore-remote: 0/346 (0.00)
    explore-connect: 6/309 (0.02)
    exploit-local: 5/92 (0.05)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 0/5 (0.00)
  exploit deflected to exploration: 0
  ## Episode: 2/15 'DQL' ϵ=0.8901, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   16.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   16.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   22.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 2|Iteration 13|reward:   22.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 2|Iteration 20|reward:   27.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 2|Iteration 20|reward:   27.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 2|Iteration 26|reward:   27.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 2|Iteration 27|reward:   33.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 

  Episode 2 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/225 (0.06)
    explore-remote: 0/330 (0.00)
    explore-connect: 4/310 (0.01)
    exploit-local: 8/89 (0.08)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/19 (0.05)
  exploit deflected to exploration: 1
  ## Episode: 3/15 'DQL' ϵ=0.8804, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   18.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 17|reward:   18.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 21|reward:   33.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 21|reward:   33.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 3|Iteration 24|reward:   34.0|last_reward_at:   21|Elapsed Time: 0:00:00||
Episode 3|

  Episode 3 ended at t=609 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/141 (0.08)
    explore-remote: 0/192 (0.00)
    explore-connect: 5/185 (0.03)
    exploit-local: 13/40 (0.25)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 2/17 (0.11)
  exploit deflected to exploration: 4
  ## Episode: 4/15 'DQL' ϵ=0.8746, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   21.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 18|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 24|reward:   28.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 4|Iteration 24|reward:   28.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 4|Iteration 26|reward:   34.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 4|Iteration 26|reward:   34.0|last_reward_at:   26|Elapsed Time: 0:00:00||
Episode 4

  Episode 4 ended at t=230 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/38 (0.24)
    explore-remote: 0/81 (0.00)
    explore-connect: 5/66 (0.07)
    exploit-local: 12/9 (0.57)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/5 (0.29)
  exploit deflected to exploration: 0
  ## Episode: 5/15 'DQL' ϵ=0.8724, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 12|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 5|Iteration 12|reward:   27.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 5|Iteration 18|reward:   27.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 5|Iteration 20|reward:   32.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 5|Iteration 20|reward:   32.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 5|Iteration 26|reward:   32.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 5

  Episode 5 ended at t=535 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/99 (0.07)
    explore-remote: 0/148 (0.00)
    explore-connect: 3/189 (0.02)
    exploit-local: 17/42 (0.29)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/25 (0.14)
  exploit deflected to exploration: 0
  ## Episode: 6/15 'DQL' ϵ=0.8674, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 6|reward:   18.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   33.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 17|reward:   38.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 17|reward:   38.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 6|Ite

  Episode 6 ended at t=582 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/118 (0.10)
    explore-remote: 0/205 (0.00)
    explore-connect: 4/163 (0.02)
    exploit-local: 12/56 (0.18)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/8 (0.27)
  exploit deflected to exploration: 6
  ## Episode: 7/15 'DQL' ϵ=0.8620, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   27.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   33.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 8|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iteration 20|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 7|Iter

  Episode 7 ended at t=604 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/154 (0.06)
    explore-remote: 0/185 (0.00)
    explore-connect: 5/176 (0.03)
    exploit-local: 15/45 (0.25)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/13 (0.13)
  exploit deflected to exploration: 5
  ## Episode: 8/15 'DQL' ϵ=0.8564, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:    6.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 8|Iteration 13|reward:   21.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 8|Iteration 13|reward:   21.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 8|Iteration 17|reward:   27.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 8|Iteration 17|reward:   27.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 8|Iteration 19|reward:   33.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 8|Iteration 19|reward:   33.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 8|Iteration 25|reward:   33.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode

  Episode 8 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/241 (0.04)
    explore-remote: 0/316 (0.00)
    explore-connect: 2/314 (0.01)
    exploit-local: 5/68 (0.07)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/44 (0.02)
  exploit deflected to exploration: 8
  ## Episode: 9/15 'DQL' ϵ=0.8474, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 9|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   27.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 17|reward:   33.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 9|Iteration 17|reward:   33.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 9|Iteration 22|reward:   38.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 9|Iteration 22|reward:   38.0|last_reward_at:   22|Elapsed Time: 0:00:00||
Episode 9|

  Episode 9 ended at t=475 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/110 (0.11)
    explore-remote: 0/136 (0.00)
    explore-connect: 6/141 (0.04)
    exploit-local: 11/42 (0.21)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/14 (0.07)
  exploit deflected to exploration: 7
  ## Episode: 10/15 'DQL' ϵ=0.8432, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   12.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 8|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 10|Iteration 8|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 10|Iteration 13|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 10|Iteration 13|reward:   32.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 10|Iteration 13|reward:   32.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   38.0|last_reward_at:   13|Elapsed Time: 0:00:00||


  Episode 10 ended at t=798 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/157 (0.06)
    explore-remote: 0/257 (0.00)
    explore-connect: 4/255 (0.02)
    exploit-local: 13/56 (0.19)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/43 (0.07)
  exploit deflected to exploration: 8
  ## Episode: 11/15 'DQL' ϵ=0.8361, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   15.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 11|reward:   27.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 11|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 11|Iteration 12|reward:   32.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 11|Iteration 12|reward:   32.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 11|Iteration 15|reward:   38.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 11|Iteration 15|reward:   38.0|last_reward_at:   15|Elapsed Time: 0:00:00|

  Episode 11 ended at t=715 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/143 (0.05)
    explore-remote: 0/234 (0.00)
    explore-connect: 4/196 (0.02)
    exploit-local: 16/90 (0.15)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/21 (0.12)
  exploit deflected to exploration: 12
  ## Episode: 12/15 'DQL' ϵ=0.8299, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 8|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   21.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 16|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 17|reward:   28.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 17|reward:   28.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 12|Iteration 21|reward:   34.0|last_reward_at:   17|Elapsed Time: 0:00:00|

  Episode 12 ended at t=327 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/74 (0.10)
    explore-remote: 0/101 (0.00)
    explore-connect: 3/87 (0.03)
    exploit-local: 15/22 (0.41)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/13 (0.24)
  exploit deflected to exploration: 7
  ## Episode: 13/15 'DQL' ϵ=0.8271, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   12.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   27.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 13|Iteration 13|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 13|Iteration 16|reward:   33.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 13|Iteration 16|reward:   33.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 13|Iteration 23|reward:   33.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 13|Iteration 27|reward:   34.0|last_reward_at:   16|Elapsed Time: 0:00:00

  Episode 13 ended at t=404 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/96 (0.09)
    explore-remote: 0/135 (0.00)
    explore-connect: 3/101 (0.03)
    exploit-local: 15/27 (0.36)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/14 (0.22)
  exploit deflected to exploration: 5
  ## Episode: 14/15 'DQL' ϵ=0.8237, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   12.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 9|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 9|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 14|Iteration 16|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 14|Iteration 20|reward:   28.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 14|Iteration 20|reward:   28.0|last_reward_at:   20|Elapsed Time: 0:00:00||
Episode 14|Iteration 25|reward:   34.0|last_reward_at:   20|Elapsed Time: 0:00:00||


  Episode 14 ended at t=499 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/112 (0.07)
    explore-remote: 0/148 (0.00)
    explore-connect: 5/141 (0.03)
    exploit-local: 15/43 (0.26)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/25 (0.07)
  exploit deflected to exploration: 5
  ## Episode: 15/15 'DQL' ϵ=0.8195, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   27.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   27.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:   28.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:   28.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 9|reward:   28.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 13|reward:   28.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 15|reward:   33.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Ep

  Episode 15 ended at t=501 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/109 (0.10)
    explore-remote: 0/144 (0.00)
    explore-connect: 2/146 (0.01)
    exploit-local: 13/56 (0.19)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/14 (0.26)
  exploit deflected to exploration: 3
simulation ended
###### DQL
Learning with: episode_count=15,iteration_count=1000,ϵ=0.9,ϵ_min=0.4, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/15 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   24.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   24.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 18|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 18|reward:   30.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 1|Iteration 25|reward:   30.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 1|Iteration 29|reward:   31.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 1|Iteration 29|reward:   31.0|last_reward_at:   29|Elapsed Time: 0:00:00||
Episode 1|

  Episode 1 ended at t=400 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 16/81 (0.16)
    explore-remote: 0/135 (0.00)
    explore-connect: 4/123 (0.03)
    exploit-local: 11/19 (0.37)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/8 (0.27)
  exploit deflected to exploration: 3
  ## Episode: 2/15 'DQL' ϵ=0.8960, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   18.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 14|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 14|reward:   24.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 2|Iteration 21|reward:   24.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 2|Iteration 24|reward:   30.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 2|Iteration 24|reward:   30.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 2|Iteration 27|reward:   36.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 2|Iteration 27|reward:   36.0|last_reward_at:   27|Elapsed Time: 0:00:00||
Episode 2

  Episode 2 ended at t=297 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/54 (0.19)
    explore-remote: 0/103 (0.00)
    explore-connect: 2/95 (0.02)
    exploit-local: 12/8 (0.60)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/5 (0.50)
  exploit deflected to exploration: 0
  ## Episode: 3/15 'DQL' ϵ=0.8931, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   24.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 13|reward:   24.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 17|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 17|reward:   30.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   31.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 3|Iteration 18|reward:   31.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|Iteration 25|reward:   31.0|last_reward_at:   18|Elapsed Time: 0:00:00||
Episode 3|

  Episode 3 ended at t=280 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/44 (0.20)
    explore-remote: 0/111 (0.00)
    explore-connect: 3/79 (0.04)
    exploit-local: 9/11 (0.45)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/8 (0.33)
  exploit deflected to exploration: 0
  ## Episode: 4/15 'DQL' ϵ=0.8903, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 5|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 19|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 19|reward:   36.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 4|Iteration 23|reward:   37.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 4|Iteration 23|reward:   37.0|last_reward_at:   23|Elapsed Time: 0:00:00||
Episode 4|I

  Episode 4 ended at t=216 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 16/32 (0.33)
    explore-remote: 0/70 (0.00)
    explore-connect: 5/71 (0.07)
    exploit-local: 9/9 (0.50)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/2 (0.50)
  exploit deflected to exploration: 0
  ## Episode: 5/15 'DQL' ϵ=0.8882, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   24.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 17|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 17|reward:   30.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 5|Iteration 22|reward:   36.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 5|Iteration 22|reward:   36.0|last_reward_at:   22|Elapsed Time: 0:00:00||
Episode 5|Iteration 24|reward:   41.0|last_reward_at:   22|Elapsed Time: 0:00:00||
Episode 5|Iteration 24|reward:   41.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 

  Episode 5 ended at t=278 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/44 (0.23)
    explore-remote: 0/100 (0.00)
    explore-connect: 4/86 (0.04)
    exploit-local: 11/13 (0.46)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/4 (0.43)
  exploit deflected to exploration: 0
  ## Episode: 6/15 'DQL' ϵ=0.8855, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 8|reward:   18.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   24.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 6|Iteration 12|reward:   24.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 6|Iteration 15|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 6|Iteration 15|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 6|Iteration 17|reward:   36.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 6|Iteration 17|reward:   36.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 6|Iteration 21|reward:   37.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 6

  Episode 6 ended at t=343 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/69 (0.16)
    explore-remote: 0/117 (0.00)
    explore-connect: 4/106 (0.04)
    exploit-local: 13/11 (0.54)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/7 (0.30)
  exploit deflected to exploration: 0
  ## Episode: 7/15 'DQL' ϵ=0.8822, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   40.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 12|reward:   40.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 7|Iter

  Episode 7 ended at t=259 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/46 (0.21)
    explore-remote: 0/82 (0.00)
    explore-connect: 5/81 (0.06)
    exploit-local: 11/8 (0.58)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/12 (0.14)
  exploit deflected to exploration: 1
  ## Episode: 8/15 'DQL' ϵ=0.8797, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 8|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 7|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 8|Iteration 7|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   36.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 15|reward:   37.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 15|reward:   37.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 8|Iteration 19|reward:   43.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 8|I

  Episode 8 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/215 (0.05)
    explore-remote: 0/331 (0.00)
    explore-connect: 1/315 (0.00)
    exploit-local: 7/35 (0.17)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/81 (0.04)
  exploit deflected to exploration: 1
  ## Episode: 9/15 'DQL' ϵ=0.8702, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   23.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   23.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:   29.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 3|reward:   29.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   35.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   35.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   36.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Iteration 13|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 9|Itera

  Episode 9 ended at t=333 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 15/57 (0.21)
    explore-remote: 0/99 (0.00)
    explore-connect: 3/106 (0.03)
    exploit-local: 8/25 (0.24)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/16 (0.20)
  exploit deflected to exploration: 0
  ## Episode: 10/15 'DQL' ϵ=0.8671, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 6|reward:   12.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 13|reward:   12.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 10|Iteration 19|reward:   31.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 10|Iteration 19|reward:   31.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 10|Iteration 25|reward:   32.0|last_reward_at:   19|Elapsed Time: 0:00:00||
Episode 10|Iteration 25|reward:   32.0|last_reward_at:   25|Elapsed Time: 0:00:00

  Episode 10 ended at t=310 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 18/46 (0.28)
    explore-remote: 0/105 (0.00)
    explore-connect: 2/98 (0.02)
    exploit-local: 5/13 (0.28)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/18 (0.22)
  exploit deflected to exploration: 2
  ## Episode: 11/15 'DQL' ϵ=0.8642, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 13|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 14|reward:   36.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 14|reward:   36.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 11|Iteration 19|reward:   37.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 11|Iteration 19|reward:   37.0|last_reward_at:   19|Elapsed Time: 0:00:00||

  Episode 11 ended at t=519 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/108 (0.11)
    explore-remote: 0/164 (0.00)
    explore-connect: 5/167 (0.03)
    exploit-local: 11/28 (0.28)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/20 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 12/15 'DQL' ϵ=0.8594, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 6|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 6|reward:   12.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 12|Iteration 7|reward:   18.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 12|Iteration 7|reward:   18.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   36.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:   36.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 10|reward:   37.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 12|Iteration 10|reward:   37.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 12|Iteration 13|reward:   43.0|last_reward_at:   10|Elapsed Time: 0:00:00||
E

  Episode 12 ended at t=262 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/44 (0.23)
    explore-remote: 0/86 (0.00)
    explore-connect: 3/84 (0.03)
    exploit-local: 9/13 (0.41)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/6 (0.40)
  exploit deflected to exploration: 0
  ## Episode: 13/15 'DQL' ϵ=0.8570, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 13|Iteration 11|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 13|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 13|Iteration 13|reward:   37.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 13|Iteration 13|reward:   37.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 13|Iteration 21|reward:   37.0|last_reward_at:   13|Elapsed Time: 0:00:00||

  Episode 13 ended at t=296 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/52 (0.15)
    explore-remote: 0/86 (0.00)
    explore-connect: 2/93 (0.02)
    exploit-local: 15/26 (0.37)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/8 (0.38)
  exploit deflected to exploration: 2
  ## Episode: 14/15 'DQL' ϵ=0.8543, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 14|Iteration 4|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 14|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:   30.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 11|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 14|Iteration 13|reward:   36.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 14|Iteration 13|reward:   36.0|last_reward_at:   13|Elapsed Time: 0:00:00||


  Episode 14 ended at t=456 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/99 (0.10)
    explore-remote: 0/149 (0.00)
    explore-connect: 2/122 (0.02)
    exploit-local: 12/42 (0.22)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/14 (0.26)
  exploit deflected to exploration: 1
  ## Episode: 15/15 'DQL' ϵ=0.8502, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 10|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 17|reward:   35.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 17|reward:   35.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 15|Iteration 21|reward:   36.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 15|Iteration 21|reward:   36.0|last_reward_at:   21|Elapsed Time: 0:00:00||

  Episode 15 ended at t=281 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/50 (0.19)
    explore-remote: 0/81 (0.00)
    explore-connect: 4/98 (0.04)
    exploit-local: 13/13 (0.50)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/7 (0.30)
  exploit deflected to exploration: 0
simulation ended


In [8]:
tiny = gym.make('ActiveDirectory-v1')
current_o = tiny.reset()
wrapped_env = AgentWrapper(tiny, ActionTrackingStateAugmentation(ep, current_o))
# Use the trained agent to run the steps one by one
max_steps = 1500
# next action suggested by DQL agent
h = []
for i in range(max_steps):
    # run the suggested action
    _, next_action, _ = l.exploit(wrapped_env, current_o)
    h.append((tiny.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
    if next_action is None:
        break
    current_o, _, is_done, _ = wrapped_env.step(next_action)
    if is_done:
        break
tiny.render()


Unnamed: 0_level_0,status,properties,local_attacks,remote_attacks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
workstation_0,owned,[breach_node],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
workstation_1,owned,[admin],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
workstation_2,owned,[admin],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
workstation_5,owned,[],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
workstation_6,owned,[admin],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
domain_controller_1,owned,[domain_controller],"[DumpNTDS, FindDomainControllers, ProbeAdmin, ...",[]
workstation_4,owned,[admin],"[FindDomainControllers, ProbeAdmin, EnumerateF...",[]
share_0,discovered,,,[]
workstation_3,discovered,,,[]
workstation_7,discovered,,,[]
