# DQL agent running on the Active Directory sample environment

In [1]:
from cyberbattle.simulation.model import *
import logging, sys, gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")


In [2]:
gymids = [f"ActiveDirectory-v{i}" for i in range(0, 2)]
iteration_count = 1000
training_episode_count = 15
random.seed(100)

In [3]:
envs = [gym.make(gymid) for gymid in gymids]
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=30,
    maximum_total_credentials=50,
    identifiers=envs[0].identifiers
)


In [4]:
# Evaluate the Deep Q-learning agent for each env using transfer learning
l = dqla.DeepQLearnerPolicy(
    ep=ep,
    gamma=0.015,
    replay_memory_size=10000,
    target_update=5,
    batch_size=512,
    learning_rate=0.01  # torch default learning rate is 1e-2
)
for env in envs:
    dqn_learning_run = learner.epsilon_greedy_search(
        cyberbattle_gym_env=env,
        environment_properties=ep,
        learner=l,
        episode_count=training_episode_count,
        iteration_count=iteration_count,
        epsilon=0.90,
        epsilon_exponential_decay=50000,
        epsilon_minimum=0.40,
        verbosity=Verbosity.Quiet,
        render=False,
        plot_episodes_length=False,
        title="DQL"
    )
    l = dqn_learning_run["learner"]


###### DQL
Learning with: episode_count=15,iteration_count=1000,ϵ=0.9,ϵ_min=0.4, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/15 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   12.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   18.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   18.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Iteration 17|reward:   33.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Iteration 17|reward:   33.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 1|Iteration 24|reward:   38.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 1|Iteration 24|reward:   38.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 1|Iteration 26|reward:   44.0|last_reward_at:   24|Elapsed Time: 0:00:00||
Episode 1

  Episode 1 stopped at t=1000 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 16/255 (0.06)
    explore-remote: 0/327 (0.00)
    explore-connect: 3/312 (0.01)
    exploit-local: 2/79 (0.02)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/5 (0.17)
  exploit deflected to exploration: 26
  ## Episode: 2/15 'DQL' ϵ=0.8901, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:    6.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   12.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   27.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 15|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 19|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Ite

  Episode 2 ended at t=881 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/207 (0.06)
    explore-remote: 0/276 (0.00)
    explore-connect: 6/279 (0.02)
    exploit-local: 11/79 (0.12)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/9 (0.10)
  exploit deflected to exploration: 4
  ## Episode: 3/15 'DQL' ϵ=0.8815, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   21.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 13|reward:   32.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 13|reward:   32.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 3|Iteration 16|reward:   32.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 3|I

  Episode 3 ended at t=775 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/184 (0.06)
    explore-remote: 0/257 (0.00)
    explore-connect: 3/234 (0.01)
    exploit-local: 12/57 (0.17)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/12 (0.25)
  exploit deflected to exploration: 4
  ## Episode: 4/15 'DQL' ϵ=0.8741, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 16|reward:   33.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Ite

  Episode 4 ended at t=467 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/96 (0.12)
    explore-remote: 0/161 (0.00)
    explore-connect: 3/143 (0.02)
    exploit-local: 12/21 (0.36)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/14 (0.22)
  exploit deflected to exploration: 3
  ## Episode: 5/15 'DQL' ϵ=0.8697, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   16.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   16.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   27.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 12|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 14|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|It

  Episode 5 ended at t=549 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/127 (0.10)
    explore-remote: 0/177 (0.00)
    explore-connect: 3/168 (0.02)
    exploit-local: 12/28 (0.30)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/16 (0.20)
  exploit deflected to exploration: 2
  ## Episode: 6/15 'DQL' ϵ=0.8646, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 11|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 15|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:   33.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 6|Iteration 16|reward:   33.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 6|Iteration 19|reward:   33.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 6|I

  Episode 6 ended at t=520 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/91 (0.12)
    explore-remote: 0/182 (0.00)
    explore-connect: 4/175 (0.02)
    exploit-local: 13/21 (0.38)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/19 (0.14)
  exploit deflected to exploration: 2
  ## Episode: 7/15 'DQL' ϵ=0.8598, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 2|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 4|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 11|reward:   27.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 11|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 7|Iteration 14|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 7|Iteration 17|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 7|It

  Episode 7 ended at t=430 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/100 (0.09)
    explore-remote: 0/135 (0.00)
    explore-connect: 3/120 (0.02)
    exploit-local: 16/31 (0.34)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/11 (0.27)
  exploit deflected to exploration: 2
  ## Episode: 8/15 'DQL' ϵ=0.8559, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 2|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 8|Iteration 8|reward:   21.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   21.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 11|reward:   21.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   27.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 16|reward:   27.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|I

  Episode 8 ended at t=718 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/154 (0.04)
    explore-remote: 0/237 (0.00)
    explore-connect: 3/231 (0.01)
    exploit-local: 17/38 (0.31)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/27 (0.13)
  exploit deflected to exploration: 4
  ## Episode: 9/15 'DQL' ϵ=0.8494, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 7|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 9|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   27.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 11|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|Iteration 14|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|Iteration 16|reward:   32.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 9|It

  Episode 9 ended at t=403 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/101 (0.10)
    explore-remote: 0/119 (0.00)
    explore-connect: 4/116 (0.03)
    exploit-local: 14/21 (0.40)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/14 (0.18)
  exploit deflected to exploration: 7
  ## Episode: 10/15 'DQL' ϵ=0.8458, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 5|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 10|reward:   21.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   21.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 14|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 10|Iteration 14|reward:   27.0|last_reward_at:   14|Elapsed Time: 0:00:00||

  Episode 10 ended at t=762 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/157 (0.04)
    explore-remote: 0/244 (0.00)
    explore-connect: 5/248 (0.02)
    exploit-local: 17/42 (0.29)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/40 (0.05)
  exploit deflected to exploration: 1
  ## Episode: 11/15 'DQL' ϵ=0.8390, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 5|reward:   20.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 5|reward:   20.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   26.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 11|Iteration 6|reward:   26.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 7|reward:   32.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 11|Iteration 7|reward:   32.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 11|Iteration 9|reward:   38.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 11|Iteration 9|reward:   38.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Epis

  Episode 11 ended at t=631 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/134 (0.07)
    explore-remote: 0/189 (0.00)
    explore-connect: 2/208 (0.01)
    exploit-local: 13/35 (0.27)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/35 (0.12)
  exploit deflected to exploration: 8
  ## Episode: 12/15 'DQL' ϵ=0.8335, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 7|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 9|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   21.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 12|reward:   21.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 15|reward:   21.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 16|reward:   27.0|last_reward_at:   11|Elapsed Time: 0:00:00||

  Episode 12 ended at t=393 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/69 (0.15)
    explore-remote: 0/125 (0.00)
    explore-connect: 2/138 (0.01)
    exploit-local: 13/17 (0.43)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 5/11 (0.31)
  exploit deflected to exploration: 10
  ## Episode: 13/15 'DQL' ϵ=0.8301, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 6|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 9|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 12|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 14|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 15|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 15|reward:   27.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 13|Iteration 17|reward:   27.0|last_reward_at:   15|Elapsed Time: 0:00:00||

  Episode 13 ended at t=437 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/85 (0.08)
    explore-remote: 0/153 (0.00)
    explore-connect: 2/121 (0.02)
    exploit-local: 17/25 (0.40)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/22 (0.19)
  exploit deflected to exploration: 13
  ## Episode: 14/15 'DQL' ϵ=0.8264, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   15.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 14|Iteration 6|reward:   21.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   22.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   22.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 11|reward:   28.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 11|reward:   28.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 14|Iteration 14|reward:   34.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 14|Iteration 14|reward:   34.0|last_reward_at:   14|Elapsed Time: 0:00:00||


  Episode 14 ended at t=342 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/61 (0.14)
    explore-remote: 0/110 (0.00)
    explore-connect: 3/102 (0.03)
    exploit-local: 15/17 (0.47)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/20 (0.17)
  exploit deflected to exploration: 0
  ## Episode: 15/15 'DQL' ϵ=0.8235, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 15|Iteration 3|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:   27.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 10|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 11|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 13|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 15|Iteration 13|reward:   28.0|last_reward_at:    4|Elapsed Time: 0:00:00||


  Episode 15 ended at t=293 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/54 (0.07)
    explore-remote: 0/97 (0.00)
    explore-connect: 2/86 (0.02)
    exploit-local: 22/9 (0.71)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/14 (0.26)
  exploit deflected to exploration: 0
simulation ended
###### DQL
Learning with: episode_count=15,iteration_count=1000,ϵ=0.9,ϵ_min=0.4, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/15 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 6|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 15|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 16|reward:   31.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 16|reward:   31.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 1|It

  Episode 1 ended at t=406 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/84 (0.12)
    explore-remote: 0/149 (0.00)
    explore-connect: 3/125 (0.02)
    exploit-local: 15/6 (0.71)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/8 (0.33)
  exploit deflected to exploration: 0
  ## Episode: 2/15 'DQL' ϵ=0.8960, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   31.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 2|Iteration 14|reward:   31.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 2|Ite

  Episode 2 ended at t=515 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 13/101 (0.11)
    explore-remote: 0/171 (0.00)
    explore-connect: 3/177 (0.02)
    exploit-local: 14/9 (0.61)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/23 (0.15)
  exploit deflected to exploration: 0
  ## Episode: 3/15 'DQL' ϵ=0.8909, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   31.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   31.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   40.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   40.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 8|reward:   46.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iterat

  Episode 3 ended at t=375 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/68 (0.09)
    explore-remote: 0/127 (0.00)
    explore-connect: 5/119 (0.04)
    exploit-local: 18/8 (0.69)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/21 (0.09)
  exploit deflected to exploration: 0
  ## Episode: 4/15 'DQL' ϵ=0.8872, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 9|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   24.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 4|Iteration 17|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 4|Iteration 20|reward:   30.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 4|

  Episode 4 ended at t=672 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/149 (0.06)
    explore-remote: 0/214 (0.00)
    explore-connect: 3/231 (0.01)
    exploit-local: 17/12 (0.59)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/32 (0.11)
  exploit deflected to exploration: 5
  ## Episode: 5/15 'DQL' ϵ=0.8807, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   23.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   23.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   23.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   23.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 14|reward:   23.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   29.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 16|reward:   29.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 5|Iteration 18|reward:   35.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 5|I

  Episode 5 ended at t=763 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/143 (0.07)
    explore-remote: 0/249 (0.00)
    explore-connect: 4/250 (0.02)
    exploit-local: 16/11 (0.59)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/76 (0.04)
  exploit deflected to exploration: 2
  ## Episode: 6/15 'DQL' ϵ=0.8734, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 6|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 6|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 6|Iteration 2|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 5|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 7|reward:   12.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   18.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 6|Iteration 10|reward:   18.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 13|reward:   18.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 15|reward:   18.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|Iteration 18|reward:   18.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 6|I

  Episode 6 ended at t=975 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/186 (0.06)
    explore-remote: 0/343 (0.00)
    explore-connect: 3/301 (0.01)
    exploit-local: 15/32 (0.32)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/79 (0.05)
  exploit deflected to exploration: 3
  ## Episode: 7/15 'DQL' ϵ=0.8643, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 7|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 3|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 7|Iteration 6|reward:    6.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 7|reward:    6.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   24.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 7|Iteration 9|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 7|Iteration 10|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   30.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|Iteration 13|reward:   36.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 7|It

  Episode 7 ended at t=558 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/122 (0.08)
    explore-remote: 0/174 (0.00)
    explore-connect: 4/177 (0.02)
    exploit-local: 17/23 (0.42)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/28 (0.10)
  exploit deflected to exploration: 0
  ## Episode: 8/15 'DQL' ϵ=0.8591, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 8|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 8|Iteration 3|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 5|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 7|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 10|reward:   18.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 8|Iteration 12|reward:   24.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 15|reward:   24.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 16|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 8|Iteration 16|reward:   30.0|last_reward_at:   16|Elapsed Time: 0:00:00||
Episode 8|

  Episode 8 ended at t=456 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 6/87 (0.06)
    explore-remote: 0/149 (0.00)
    explore-connect: 3/142 (0.02)
    exploit-local: 19/16 (0.54)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/30 (0.12)
  exploit deflected to exploration: 0
  ## Episode: 9/15 'DQL' ϵ=0.8550, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 9|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 9|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 9|Iteration 2|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 4|reward:   24.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   30.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 9|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 10|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 9|Iteration 12|reward:   36.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 9|Ite

  Episode 9 ended at t=263 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/46 (0.13)
    explore-remote: 0/83 (0.00)
    explore-connect: 2/77 (0.03)
    exploit-local: 18/8 (0.69)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/17 (0.23)
  exploit deflected to exploration: 0
  ## Episode: 10/15 'DQL' ϵ=0.8526, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 10|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 10|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 3|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 4|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 5|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 7|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 10|Iteration 9|reward:   24.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   30.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 10|Iteration 12|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 10|Iteration 15|reward:   30.0|last_reward_at:   12|Elapsed Time: 0:00:00||
E

  Episode 10 ended at t=354 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/97 (0.10)
    explore-remote: 0/95 (0.00)
    explore-connect: 3/107 (0.03)
    exploit-local: 16/9 (0.64)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/12 (0.25)
  exploit deflected to exploration: 0
  ## Episode: 11/15 'DQL' ϵ=0.8494, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 11|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 11|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 3|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 11|Iteration 4|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 7|reward:   24.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 8|reward:   30.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 11|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 11|Iteration 10|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 11|Iteration 10|reward:   36.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 11|Iteration 13|reward:   36.0|last_reward_at:   10|Elapsed Time: 0:00:00||
E

  Episode 11 ended at t=633 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/135 (0.06)
    explore-remote: 0/181 (0.00)
    explore-connect: 2/207 (0.01)
    exploit-local: 18/19 (0.49)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/58 (0.08)
  exploit deflected to exploration: 0
  ## Episode: 12/15 'DQL' ϵ=0.8437, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 12|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 12|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 12|Iteration 3|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 12|Iteration 5|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 12|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 12|Iteration 8|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 12|Iteration 11|reward:   36.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 13|reward:   36.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 12|Iteration 14|reward:   37.0|last_reward_at:   11|Elapsed Time: 0:00:00||


  Episode 12 ended at t=182 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/27 (0.21)
    explore-remote: 0/58 (0.00)
    explore-connect: 1/57 (0.02)
    exploit-local: 17/3 (0.85)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/6 (0.50)
  exploit deflected to exploration: 1
  ## Episode: 13/15 'DQL' ϵ=0.8421, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 13|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 13|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 3|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 13|Iteration 4|reward:   12.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 5|reward:   18.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 13|Iteration 5|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 13|Iteration 7|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 13|Iteration 10|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 13|Iteration 13|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 13|Iteration 14|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
E

  Episode 13 ended at t=370 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/71 (0.12)
    explore-remote: 0/125 (0.00)
    explore-connect: 1/101 (0.01)
    exploit-local: 16/13 (0.55)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/27 (0.18)
  exploit deflected to exploration: 0
  ## Episode: 14/15 'DQL' ϵ=0.8389, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 14|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 14|Iteration 1|reward:   18.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   24.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 14|Iteration 3|reward:   24.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 14|Iteration 5|reward:   30.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 14|Iteration 5|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 14|Iteration 7|reward:   30.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   36.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 14|Iteration 8|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 10|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 14|Iteration 11|reward:   37.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Ep

  Episode 14 ended at t=830 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/187 (0.06)
    explore-remote: 0/262 (0.00)
    explore-connect: 2/244 (0.01)
    exploit-local: 16/14 (0.53)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/89 (0.05)
  exploit deflected to exploration: 1
  ## Episode: 15/15 'DQL' ϵ=0.8316, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 15|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 2|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 5|reward:   18.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 15|Iteration 5|reward:   18.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   24.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 15|Iteration 7|reward:   24.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 8|reward:   30.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 15|Iteration 8|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 15|Iteration 10|reward:   30.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 15|Iteration 12|reward:   36.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Ep

  Episode 15 ended at t=157 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/17 (0.29)
    explore-remote: 0/62 (0.00)
    explore-connect: 1/40 (0.02)
    exploit-local: 19/3 (0.86)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 6/2 (0.75)
  exploit deflected to exploration: 0
simulation ended


In [28]:
tiny = gym.make('ActiveDirectory-v1')
current_o = tiny.reset()
wrapped_env = AgentWrapper(tiny, ActionTrackingStateAugmentation(ep, current_o))
# Use the trained agent to run the steps one by one
max_steps = 1500
# next action suggested by DQL agent
h = []
for i in range(max_steps):
    # run the suggested action
    _, next_action, _ = l.exploit(wrapped_env, current_o)
    h.append((tiny.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
    if next_action is None:
        break
    current_o, _, is_done, _ = wrapped_env.step(next_action)
    if is_done:
        break
tiny.render()


Unnamed: 0_level_0,status,properties,local_attacks,remote_attacks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
workstation_0,owned,[breach_node],"[EnumerateFileShares, AuthorizationSpoofAndCra...",[]
workstation_4,owned,[admin],"[EnumerateFileShares, ProbeAdmin, ScanForCreds...",[]
workstation_1,owned,[admin],"[EnumerateFileShares, ProbeAdmin, ScanForCreds...",[]
workstation_2,owned,[],"[EnumerateFileShares, ProbeAdmin, FindDomainCo...",[]
share_0,discovered,,,[]
domain_controller_1,discovered,,,[]
