# DQL agent running on the Active Directory sample environment

In [11]:
from cyberbattle.simulation.model import *
import logging, sys, gym
import cyberbattle.agents.baseline.learner as learner
import cyberbattle.agents.baseline.agent_wrapper as w
import cyberbattle.agents.baseline.agent_dql as dqla
from cyberbattle.agents.baseline.agent_wrapper import ActionTrackingStateAugmentation, AgentWrapper, Verbosity
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(levelname)s: %(message)s")


In [12]:
gymids = [f"ActiveDirectory-v{i}" for i in range(0, 3)]
iteration_count = 500
training_episode_count = 5

In [13]:
envs = [gym.make(gymid) for gymid in gymids]
ep = w.EnvironmentBounds.of_identifiers(
    maximum_node_count=30,
    maximum_total_credentials=50000,
    identifiers=envs[0].identifiers
)


In [14]:
# Evaluate the Deep Q-learning agent for each env using transfer learning
l = dqla.DeepQLearnerPolicy(
    ep=ep,
    gamma=0.015,
    replay_memory_size=10000,
    target_update=5,
    batch_size=512,
    learning_rate=0.01  # torch default learning rate is 1e-2
)
for env in envs:
    dqn_learning_run = learner.epsilon_greedy_search(
        cyberbattle_gym_env=env,
        environment_properties=ep,
        learner=l,
        episode_count=training_episode_count,
    iteration_count=iteration_count,
        epsilon=0.90,
        epsilon_exponential_decay=50000,
        epsilon_minimum=0.10,
        verbosity=Verbosity.Quiet,
        render=False,
        plot_episodes_length=False,
        title="DQL"
    )
    l = dqn_learning_run["learner"]


###### DQL
Learning with: episode_count=5,iteration_count=500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/5 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 3|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   27.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   33.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 26|reward:   33.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 26|reward:   34.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 26|reward:   34.0|last_reward_at:   26|Elapsed Time: 0:00:00||
Episode 1|Ite

  Episode 1 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 17/92 (0.16)
    explore-remote: 0/185 (0.00)
    explore-connect: 4/146 (0.03)
    exploit-local: 0/56 (0.00)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 0/0 (NaN)
  exploit deflected to exploration: 1
  ## Episode: 2/5 'DQL' ϵ=0.8921, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 5|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   27.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 14|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 16|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 18|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|Iteration 21|reward:   27.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 2|I

  Episode 2 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/98 (0.10)
    explore-remote: 0/174 (0.00)
    explore-connect: 4/160 (0.02)
    exploit-local: 7/41 (0.15)
    exploit-remote: 0/1 (0.00)
    exploit-connect: 1/3 (0.25)
  exploit deflected to exploration: 0
  ## Episode: 3/5 'DQL' ϵ=0.8842, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 2|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 5|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   22.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   22.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   28.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   28.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   34.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   34.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 3|Iter

  Episode 3 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/105 (0.10)
    explore-remote: 0/158 (0.00)
    explore-connect: 3/154 (0.02)
    exploit-local: 12/44 (0.21)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/9 (0.25)
  exploit deflected to exploration: 2
  ## Episode: 4/5 'DQL' ϵ=0.8764, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   12.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   12.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   27.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   27.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 13|reward:   32.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 13|reward:   32.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 4|Iteration 16|reward:   38.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 4|It

  Episode 4 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 10/106 (0.09)
    explore-remote: 0/157 (0.00)
    explore-connect: 3/176 (0.02)
    exploit-local: 7/19 (0.27)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/21 (0.05)
  exploit deflected to exploration: 8
  ## Episode: 5/5 'DQL' ϵ=0.8686, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 4|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   21.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   27.0|last_reward_at:    5|Elapsed Time: 0:00:00||
Episode 5|Iteration 9|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 12|reward:   27.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   28.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   28.0|last_reward_at:   15|Elapsed Time: 0:00:00||
Episode 5|Ite

  Episode 5 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/104 (0.04)
    explore-remote: 0/149 (0.00)
    explore-connect: 3/159 (0.02)
    exploit-local: 16/17 (0.48)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/46 (0.04)
  exploit deflected to exploration: 2
simulation ended
###### DQL
Learning with: episode_count=5,iteration_count=500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/5 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:   15.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   20.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 1|Iteration 2|reward:   20.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 5|reward:   20.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   26.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:   26.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 10|reward:   26.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   26.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   32.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 1|Iteration 13|reward:   32.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 1|It

  Episode 1 ended at t=433 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 11/77 (0.12)
    explore-remote: 0/145 (0.00)
    explore-connect: 4/142 (0.03)
    exploit-local: 13/17 (0.43)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/21 (0.12)
  exploit deflected to exploration: 1
  ## Episode: 2/5 'DQL' ϵ=0.8931, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   15.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   15.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 7|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 10|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward:   26.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 12|reward:   26.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 2|Iteration 16|reward:   26.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 2|Iteration 19|reward:   26.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 2|I

  Episode 2 ended at t=348 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 9/77 (0.10)
    explore-remote: 0/122 (0.00)
    explore-connect: 5/97 (0.05)
    exploit-local: 16/11 (0.59)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 2/9 (0.18)
  exploit deflected to exploration: 0
  ## Episode: 3/5 'DQL' ϵ=0.8876, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 3|Iteration 3|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   27.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   33.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 7|reward:   33.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 10|reward:   33.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iteration 13|reward:   33.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 3|Iter

  Episode 3 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 4/114 (0.03)
    explore-remote: 0/159 (0.00)
    explore-connect: 1/159 (0.01)
    exploit-local: 7/15 (0.32)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/40 (0.02)
  exploit deflected to exploration: 0
  ## Episode: 4/5 'DQL' ϵ=0.8798, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:    6.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   21.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 4|reward:   21.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   27.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   27.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   33.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 4|Iteration 8|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 11|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iteration 12|reward:   34.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 4|Iter

  Episode 4 ended at t=401 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/104 (0.10)
    explore-remote: 0/128 (0.00)
    explore-connect: 4/105 (0.04)
    exploit-local: 13/18 (0.42)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/14 (0.18)
  exploit deflected to exploration: 4
  ## Episode: 5/5 'DQL' ϵ=0.8736, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:    6.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   21.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 2|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 5|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   21.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   27.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 5|Iteration 10|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   27.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|Iteration 17|reward:   33.0|last_reward_at:   10|Elapsed Time: 0:00:00||
Episode 5|I

  Episode 5 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/123 (0.06)
    explore-remote: 0/153 (0.00)
    explore-connect: 4/153 (0.03)
    exploit-local: 12/16 (0.43)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 1/30 (0.03)
  exploit deflected to exploration: 1
simulation ended
###### DQL
Learning with: episode_count=5,iteration_count=500,ϵ=0.9,ϵ_min=0.1, ϵ_expdecay=50000,γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5
  ## Episode: 1/5 'DQL' ϵ=0.9000, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 1|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 4|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 7|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 1|Iteration 8|reward:    6.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 1|Iteration 9|reward:   33.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   34.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 1|Iteration 12|reward:   34.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Iteration 14|reward:   40.0|last_reward_at:   12|Elapsed Time: 0:00:00||
Episode 1|Iteration 14|reward:   40.0|last_reward_at:   14|Elapsed Time: 0:00:00||
Episode 1|It

  Episode 1 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/113 (0.10)
    explore-remote: 0/160 (0.00)
    explore-connect: 3/163 (0.02)
    exploit-local: 14/7 (0.67)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/25 (0.11)
  exploit deflected to exploration: 1
  ## Episode: 2/5 'DQL' ϵ=0.8921, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 2|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   27.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 2|Iteration 1|reward:   27.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   33.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 2|Iteration 2|reward:   33.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   34.0|last_reward_at:    2|Elapsed Time: 0:00:00||
Episode 2|Iteration 4|reward:   34.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 6|reward:   40.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 2|Iteration 6|reward:   40.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   46.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 2|Iteration 9|reward:   46.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Iteration 11|reward:   52.0|last_reward_at:    9|Elapsed Time: 0:00:00||
Episode 2|Itera

  Episode 2 ended at t=438 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 12/91 (0.12)
    explore-remote: 0/135 (0.00)
    explore-connect: 4/141 (0.03)
    exploit-local: 18/15 (0.55)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/17 (0.23)
  exploit deflected to exploration: 0
  ## Episode: 3/5 'DQL' ϵ=0.8851, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 3|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 1|reward:    0.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    6.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 3|Iteration 4|reward:    6.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   33.0|last_reward_at:    4|Elapsed Time: 0:00:00||
Episode 3|Iteration 6|reward:   33.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 9|reward:   33.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   39.0|last_reward_at:    6|Elapsed Time: 0:00:00||
Episode 3|Iteration 11|reward:   39.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 14|reward:   39.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 17|reward:   40.0|last_reward_at:   11|Elapsed Time: 0:00:00||
Episode 3|Iteration 17|reward:   40.0|last_reward_at:   17|Elapsed Time: 0:00:00||
Episode 3|I

  Episode 3 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 14/110 (0.11)
    explore-remote: 0/153 (0.00)
    explore-connect: 1/172 (0.01)
    exploit-local: 12/19 (0.39)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 5/14 (0.26)
  exploit deflected to exploration: 0
  ## Episode: 4/5 'DQL' ϵ=0.8773, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 4|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   27.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 4|Iteration 1|reward:   27.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   33.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 4|Iteration 3|reward:   33.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 6|reward:   33.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   39.0|last_reward_at:    3|Elapsed Time: 0:00:00||
Episode 4|Iteration 7|reward:   39.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 10|reward:   39.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 13|reward:   39.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 15|reward:   39.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|Iteration 17|reward:   45.0|last_reward_at:    7|Elapsed Time: 0:00:00||
Episode 4|It

  Episode 4 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 7/105 (0.06)
    explore-remote: 0/162 (0.00)
    explore-connect: 3/150 (0.02)
    exploit-local: 19/17 (0.53)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 3/34 (0.08)
  exploit deflected to exploration: 1
  ## Episode: 5/5 'DQL' ϵ=0.8696, γ=0.015, lr=0.01, replaymemory=10000,
batch=512, target_update=5


Episode 5|Iteration 0|reward: ------|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   27.0|last_reward_at: ----|Elapsed Time: 0:00:00||
Episode 5|Iteration 1|reward:   27.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 3|reward:   27.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 6|reward:   27.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   33.0|last_reward_at:    1|Elapsed Time: 0:00:00||
Episode 5|Iteration 8|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 11|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   33.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   39.0|last_reward_at:    8|Elapsed Time: 0:00:00||
Episode 5|Iteration 13|reward:   39.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 5|Iteration 15|reward:   45.0|last_reward_at:   13|Elapsed Time: 0:00:00||
Episode 5|I

  Episode 5 stopped at t=500 
  Breakdown [Reward/NoReward (Success rate)]
    explore-local: 8/112 (0.07)
    explore-remote: 0/156 (0.00)
    explore-connect: 2/158 (0.01)
    exploit-local: 18/14 (0.56)
    exploit-remote: 0/0 (NaN)
    exploit-connect: 4/28 (0.12)
  exploit deflected to exploration: 0
simulation ended


In [15]:
tiny = gym.make('ActiveDirectoryTiny-v0')
current_o = tiny.reset()
wrapped_env = AgentWrapper(tiny, ActionTrackingStateAugmentation(ep, current_o))
# Use the trained agent to run the steps one by one
max_steps = 500
# next action suggested by DQL agent
h = []
for i in range(max_steps):
    # run the suggested action
    _, next_action, _ = l.exploit(wrapped_env, current_o)
    h.append((tiny.get_explored_network_node_properties_bitmap_as_numpy(current_o), next_action))
    if next_action is None:
        break
    current_o, _, is_done, _ = wrapped_env.step(next_action)
    if is_done:
        break
tiny.render()


Unnamed: 0_level_0,status,properties,local_attacks,remote_attacks
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
workstation_0,owned,[breach_node],"[AuthorizationSpoofAndCrack, FindDomainControl...",[]
workstation_1,owned,[admin],"[ScanForCreds, FindDomainControllers, ProbeAdm...",[]
domain_controller_1,discovered,,,[]
