In [1]:
!pip install tensorflow
!pip install gym
!pip install keras
!pip install keras-rl2

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.2.1-py3-none-any.whl.metadata (2.4 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.21.0,!

In [85]:
from CybORG import CybORG
import inspect
from CybORG.Agents import B_lineAgent
from CybORG.Agents.Wrappers.ChallengeWrapper import ChallengeWrapper
import random

In [86]:
MAX_STEPS_PER_GAME = 30
MAX_EPS = 100

In [87]:
print("Setup")
path = str(inspect.getfile(CybORG))
path = path[:-10] + f'/Shared/Scenarios/Scenario2.yaml'

agent_name = 'Blue'
# wrappers = FixedFlatWrapper(EnumActionWrapper(CybORG(path, 'sim', agents={'Red': B_lineAgent})))
# cyborg = OpenAIGymWrapper(env=wrappers,agent_name=agent_name)

env = ChallengeWrapper(env=CybORG(path, 'sim', agents={'Red': B_lineAgent}), agent_name=agent_name)

Setup


In [88]:
states = env.observation_space.shape[0]
actions = env.action_space.n

print(f"States: {states}")
print(f"Actions: {actions}")

States: 52
Actions: 145


In [89]:
episodes = 15
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    for j in range(MAX_STEPS_PER_GAME):
        action = random.randint(0, actions-1)
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:-80.10000000000001
Episode:2 Score:-225.69999999999993
Episode:3 Score:-214.79999999999993
Episode:4 Score:-136.79999999999995
Episode:5 Score:-137.7
Episode:6 Score:-227.69999999999993
Episode:7 Score:-48.600000000000016
Episode:8 Score:-138.79999999999998
Episode:9 Score:-224.79999999999993
Episode:10 Score:-224.99999999999997
Episode:11 Score:-224.79999999999993
Episode:12 Score:-228.79999999999993
Episode:13 Score:-225.79999999999993
Episode:14 Score:-204.79999999999995
Episode:15 Score:-223.69999999999993


In [99]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
from collections import deque
import torch.nn.functional as F

In [91]:
class DuelingDQN(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(DuelingDQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        
        # Value stream
        self.value_fc = nn.Linear(128, 64)
        self.value = nn.Linear(64, 1)
        
        # Advantage stream
        self.advantage_fc = nn.Linear(128, 64)
        self.advantage = nn.Linear(64, action_dim)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        
        value = F.relu(self.value_fc(x))
        value = self.value(value)
        
        advantage = F.relu(self.advantage_fc(x))
        advantage = self.advantage(advantage)
        
        # Combine value and advantage streams
        q_vals = value + (advantage - advantage.mean(dim=1, keepdim=True))
        return q_vals

# Replay Memory to store transitions
class ReplayMemory:
    def __init__(self, capacity):
        self.memory = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [96]:
# Hyperparameters
BATCH_SIZE = 64
GAMMA = 0.99
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
LR = 0.001
TARGET_UPDATE = 10
MAX_EPISODES = 2000
MAX_STEPS_PER_GAME = 100  # or as defined in your environment
MEMORY_SIZE = 10000

In [97]:
memory = ReplayMemory(MEMORY_SIZE)
policy_net = DuelingDQN(states, actions)
target_net = DuelingDQN(states, actions)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
criterion = nn.MSELoss()

epsilon = EPSILON_START

In [98]:
avg_score = 0
for episode in range(1, MAX_EPISODES + 1):
    state = env.reset()
    state = np.array(state)
    score = 0
    done = False
    
    for step in range(MAX_STEPS_PER_GAME):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = random.randrange(actions)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                q_values = policy_net(state_tensor)
                action = torch.argmax(q_values).item()
        
        next_state, reward, done, _ = env.step(action)
        next_state = np.array(next_state)
        memory.push(state, action, reward, next_state, done)
        state = next_state
        score += reward
        
        if done:
            break
        
        # Training step: update policy network if enough samples in replay memory
        if len(memory) >= BATCH_SIZE:
            transitions = memory.sample(BATCH_SIZE)
            batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(*transitions)
            
            batch_state = torch.FloatTensor(batch_state)
            batch_action = torch.LongTensor(batch_action).unsqueeze(1)
            batch_reward = torch.FloatTensor(batch_reward).unsqueeze(1)
            batch_next_state = torch.FloatTensor(batch_next_state)
            batch_done = torch.FloatTensor(batch_done).unsqueeze(1)
            
            # Compute current Q values from policy network
            current_q = policy_net(batch_state).gather(1, batch_action)
            
            # Compute target Q values using target network
            next_q = target_net(batch_next_state).max(1)[0].unsqueeze(1)
            expected_q = batch_reward + GAMMA * next_q * (1 - batch_done)
            
            # Compute loss and perform backpropagation
            loss = criterion(current_q, expected_q)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
    avg_score += score
    
    # Decay epsilon
    epsilon = max(EPSILON_END, epsilon * EPSILON_DECAY)
    if episode % 50 == 0:
        avg_score /= 50
        print(f"Episode: {episode} Avg Score: {avg_score:.2f}")
        avg_score = 0
    
    # Update target network periodically
    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

Episode: 50 Avg Score: -452.53
Episode: 100 Avg Score: -405.02
Episode: 150 Avg Score: -469.58
Episode: 200 Avg Score: -568.61
Episode: 250 Avg Score: -627.64
Episode: 300 Avg Score: -512.07
Episode: 350 Avg Score: -466.87
Episode: 400 Avg Score: -323.49
Episode: 450 Avg Score: -247.32
Episode: 500 Avg Score: -187.26
Episode: 550 Avg Score: -297.40
Episode: 600 Avg Score: -145.98
Episode: 650 Avg Score: -160.36
Episode: 700 Avg Score: -197.49
Episode: 750 Avg Score: -115.92
Episode: 800 Avg Score: -131.02
Episode: 850 Avg Score: -80.50
Episode: 900 Avg Score: -81.49
Episode: 950 Avg Score: -70.12
Episode: 1000 Avg Score: -73.70
Episode: 1050 Avg Score: -58.19
Episode: 1100 Avg Score: -63.99
Episode: 1150 Avg Score: -75.05
Episode: 1200 Avg Score: -133.21
Episode: 1250 Avg Score: -79.19
Episode: 1300 Avg Score: -60.96
Episode: 1350 Avg Score: -48.17
Episode: 1400 Avg Score: -70.49
Episode: 1450 Avg Score: -78.58
Episode: 1500 Avg Score: -75.53
Episode: 1550 Avg Score: -68.82


KeyboardInterrupt: 