In [1]:
from CybORG import CybORG
import inspect
from CybORG.Agents import B_lineAgent
from CybORG.Agents.Wrappers.ChallengeWrapper import ChallengeWrapper
import random

In [2]:
MAX_STEPS_PER_GAME = 30
MAX_EPS = 100

In [3]:
print("Setup")
path = str(inspect.getfile(CybORG))
path = path[:-10] + f'/Shared/Scenarios/Scenario2.yaml'

agent_name = 'Blue'
env = ChallengeWrapper(env=CybORG(path, 'sim', agents={'Red': B_lineAgent}), agent_name=agent_name)

Setup


In [16]:
states = env.observation_space.shape[0]
actions = env.action_space.n

print(f"States: {states}")
print(f"Actions: {actions}")

States: 52
Actions: 145


In [17]:
episodes = 15
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    for j in range(MAX_STEPS_PER_GAME):
        action = random.randint(0, actions-1)
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:-225.69999999999993
Episode:2 Score:-226.2
Episode:3 Score:-104.79999999999998
Episode:4 Score:-41.5
Episode:5 Score:-224.79999999999993
Episode:6 Score:-204.79999999999995
Episode:7 Score:-226.79999999999993
Episode:8 Score:-50.70000000000002
Episode:9 Score:-219.69999999999993
Episode:10 Score:-47.80000000000002
Episode:11 Score:-226.79999999999993
Episode:12 Score:-171.79999999999993
Episode:13 Score:-202.2
Episode:14 Score:-83.79999999999997
Episode:15 Score:-27.80000000000001


In [18]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

In [19]:
# PPO model with shared layers and separate actor and critic heads.
class PPO(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(PPO, self).__init__()

        # Actor Network
        self.actor = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, action_dim),
            nn.Softmax(dim=-1)  # Outputs probabilities for discrete actions
        )

        # Critic Network
        self.critic = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.Tanh(),
            nn.Linear(64, 32),
            nn.Tanh(),
            nn.Linear(32, 1)  # Outputs state value
        )

    def act(self, state):
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

    def evaluate(self, state, action):
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.critic(state)
        return action_logprobs, state_value.squeeze(), dist_entropy

In [20]:
# Compute returns using Generalized Advantage Estimation (GAE)
def compute_returns(rewards, dones, values, next_value, gamma=0.99, lam=0.95):
    returns = []
    gae = 0
    values = values + [next_value]
    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * (1 - dones[step]) - values[step]
        gae = delta + gamma * lam * (1 - dones[step]) * gae
        returns.insert(0, gae + values[step])
    return returns

In [21]:
hidden_size = 128
lr = 0.002
max_episodes = 10000
max_steps = 100  # or as defined by your environment
update_timestep = 1000  # Number of timesteps to collect before an update
eps_clip = 0.2
gamma = 0.99
lam = 0.95
K_epochs = 6  # PPO update iterations

In [22]:
# Initialize PPO model and optimizer
ppo = PPO(states, actions)
optimizer = optim.Adam(ppo.parameters(), lr=lr)

timestep = 0
memory = []  # Store trajectories
avg_reward = 0

for episode in range(1, max_episodes + 1):
    state = env.reset()
    state = torch.FloatTensor(state)
    episode_reward = 0
    
    for t in range(max_steps):
        timestep += 1

        # Select an action using the current policy
        action, log_prob = ppo.act(state.unsqueeze(0))
        next_state, reward, done, _ = env.step(action)
        next_state = torch.FloatTensor(next_state)
        
        # Save transition in memory: (state, action, log_prob, reward, done)
        memory.append((state, action, log_prob, reward, done))
        episode_reward += reward
        
        state = next_state
        
        if done:
            break

        # PPO Update
        if timestep % update_timestep == 0:
            # Unpack memory
            states, actions, log_probs, rewards, dones = zip(*memory)
            states = torch.stack(states)
            actions = torch.LongTensor(actions)
            old_log_probs = torch.stack(log_probs).detach()

            # Compute state values
            with torch.no_grad():
                values = ppo.critic(states).squeeze().tolist()
                next_value = 0 if done else ppo.critic(state.unsqueeze(0)).item()
            
            # Compute advantages and returns
            returns = compute_returns(rewards, dones, values, next_value, gamma, lam)
            returns = torch.FloatTensor(returns)
            advantages = returns - torch.FloatTensor(values)
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
            
            # PPO update
            for _ in range(K_epochs):
                # Evaluate new policy
                new_log_probs, state_values, dist_entropy = ppo.evaluate(states, actions)
                ratios = torch.exp(new_log_probs - old_log_probs)

                # PPO loss
                surr1 = ratios * advantages
                surr2 = torch.clamp(ratios, 1 - eps_clip, 1 + eps_clip) * advantages
                loss = -torch.min(surr1, surr2).mean() + 0.5 * F.mse_loss(state_values, returns) - 0.01 * dist_entropy.mean()
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            # Reset memory
            memory = []
            timestep = 0
    
    avg_reward += episode_reward
    if episode % 50 == 0:
        avg_reward = avg_reward / 50
        print("Episode: {} Avg Reward: {}".format(episode, avg_reward))
        avg_reward = 0

Episode: 50 Avg Reward: -714.9020000000007
Episode: 100 Avg Reward: -613.5380000000001
Episode: 150 Avg Reward: -661.5920000000004
Episode: 200 Avg Reward: -637.3300000000004
Episode: 250 Avg Reward: -659.0760000000006
Episode: 300 Avg Reward: -573.1620000000003
Episode: 350 Avg Reward: -564.3980000000004
Episode: 400 Avg Reward: -567.8320000000004
Episode: 450 Avg Reward: -522.0720000000006
Episode: 500 Avg Reward: -577.2460000000002
Episode: 550 Avg Reward: -659.1340000000004
Episode: 600 Avg Reward: -661.6440000000007
Episode: 650 Avg Reward: -474.3800000000002
Episode: 700 Avg Reward: -426.03400000000033
Episode: 750 Avg Reward: -472.19400000000024
Episode: 800 Avg Reward: -474.3160000000002
Episode: 850 Avg Reward: -490.34600000000023
Episode: 900 Avg Reward: -632.7260000000002
Episode: 950 Avg Reward: -382.8120000000001
Episode: 1000 Avg Reward: -497.64200000000005
Episode: 1050 Avg Reward: -502.1620000000001
Episode: 1100 Avg Reward: -435.1420000000003
Episode: 1150 Avg Reward: 

KeyboardInterrupt: 