In [5]:
import sys
import os
# Add parent directory to Python path so it can find security_env.py
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)
    
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from security_env import SecurityEnv
import pandas as pd
from typing import List, Dict, Any
import copy

class PBTAgent:
    def __init__(self, params: Dict[str, Any]):
        self.params = params
        self.model = None
        self.reward = float('-inf')
        self.steps = 0
    
    def create_env(self):
        return SecurityEnv(
            alpha=self.params['alpha'],
            beta=self.params['beta'],
            s_min=self.params['s_min']
        )
    
    def train(self, n_steps: int):
        env = self.create_env()
        env = DummyVecEnv([lambda: env])
        
        if self.model is None:
            self.model = PPO(
                "MlpPolicy",
                env,
                learning_rate=self.params['learning_rate'],
                n_steps=int(self.params['n_steps']),  # Ensure integer
                batch_size=int(self.params['batch_size']),  # Ensure integer
                n_epochs=int(self.params['n_epochs']),  # Ensure integer
                gamma=self.params['gamma'],
                gae_lambda=self.params['gae_lambda'],
                clip_range=self.params['clip_range'],
                ent_coef=self.params['ent_coef'],
                verbose=0
            )
        
        try:
            self.model.learn(total_timesteps=n_steps)
            self.steps += n_steps
            
            # Evaluate
            eval_env = self.create_env()
            eval_env = DummyVecEnv([lambda: eval_env])
            self.reward, _ = evaluate_model(self.model, eval_env)
        except Exception as e:
            print(f"Error during training: {e}")
            self.reward = float('-inf')  # Assign worst reward on error
    
    def exploit(self, other_agent: 'PBTAgent'):
        """Copy parameters from better performing agent"""
        self.params = copy.deepcopy(other_agent.params)
        self.model = copy.deepcopy(other_agent.model)
    
    def explore(self):
        """Randomly perturb parameters"""
        for key in self.params:
            if isinstance(self.params[key], float):
                self.params[key] *= np.random.uniform(0.8, 1.2)
            elif isinstance(self.params[key], int):
                self.params[key] = int(self.params[key] * np.random.uniform(0.8, 1.2))

def evaluate_model(model, env, n_episodes=5):
    """Evaluate a trained model"""
    rewards = []
    for _ in range(n_episodes):
        obs = env.reset()  # Just get the observation without unpacking
        done = False
        episode_reward = 0
        step_count = 0
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            # Handle stable-baselines3 style returns (not gymnasium style)
            obs, reward, done_array, info = env.step(action)
            done = done_array[0]  # Extract scalar from array
            episode_reward += reward[0]  # Extract scalar from array
            step_count += 1
            
            if step_count > 1000:  # Safety to prevent infinite loops
                print("Warning: Episode exceeded 1000 steps, terminating")
                break
                
        rewards.append(episode_reward)
    return np.mean(rewards), np.std(rewards)

def run_pbt(
    population_size: int = 8,
    num_generations: int = 10,
    steps_per_generation: int = 10000,
    exploit_threshold: float = 0.2
):
    """
    Run Population Based Training with tracking of global best agent.
    """
    import copy

    # Initialize population
    population = []
    param_ranges = {
        'learning_rate': np.logspace(-5, -3, 100),
        'n_steps': [2048, 4096, 8192],
        'batch_size': [32, 64, 128],
        'n_epochs': [5, 10, 20],
        'gamma': [0.95, 0.99, 0.995],
        'gae_lambda': [0.9, 0.95, 0.98],
        'clip_range': [0.1, 0.2, 0.3],
        'ent_coef': [0.0, 0.01, 0.005],
        'alpha': [0.3, 0.5, 0.7],
        'beta': [0.3, 0.5, 0.7],
        's_min': [5.0, 8.0, 10.0]
    }

    for _ in range(population_size):
        params = {k: np.random.choice(v) for k, v in param_ranges.items()}
        population.append(PBTAgent(params))

    results = []
    global_best_agent = None
    global_best_reward = float('-inf')

    for generation in range(num_generations):
        print(f"\nGeneration {generation + 1}/{num_generations}")

        # Train all agents
        for i, agent in enumerate(population):
            print(f"Training agent {i + 1}/{population_size}...")
            agent.train(steps_per_generation)

        # Sort agents by reward
        population.sort(key=lambda x: x.reward, reverse=True)

        # Update global best if necessary
        if population[0].reward > global_best_reward:
            global_best_reward = population[0].reward
            global_best_agent = copy.deepcopy(population[0])

        # Record generation results
        results.append({
            'generation': generation,
            'best_reward': population[0].reward,
            'mean_reward': np.mean([a.reward for a in population]),
            'std_reward': np.std([a.reward for a in population]),
            'best_params': population[0].params.copy(),
            'global_best_so_far': global_best_reward
        })

        print(f"Best Reward: {population[0].reward:.2f}")
        print(f"Mean Reward: {np.mean([a.reward for a in population]):.2f}")
        print(f"Best params: learning_rate={population[0].params['learning_rate']:.6f}, alpha={population[0].params['alpha']}, beta={population[0].params['beta']}")

        # Exploit and explore
        for i in range(population_size):
            if i > 0 and np.random.random() < exploit_threshold:
                better_idx = np.random.randint(0, i)
                print(f"Agent {i + 1} exploiting from agent {better_idx + 1}")
                population[i].exploit(population[better_idx])
            population[i].explore()

    # Save full training history
    results_df = pd.DataFrame(results)
    results_df.to_csv('pbt_results.csv', index=False)

    # Re-evaluate global best agent and save
    eval_env = global_best_agent.create_env()
    eval_env = DummyVecEnv([lambda: eval_env])
    mean_reward, std_reward = evaluate_model(global_best_agent.model, eval_env)

    global_best_agent.model.save("best_pbt_model")
    pd.DataFrame([global_best_agent.params]).to_csv('best_pbt_hyperparams.csv', index=False)

    # Save combined summary of hyperparams and performance
    final_summary = {**global_best_agent.params, **{
        'mean_reward': mean_reward,
        'std_reward': std_reward,
        'total_steps': global_best_agent.steps
    }}
    pd.DataFrame([final_summary]).to_csv('best_pbt_summary.csv', index=False)

    print(f"\n🎯 Final Best Agent:")
    print(f"Mean Reward: {mean_reward:.2f} ± {std_reward:.2f}")
    print(f"Total Steps Trained: {global_best_agent.steps}")
    print("Saved to: best_pbt_model, best_pbt_summary.csv, best_pbt_hyperparams.csv")

    return results_df


In [6]:
# Run PBT with default parameters
results = run_pbt()

# Or specify custom parameters
results = run_pbt(
    population_size=8,
    num_generations=10,
    steps_per_generation=10000,
    exploit_threshold=0.2
)


Generation 1/10
Training agent 1/8...
Training agent 2/8...
Training agent 3/8...
Training agent 4/8...
Training agent 5/8...
Training agent 6/8...
Training agent 7/8...
Training agent 8/8...
Best Reward: 6007.22
Mean Reward: 3793.58
Best params: learning_rate=0.000040, alpha=0.7, beta=0.5

Generation 2/10
Training agent 1/8...
Training agent 2/8...
Training agent 3/8...
Training agent 4/8...
Training agent 5/8...
Training agent 6/8...
Training agent 7/8...
Training agent 8/8...
Best Reward: 7227.46
Mean Reward: 4157.92
Best params: learning_rate=0.000045, alpha=0.8373398002731287, beta=0.5877496051322343
Agent 6 exploiting from agent 3
Agent 7 exploiting from agent 3
Agent 8 exploiting from agent 7

Generation 3/10
Training agent 1/8...
Training agent 2/8...
Training agent 3/8...
Training agent 4/8...
Training agent 5/8...
Training agent 6/8...
Training agent 7/8...
Training agent 8/8...
Best Reward: 7586.95
Mean Reward: 5388.02
Best params: learning_rate=0.000045, alpha=0.8556284174

PermissionError: [Errno 13] Permission denied: 'pbt_results.csv'

In [7]:
# Add this as a new cell in your notebook

# Check if results variable exists in memory
if 'results' in globals():
    # Get the user's home directory where we should have permission to write
    home_dir = os.path.expanduser("~")
    
    # Create a safe path to save the results
    safe_path = os.path.join(home_dir, "pbt_results_recovered.csv")
    
    try:
        # Convert results to DataFrame if it's not already
        if not isinstance(results, pd.DataFrame):
            results_df = pd.DataFrame(results)
        else:
            results_df = results
            
        # Save to the new location
        results_df.to_csv(safe_path, index=False)
        print(f"Successfully saved results to: {safe_path}")
        
        # Display the results
        display(results_df)
        
        # Also try to save the best agent's parameters if available
        if 'global_best_agent' in globals() and global_best_agent is not None:
            best_params_path = os.path.join(home_dir, "best_pbt_params_recovered.csv")
            pd.DataFrame([global_best_agent.params]).to_csv(best_params_path, index=False)
            print(f"Saved best parameters to: {best_params_path}")
            
            # Display the best parameters
            print("\nBest Parameters:")
            for k, v in global_best_agent.params.items():
                print(f"{k}: {v}")
    except Exception as e:
        print(f"Error while saving to safe location: {e}")
        
        # If saving fails, at least display the data
        print("\nResults Data (copy this if needed):")
        if 'results_df' in locals():
            display(results_df)
        else:
            print(results)
else:
    print("No 'results' variable found in memory. The PBT run may not have stored results or the variable has been cleared.")

Successfully saved results to: C:\Users\Tuan Anh HSLU\pbt_results_recovered.csv


Unnamed: 0,generation,best_reward,mean_reward,std_reward,best_params
0,0,6310.609822,4156.689906,1424.844878,"{'learning_rate': 5.590810182512223e-05, 'n_st..."
1,1,5481.112669,3854.00788,1406.812136,"{'learning_rate': 6.459929952567174e-05, 'n_st..."
2,2,5738.796873,4295.577623,1612.455038,"{'learning_rate': 0.00048212644448820827, 'n_s..."
3,3,6208.992759,4298.972426,1586.563752,"{'learning_rate': 0.0004906612217360583, 'n_st..."
4,4,6953.402641,5775.737418,1215.222126,"{'learning_rate': 5.0790933463636486e-05, 'n_s..."
5,5,8442.626759,6481.892946,1686.520765,"{'learning_rate': 0.0004310231047012053, 'n_st..."
6,6,9951.201056,7874.880139,2303.658979,"{'learning_rate': 0.0004529411140663297, 'n_st..."
7,7,9453.611517,7536.380702,1958.068128,"{'learning_rate': 0.0004946896235870806, 'n_st..."
8,8,10050.793005,8379.626022,2310.942296,"{'learning_rate': 0.0004909639149261721, 'n_st..."
9,9,11925.588109,9363.934525,2630.018972,"{'learning_rate': 0.0005594100486622986, 'n_st..."
