In [2]:
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from security_env import SecurityEnv
import pandas as pd
from typing import List, Dict, Any
import copy

class PBTAgent:
    def __init__(self, params: Dict[str, Any]):
        self.params = params
        self.model = None
        self.reward = float('-inf')
        self.steps = 0
    
    def create_env(self):
        return SecurityEnv(
            rf_model_path="fatigue_model.joblib",
            alpha=self.params['alpha'],
            beta=self.params['beta'],
            s_min=self.params['s_min']
        )
    
    def train(self, n_steps: int):
        env = self.create_env()
        env = DummyVecEnv([lambda: env])
        
        if self.model is None:
            self.model = PPO(
                "MlpPolicy",
                env,
                learning_rate=self.params['learning_rate'],
                n_steps=self.params['n_steps'],
                batch_size=self.params['batch_size'],
                n_epochs=self.params['n_epochs'],
                gamma=self.params['gamma'],
                gae_lambda=self.params['gae_lambda'],
                clip_range=self.params['clip_range'],
                ent_coef=self.params['ent_coef'],
                verbose=0
            )
        
        self.model.learn(total_timesteps=n_steps)
        self.steps += n_steps
        
        # Evaluate
        eval_env = self.create_env()
        eval_env = DummyVecEnv([lambda: eval_env])
        self.reward, _ = evaluate_model(self.model, eval_env)
    
    def exploit(self, other_agent: 'PBTAgent'):
        """Copy parameters from better performing agent"""
        self.params = copy.deepcopy(other_agent.params)
        self.model = copy.deepcopy(other_agent.model)
    
    def explore(self):
        """Randomly perturb parameters"""
        for key in self.params:
            if isinstance(self.params[key], float):
                self.params[key] *= np.random.uniform(0.8, 1.2)
            elif isinstance(self.params[key], int):
                self.params[key] = int(self.params[key] * np.random.uniform(0.8, 1.2))

def evaluate_model(model, env, n_episodes=5):
    """Evaluate a trained model"""
    rewards = []
    for _ in range(n_episodes):
        obs = env.reset()[0]  # Get only the observation
        done = False
        episode_reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = env.step(action)  # Updated to handle new gymnasium API
            episode_reward += reward
        rewards.append(episode_reward)
    return np.mean(rewards), np.std(rewards)

def run_pbt(
    population_size: int = 8,
    num_generations: int = 10,
    steps_per_generation: int = 10000,
    exploit_threshold: float = 0.2
):
    """
    Run Population Based Training
    """
    # Initialize population
    population = []
    param_ranges = {
        'learning_rate': np.logspace(-5, -3, 100),
        'n_steps': [2048, 4096, 8192],
        'batch_size': [32, 64, 128],
        'n_epochs': [5, 10, 20],
        'gamma': [0.95, 0.99, 0.995],
        'gae_lambda': [0.9, 0.95, 0.98],
        'clip_range': [0.1, 0.2, 0.3],
        'ent_coef': [0.0, 0.01, 0.005],
        'alpha': [0.3, 0.5, 0.7],
        'beta': [0.3, 0.5, 0.7],
        's_min': [5.0, 8.0, 10.0]
    }
    
    for _ in range(population_size):
        params = {k: np.random.choice(v) for k, v in param_ranges.items()}
        population.append(PBTAgent(params))
    
    results = []
    
    for generation in range(num_generations):
        print(f"\nGeneration {generation + 1}/{num_generations}")
        
        # Train all agents
        for agent in population:
            agent.train(steps_per_generation)
        
        # Sort agents by reward
        population.sort(key=lambda x: x.reward, reverse=True)
        
        # Store results
        results.append({
            'generation': generation,
            'best_reward': population[0].reward,
            'mean_reward': np.mean([a.reward for a in population]),
            'std_reward': np.std([a.reward for a in population])
        })
        
        print(f"Best Reward: {population[0].reward:.2f}")
        print(f"Mean Reward: {np.mean([a.reward for a in population]):.2f}")
        
        # Exploit and explore
        for i in range(population_size):
            if i > 0 and np.random.random() < exploit_threshold:  # Only exploit if i > 0
                # Exploit from better performing agent
                better_agent = population[np.random.randint(0, i)]
                population[i].exploit(better_agent)
            # Explore
            population[i].explore()
    
    # Save results
    results_df = pd.DataFrame(results)
    results_df.to_csv('pbt_results.csv', index=False)
    return results_df


In [3]:
# Run PBT with default parameters
results = run_pbt()

# Or specify custom parameters
results = run_pbt(
    population_size=8,
    num_generations=10,
    steps_per_generation=10000,
    exploit_threshold=0.2
)


Generation 1/10


IndexError: invalid index to scalar variable.