In [4]:
import sys
import os
# Add parent directory to Python path so it can find security_env.py
parent_dir = os.path.abspath('..')
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

import numpy as np
import optuna
from optuna.samplers import TPESampler
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from security_env import SecurityEnv

def objective(trial):
    """
    Objective function for Optuna optimization
    """
    # Define hyperparameter ranges
    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
        'n_steps': trial.suggest_categorical('n_steps', [2048, 4096, 8192]),
        'batch_size': trial.suggest_categorical('batch_size', [32, 64, 128]),
        'n_epochs': trial.suggest_categorical('n_epochs', [5, 10, 20]),
        'gamma': trial.suggest_categorical('gamma', [0.95, 0.99, 0.995]),
        'gae_lambda': trial.suggest_categorical('gae_lambda', [0.9, 0.95, 0.98]),
        'clip_range': trial.suggest_categorical('clip_range', [0.1, 0.2, 0.3]),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-5, 1e-2),
        'alpha': trial.suggest_categorical('alpha', [0.3, 0.5, 0.7]),
        'beta': trial.suggest_categorical('beta', [0.3, 0.5, 0.7]),
        's_min': trial.suggest_categorical('s_min', [5.0, 8.0, 10.0])
    }
    
    # Create environment
    env = SecurityEnv(
        alpha=params['alpha'],
        beta=params['beta'],
        s_min=params['s_min']
    )
    env = DummyVecEnv([lambda: env])
    
    # Create and train model
    model = PPO(
        "MlpPolicy",
        env,
        learning_rate=params['learning_rate'],
        n_steps=params['n_steps'],
        batch_size=params['batch_size'],
        n_epochs=params['n_epochs'],
        gamma=params['gamma'],
        gae_lambda=params['gae_lambda'],
        clip_range=params['clip_range'],
        ent_coef=params['ent_coef'],
        verbose=0
    )
    
    # Train model
    model.learn(total_timesteps=10000)
    
    # Evaluate model
    eval_env = SecurityEnv(
        alpha=params['alpha'],
        beta=params['beta'],
        s_min=params['s_min']
    )
    eval_env = DummyVecEnv([lambda: eval_env])
    
    mean_reward, _ = evaluate_model(model, eval_env)
    
    return mean_reward

def run_bayesian_optimization(n_trials=50):
    """
    Run Bayesian Optimization using Optuna
    """
    # Create study
    study = optuna.create_study(
        direction='maximize',
        sampler=TPESampler(seed=42)
    )
    
    # Run optimization
    study.optimize(objective, n_trials=n_trials)
    
    # Print best results
    print("Best trial:")
    trial = study.best_trial
    print("  Value: ", trial.value)
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")
    
    # Save results
    results_df = study.trials_dataframe()
    results_df.to_csv('bayesian_optimization_results.csv', index=False)
    
    return study

def evaluate_model(model, env, n_episodes=5):
    """Evaluate a trained model"""
    rewards = []
    for _ in range(n_episodes):
        obs = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, info = env.step(action)
            episode_reward += reward
        rewards.append(episode_reward)
    return np.mean(rewards), np.std(rewards)

In [5]:
study = run_bayesian_optimization(n_trials=50)

[I 2025-04-29 22:38:49,389] A new study created in memory with name: no-name-e86f932e-c773-48b1-876a-86b8165c5671
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3),
  'ent_coef': trial.suggest_loguniform('ent_coef', 1e-5, 1e-2),




[I 2025-04-29 22:42:15,833] Trial 0 finished with value: 2512.673828125 and parameters: {'learning_rate': 5.6115164153345e-05, 'n_steps': 2048, 'batch_size': 32, 'n_epochs': 5, 'gamma': 0.99, 'gae_lambda': 0.9, 'clip_range': 0.2, 'ent_coef': 7.476312062252303e-05, 'alpha': 0.3, 'beta': 0.7, 's_min': 10.0}. Best is trial 0 with value: 2512.673828125.




[I 2025-04-29 22:46:00,342] Trial 1 finished with value: 3264.388671875 and parameters: {'learning_rate': 1.2385137298860926e-05, 'n_steps': 2048, 'batch_size': 64, 'n_epochs': 20, 'gamma': 0.995, 'gae_lambda': 0.95, 'clip_range': 0.1, 'ent_coef': 0.00043664735929796326, 'alpha': 0.5, 'beta': 0.3, 's_min': 5.0}. Best is trial 1 with value: 3264.388671875.




[I 2025-04-29 22:52:59,183] Trial 2 finished with value: 5492.89306640625 and parameters: {'learning_rate': 1.2315571723666024e-05, 'n_steps': 4096, 'batch_size': 32, 'n_epochs': 20, 'gamma': 0.99, 'gae_lambda': 0.98, 'clip_range': 0.3, 'ent_coef': 1.667761543019792e-05, 'alpha': 0.7, 'beta': 0.3, 's_min': 10.0}. Best is trial 2 with value: 5492.89306640625.




[I 2025-04-29 22:55:43,478] Trial 3 finished with value: 2487.403564453125 and parameters: {'learning_rate': 0.00018841476921545086, 'n_steps': 2048, 'batch_size': 64, 'n_epochs': 5, 'gamma': 0.95, 'gae_lambda': 0.95, 'clip_range': 0.2, 'ent_coef': 0.0001702741688676439, 'alpha': 0.3, 'beta': 0.7, 's_min': 10.0}. Best is trial 2 with value: 5492.89306640625.




[I 2025-04-29 23:00:37,484] Trial 4 finished with value: 3512.53759765625 and parameters: {'learning_rate': 0.00040489662225846743, 'n_steps': 4096, 'batch_size': 64, 'n_epochs': 20, 'gamma': 0.99, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 9.324140221663475e-05, 'alpha': 0.5, 'beta': 0.3, 's_min': 5.0}. Best is trial 2 with value: 5492.89306640625.




[I 2025-04-29 23:04:52,827] Trial 5 finished with value: 2673.98828125 and parameters: {'learning_rate': 1.1851515660043103e-05, 'n_steps': 2048, 'batch_size': 64, 'n_epochs': 20, 'gamma': 0.995, 'gae_lambda': 0.95, 'clip_range': 0.2, 'ent_coef': 1.865818136012483e-05, 'alpha': 0.3, 'beta': 0.7, 's_min': 8.0}. Best is trial 2 with value: 5492.89306640625.




[I 2025-04-29 23:09:53,479] Trial 6 finished with value: 5614.96435546875 and parameters: {'learning_rate': 0.0001951396765593445, 'n_steps': 4096, 'batch_size': 32, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0007930569433855138, 'alpha': 0.7, 'beta': 0.3, 's_min': 5.0}. Best is trial 6 with value: 5614.96435546875.




[I 2025-04-29 23:13:56,953] Trial 7 finished with value: 2154.410888671875 and parameters: {'learning_rate': 0.0006267702540388485, 'n_steps': 2048, 'batch_size': 32, 'n_epochs': 10, 'gamma': 0.99, 'gae_lambda': 0.95, 'clip_range': 0.1, 'ent_coef': 1.9099601385505377e-05, 'alpha': 0.3, 'beta': 0.3, 's_min': 8.0}. Best is trial 6 with value: 5614.96435546875.




[I 2025-04-29 23:19:14,974] Trial 8 finished with value: 2241.833251953125 and parameters: {'learning_rate': 0.0001424976719890158, 'n_steps': 8192, 'batch_size': 128, 'n_epochs': 20, 'gamma': 0.95, 'gae_lambda': 0.98, 'clip_range': 0.1, 'ent_coef': 0.00014298589094443974, 'alpha': 0.3, 'beta': 0.5, 's_min': 10.0}. Best is trial 6 with value: 5614.96435546875.




[I 2025-04-29 23:28:10,456] Trial 9 finished with value: 1719.550537109375 and parameters: {'learning_rate': 0.0009552294429449872, 'n_steps': 8192, 'batch_size': 32, 'n_epochs': 20, 'gamma': 0.995, 'gae_lambda': 0.98, 'clip_range': 0.3, 'ent_coef': 0.004677524010176147, 'alpha': 0.5, 'beta': 0.3, 's_min': 10.0}. Best is trial 6 with value: 5614.96435546875.




[I 2025-04-29 23:31:34,134] Trial 10 finished with value: 6611.9716796875 and parameters: {'learning_rate': 4.194601131160366e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0011746931059660698, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:34:53,044] Trial 11 finished with value: 6554.44384765625 and parameters: {'learning_rate': 5.1544513708209705e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.001326429162132363, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:38:09,963] Trial 12 finished with value: 6470.52099609375 and parameters: {'learning_rate': 4.597442765558493e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0020817252962872334, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:41:29,656] Trial 13 finished with value: 6581.8603515625 and parameters: {'learning_rate': 3.0416543930766183e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.00987698895540022, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:44:47,189] Trial 14 finished with value: 6025.552734375 and parameters: {'learning_rate': 2.6813163881195038e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.00867514565549096, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:48:06,789] Trial 15 finished with value: 6593.6796875 and parameters: {'learning_rate': 2.613488212282252e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.003067982340749333, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:51:01,788] Trial 16 finished with value: 6346.0009765625 and parameters: {'learning_rate': 2.3450390618273713e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.95, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.002441199575025527, 'alpha': 0.7, 'beta': 0.5, 's_min': 8.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:54:13,135] Trial 17 finished with value: 6581.16650390625 and parameters: {'learning_rate': 8.79518847133326e-05, 'n_steps': 8192, 'batch_size': 128, 'n_epochs': 5, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0005912148514718085, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:56:27,978] Trial 18 finished with value: 5809.78564453125 and parameters: {'learning_rate': 8.649547573079232e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.1, 'ent_coef': 0.003279149371929606, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-29 23:58:47,780] Trial 19 finished with value: 5630.9814453125 and parameters: {'learning_rate': 1.949148146843876e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.95, 'gae_lambda': 0.98, 'clip_range': 0.2, 'ent_coef': 0.0009911971439486177, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:03:16,317] Trial 20 finished with value: 4232.8798828125 and parameters: {'learning_rate': 3.817378946867669e-05, 'n_steps': 8192, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0003295341471817215, 'alpha': 0.5, 'beta': 0.5, 's_min': 8.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:06:06,209] Trial 21 finished with value: 6521.54833984375 and parameters: {'learning_rate': 3.0193351056396477e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.008003127813294037, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:08:20,248] Trial 22 finished with value: 6565.7724609375 and parameters: {'learning_rate': 1.8716367604105584e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.004877520703020233, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:10:37,869] Trial 23 finished with value: 5998.166015625 and parameters: {'learning_rate': 6.419021749635753e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0017941931798196057, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:12:51,132] Trial 24 finished with value: 6583.8369140625 and parameters: {'learning_rate': 3.741151293047051e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.004584691716843749, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:14:54,412] Trial 25 finished with value: 6559.7373046875 and parameters: {'learning_rate': 1.758103241698311e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 5, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.003990467410241338, 'alpha': 0.7, 'beta': 0.7, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:17:08,908] Trial 26 finished with value: 6333.99560546875 and parameters: {'learning_rate': 3.97505102322754e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0013440501663433827, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:19:31,654] Trial 27 finished with value: 6463.0439453125 and parameters: {'learning_rate': 6.876944848069872e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.005667454194803142, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:22:05,991] Trial 28 finished with value: 6329.97412109375 and parameters: {'learning_rate': 0.00012653398503317665, 'n_steps': 4096, 'batch_size': 64, 'n_epochs': 10, 'gamma': 0.95, 'gae_lambda': 0.98, 'clip_range': 0.2, 'ent_coef': 0.0025428643113326412, 'alpha': 0.7, 'beta': 0.5, 's_min': 8.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:24:55,862] Trial 29 finished with value: 4124.57568359375 and parameters: {'learning_rate': 6.359944267657934e-05, 'n_steps': 8192, 'batch_size': 128, 'n_epochs': 5, 'gamma': 0.99, 'gae_lambda': 0.95, 'clip_range': 0.1, 'ent_coef': 4.0813676113162666e-05, 'alpha': 0.5, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:27:52,940] Trial 30 finished with value: 2648.552978515625 and parameters: {'learning_rate': 3.420005507853586e-05, 'n_steps': 2048, 'batch_size': 32, 'n_epochs': 10, 'gamma': 0.99, 'gae_lambda': 0.9, 'clip_range': 0.2, 'ent_coef': 0.0008487587936472669, 'alpha': 0.3, 'beta': 0.7, 's_min': 10.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:30:30,753] Trial 31 finished with value: 6564.72802734375 and parameters: {'learning_rate': 2.6638123579685e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.008597272122416903, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:33:03,804] Trial 32 finished with value: 6597.96728515625 and parameters: {'learning_rate': 1.602245500434695e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.005890839942670577, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:35:29,996] Trial 33 finished with value: 6180.794921875 and parameters: {'learning_rate': 1.4270255906773317e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.003693418424542578, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:37:50,492] Trial 34 finished with value: 6381.3310546875 and parameters: {'learning_rate': 1.4795140691777777e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 5, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.005490674942417662, 'alpha': 0.7, 'beta': 0.7, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:40:10,046] Trial 35 finished with value: 6087.83349609375 and parameters: {'learning_rate': 1.0798952400295223e-05, 'n_steps': 2048, 'batch_size': 64, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.95, 'clip_range': 0.3, 'ent_coef': 0.0004829253823879077, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:42:56,167] Trial 36 finished with value: 4187.4326171875 and parameters: {'learning_rate': 2.4084528018850626e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 20, 'gamma': 0.99, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0013916822751076203, 'alpha': 0.5, 'beta': 0.5, 's_min': 10.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:45:45,295] Trial 37 finished with value: 5450.8251953125 and parameters: {'learning_rate': 4.437431367028616e-05, 'n_steps': 4096, 'batch_size': 64, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.1, 'ent_coef': 0.002979788242149263, 'alpha': 0.7, 'beta': 0.3, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:48:33,915] Trial 38 finished with value: 2672.09033203125 and parameters: {'learning_rate': 1.0106127883076524e-05, 'n_steps': 2048, 'batch_size': 32, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.95, 'clip_range': 0.2, 'ent_coef': 0.006164765539190423, 'alpha': 0.3, 'beta': 0.7, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:51:36,468] Trial 39 finished with value: 6217.26806640625 and parameters: {'learning_rate': 0.0003271233841491366, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 20, 'gamma': 0.95, 'gae_lambda': 0.98, 'clip_range': 0.3, 'ent_coef': 0.0018488958200118598, 'alpha': 0.7, 'beta': 0.3, 's_min': 8.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:53:52,723] Trial 40 finished with value: 2674.518798828125 and parameters: {'learning_rate': 2.07539727038721e-05, 'n_steps': 4096, 'batch_size': 64, 'n_epochs': 5, 'gamma': 0.99, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0006867690651771506, 'alpha': 0.3, 'beta': 0.5, 's_min': 10.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:56:14,144] Trial 41 finished with value: 6564.66259765625 and parameters: {'learning_rate': 3.310323558156226e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.009013754878022426, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 00:58:31,012] Trial 42 finished with value: 6481.1064453125 and parameters: {'learning_rate': 1.5837646577880925e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.006332830279893805, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:00:49,122] Trial 43 finished with value: 6386.5400390625 and parameters: {'learning_rate': 2.8560973865972813e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.003804928870370026, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:04:17,836] Trial 44 finished with value: 6220.2197265625 and parameters: {'learning_rate': 5.27191273817966e-05, 'n_steps': 4096, 'batch_size': 32, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.009952889706920943, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:06:41,094] Trial 45 finished with value: 4675.25341796875 and parameters: {'learning_rate': 1.3829502994569037e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.0001973209260592085, 'alpha': 0.5, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:10:17,904] Trial 46 finished with value: 5495.10205078125 and parameters: {'learning_rate': 2.2500984086582163e-05, 'n_steps': 8192, 'batch_size': 128, 'n_epochs': 20, 'gamma': 0.995, 'gae_lambda': 0.95, 'clip_range': 0.1, 'ent_coef': 1.099985223916306e-05, 'alpha': 0.7, 'beta': 0.3, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:12:11,086] Trial 47 finished with value: 4141.51318359375 and parameters: {'learning_rate': 4.336132568758978e-05, 'n_steps': 2048, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.98, 'clip_range': 0.3, 'ent_coef': 0.002803795683617072, 'alpha': 0.7, 'beta': 0.5, 's_min': 5.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:14:32,665] Trial 48 finished with value: 5917.8251953125 and parameters: {'learning_rate': 3.417449984662684e-05, 'n_steps': 4096, 'batch_size': 128, 'n_epochs': 10, 'gamma': 0.995, 'gae_lambda': 0.9, 'clip_range': 0.3, 'ent_coef': 0.006793773622155831, 'alpha': 0.7, 'beta': 0.5, 's_min': 10.0}. Best is trial 10 with value: 6611.9716796875.




[I 2025-04-30 01:17:58,542] Trial 49 finished with value: 2684.553955078125 and parameters: {'learning_rate': 7.855459970676167e-05, 'n_steps': 4096, 'batch_size': 32, 'n_epochs': 10, 'gamma': 0.95, 'gae_lambda': 0.9, 'clip_range': 0.2, 'ent_coef': 0.005092477796407556, 'alpha': 0.3, 'beta': 0.5, 's_min': 8.0}. Best is trial 10 with value: 6611.9716796875.


Best trial:
  Value:  6611.9716796875
  Params: 
    learning_rate: 4.194601131160366e-05
    n_steps: 4096
    batch_size: 128
    n_epochs: 10
    gamma: 0.995
    gae_lambda: 0.9
    clip_range: 0.3
    ent_coef: 0.0011746931059660698
    alpha: 0.7
    beta: 0.5
    s_min: 5.0
