In [None]:
!pip install imageio
!pip install SuperSuit==3.4.0
!pip install ray[rllib]==0.8.5
!pip install lz4
!pip install opencv-python==4.5.5.64
!pip install dm_tree
!pip install stable-baselines3
!pip install pettingzoo==1.18.1
!pip install sb3-contrib
!pip install optuna
!pip install gym==0.23.1
!pip install wandb

In [None]:
import optuna
import numpy as np
import supersuit as ss

from social_dilemmas.envs.pettingzoo_env import MAX_CYCLES
from social_dilemmas.envs.pettingzoo_env import parallel_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ppo import CnnPolicy
from stable_baselines3 import PPO

In [None]:
from stable_baselines3.common.vec_env import VecMonitor

num_agents = 2
number_of_envs = 16
num_cpus=8

env = parallel_env(max_cycles=MAX_CYCLES, env = "harvest", num_agents = num_agents, proportion=.5)

env = ss.resize_v1(env, x_size=36, y_size=36, linear_interp=False)
env = ss.frame_stack_v1(env, 1)
env = ss.pettingzoo_env_to_vec_env_v1(env)
env = ss.concat_vec_envs_v1(env, number_of_envs, num_cpus=num_cpus, base_class="stable_baselines3")
# env = VecMonitor(env, info_keywords=('Utilitarian',), filename='logs/')

In [None]:
def ppo_params(trial:optuna.Trial):
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    return {"learning_rate": learning_rate,
             "n_steps": n_steps,
             "batch_size": batch_size,
             "gamma": gamma,
             "ent_coef": ent_coef
            }

def a2c_params(trial:optuna.Trial):
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-2)
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    return {"learning_rate": learning_rate,
             "vf_coef": vf_coef,
             "gamma": gamma,
             "ent_coef": ent_coef,
            "max_grad_norm": max_grad_norm
            }

def calculate_reward(model):
    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=100)
    return mean_reward


class LoggingCallback:
    def __init__(self, threshold, trial_number, patience):
        self.threshold = threshold
        self.trial_number = trial_number
        self.patience = patience
        self.cb_list = []
    def __call__(self, study:optuna.study, frozen_trial: optuna.Trial):
        study.set_user_attr('previous_best_value', study.best_value)
        if frozen_trial.number > self.trial_number:
            previous_best_value = study.user_attrs.get('previous_best_value', None)
            if previous_best_value * study.best_value >= 0:
                if abs(previous_best_value = study.best_value) < self.threshold:
                    self.cb_list.append(frozen_trial.number)
                    if len(self.cb_list) > self.patience:
                        print('The study stops now..')
                        print('With number ', frozen_trial.number, 'and value ', frozen_trial.value)
                        print('The previous and current values are {} and {} respectively'
                        .format(previous_best_value, study.best_value))
                        study.stop()

def objective(trial: optuna.Trial):
    hyperparameters = a2c_params(trial)
    # model_ppo = PPO(CnnPolicy, env, *hyperparameters)
    model_a2c = A2C(CnnPolicy, env, learning_rate = hyperparameters['learning_rate'], 
                   gamma = hyperparameters['gamma'],
                   ent_coef = hyperparameters['ent_coef'],
                   vf_coef = hyperparameters['vf_coef'],
                   max_grad_norm = hyperparameters['max_grad_norm'])
    model_a2c.learn(total_timesteps=600_000)
    reward = calculate_reward(model_a2c)
    return reward

In [None]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(study_name='a2c_study', direction='maximize', sampler = sampler, pruner=optuna.pruners.HyperbandPruner(),
                           storage='sqlite:////storage/near.db', load_if_exists=True)

In [None]:
print(study.best_params)

In [None]:
logging_callback = LoggingCallback(threshold=3, patience=3, trial_number=3)

study.optimize(objective, n_trials=4, catch=(ValueError, )) # ,  callbacks=[logging_callback])