In [1]:
import gym
import math
import psutil
import ray
from ray.rllib.algorithms import ppo
from ray.tune.registry import register_env

import cpr_gym

display(cpr_gym.engine.cpr_lib_version)

ray.init(ignore_reinit_error=True)

'local-v0.5.1-4-g837f00d'

2022-11-16 18:49:48,093	INFO worker.py:1528 -- Started a local Ray instance.


0,1
Python version:,3.9.15
Ray version:,2.1.0


In [2]:
def sparse_relative(
    protocol="nakamoto", protocol_args={}, alpha=1 / 3, gamma=0.5, episode_len=128
):
    protocol_fn = getattr(cpr_gym.protocols, protocol)
    env = gym.make(
        "cpr_gym:core-v0",
        proto=protocol_fn(**protocol_args),
        max_steps=episode_len,
        alpha=alpha,
        gamma=gamma,
        defenders=math.ceil((1 - alpha) / (1 - gamma)),
    )
    env = cpr_gym.wrappers.SparseRelativeRewardWrapper(env)
    return env


register_env("sparse_relative", lambda config: sparse_relative(**config))


def sparse_per_progress(
    protocol="nakamoto", protocol_args={}, alpha=1 / 3, gamma=0.5, episode_len=128
):
    protocol_fn = getattr(cpr_gym.protocols, protocol)
    env = gym.make(
        "cpr_gym:core-v0",
        proto=protocol_fn(**protocol_args),
        max_steps=episode_len,
        alpha=alpha,
        gamma=gamma,
        defenders=math.ceil((1 - alpha) / (1 - gamma)),
    )
    env = cpr_gym.wrappers.SparseRewardPerProgressWrapper(env)
    return env


register_env("sparse_per_progress", lambda config: sparse_per_progress(**config))


def dense_per_progress(
    protocol="nakamoto", protocol_args={}, alpha=1 / 3, gamma=0.5, episode_len=128
):
    protocol_fn = getattr(cpr_gym.protocols, protocol)
    env = gym.make(
        "cpr_gym:core-v0",
        proto=protocol_fn(**protocol_args),
        alpha=alpha,
        gamma=gamma,
        defenders=math.ceil((1 - alpha) / (1 - gamma)),
    )
    env = cpr_gym.wrappers.DenseRewardPerProgressWrapper(env, episode_len=episode_len)
    return env


register_env("dense_per_progress", lambda config: dense_per_progress(**config))

algo = ppo.PPO(
    env="sparse_relative",
    config=dict(
        env_config=dict(
            protocol="nakamoto",
            protocol_args=dict(),
            alpha=0.45,
            gamma=0.5,
            episode_len=128,
        ),
        framework="torch",
        num_gpus=0,
        num_workers=-1,
        model=dict(
            fcnet_hiddens=[32, 32, 32],
            fcnet_activation="relu",
        ),
        rollout_fragment_length=2048,
        train_batch_size=2048 * 6,
    ),
)

# for i in range(10):
# print(i)
# algo.train()

# algo.evaluate()

2022-11-16 18:49:49,216	INFO ppo.py:379 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-11-16 18:49:49,218	INFO algorithm.py:457 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


In [3]:
import random

from ray import air, tune
from ray.tune.schedulers import PopulationBasedTraining

# From https://docs.ray.io/en/latest/tune/examples/pbt_ppo_example.html

# Postprocess the perturbed config to ensure it's still valid
def explore(config):
    # ensure we collect enough timesteps to do sgd
    if config["train_batch_size"] < config["sgd_minibatch_size"] * 2:
        config["train_batch_size"] = config["sgd_minibatch_size"] * 2
    # ensure we run at least one sgd iter
    if config["num_sgd_iter"] < 1:
        config["num_sgd_iter"] = 1
    return config


pbt = PopulationBasedTraining(
    time_attr="time_total_s",
    perturbation_interval=120,
    resample_probability=0.25,
    # Specifies the mutations of these hyperparams
    hyperparam_mutations={
        "lambda": lambda: random.uniform(0.9, 1.0),
        "clip_param": lambda: random.uniform(0.01, 0.5),
        "lr": lambda: 10 ** (-random.uniform(2.5, 5)),
        "num_sgd_iter": lambda: random.randint(1, 30),
        "sgd_minibatch_size": lambda: random.randint(128, 16384),
        "train_batch_size": lambda: random.randint(2000, 160000),
    },
    custom_explore_fn=explore,
)

tuner = tune.Tuner(
    "PPO",
    tune_config=tune.TuneConfig(
        metric="episode_reward_mean",
        mode="max",
        scheduler=pbt,
        num_samples=-1,
        # max_concurrent_trials=6,
        # reuse_actors=True,
    ),
    param_space={
        "env": "sparse_relative",
        "env_config": dict(
            protocol="nakamoto",
            protocol_args=dict(),
            alpha=0.45,
            gamma=0.5,
            episode_len=128,
        ),
        "framework": "torch",
        "num_gpus": 0,
        "num_workers": 11,
        "model": dict(
            fcnet_hiddens=[32, 32, 32],
            fcnet_activation="relu",
        ),
        "kl_coeff": 1.0,
        # These params are tuned from a fixed starting value.
        "lambda": 0.95,
        "clip_param": 0.2,
        "lr": 1e-3,
        "train_batch_size": 10000,
        # These params start off randomly drawn from a set.
        "num_sgd_iter": tune.choice([10, 20, 30]),
        "sgd_minibatch_size": tune.choice([128, 256, 512, 1024, 2048]),
    },
)
# results = tuner.fit()

# print("best hyperparameters: ", results.get_best_result().config)

In [4]:
import random
import warnings

from ray import air, tune
from ray.tune.search.skopt import SkOptSearch

# From https://docs.ray.io/en/latest/tune/examples/pbt_ppo_example.html

bayesopt = SkOptSearch(
    metric="episode_reward_mean",
    mode="max",
)

base_config = {
    "env": "sparse_relative",
    "env_config": dict(
        protocol="nakamoto",
        protocol_args=dict(),
        alpha=0.45,
        gamma=0.5,
        episode_len=128,
    ),
    "framework": "torch",
    "num_gpus": 0,
    "num_workers": -1,
    "model": dict(
        fcnet_hiddens=[32, 32, 32],
        fcnet_activation="relu",
    ),
    "evaluation_duration": 64,
}


def my_ppo(config={}):
    config = config | base_config

    budget = config.pop("time_budget_s", 300)

    sms = config.get("sgd_minibatch_size", 128)
    tbsm = config.pop("train_batch_size_multiple", 1)
    tbs = sms * tbsm
    config["train_batch_size"] = tbs
    config["rollout_fragment_length"] = config["train_batch_size"]

    ed = config.get("evaluation_duration", 1)
    config["evaluation_duration_unit"] = "episodes"
    # it does something really weird instead of counting episodes ...
    config["evaluation_duration"] = max(
        1, int(ed / tbsm / (sms / config["env_config"]["episode_len"]))
    )

    # config["evaluation_duration"] = max(1, int(ed / tbsm))

    env = config.pop("env", "sparse_relative")

    algo = ppo.PPO(env=env, config=config)

    res = algo.train()
    while res["time_total_s"] < budget:
        res = algo.train()

    return algo.evaluate()["evaluation"]


# my_ppo(dict(train_batch_size_multiple = 2, time_budget_s = 10, sgd_minibatch_size=256))

In [5]:
tune_budget_h = 2
run_budget_s = 300
cores = 12
num_samples = int(tune_budget_h * 60 * 60 / run_budget_s * cores)
display(f"num_samples: {num_samples}")

tuner = tune.Tuner(
    my_ppo,
    tune_config=tune.TuneConfig(
        search_alg=bayesopt,
        num_samples=num_samples,
    ),
    param_space={
        "time_budget_s": run_budget_s,
        "kl_coeff": tune.uniform(0, 1.0),
        "lambda": tune.uniform(0.9, 1.0),
        "clip_param": tune.uniform(0.01, 0.5),
        "lr": tune.loguniform(1e-5, 1e-2),
        "num_sgd_iter": tune.randint(1, 30),
        "sgd_minibatch_size": tune.choice([128, 256, 512, 1024, 2048, 4096]),
        "train_batch_size_multiple": tune.choice([1, 2, 4, 8]),
    },
)
# results = tuner.fit()

print(
    "best hyperparameters: ",
    results.get_best_result(metric="episode_reward_mean", mode="max").config,
)

'num_samples: 288'

NameError: name 'results' is not defined