In [1]:
import cpr_gym

cpr_gym.engine.cpr_lib_version

'local-v0.5.4-4-g68df215'

In [2]:
import math
import gym


def env_fn(config={}):
    protocol_fn = getattr(cpr_gym.protocols, config.get("protocol", "nakamoto"))
    protocol_args = config.get("protocol_args", {})

    episode_len = config.get("episode_len", 128)

    alpha = config.get("alpha", 0.33)
    gamma = config.get("gamma", 0.5)
    if "defenders" in config:
        defenders = config["defenders"]
    else:
        defenders = math.ceil((1 - alpha) / (1 - gamma))

    rewards = dict(
        sparse_relative=(
            cpr_gym.wrappers.SparseRelativeRewardWrapper,
            dict(max_steps=episode_len),
        ),
        sparse_per_progress=(
            cpr_gym.wrappers.SparseRewardPerProgressWrapper,
            dict(max_steps=episode_len),
        ),
        dense_per_progress=(
            lambda env: cpr_gym.wrappers.DenseRewardPerProgressWrapper(
                env, episode_len=episode_len
            ),
            dict(),
        ),
    )

    reward_wrapper, env_args = rewards[config.get("reward", "sparse_relative")]

    env = gym.make(
        "cpr_gym:core-v0",
        proto=protocol_fn(**protocol_args),
        alpha=alpha,
        gamma=gamma,
        defenders=defenders,
        **env_args
    )

    env = reward_wrapper(env)

    env = cpr_gym.wrappers.ClearInfoWrapper(env)

    return env


from stable_baselines3.common.env_checker import check_env

test_env = env_fn()
check_env(test_env)
test_env.render()

print()
test_env = env_fn(
    dict(
        alpha=0.45,
        protocol="tailstorm",
        protocol_args=dict(k=8, reward="discount", subblock_selection="heuristic"),
    )
)
check_env(test_env)
test_env.render()

Nakamoto consensus; SSZ'16 attack space; α=0.33 attacker
public_blocks: 2
private_blocks: 1
diff_blocks: -1
event: `ProofOfWork
Actions: (0) Adopt | (1) Override | (2) Match | (3) Wait

Tailstorm with k=8, discount rewards, and heuristic sub-block selection; SSZ'16-like attack space; α=0.45 attacker
public_blocks: 1
private_blocks: 1
diff_blocks: 0
public_votes: 2
private_votes_inclusive: 1
private_votes_exclusive: 1
public_depth: 1
private_depth_inclusive: 0
private_depth_exclusive: 0
event: `Network
Actions: (0) Adopt_Prolong | (1) Override_Prolong | (2) Match_Prolong | (3) Wait_Prolong | (4) Adopt_Proceed | (5) Override_Proceed | (6) Match_Proceed | (7) Wait_Proceed


In [3]:
import stable_baselines3
import torch


def ppo(config={}):
    env_config = config.get("env_config", {})

    def configured_env_fn():
        return env_fn(env_config)

    # Vectorize
    n_vec = config.get("vectorize", 1)
    if n_vec > 1 and not config.get("vectorize_dummy", False):
        env = stable_baselines3.common.vec_env.SubprocVecEnv(
            [configured_env_fn] * n_vec
        )
    else:
        env = stable_baselines3.common.vec_env.DummyVecEnv([configured_env_fn] * n_vec)

    # Monitor
    env = stable_baselines3.common.vec_env.VecMonitor(env)

    # Learning-rate schedule
    lr_start = config.get("lr_start", 1e-3)
    lr_end = max(lr_start, config.get("lr_end", 1e-5))

    def lr_schedule(remaining):
        return lr_start * remaining + lr_end * (1 - remaining)

    batch_size = config.get("batch_size", 1024)
    n_steps_multiple = config.get("n_steps_multiple", 1)
    layer_size = config.get("layer_size", 64)
    n_layers = config.get("n_layers", 3)
    model = stable_baselines3.PPO(
        "MlpPolicy",
        env=env,
        verbose=config.get("verbose", 0),
        batch_size=batch_size,
        gamma=config.get("gamma", 0.999),
        n_steps=batch_size * n_steps_multiple,
        clip_range=config.get("clip_range", 0.1),
        learning_rate=lr_schedule,
        policy_kwargs=dict(
            activation_fn=torch.nn.ReLU,
            net_arch=[
                dict(
                    pi=[int(layer_size)] * n_layers,
                    vf=[int(layer_size)] * n_layers,
                )
            ],
        ),
    )
    return model


ppo()

<stable_baselines3.ppo.ppo.PPO at 0x7f15c3f03040>

In [4]:
import numpy as np
import skopt
import time

space = dict(
    lr_start=skopt.space.Real(
        name="lr_start", low=1e-5, high=1e-2, prior="log-uniform"
    ),
    lr_decay=skopt.space.Real(name="lr_decay", low=1e-3, high=1, prior="log-uniform"),
    clip_range=skopt.space.Real(name="clip_range", low=0.01, high=0.5, prior="uniform"),
    gamma=skopt.space.Real(name="gamma", low=0.9, high=1, prior="log-uniform"),
    n_vec=skopt.space.Integer(name="n_vec", low=6, high=12, prior="uniform"),
    batch_size=skopt.space.Integer(
        name="batch_size", low=128, high=8192, prior="log-uniform"
    ),
    n_steps_multiple=skopt.space.Integer(
        name="n_steps_multiple", low=1, high=4, prior="uniform"
    ),
    layer_size=skopt.space.Categorical([8, 16, 32, 64, 128], name="layer_size"),
    n_layers=skopt.space.Integer(name="n_layers", low=2, high=4, prior="uniform"),
)
dimensions = skopt.utils.dimensions_aslist(space)

env_config = dict(
    alpha=0.45,
    gamma=0.5,
    reward="sparse_relative",
    episode_len=128,
)


eval_n_episodes = 16
training_budget_steps = 5e5


@skopt.utils.use_named_args(dimensions=dimensions)
def objective(
    lr_start,
    lr_decay,
    n_vec,
    batch_size,
    n_steps_multiple,
    layer_size,
    n_layers,
    clip_range,
    gamma,
):
    alpha = env_config["alpha"]

    config = dict(
        env_config=env_config,
        lr_start=lr_start,
        lr_end=lr_start * lr_decay,
        vectorize=n_vec,
        batch_size=batch_size,
        n_steps_multiple=n_steps_multiple,
        layer_size=layer_size,
        n_layers=n_layers,
        clip_range=clip_range,
        gamma=gamma,
    )

    log = dict(
        config=config,
        time=dict(),
        eval_n_episodes=eval_n_episodes,
        training_budget_steps=training_budget_steps,
    )

    model = ppo(config)

    # Train
    start_training = time.time()
    model.learn(total_timesteps=training_budget_steps)
    log["time"]["training_s"] = time.time() - start_training

    # Eval
    start_eval = time.time()
    env = env_fn()
    obs = env.reset()
    rewards = []
    i = 0
    while i < eval_n_episodes:
        i += 1
        obs = env.reset()
        done = False
        reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, r, done, info = env.step(action)
            reward += r
        reward = reward / alpha - 1
        rewards.append(reward)

    log["time"]["eval_s"] = time.time() - start_eval

    mean_reward = np.mean(rewards)
    log["mean_reward"] = mean_reward

    del model

    return log


def point(**kwargs):
    return skopt.utils.point_aslist(space, dict(**kwargs))


# objective(point(lr=1e-3, n_vec=1))

In [5]:
import skopt.plots
import matplotlib.pyplot as plt
import IPython.display
import datetime
from collections import deque

optimizer = skopt.Optimizer(dimensions)

next_x = optimizer.ask()
last_r = -1.0
max_x = next_x
max_r = last_r
log = []
times = deque([], 10)
i = 0

In [12]:
x = point(
    lr_start=1e-3,
    lr_decay=1,
    clip_range=0.1,
    n_vec=1,
    batch_size=1024,
    gamma=0.999,
    n_layers=3,
    layer_size=16,
    n_steps_multiple=1,
)
objective(x)

{'config': {'env_config': {'alpha': 0.45,
   'gamma': 0.5,
   'reward': 'sparse_relative',
   'episode_len': 128},
  'lr_start': 0.001,
  'lr_end': 0.001,
  'vectorize': 1,
  'batch_size': 1024,
  'n_steps_multiple': 1,
  'layer_size': 16,
  'n_layers': 3,
  'clip_range': 0.1,
  'gamma': 0.999},
 'time': {'training_s': 177.15141940116882, 'eval_s': 0.46401000022888184},
 'eval_n_episodes': 16,
 'training_budget_steps': 500000.0,
 'mean_reward': -0.3005503266793349}

In [6]:
# for _ in range(10):
while True:
    i += 1
    IPython.display.clear_output()
    print(str(datetime.datetime.now()))
    print(f"iteration {i}")
    print(f"{3600 / np.mean(times):1.0f} iterations per hour")
    print()
    print(f"last r: {last_r}")
    print(f"next x: {dict(skopt.utils.point_asdict(space, next_x))}")
    print()
    print(f"best r: {max_r}")
    print(f"best x: {dict(skopt.utils.point_asdict(space, max_x))}")

    start = time.time()
    ret = objective(next_x)
    times.append(time.time() - start)
    log.append((next_x, ret))

    last_r = ret["mean_reward"]
    if last_r > max_r:
        max_r = last_r
        max_x = next_x
    optimizer.tell(next_x, -last_r)
    next_x = optimizer.ask()

2022-11-18 08:10:48.541088
iteration 521
28 iterations per hour

last r: -0.3409806410946399
next x: {'batch_size': 128, 'clip_range': 0.10812895857629526, 'gamma': 1.0, 'layer_size': 128, 'lr_decay': 1.0, 'lr_start': 0.0002882052349594924, 'n_layers': 4, 'n_steps_multiple': 1, 'n_vec': 6}

best r: -0.11839819027674037
best x: {'batch_size': 404, 'clip_range': 0.01, 'gamma': 1.0, 'layer_size': 128, 'lr_decay': 0.007729720805333112, 'lr_start': 0.0017582507769772183, 'n_layers': 4, 'n_steps_multiple': 1, 'n_vec': 9}


KeyboardInterrupt: 