In [1]:
import numpy
from barzur20aft import Bitcoin, map_params, mappable_params, ptmdp
from compiler import Compiler
from time import time
import sm
import bitcoin

In [2]:
mdp = Compiler(Bitcoin(**mappable_params, maximum_fork_length=25)).mdp()
mdp = map_params(mdp, alpha=0.33, gamma=0.75)
mdp

MDP of size 1624 / 4 / 6318 / 3.9

In [3]:
if True:
    mdp = Compiler(
        sm.SelfishMining(bitcoin.Bitcoin(), **sm.mappable_params, maximum_height=6)
    ).mdp()
    mdp = sm.map_params(mdp, alpha=0.33, gamma=0.75)
    mdp

In [4]:
# helper: get steady state in MDP space
def steady_state(mdp, policy, state):
    mc = mdp.markov_chain(policy, start_state=state)
    mc_ss = mdp.steady_state(mc["prb"]).pop("ss")

    ss = numpy.zeros(policy.shape, dtype=float)
    for mc_state, mdp_state in enumerate(mc["mdp_states"]):
        ss[mdp_state] = mc_ss[mc_state]

    return ss

In [5]:
# benchmark: PTO + value iteration
pmdp = ptmdp(mdp, horizon=100)
vi = pmdp.value_iteration(stop_delta=0.001)

vi_ss = steady_state(mdp, vi["vi_policy"], numpy.argmax("vi_value"))

vi_revenue = vi["vi_value"].dot(vi_ss)
print("PTO revenue (value iteration):", vi_revenue)

PTO revenue (value iteration): 43.99737010919144


In [6]:
# wip: PTO + policy iteration (policy evaluation only)
def policy_evaluation(self, policy, *args, theta, discount=1, verbose=False):
    value = numpy.zeros((2, self.n_states), dtype=float)

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src, actions in enumerate(self.tab):
            a = policy[src]
            if a < 0:
                continue
            v = 0.0
            for t in actions[a]:
                v += t.probability * (t.reward + discount * value[prev, t.destination])
            value[next, src] = v

        delta = numpy.abs(value[next,] - value[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        i += 1

    return value[next,]


pe_value = policy_evaluation(pmdp, vi["vi_policy"], theta=0.001)
pe_revenue = pe_value.dot(vi_ss)
print("PTO revenue (value iteration):  ", vi_revenue)
print("PTO revenue (policy evaluation):", pe_revenue)

PTO revenue (value iteration):   43.99737010919144
PTO revenue (policy evaluation): 43.99687926382505


In [7]:
# wip: PTO + policy iteration (policy evaluation only) (reachable only)
def policy_evaluation_ro(
    self, policy, *args, theta, discount=1, verbose=False, start_state=None
):
    value = numpy.zeros((2, self.n_states), dtype=float)

    reachable = self.reachable_states(policy, start_state=start_state)

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src in reachable:
            a = policy[src]
            if a < 0:
                continue
            v = 0.0
            for t in self.tab[src][a]:
                v += t.probability * (t.reward + discount * value[prev, t.destination])
            value[next, src] = v

        delta = numpy.abs(value[next,] - value[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        i += 1

    return value[next,]


pero_value = policy_evaluation_ro(pmdp, vi["vi_policy"], theta=0.001)
pero_revenue = pero_value.dot(vi_ss)
print("PTO revenue (value iteration):     ", vi_revenue)
print("PTO revenue (policy evaluation):   ", pe_revenue)
print("PTO revenue (policy evaluation ro):", pero_revenue)

PTO revenue (value iteration):      43.99737010919144
PTO revenue (policy evaluation):    43.99687926382505
PTO revenue (policy evaluation ro): 43.99687926382505


In [8]:
# wip: PTO + policy iteration
def policy_iteration(
    self, *args, theta, discount=1, verbose=False, reachable_only=True
):
    start = time()

    policy = numpy.full(self.n_states, -1, dtype=int)

    if reachable_only:
        best_state = None

    i = 1
    while True:
        stable = True

        if reachable_only:
            value = policy_evaluation_ro(
                self, policy, theta=theta, discount=discount, start_state=best_state
            )
            best_state = numpy.argmax(value)
        else:
            value = policy_evaluation(self, policy, theta=theta, discount=discount)

        for src, actions in enumerate(self.tab):
            best_v = float("-inf")
            best_a = -1  # no action possible
            for a, lst in actions.items():
                if a < 0:
                    continue
                v = 0.0
                for t in lst:
                    v += t.probability * (t.reward + discount * value[t.destination])
                if v > best_v:
                    best_v = v
                    best_a = a

            if policy[src] != best_a:
                stable = False

            policy[src] = best_a

        if stable:
            break

        i += 1

    return dict(pi_value=value, pi_policy=policy, pi_iter=i, pi_time=time() - start)


pi = policy_iteration(pmdp, theta=0.001, reachable_only=False)
pi_value = pi.pop("pi_value")
pi_ss = steady_state(mdp, pi["pi_policy"], numpy.argmax(pi_value))
pi_revenue = pi_value.dot(pi_ss)

piro = policy_iteration(pmdp, theta=0.001, reachable_only=True)
piro_value = piro.pop("pi_value")
piro_ss = steady_state(mdp, piro["pi_policy"], numpy.argmax(piro_value))
piro_revenue = piro_value.dot(piro_ss)
print("PTO revenue (value iteration):     ", vi_revenue)
print("PTO revenue (policy evaluation):   ", pe_revenue)
print("PTO revenue (policy evaluation ro):", pero_revenue)
print("PTO revenue (policy iteration):    ", pi_revenue)
print("PTO revenue (policy iteration ro): ", piro_revenue)

# conclude: reachable-only does work for evaluation but not for optimization

PTO revenue (value iteration):      43.99737010919144
PTO revenue (policy evaluation):    43.99687926382505
PTO revenue (policy evaluation ro): 43.99687926382505
PTO revenue (policy iteration):     43.99687926382505
PTO revenue (policy iteration ro):  12.624201898191922


In [9]:
# wip: PTO + policy iteration (reuse value function)
def policy_evaluation_rvf(
    self, policy, *args, theta, discount=1, verbose=False, init=None
):
    value = numpy.zeros((2, self.n_states), dtype=float)

    if init is not None:
        value[0,] = init
        value[1,] = init

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src, actions in enumerate(self.tab):
            a = policy[src]
            if a < 0:
                continue
            v = 0.0
            for t in actions[a]:
                v += t.probability * (t.reward + discount * value[prev, t.destination])
            value[next, src] = v

        delta = numpy.abs(value[next,] - value[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        i += 1

    return value[next,]


def policy_iteration_rvf(self, *args, theta, discount=1, verbose=False):
    start = time()

    policy = numpy.full(self.n_states, -1, dtype=int)
    value = policy_evaluation_rvf(self, policy, theta=theta, discount=discount)

    i = 1
    while True:
        stable = True

        for src, actions in enumerate(self.tab):
            best_v = float("-inf")
            best_a = -1  # no action possible
            for a, lst in actions.items():
                if a < 0:
                    continue
                v = 0.0
                for t in lst:
                    v += t.probability * (t.reward + discount * value[t.destination])
                if v > best_v:
                    best_v = v
                    best_a = a

            if policy[src] != best_a:
                stable = False

            policy[src] = best_a

        if stable:
            break

        i += 1
        value = policy_evaluation_rvf(
            self, policy, theta=theta, discount=discount, init=value
        )

    return dict(pi_value=value, pi_policy=policy, pi_iter=i, pi_time=time() - start)


pirvf = policy_iteration_rvf(pmdp, theta=0.001)
pirvf_value = pirvf.pop("pi_value")
pirvf_ss = steady_state(mdp, pirvf["pi_policy"], numpy.argmax(pirvf_value))
pirvf_revenue = pirvf_value.dot(pirvf_ss)
print("PTO revenue (value iteration):     ", vi_revenue, vi["vi_time"])
print("PTO revenue (policy evaluation):   ", pe_revenue)
print("PTO revenue (policy evaluation ro):", pero_revenue)
print("PTO revenue (policy iteration):    ", pi_revenue, pi["pi_time"])
print("PTO revenue (policy iteration rvf):", pirvf_revenue, pirvf["pi_time"])

PTO revenue (value iteration):      43.99737010919144 15.032769203186035
PTO revenue (policy evaluation):    43.99687926382505
PTO revenue (policy evaluation ro): 43.99687926382505
PTO revenue (policy iteration):     43.99687926382505 75.58411908149719
PTO revenue (policy iteration rvf): 44.01740551368594 20.434382677078247


In [11]:
def reward_per_progress(mdp, policy, start_state):
    mc = mdp.markov_chain(policy, start_state=start_state)
    mc.pop("mdp_states")
    ss = mdp.steady_state(mc["prb"])
    ss_vec = ss.pop("ss")
    return mdp.reward_per_progress(policy, **mc, ss=ss_vec, eps=0.001)


rpp_vi = reward_per_progress(mdp, vi["vi_policy"], numpy.argmax(vi["vi_value"]))
rpp_pi = reward_per_progress(mdp, pirvf["pi_policy"], numpy.argmax(pirvf_value))
print("RPP (value iteration): ", rpp_vi["rpp"])
print("RPP (policy iteration):", rpp_pi["rpp"])

RPP (value iteration):  0.30038634434802663
RPP (policy iteration): 0.30038634434802663
