In [1]:
import numpy
from aft20barzur import BitcoinSM, map_params, mappable_params, ptmdp
from compiler import Compiler
from time import time
import sm
import bitcoin
import util

In [2]:
if False:
    mdp = Compiler(
        sm.SelfishMining(bitcoin.Bitcoin(), **sm.mappable_params, maximum_height=6)
    ).mdp()
    mdp = sm.map_params(mdp, alpha=0.33, gamma=0.75)
else:
    mdp = Compiler(BitcoinSM(**mappable_params, maximum_fork_length=25)).mdp()
    mdp = map_params(mdp, alpha=0.33, gamma=0.75)
mdp

MDP of size 1624 / 4 / 6318 / 3.9

In [3]:
# benchmark: PTO + value iteration
pmdp = ptmdp(mdp, horizon=100)
vi = pmdp.value_iteration(stop_delta=0.001)

vi_ss = mdp.steady_state(vi["vi_policy"], start_state=numpy.argmax("vi_value"))["ss"]

vi_revenue = vi["vi_value"][:-1].dot(vi_ss)
print("PTO revenue (value iteration):", vi_revenue)

PTO revenue (value iteration): 41.20263153697032


In [4]:
# wip: PTO + policy iteration (policy evaluation only)
def policy_evaluation(self, policy, *args, theta, discount=1, verbose=False):
    value = numpy.zeros((2, self.n_states), dtype=float)

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src, actions in enumerate(self.tab):
            a = policy[src]
            if a < 0:
                continue
            v = 0.0
            for t in actions[a]:
                v += t.probability * (t.reward + discount * value[prev, t.destination])
            value[next, src] = v

        delta = numpy.abs(value[next,] - value[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        i += 1

    return value[next,]


pe_value = policy_evaluation(pmdp, vi["vi_policy"], theta=0.001)
pe_revenue = pe_value[:-1].dot(vi_ss)
print("PTO revenue (value iteration):  ", vi_revenue)
print("PTO revenue (policy evaluation):", pe_revenue)

PTO revenue (value iteration):   41.20263153697032
PTO revenue (policy evaluation): 41.20285764729448


In [5]:
# wip: PTO + policy iteration (policy evaluation only) (reachable only)
def policy_evaluation_ro(
    self, policy, *args, theta, discount=1, verbose=False, start_state=None
):
    value = numpy.zeros((2, self.n_states), dtype=float)

    reachable = self.reachable_states(policy, start_state=start_state)

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src in reachable:
            a = policy[src]
            if a < 0:
                continue
            v = 0.0
            for t in self.tab[src][a]:
                v += t.probability * (t.reward + discount * value[prev, t.destination])
            value[next, src] = v

        delta = numpy.abs(value[next,] - value[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        i += 1

    return value[next,]


pero_value = policy_evaluation_ro(pmdp, vi["vi_policy"], theta=0.001)
pero_revenue = pero_value[:-1].dot(vi_ss)
print("PTO revenue (value iteration):     ", vi_revenue)
print("PTO revenue (policy evaluation):   ", pe_revenue)
print("PTO revenue (policy evaluation ro):", pero_revenue)

PTO revenue (value iteration):      41.20263153697032
PTO revenue (policy evaluation):    41.20285764729448
PTO revenue (policy evaluation ro): 41.20285764729448


In [6]:
# wip: PTO + policy iteration
def policy_iteration(
    self, *args, theta, discount=1, verbose=False, reachable_only=True
):
    start = time()

    policy = numpy.full(self.n_states, -1, dtype=int)

    if reachable_only:
        best_state = None

    i = 1
    while True:
        stable = True

        if reachable_only:
            value = policy_evaluation_ro(
                self, policy, theta=theta, discount=discount, start_state=best_state
            )
            best_state = numpy.argmax(value)
        else:
            value = policy_evaluation(self, policy, theta=theta, discount=discount)

        for src, actions in enumerate(self.tab):
            best_v = float("-inf")
            best_a = -1  # no action possible
            for a, lst in actions.items():
                if a < 0:
                    continue
                v = 0.0
                for t in lst:
                    v += t.probability * (t.reward + discount * value[t.destination])
                if v > best_v:
                    best_v = v
                    best_a = a

            if policy[src] != best_a:
                stable = False

            policy[src] = best_a

        if stable:
            break

        i += 1

    return dict(pi_value=value, pi_policy=policy, pi_iter=i, pi_time=time() - start)


pi = policy_iteration(pmdp, theta=0.001, reachable_only=False)
pi_value = pi.pop("pi_value")
pi_ss = mdp.steady_state(pi["pi_policy"], start_state=numpy.argmax(pi_value))["ss"]
pi_revenue = pi_value[:-1].dot(pi_ss)

piro = policy_iteration(pmdp, theta=0.001, reachable_only=True)
piro_value = piro.pop("pi_value")
piro_ss = mdp.steady_state(piro["pi_policy"], start_state=numpy.argmax(piro_value))[
    "ss"
]
piro_revenue = piro_value[:-1].dot(piro_ss)
print("PTO revenue (value iteration):     ", vi_revenue)
print("PTO revenue (policy evaluation):   ", pe_revenue)
print("PTO revenue (policy evaluation ro):", pero_revenue)
print("PTO revenue (policy iteration):    ", pi_revenue)
print("PTO revenue (policy iteration ro): ", piro_revenue)

# conclude: reachable-only does work for evaluation but not for optimization

PTO revenue (value iteration):      41.20263153697032
PTO revenue (policy evaluation):    41.20285764729448
PTO revenue (policy evaluation ro): 41.20285764729448
PTO revenue (policy iteration):     41.20285764729448
PTO revenue (policy iteration ro):  32.90149018623927


In [7]:
# wip: PTO + policy iteration (reuse value function)
def policy_evaluation_rvf(
    self, policy, *args, theta, discount=1, verbose=False, init=None
):
    value = numpy.zeros((2, self.n_states), dtype=float)

    if init is not None:
        value[0,] = init
        value[1,] = init

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src, actions in enumerate(self.tab):
            a = policy[src]
            if a < 0:
                continue
            v = 0.0
            for t in actions[a]:
                v += t.probability * (t.reward + discount * value[prev, t.destination])
            value[next, src] = v

        delta = numpy.abs(value[next,] - value[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        i += 1

    return value[next,]


def policy_iteration_rvf(self, *args, theta, discount=1, verbose=False):
    start = time()

    policy = numpy.full(self.n_states, -1, dtype=int)
    value = policy_evaluation_rvf(self, policy, theta=theta, discount=discount)

    i = 1
    while True:
        stable = True

        for src, actions in enumerate(self.tab):
            best_v = float("-inf")
            best_a = -1  # no action possible
            for a, lst in actions.items():
                if a < 0:
                    continue
                v = 0.0
                for t in lst:
                    v += t.probability * (t.reward + discount * value[t.destination])
                if v > best_v:
                    best_v = v
                    best_a = a

            if policy[src] != best_a:
                stable = False

            policy[src] = best_a

        if stable:
            break

        i += 1
        value = policy_evaluation_rvf(
            self, policy, theta=theta, discount=discount, init=value
        )

    return dict(pi_value=value, pi_policy=policy, pi_iter=i, pi_time=time() - start)


pirvf = policy_iteration_rvf(pmdp, theta=0.001)
pirvf_value = pirvf.pop("pi_value")
pirvf_ss = mdp.steady_state(pirvf["pi_policy"], start_state=numpy.argmax(pirvf_value))[
    "ss"
]
pirvf_revenue = pirvf_value[:-1].dot(pirvf_ss)
print("PTO revenue (value iteration):     ", vi_revenue, vi["vi_time"])
print("PTO revenue (policy evaluation):   ", pe_revenue)
print("PTO revenue (policy evaluation ro):", pero_revenue)
print("PTO revenue (policy iteration):    ", pi_revenue, pi["pi_time"])
print("PTO revenue (policy iteration rvf):", pirvf_revenue, pirvf["pi_time"])

PTO revenue (value iteration):      41.20263153697032 4.203214645385742
PTO revenue (policy evaluation):    41.20285764729448
PTO revenue (policy evaluation ro): 41.20285764729448
PTO revenue (policy iteration):     41.20285764729448 7.576725721359253
PTO revenue (policy iteration rvf): 41.20286480649146 3.887073516845703


In [8]:
# Does reward per progress do what it should?
def policy_evaluation_full(
    self,
    policy,
    *args,
    theta,
    discount=1,
    verbose=False,
    around_state=None,
    max_iter=None,
):
    rew = numpy.zeros((2, self.n_states), dtype=float)
    prg = numpy.zeros((2, self.n_states), dtype=float)

    if around_state is None:
        included_states = self.reachable_states(policy, start_state=around_state)
    else:
        included_states = range(self.n_states)

    i = 1
    while True:
        prev = i % 2
        next = (prev + 1) % 2

        for src in included_states:
            a = policy[src]
            if a < 0:
                continue
            r = 0.0
            p = 0.0
            for t in self.tab[src][a]:
                r += t.probability * (t.reward + discount * rew[prev, t.destination])
                p += t.probability * (t.progress + discount * prg[prev, t.destination])
            rew[next, src] = r
            prg[next, src] = p

        delta = numpy.abs(rew[next,] - rew[prev,]).max()

        if verbose:
            print(f"\riteration {i}: delta {delta:g}")

        if delta < theta:
            break

        if max_iter is not None and i >= max_iter:
            break

        i += 1

    return dict(pe_reward=rew[next,], pe_progress=prg[next,], pe_iter=i)


best_state = numpy.argmax(pirvf_value)
# Evaluate policy in PTO space, note number of iterations
ppe = policy_evaluation_full(
    pmdp, pirvf["pi_policy"], around_state=best_state, theta=0.001
)
# Evalutate policy in divergent space for same number of iterations
pe = policy_evaluation_full(
    mdp, pirvf["pi_policy"], around_state=best_state, theta=0, max_iter=ppe["pe_iter"]
)
pe_prg = pe["pe_progress"].dot(pirvf_ss)
pe_rew = pe["pe_reward"].dot(pirvf_ss)
rpp_pipe = pe_rew / pe_prg

print("RPP (policy iteration) (this cell):", rpp_pipe)

RPP (policy iteration) (this cell): 0.40428299140506413


In [9]:
# run the value iteration based pipeline & compare
res = util.optimize_and_evaluate(mdp, horizon=100, eps=0.001)

print("RPP (policy iteration):", rpp_pipe)
print("RPP (value iteration):", res["rpp"])

RPP (policy iteration): 0.40428299140506413
RPP (value iteration): 0.40428299140459983
