In [1]:
import barzur20aft
import compiler
import numpy as np

In [2]:
def bitcoin_mdp(*args, **kwargs):
    model = barzur20aft.Bitcoin(*args, **kwargs)
    c = compiler.Compiler(model)
    while c.explore():
        pass
    return c.mdp()

In [3]:
alpha = [0.01, 0.05, 0.1, 0.2, 0.3, 0.35, 0.4, 0.45, 0.49]
ptmdp = []
mdp = []
for a in alpha:
    print(a)
    m = bitcoin_mdp(alpha=a, gamma=0.0, maximum_fork_length=25)
    mdp.append(m)
    ptmdp.append(barzur20aft.ptmdp(m, horizon=100))

0.01
0.05
0.1
0.2
0.3
0.35
0.4
0.45
0.49


In [4]:
def value_iteration(mdp, *args, n_iter=100, discount=0.99, verbose=False):
    value = np.zeros(mdp.n_states, dtype=float)
    policy = np.zeros(mdp.n_states, dtype=int)

    for iteration in range(n_iter):
        value_next = np.zeros(mdp.n_states, dtype=float)
        policy_next = np.zeros(mdp.n_states, dtype=int)

        for src, actions in enumerate(mdp.tab):
            best_v = 0.0
            best_a = -1  # no action possible
            for act, lst in actions.items():
                if act < 0:
                    continue
                this_v = 0.0
                for t in lst:
                    this_v += t.probability * (
                        t.reward + discount * value[t.destination]
                    )
                if this_v >= best_v:  # intentionally to not stick with action -1
                    best_v = this_v
                    best_a = act
            value_next[src] = best_v
            policy_next[src] = best_a
            assert best_a >= 0 or len(actions) == 0

        value_delta = np.abs(value_next - value).max()
        policy_delta = (policy_next != policy).sum()
        if verbose:
            print(iteration, value[:5], value_delta, policy_delta)
        value = value_next
        policy = policy_next
    return value, policy

In [5]:
policy = []
ptvalue = []
for i, m in enumerate(ptmdp):
    v, p = value_iteration(m, discount=1, n_iter=500)
    policy.append(p)
    ptvalue.append(v)
    print(
        f"alpha={alpha[i]} ptvalue[0]={ptvalue[i][0]} policy={policy[i][:10]} {sum(policy[i])} {hash(policy[i].tobytes()) % 10000}"
    )

alpha=0.01 ptvalue[0]=0.9189414838378175 policy=[0 3 1 0 1 0 0 3 1 1] 1794 1520
alpha=0.05 ptvalue[0]=4.594707419189073 policy=[0 3 1 0 1 0 0 3 1 1] 1786 1264
alpha=0.1 ptvalue[0]=9.189414838378188 policy=[0 3 1 0 1 0 0 3 1 1] 1784 8739
alpha=0.2 ptvalue[0]=18.378829676756467 policy=[0 3 1 0 1 0 0 3 1 1] 1754 4957
alpha=0.3 ptvalue[0]=27.569142083943404 policy=[0 3 1 0 1 0 0 3 0 1] 1700 6493
alpha=0.35 ptvalue[0]=34.61090738587485 policy=[0 0 1 0 1 0 0 3 0 1] 1678 2812
alpha=0.4 ptvalue[0]=45.488694825371496 policy=[0 0 1 0 1 0 0 3 0 1] 1656 8426
alpha=0.45 ptvalue[0]=61.125244060442164 policy=[0 0 1 0 1 0 0 3 0 1] 1626 2988
alpha=0.49 ptvalue[0]=76.07812204354227 policy=[0 0 1 0 1 0 0 3 0 1] 1588 4774


In [6]:
def reward_per_progress_backpropagation(mdp, policy, n_iter=500):
    reward = np.zeros(mdp.n_states, dtype=float)
    progress = np.zeros(mdp.n_states, dtype=float)

    for i in range(n_iter):
        reward_next = np.zeros(mdp.n_states, dtype=float)
        progress_next = np.zeros(mdp.n_states, dtype=float)
        for src in range(mdp.n_states):
            act = policy[src]
            if act == -1:
                assert len(mdp.tab[src]) == 0
                continue
            for t in mdp.tab[src][act]:
                reward_next[src] += t.probability * (t.reward + reward[t.destination])
                progress_next[src] += t.probability * (
                    t.progress + progress[t.destination]
                )
        reward = reward_next
        progress = progress_next
    return reward / progress


for i, a in enumerate(alpha):
    rpp = reward_per_progress_backpropagation(mdp[i], policy[i])
    print(f"alpha={a} rpp[0]={rpp[0]}")

alpha=0.01 rpp[0]=0.009999999999999992
alpha=0.05 rpp[0]=0.049999999999999954
alpha=0.1 rpp[0]=0.1
alpha=0.2 rpp[0]=0.19999999999999984
alpha=0.3 rpp[0]=0.2999999999999994
alpha=0.35 rpp[0]=0.3681152968623869
alpha=0.4 rpp[0]=0.48155899463631074
alpha=0.45 rpp[0]=0.6371454398566533
alpha=0.49 rpp[0]=0.7969679577222449
