In [1]:
import barzur20aft
import compiler
import numpy as np
import math

In [2]:
def bitcoin_mdp(*args, **kwargs):
    model = barzur20aft.Bitcoin(*args, **kwargs)
    c = compiler.Compiler(model)
    return c.mdp()

In [3]:
alpha = [0.01, 0.05, 0.1, 0.2, 0.3, 0.35, 0.4, 0.45, 0.49]
ptmdp = []
mdp = []
for a in alpha:
    print(a)
    m = bitcoin_mdp(alpha=a, gamma=0.0, maximum_fork_length=25)
    mdp.append(m)
    ptmdp.append(barzur20aft.ptmdp(m, horizon=100))

0.01
0.05
0.1
0.2
0.3
0.35
0.4
0.45
0.49


In [4]:
vi = []
for i, m in enumerate(ptmdp):
    res = m.value_iteration(value_eps=0.01)
    vi.append(res)
    v = res["value"]
    p = res["policy"]
    iter = res["iter"]
    print(
        f"alpha={alpha[i]:.2f} iter={iter:3d} v[0]={v[0]:.2f} p={p[:10]} {sum(p)} {hash(p.tobytes()) % 10000}"
    )

alpha=0.01 iter= 30 v[0]=0.14 p=[0 2 1 0 0 0 1 0 0 2] 1370 501
alpha=0.05 iter=315 v[0]=3.97 p=[0 2 1 0 0 0 1 0 0 2] 1386 1966
alpha=0.10 iter=439 v[0]=8.89 p=[0 2 1 0 0 0 1 0 0 2] 1380 582
alpha=0.20 iter=535 v[0]=18.64 p=[0 2 1 0 0 0 1 0 0 2] 1350 1185
alpha=0.30 iter=541 v[0]=28.02 p=[0 2 1 0 0 0 1 0 0 2] 1302 4164
alpha=0.35 iter=571 v[0]=35.46 p=[0 0 1 0 0 0 1 0 0 2] 1282 5217
alpha=0.40 iter=628 v[0]=47.30 p=[0 0 1 0 0 0 1 0 0 2] 1260 7227
alpha=0.45 iter=689 v[0]=64.32 p=[0 0 1 0 0 0 1 0 0 0] 1236 5698
alpha=0.49 iter=747 v[0]=80.97 p=[0 0 1 0 0 0 1 0 0 0] 1200 6423


In [5]:
def reward_per_progress_backpropagation(mdp, policy, n_iter=500):
    reward = np.zeros(mdp.n_states, dtype=float)
    progress = np.zeros(mdp.n_states, dtype=float)

    for i in range(n_iter):
        reward_next = np.zeros(mdp.n_states, dtype=float)
        progress_next = np.zeros(mdp.n_states, dtype=float)
        for src in range(mdp.n_states):
            act = policy[src]
            if act == -1:
                assert len(mdp.tab[src]) == 0
                continue
            for t in mdp.tab[src][act]:
                reward_next[src] += t.probability * (t.reward + reward[t.destination])
                progress_next[src] += t.probability * (
                    t.progress + progress[t.destination]
                )
        reward = reward_next
        progress = progress_next
    return reward / progress


for i, a in enumerate(alpha):
    rpp = reward_per_progress_backpropagation(mdp[i], vi[i]["policy"])
    print(f"alpha={a} rpp[0]={rpp[0]}")

alpha=0.01 rpp[0]=0.009999999999999992
alpha=0.05 rpp[0]=0.049999999999999954
alpha=0.1 rpp[0]=0.1
alpha=0.2 rpp[0]=0.19999999999999984
alpha=0.3 rpp[0]=0.2999999999999994
alpha=0.35 rpp[0]=0.3681152968623869
alpha=0.4 rpp[0]=0.48155899463631074
alpha=0.45 rpp[0]=0.6372053002323459
alpha=0.49 rpp[0]=0.7969679577222449


In [6]:
def steady_state_rpp(mdp, policy):
    n = mdp.n_states
    prb = np.zeros((n, n), dtype=float)
    rew = np.zeros(n, dtype=float)
    prg = np.zeros(n, dtype=float)

    for src, actions in enumerate(mdp.tab):
        for t in actions[policy[src]]:
            dst = t.destination
            prb[src, dst] = t.probability
            rew[src] += t.probability * t.reward
            prg[src] += t.probability * t.progress

    # by squaring the matrix we can do 2^10 state transitions quickly
    for _ in range(10):
        prb = np.dot(prb, prb)

    vec = np.zeros(n, dtype=float)
    for s, p in mdp.start.items():
        vec[s] = p

    assert sum(vec) == 1, f"{sum(vec)}"
    # print('start states', sum(vec > 0))

    vec = np.dot(vec, prb)
    assert math.isclose(sum(vec), 1), f"{sum(vec)}"
    # print('steady states', sum(vec > 0))

    # print(vec.shape, rew.shape, prg.shape)
    # print(np.sum(np.multiply(vec, rew)))
    # print(np.sum(np.multiply(vec, prg)))

    return np.sum(np.multiply(vec, rew)) / np.sum(np.multiply(vec, prg))


for i, a in enumerate(alpha):
    rpp = steady_state_rpp(mdp[i], vi[i]["policy"])
    print(f"alpha={a} rpp={rpp}")

  return np.sum(np.multiply(vec, rew)) / np.sum(np.multiply(vec, prg))


alpha=0.01 rpp=nan
alpha=0.05 rpp=nan
alpha=0.1 rpp=nan
alpha=0.2 rpp=nan
alpha=0.3 rpp=nan
alpha=0.35 rpp=0.37061541132797327
alpha=0.4 rpp=0.48705470407321894
alpha=0.45 rpp=0.6527894974063071
alpha=0.49 rpp=0.8283342996887935


**Not sure why this does not calculate the steady state rewards for the honest policy?!**

In [7]:
def steady_state(mdp, policy):
    n = mdp.n_states
    prb = np.zeros((n, n), dtype=float)
    rew = np.zeros(n, dtype=float)
    prg = np.zeros(n, dtype=float)

    for src, actions in enumerate(mdp.tab):
        for t in actions[policy[src]]:
            dst = t.destination
            prb[src, dst] = t.probability
            rew[src] += t.probability * t.reward
            prg[src] += t.probability * t.progress

    # by squaring the matrix we can do 2^10 state transitions quickly
    for _ in range(10):
        prb = np.dot(prb, prb)

    vec = np.zeros(n, dtype=float)
    for s, p in mdp.start.items():
        vec[s] = p

    assert sum(vec) == 1, f"{sum(vec)}"

    vec = np.dot(vec, prb)
    assert math.isclose(sum(vec), 1), f"{sum(vec)}"

    return vec


for i, a in enumerate(alpha):
    ss = steady_state(mdp[i], vi[i]["policy"])
    rpp = reward_per_progress_backpropagation(mdp[i], vi[i]["policy"])
    revenue = sum(np.multiply(ss, rpp))
    print(f"alpha={a} revenue={revenue}")

alpha=0.01 revenue=0.009999999999999992
alpha=0.05 revenue=0.04999999999999995
alpha=0.1 revenue=0.1
alpha=0.2 revenue=0.20000000000000698
alpha=0.3 revenue=0.2999999999999994
alpha=0.35 revenue=0.370568830736789
alpha=0.4 revenue=0.48691846604104705
alpha=0.45 revenue=0.6524747897127535
alpha=0.49 revenue=0.8280881459620522
