In [1]:
from bitcoin import Bitcoin
from compiler import Compiler
from ethereum import EthereumWhitepaper, EthereumByzantium
from parallel import Parallel
from sm import SelfishMining, map_params, mappable_params
import barzur20aft
import seaborn as sns
import numpy as np

In [2]:
def compile(proto, **kwargs):
    model = SelfishMining(proto, **kwargs)
    c = Compiler(model)
    while c.explore():
        pass
    return c.mdp()

In [3]:
base_mdp = compile(Bitcoin(), maximum_size=9, **mappable_params)
base_mdp

MDP of size 2886 / 3 / 11249 / 3.9

In [4]:
alpha = [0.01, 0.05, 0.1, 0.2, 0.3, 0.35, 0.4, 0.5]
ptmdp = []
mdp = []
for a in alpha:
    print(a)
    m = map_params(base_mdp, alpha=a, gamma=0.5)
    mdp.append(m)
    ptmdp.append(barzur20aft.ptmdp(m, horizon=100))

0.01
0.05
0.1
0.2
0.3
0.35
0.4
0.5


In [5]:
def value_iteration(mdp, *args, n_iter=100, discount=0.99, verbose=False):
    value = np.zeros(mdp.n_states, dtype=float)
    policy = np.zeros(mdp.n_states, dtype=int)

    for iteration in range(n_iter):
        value_next = np.zeros(mdp.n_states, dtype=float)
        policy_next = np.zeros(mdp.n_states, dtype=int)

        for src, actions in enumerate(mdp.tab):
            best_v = 0.0
            best_a = -1  # no action possible
            for act, lst in actions.items():
                if act < 0:
                    continue
                this_v = 0.0
                for t in lst:
                    this_v += t.probability * (
                        t.reward + discount * value[t.destination]
                    )
                if this_v >= best_v:  # intentionally to not stick with action -1
                    best_v = this_v
                    best_a = act
            value_next[src] = best_v
            policy_next[src] = best_a
            assert best_a >= 0 or len(actions) == 0

        value_delta = np.abs(value_next - value).max()
        policy_delta = (policy_next != policy).sum()
        if verbose:
            print(iteration, value[:5], value_delta, policy_delta)
        value = value_next
        policy = policy_next
    return value, policy

In [6]:
# value_iteration(ptmdp[0], n_iter=500, discount=1, verbose=True)

In [7]:
policy = []
ptvalue = []
for i, m in enumerate(ptmdp):
    print(f"{i}: alpha={alpha[i]}")
    v, p = value_iteration(m, discount=1, n_iter=500)
    policy.append(p)
    ptvalue.append(v)

0: alpha=0.01
1: alpha=0.05
2: alpha=0.1
3: alpha=0.2
4: alpha=0.3
5: alpha=0.35
6: alpha=0.4
7: alpha=0.5


In [8]:
for i in range(len(alpha)):
    print(
        f"alpha={alpha[i]} ptvalue[0:2]={ptvalue[i][0:2]} policy={policy[i][:10]} {sum(policy[i])} {hash(policy[i].tobytes()) % 10000}"
    )

alpha=0.01 ptvalue[0:2]=[1.9081309  0.90813098] policy=[0 0 1 2 0 1 2 1 1 1] 3716 6714
alpha=0.05 ptvalue[0:2]=[5.54080204 4.54081196] policy=[0 0 1 2 0 1 2 1 1 1] 3630 4145
alpha=0.1 ptvalue[0:2]=[10.08190378  9.08197897] policy=[0 0 1 2 0 1 2 1 1 1] 3568 6630
alpha=0.2 ptvalue[0:2]=[19.16459284 18.16512749] policy=[0 0 1 2 0 1 2 1 1 1] 3584 1402
alpha=0.3 ptvalue[0:2]=[29.52245476 28.52727962] policy=[0 0 1 2 0 0 2 1 1 0] 3323 1547
alpha=0.35 ptvalue[0:2]=[36.78628988 35.79400224] policy=[0 0 1 2 0 0 2 1 1 0] 3319 4370
alpha=0.4 ptvalue[0:2]=[45.42159361 44.43318694] policy=[0 0 1 2 0 0 2 1 1 0] 3267 508
alpha=0.5 ptvalue[0:2]=[64.38718866 63.40907379] policy=[0 0 1 2 0 0 2 1 1 0] 3251 2746


In [9]:
print(mdp[0].n_states, ptmdp[0].n_states, len(policy[0]))

2886 2887 2887


In [10]:
def reward_per_progress_backpropagation(mdp, policy, n_iter=500):
    reward = np.zeros(mdp.n_states, dtype=float)
    progress = np.zeros(mdp.n_states, dtype=float)

    for i in range(n_iter):
        reward_next = np.zeros(mdp.n_states, dtype=float)
        progress_next = np.zeros(mdp.n_states, dtype=float)
        for src in range(mdp.n_states):
            act = policy[src]
            if act == -1:
                assert len(mdp.tab[src]) == 0
                continue
            for t in mdp.tab[src][act]:
                reward_next[src] += t.probability * (t.reward + reward[t.destination])
                progress_next[src] += t.probability * (
                    t.progress + progress[t.destination]
                )
        reward = reward_next
        progress = progress_next

    rpp = 0.0
    for state, prob in mdp.start.items():
        rpp += prob * reward[state] / progress[state]
    return rpp


for i in range(len(alpha)):
    rpp = reward_per_progress_backpropagation(mdp[i], policy[i])
    print(f"alpha={alpha[i]} rpp={rpp}")

alpha=0.01 rpp=0.010461847389558226
alpha=0.05 rpp=0.0503012048192771
alpha=0.1 rpp=0.10010040160642575
alpha=0.2 rpp=0.1996987951807229
alpha=0.3 rpp=0.3237880072252172
alpha=0.35 rpp=0.41345735371732767
alpha=0.4 rpp=0.5203834028446517
alpha=0.5 rpp=0.7532045016599548
