In [1]:
from bitcoin import Bitcoin
from compiler import Compiler
from parallel import Parallel
from sm_v1 import Config, SelfishMining, StateEditor
import seaborn as sns
import numpy as np

In [2]:
def cfg(protocol, *args, alpha=0.25, gamma=0.5, truncate=5, horizon=0, **kwargs):
    return Config(
        protocol=protocol(*args, **kwargs),
        alpha=alpha,
        gamma=gamma,
        truncate_on_pow=truncate,
        horizon=horizon,
    )


def compile(*args, verbose=False, **kwargs):
    config = cfg(*args, **kwargs)
    se = StateEditor()
    c = Compiler(SelfishMining(se, config))
    while c.explore():
        if verbose:
            process = psutil.Process()
            trace, _state = peek(c)
            info = dict(
                protocol=config.protocol.name,
                n_states_explored=len(c.explored),
                n_states_queued=c.queue.qsize(),
                n_states_seen=len(c.state_map),
                n_actions=len(c.action_map),
                n_transitions=len(c.transitions),
                trace_blocks_mined=trace.blocks_mined,
                trace_actions_taken=trace.actions_taken,
                ram_usage_gb=process.memory_info().rss / 1024**3,
            )
            info["queuing_factor"] = info["n_states_queued"] / info["n_states_explored"]
            pp.pprint(info)
    return c.mdp()

In [3]:
alpha = [0.01, 0.05, 0.1, 0.2, 0.3, 0.35, 0.4, 0.5]
ptmdp = []
mdp = []
for a in alpha:
    print(a)
    ptmdp.append(compile(Bitcoin, alpha=a, gamma=0.5, truncate=5, horizon=100))
    mdp.append(compile(Bitcoin, alpha=a, gamma=0.5, truncate=5, horizon=0))

0.01
0.05
0.1
0.2
0.3
0.35
0.4
0.5


In [9]:
def value_iteration(mdp, *args, n_iter=100, discount=0.99, verbose=False):
    value = np.zeros(mdp.n_states, dtype=float)
    policy = np.zeros(mdp.n_states, dtype=int)

    for iteration in range(n_iter):
        value_next = np.zeros(mdp.n_states, dtype=float)
        policy_next = np.zeros(mdp.n_states, dtype=int)

        for src, actions in enumerate(mdp.tab):
            best_v = 0.0
            best_a = -1  # no action possible
            for act, lst in actions.items():
                if act < 0:
                    continue
                this_v = 0.0
                for t in lst:
                    this_v += t.probability * (
                        t.reward + discount * value[t.destination]
                    )
                if this_v >= best_v:  # intentionally to not stick with action -1
                    best_v = this_v
                    best_a = act
            value_next[src] = best_v
            policy_next[src] = best_a
            assert best_a >= 0 or len(actions) == 0

        value_delta = np.abs(value_next - value).max()
        policy_delta = (policy_next != policy).sum()
        if verbose:
            print(iteration, value[:5], value_delta, policy_delta)
        value = value_next
        policy = policy_next
    return value, policy

In [10]:
mdp[0].n_states

8043

In [11]:
# value_iteration(ptmdp[0], n_iter=500, discount=1, verbose=True)

In [12]:
policy = []
ptvalue = []
for i, m in enumerate(ptmdp):
    print(f"{i}: alpha={alpha[i]}")
    v, p = value_iteration(m, discount=1, n_iter=500)
    policy.append(p)
    ptvalue.append(v)

0: alpha=0.01
1: alpha=0.05
2: alpha=0.1
3: alpha=0.2
4: alpha=0.3
5: alpha=0.35
6: alpha=0.4
7: alpha=0.5


In [13]:
for i in range(len(alpha)):
    print(
        f"alpha={alpha[i]} ptvalue[0:2]={ptvalue[i][0:2]} policy={policy[i][:10]} {sum(policy[i])} {hash(policy[i].tobytes()) % 10000}"
    )

alpha=0.01 ptvalue[0:2]=[49.84483445 48.84483445] policy=[0 0 1 0 1 0 2 1 0 1] 43823 616
alpha=0.05 ptvalue[0:2]=[84.19039387 83.19039387] policy=[0 0 1 0 1 0 2 1 0 1] 43957 2225
alpha=0.1 ptvalue[0:2]=[91.97750223 90.97750223] policy=[0 0 1 0 1 0 2 1 0 1] 43957 2225
alpha=0.2 ptvalue[0:2]=[96.37270019 95.37270019] policy=[0 0 1 0 1 0 2 1 0 1] 43957 2225
alpha=0.3 ptvalue[0:2]=[97.86789465 96.86789465] policy=[0 0 1 0 1 0 2 1 0 1] 43965 3632
alpha=0.35 ptvalue[0:2]=[98.27787726 97.27787726] policy=[0 0 1 0 1 0 2 1 0 1] 43965 3632
alpha=0.4 ptvalue[0:2]=[98.57069763 97.57069763] policy=[0 0 1 0 1 0 2 1 0 1] 43993 686
alpha=0.5 ptvalue[0:2]=[98.93970303 97.93970303] policy=[0 0 1 0 1 0 2 1 0 1] 43993 686


In [17]:
print(mdp[0].n_states, ptmdp[0].n_states, len(policy[0]))

8043 8043 8043


In [20]:
def reward_per_progress_backpropagation(mdp, policy, n_iter=500):
    reward = np.zeros(mdp.n_states, dtype=float)
    progress = np.zeros(mdp.n_states, dtype=float)

    for i in range(n_iter):
        reward_next = np.zeros(mdp.n_states, dtype=float)
        progress_next = np.zeros(mdp.n_states, dtype=float)
        for src in range(mdp.n_states):
            act = policy[src]
            if act == -1:
                assert len(mdp.tab[src]) == 0
                continue
            for t in mdp.tab[src][act]:
                reward_next[src] += t.probability * (t.reward + reward[t.destination])
                progress_next[src] += t.probability * (
                    t.progress + progress[t.destination]
                )
        reward = reward_next
        progress = progress_next
    return reward / progress


for i in range(len(alpha)):
    rpp = reward_per_progress_backpropagation(mdp[i], policy[i])
    print(f"alpha={alpha[i]} rpp[0:2]={rpp[0:2]}")

  return reward / progress


alpha=0.01 rpp[0:2]=[0.78853291 0.78630586]
alpha=0.05 rpp[0:2]=[0.96232473 0.96023222]
alpha=0.1 rpp[0:2]=[0.98356017 0.98151006]
alpha=0.2 rpp[0:2]=[0.99376254 0.99173212]
alpha=0.3 rpp[0:2]=[0.99699025 0.9949653 ]
alpha=0.35 rpp[0:2]=[0.99785623 0.99583241]
alpha=0.4 rpp[0:2]=[0.99847039 0.99644709]
alpha=0.5 rpp[0:2]=[0.99924115 0.99721754]
