In [1]:
import environmentv6 as e
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import progressbar as pb
import scipy.sparse as ss
import seaborn as sns
import warnings
warnings.filterwarnings('ignore', category=ss.SparseEfficiencyWarning)

In [46]:
# params
alpha = 0.45
gamma = 0.5
T = 75
epsilon = 10e-5

# game
action_count = 4
adopt = 0; override = 1; mine = 2; match = 3

# fork params
fork_count = 3
irrelevant = 0; relevant = 1; active = 2;

state_count = (T+1) * (T+1) * 3

# mapping utils
state_mapping = {}
states = []
count = 0
for a in range(T+1):
    for h in range(T+1):
        for fork in range(fork_count):
            state_mapping[(a, h, fork)] = count
            states.append((a, h, fork))
            count += 1

# initialize matrices
transitions = []; rewards = []
for _ in range(action_count):
    transitions.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))
    rewards.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))

In [47]:
mining_cost = 0.5

# populate matrices
for state_index in range(state_count):
    a, h, fork = states[state_index]

    # adopt
    transitions[adopt][state_index, state_mapping[0, 0, irrelevant]] = 1

    # override
    if a > h:
        transitions[override][state_index, state_mapping[a-h-1, 0, irrelevant]] = 1
        rewards[override][state_index, state_mapping[a-h-1, 0, irrelevant]] = h + 1
    else:
        transitions[override][state_index, 0] = 1
        rewards[override][state_index, 0] = -10000

    # mine 
    if (fork != active) and (a < T) and (h < T):
        transitions[mine][state_index, state_mapping[a+1, h, irrelevant]] = alpha
        transitions[mine][state_index, state_mapping[a, h+1, relevant]] = (1 - alpha) 
        rewards[mine][state_index, state_mapping[a+1, h, irrelevant]] = -1 * alpha * mining_cost
        rewards[mine][state_index, state_mapping[a, h+1, relevant]] = -1 * alpha * mining_cost        
    elif (fork == active) and (a > h) and (h > 0) and (a < T) and (h < T):
        transitions[mine][state_index, state_mapping[a+1, h, active]] = alpha
        transitions[mine][state_index, state_mapping[a-h, 1, relevant]] = (1 - alpha) * gamma
        transitions[mine][state_index, state_mapping[a, h+1, relevant]] = (1 - alpha) * (1 - gamma)
        rewards[mine][state_index, state_mapping[a+1, h, active]] = -1 * alpha * mining_cost
        rewards[mine][state_index, state_mapping[a-h, 1, relevant]] = h - alpha * mining_cost
        rewards[mine][state_index, state_mapping[a, h+1, relevant]] = -1 * alpha * mining_cost
    else:
        transitions[mine][state_index, 0] = 1
        rewards[mine][state_index, 0] = -10000
        
    # match 
    if (fork == relevant) and (a >= h) and (h > 0) and (a < T) and (h < T):
        transitions[match][state_index, state_mapping[a+1, h, active]] = alpha
        transitions[match][state_index, state_mapping[a-h, 1, relevant]] = (1 - alpha) * gamma
        transitions[match][state_index, state_mapping[a, h+1, relevant]] = (1 - alpha) * (1 - gamma)
        rewards[match][state_index, state_mapping[a+1, h, active]] = -1 * alpha * mining_cost
        rewards[match][state_index, state_mapping[a-h, 1, relevant]] = h - alpha * mining_cost
        rewards[match][state_index, state_mapping[a, h+1, relevant]] = -1 * alpha * mining_cost
    else:
        transitions[match][state_index, 0] = 1
        rewards[match][state_index, 0] = -10000

In [48]:
rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, rewards, epsilon/8)
rvi.run()
policy = rvi.policy
processPolicy(policy)

wwa & aaa & aaa & aaa & aaa & aaa & aaa & aaa & aaa & \\ 
wwo & wma & wwa & aaa & aaa & aaa & aaa & aaa & aaa & \\ 
wwo & wmw & wma & wwa & aaa & aaa & aaa & aaa & aaa & \\ 
wwo & wmw & wmw & wma & wwa & wwa & aaa & aaa & aaa & \\ 
wwo & wmw & wmw & omw & wma & wwa & wwa & aaa & aaa & \\ 
wwo & wmw & wmw & wmw & omw & wma & wwa & wwa & aaa & \\ 
wwo & wmw & wmw & wmw & wmw & omw & wma & wwa & wwa & \\ 
wwo & wmw & wmw & wmw & wmw & wmw & ooo & wma & wwa & \\ 
wwo & wmw & wmw & wmw & wmw & wmw & wmw & ooo & wma & \\ 



In [45]:
np.reshape(policy, (9,9,3))

array([[[2, 2, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[2, 2, 1],
        [2, 3, 0],
        [2, 2, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[2, 2, 1],
        [2, 3, 2],
        [2, 3, 0],
        [2, 2, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[2, 2, 1],
        [2, 3, 2],
        [1, 3, 2],
        [2, 3, 0],
        [2, 2, 0],
        [2, 2, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[2, 2, 1],
        [2, 3, 2],
        [2, 3, 2],
        [1, 1, 1],
        [2, 3, 0],
        [2, 2, 0],
        [2, 2, 0],
        [0, 0, 0],
        [0, 0, 0]],

       [[2, 2, 1],
        [2, 3, 2],
        [2, 3, 2],
        [2, 3, 2],
        [1, 1, 1],
        [2, 3, 0],
        [2, 2, 0],
  

In [13]:
def processPolicy(policy):
    results = ''
    for a in range(9):
        for h in range(9):
            for fork in range(3):
                state_index = state_mapping[(a, h, fork)]
                action = policy[state_index]
                if action == 0:
                    results += 'a'
                elif action == 1:
                    results += 'o'
                elif action == 2:
                    results += 'w'
                elif action == 3:
                    results += 'm'
                else:
                    print('here')
            results += ' & '
        results += '\\\\ \n'
    print(results)

In [7]:
sm1_policy = np.asarray([
[2, 0, 9, 9, 9, 9, 9, 9, 9],
[2, 0, 9, 9, 9, 9, 9, 9, 9],
[2, 1, 0, 9, 9, 9, 9, 9, 9], 
[2, 2, 1, 0, 9, 9, 9, 9, 9],
[2, 2, 2, 1, 0, 9, 9, 9, 9],
[2, 2, 2, 2, 1, 0, 9, 9, 9],
[2, 2, 2, 2, 2, 1, 0, 9, 9],
[2, 2, 2, 2, 2, 2, 1, 0, 9],
[1, 1, 1, 1, 1, 1, 1, 1, 0]
])

honest_policy = np.asarray([
[2, 0, 9, 9, 9, 9, 9, 9, 9],
[1, 9, 9, 9, 9, 9, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9, 9], 
[9, 9, 9, 9, 9, 9, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9, 9],
[9, 9, 9, 9, 9, 9, 9, 9, 9]
])

opt_policy = np.reshape(policy, (9,9))

In [22]:
def get_opt_policy(alpha, T, mining_cost):
    for state_index in range(state_count):
        a, h = states[state_index]

        # adopt transitions
        transitions[adopt][state_index, state_mapping[0, 0]] = 1

        # override
        if a > h:
            transitions[override][state_index, state_mapping[a-h-1, 0]] = 1
            rewards[override][state_index, state_mapping[a-h-1, 0]] = h + 1
        else:
            transitions[override][state_index, 0] = 1
            rewards[override][state_index, 0] = -10000

        # mine transitions
        if (a < T) and (h < T):
            transitions[mine][state_index, state_mapping[a+1, h]] = alpha
            transitions[mine][state_index, state_mapping[a, h+1]] = (1 - alpha) 
            rewards[mine][state_index, state_mapping[a+1, h]] = -1 * alpha * mining_cost
            rewards[mine][state_index, state_mapping[a, h+1]] = -1 * alpha * mining_cost        
        else:
            transitions[mine][state_index, 0] = 1
            rewards[mine][state_index, 0] = -10000
        
        rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, rewards, epsilon/8)
        rvi.run()
        return np.reshape(rvi.policy, (T+1, T+1))

In [26]:
get_opt_policy(alpha=0.4, T=8, mining_cost=0.5)

array([[2, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 2, 2, 0, 0, 0, 0, 0, 0],
       [2, 1, 2, 2, 2, 0, 0, 0, 0],
       [2, 2, 1, 2, 2, 2, 0, 0, 0],
       [2, 2, 2, 1, 2, 2, 2, 0, 0],
       [2, 2, 2, 2, 1, 2, 2, 2, 0],
       [2, 2, 2, 2, 2, 1, 2, 2, 0],
       [2, 2, 2, 2, 2, 2, 1, 2, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 0]])

In [15]:
# simulation
length = int(1e6)
alpha = 0.4
T = 8
mining_cost = 0.5
env = e.Environment(alpha, T, mining_cost)

# simulation
bar = pb.ProgressBar()
_ = env.reset()
current_reward = 0
for _ in bar(range(length)):
    a, h = env.current_state
    action = opt_policy[(a,h)]
    _, reward = env.takeAction(action)
    current_reward += reward

100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:28 Time:  0:00:28


In [16]:
# opt
print(current_reward, current_reward / length)

101211.00000130651 0.10121100000130652


In [119]:
# sm1
print(current_reward, current_reward / length)

54266.60000058278 0.05426660000058278


In [12]:
# honest
print(current_reward, current_reward / length)

100698.00000089758 0.10069800000089758
