In [21]:
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as ss

In [22]:
def getAdoptMatrices(rho):
    # creating the adopt transition & reward matrices
    adopt_transitions = np.zeros(shape = (num_states, num_states))
    adopt_rewards = np.zeros(shape = (num_states, num_states))

    # each adopt matrix only can map to (1,0,irrelevant) or (0,1,irrelevant)
    new_state_1 = (1, 0, 'irrelevant')
    new_state_2 = (0, 1, 'irrelevant')
    for state_index in range(num_states):
        a, h, fork = states[state_index]
        adopt_transitions[state_index, state_mapping[new_state_1]] = alpha
        adopt_transitions[state_index, state_mapping[new_state_2]] = 1 - alpha
        adopt_rewards[state_index, state_mapping[new_state_2]] = -1 * rho * h
        adopt_rewards[state_index, state_mapping[new_state_2]] = -1 * rho * h
        
    # making matrices sparse
    return adopt_transitions, adopt_rewards

In [23]:
def getOverrideMatrices(rho):
    # creating the override transition & reward matrices
    override_transitions = np.zeros(shape = (num_states, num_states))
    override_rewards = np.zeros(shape = (num_states, num_states))

    for state_index in range(num_states):
        a, h, fork = states[state_index]
        # a > h, which must be true for override to succeed
        if a > h:
            new_state_1 = (a - h, 0, 'irrelevant')
            new_state_2 = (a - h - 1, 1, 'relevant')
            override_transitions[state_index, state_mapping[new_state_1]] = alpha
            override_transitions[state_index, state_mapping[new_state_2]] = 1 - alpha
            override_rewards[state_index, state_mapping[new_state_1]] = (1 - rho) * (h + 1)
            override_rewards[state_index, state_mapping[new_state_2]] = (1 - rho) * (h + 1)
        else:
            # filling in remainder of array.
            override_transitions[state_index, 0] = 1
            override_rewards[state_index, 0] = -1 * rho * 1000

    # making matrices sparse
    return override_transitions, override_rewards

In [24]:
def getWaitMatrices(rho):
    # creating the wait transition & reward matrices
    wait_transitions = np.zeros(shape = (num_states, num_states))
    wait_rewards = np.zeros(shape = (num_states, num_states))

    for state_index in range(num_states):
        a, h, fork = states[state_index]
        # irrelevant or relevant
        if ((fork == 'irrelevant') or (fork == 'relevant')) and (a < T) and (h < T):
            new_state_1 = (a + 1, h, 'irrelevant')
            new_state_2 = (a, h + 1, 'relevant')
            wait_transitions[state_index, state_mapping[new_state_1]] = alpha
            wait_transitions[state_index, state_mapping[new_state_2]] = 1 - alpha
        # active
        elif (fork == 'active') and (a < T) and (h < T) and (h > 0):
            if a >= h: 
                new_state_1 = (a + 1, h, 'active')
                new_state_2 = (a - h, 1, 'relevant')
                new_state_3 = (a, h + 1, 'relevant')
                wait_transitions[state_index, state_mapping[new_state_1]] = alpha
                wait_transitions[state_index, state_mapping[new_state_2]] = gamma * (1 - alpha)
                wait_transitions[state_index, state_mapping[new_state_3]] = (1 - gamma) * (1 - alpha)
                wait_rewards[state_index, state_mapping[new_state_2]] = (1 - rho) * h
            else:
                wait_transitions[state_index, 0] = 1
                wait_rewards[state_index, 0] = -1 * rho * 1000
        else:
            wait_transitions[state_index, 0] = 1
            wait_rewards[state_index, 0] = -1 * rho * 1000

    # making matrices sparse
    return wait_transitions, wait_rewards

In [25]:
def getMatchMatrices(rho):
    # creating the match transition & rewards matrices
    match_transitions = np.zeros(shape = (num_states, num_states))
    match_rewards = np.zeros(shape = (num_states, num_states))

    for state_index in range(num_states):
        a, h, fork = states[state_index]
        if (a >= h) and (fork == 'relevant') and (a < T) and (h < T) and (h > 0):
            new_state_1 = (a + 1, h, 'active')
            new_state_2 = (a - h, 1, 'relevant')
            new_state_3 = (a, h + 1, 'relevant')
            match_transitions[state_index, state_mapping[new_state_1]] = alpha
            match_transitions[state_index, state_mapping[new_state_2]] = gamma * (1 - alpha)
            match_transitions[state_index, state_mapping[new_state_3]] = (1 - gamma) * (1 - alpha)
            match_rewards[state_index, state_mapping[new_state_2]] = (1 - rho) * h
        else:
            match_transitions[state_index, 0] = 1
            match_rewards[state_index, 0] = -1 * rho * 1000

    # making matrices sparse
    return match_transitions, match_rewards

In [26]:
def forceAdopt(transition_matrix, reward_matrix, rho, underpaying):
    new_state_1_index = state_mapping[(1, 0, 'irrelevant')]
    new_state_2_index = state_mapping[(0, 1, 'irrelevant')]
    for state_index in range(num_states):
        a, h, fork = states[state_index]
        if ((a == T) or (h == T)) and (a != h):
            # clear out old probabilities
            transition_matrix[state_index, :] = 0
            transition_matrix[state_index, new_state_1_index] = alpha
            transition_matrix[state_index, new_state_2_index] = 1 - alpha
            if underpaying:
                reward_matrix[state_index, new_state_1_index] = -1 * rho * h
                reward_matrix[state_index, new_state_2_index] = -1 * rho * h
            else:
                # attacker ahead
                if a > h:
                    reward_matrix[state_index, new_state_1_index] = overpayAttackerAhead(a, h, rho)
                    reward_matrix[state_index, new_state_2_index] = overpayAttackerAhead(a, h, rho)
                else:
                    reward_matrix[state_index, new_state_1_index] = overpayHonestAhead(a, h, rho)
                    reward_matrix[state_index, new_state_2_index] = overpayHonestAhead(a, h, rho)

In [27]:
# helpers
def overpayAttackerAhead(a, h, rho):
    assert(a > h)
    expr1 = (1 - rho) * (alpha * (1 - alpha)) / ((1 - 2 * alpha)**2)
    expr2 = (1/2) * ((a - h) / (1 - 2 * alpha) + a + h)
    return expr1 + expr2

def overpayHonestAhead(a, h, rho):
    assert(h > a)
    expr1 = (1 - np.power(alpha/(1-alpha), h - a)) * (-1*rho*h)
    expr2 = np.power(alpha/(1-alpha), h - a) * (1 - rho)
    expr3 = (alpha * (1-alpha)) / (np.power(1-2*alpha, 2)) + (h - a) / (1- 2 * alpha)
    return expr1 + expr2 * expr3

def getAllMatrices(rho, underpaying=True):
    adopt_t, adopt_r = getAdoptMatrices(rho)
    forceAdopt(adopt_t, adopt_r, rho, underpaying)
    adopt_t = ss.csr_matrix(adopt_t); adopt_r = ss.csr_matrix(adopt_r)
    override_t, override_r = getOverrideMatrices(rho)
    forceAdopt(override_t, override_r, rho, underpaying)
    override_t = ss.csr_matrix(override_t); override_r = ss.csr_matrix(override_r)
    wait_t, wait_r = getWaitMatrices(rho)
    forceAdopt(wait_t, wait_r, rho, underpaying)
    wait_t = ss.csr_matrix(wait_t); wait_r = ss.csr_matrix(wait_r)
    match_t, match_r = getMatchMatrices(rho)
    forceAdopt(match_t, match_r, rho, underpaying)
    match_t = ss.csr_matrix(match_t); match_r = ss.csr_matrix(match_r)
    return [adopt_t, override_t, wait_t, match_t], [adopt_r, override_r, wait_r, match_r]

In [33]:
# the numbers of states is (T+1)*(T+1)*3 because each chain can be up to T length and there are 3 fork states.
num_states = (T+1)*(T+1)*3

# generate a state to integer mapping and list of states
state_mapping = {}
states = []
count = 0
for a in range(T+1):
    for h in range(T+1):
        for fork in ['irrelevant', 'relevant', 'active']:
            state_mapping[(a, h, fork)] = count
            states.append((a, h, fork))
            count += 1

In [43]:
# initializing params
epsilon = 10e-5
T = 75
gamma = 0
alpha = 0.20

In [44]:
# main alg
low = 0; high = 1
while (high - low) >= epsilon / 8:
    rho = (low + high) / 2
    print(low, high, rho)
    transitions, rewards = getAllMatrices(rho, underpaying=True)
    rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, rewards, epsilon/8)
    rvi.run()
    if rvi.average_reward > 0:
        low = rho
    else:
        high = rho
lower_bound = rho - epsilon
rho_prime = np.max(low - epsilon/4, 0)
transitions, rewards = getAllMatrices(rho_prime, underpaying=False)
rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, rewards, epsilon)
rvi.run()
upper_bound = rho_prime + 2 * (rvi.average_reward + epsilon)

0 1 0.5
0 0.5 0.25
0 0.25 0.125
0.125 0.25 0.1875
0.1875 0.25 0.21875
0.21875 0.25 0.234375
0.234375 0.25 0.2421875
0.234375 0.2421875 0.23828125
0.234375 0.23828125 0.236328125
0.236328125 0.23828125 0.2373046875
0.2373046875 0.23828125 0.23779296875
0.23779296875 0.23828125 0.238037109375
0.238037109375 0.23828125 0.2381591796875
0.238037109375 0.2381591796875 0.23809814453125
0.238037109375 0.23809814453125 0.238067626953125
0.238067626953125 0.23809814453125 0.2380828857421875
0.2380828857421875 0.23809814453125 0.23809051513671875


In [45]:
lower_bound, upper_bound

(0.23799051513671876, 0.23831544970703208)

In [None]:
def getAdoptMatrices(rho, underpaying=True):
    # creating the adopt transition & reward matrices
    adopt_transitions = np.zeros(shape = (num_states, num_states))
    adopt_rewards = np.zeros(shape = (num_states, num_states))

    # each adopt matrix only can map to (1,0,irrelevant) or (0,1,irrelevant)
    new_state_1 = (1, 0, 'irrelevant')
    new_state_2 = (0, 1, 'irrelevant')
    for state_index in range(num_states):
        a, h, fork = states[state_index]
        adopt_transitions[state_index, state_mapping[new_state_1]] = alpha
        adopt_transitions[state_index, state_mapping[new_state_2]] = 1 - alpha
        adopt_rewards[state_index, state_mapping[new_state_2]] = rho * h
        adopt_rewards[state_index, state_mapping[new_state_2]] = rho * h
        if ((a == T) or (h == T)) and (a != h):
            # overpaying
            if not underpaying:
                # attacker ahead
                if a > h: 
                    adopt_rewards[state_index, state_mapping[new_state_1]] = overpayAttackerAhead(a, h, rho)
                    adopt_rewards[state_index, state_mapping[new_state_2]] = overpayAttackerAhead(a, h, rho)
                # honest ahead
                else: 
                    adopt_rewards[state_index, state_mapping[new_state_1]] = overpayHonestAhead(a, h, rho)
                    adopt_rewards[state_index, state_mapping[new_state_2]] = overpayHonestAhead(a, h, rho)
    
    # making matrices sparse
    return ss.csr_matrix(adopt_transitions), ss.csr_matrix(adopt_rewards)

In [9]:
0 1 0.5
0.5 1 0.75
0.5 0.75 0.625
0.5 0.625 0.5625
0.5625 0.625 0.59375
0.59375 0.625 0.609375
0.609375 0.625 0.6171875
0.609375 0.6171875 0.61328125
0.61328125 0.6171875 0.615234375
0.61328125 0.615234375 0.6142578125
0.6142578125 0.615234375 0.61474609375
0.6142578125 0.61474609375 0.614501953125
0.6142578125 0.614501953125 0.6143798828125
0.6142578125 0.6143798828125 0.61431884765625
0.6142578125 0.61431884765625 0.614288330078125
0.6142578125 0.614288330078125 0.6142730712890625
0.6142730712890625 0.614288330078125 0.6142807006835938
0.0017484939746807981

SyntaxError: invalid syntax (<ipython-input-9-dde0cfbc6a8c>, line 1)