In [1]:
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as ss
import warnings
warnings.filterwarnings('ignore', category=ss.SparseEfficiencyWarning)

In [2]:
# init params
alpha = 0.45
T = 5
state_count = (T+1) * (T+1) * 3
gamma = 0

In [3]:
# MDP helpers
irrelevant = 0; relevant = 1; active = 2
choices = 4
adopt = 0; override = 1; match = 2; wait = 3
# generate a state to integer mapping and list of states
state_mapping = {}
states = []
count = 0
for a in range(T+1):
    for h in range(T+1):
        for fork in range(3):
            state_mapping[(a, h, fork)] = count
            states.append((a, h, fork))
            count += 1

In [23]:
# transition and reward matrices
transitions = []; reward_selfish = []; reward_honest = []
for _ in range(choices):
    transitions.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))
    reward_selfish.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))
    reward_honest.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))

# writing transition and reward data 
for state_index in range(state_count):
    if state_index % 2000 == 0:
        print('processing state', state_index)
    
    a, h, fork = states[state_index]
    
    # adopt transitions
    transitions[adopt][state_index, state_mapping[1, 0, irrelevant]] = alpha
    transitions[adopt][state_index, state_mapping[0, 1, irrelevant]] = 1 - alpha
    # adopt rewards
    reward_honest[adopt][state_index, state_mapping[1, 0, irrelevant]] = h
    reward_honest[adopt][state_index, state_mapping[0, 1, irrelevant]] = h

    # override
    if a > h:
        transitions[override][state_index, state_mapping[a-h, 0, irrelevant]] = alpha
        reward_selfish[override][state_index, state_mapping[a-h, 0, irrelevant]] = h+1
        transitions[override][state_index, state_mapping[a-h-1, 1, relevant]] = 1 - alpha
        reward_selfish[override][state_index, state_mapping[a-h-1, 1, relevant]] = h+1
    else:
        transitions[override][state_index, 0] = 1
        reward_honest[override][state_index, 0] = 10000

    # wait
    if (fork != active) and (a < T) and (h < T):
        transitions[wait][state_index, state_mapping[a+1, h, irrelevant]] = alpha
        transitions[wait][state_index, state_mapping[a, h+1, relevant]] = 1 - alpha
    elif (fork == active) and (a > h) and (h > 0) and (a < T) and (h < T): 
        transitions[wait][state_index, state_mapping[a+1, h, active]] = alpha
        transitions[wait][state_index, state_mapping[a-h, 1, relevant]] = gamma*(1-alpha)
        reward_selfish[wait][state_index, state_mapping[a-h, 1, relevant]] = h
        transitions[wait][state_index, state_mapping[a, h+1, relevant]] = (1-gamma)*(1-alpha)
    else:
        transitions[wait][state_index, 0] = 1
        reward_honest[wait][state_index, 0] = 10000

    # match
    if (fork == relevant) and (a >= h) and (h > 0) and (a < T) and (h < T):
        transitions[match][state_index, state_mapping[a+1, h, active]] = alpha
        transitions[match][state_index, state_mapping[a-h, 1, relevant]] = gamma*(1-alpha)
        reward_selfish[match][state_index, state_mapping[a-h, 1, relevant]] = h
        transitions[match][state_index, state_mapping[a, h+1, relevant]] = (1-gamma)*(1-alpha)
    else:
        transitions[match][state_index, 0] = 1
        reward_honest[match][state_index, 0] = 10000

processing state 0


In [24]:
epsilon = 0.0001
lowRho = 0
highRho = 1
while(highRho - lowRho > epsilon/8):
    rho = (highRho + lowRho) / 2;
    Wrho = []
    for i in range(choices):
        Wrho.append((1-rho)*reward_selfish[i] - rho*reward_honest[i])
    rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, Wrho, epsilon/8)
    rvi.run()
    lowerBoundPolicy = rvi.policy
    reward = rvi.average_reward
    if reward > 0:
        lowRho = rho
    else:
        highRho = rho
print('alpha: ', alpha, 'lower bound reward:', rho)
lowerBoundRho = rho
lowRho = rho
highRho = min(rho+0.1, 1)
while (highRho - lowRho) > (epsilon / 8):
    rho = (highRho + lowRho) / 2
    for state_index in range(state_count):
        a, h, fork = states[state_index]
        if a == T:
            expr = (1-rho)*alpha*(1-alpha)/(1-2*alpha)**2+0.5*((a-h)/(1-2*alpha)+a+h)
            reward_selfish[adopt][state_index, state_mapping[1, 0, irrelevant]] = expr
            reward_selfish[adopt][state_index, state_mapping[0, 1, irrelevant]] = expr
            reward_honest[adopt][state_index, state_mapping[1, 0, irrelevant]] = 0
            reward_honest[adopt][state_index, state_mapping[0, 1, irrelevant]] = 0
        elif h == T:
            expr1 = (1 - np.power(alpha/(1-alpha), h - a)) * (-1*rho*h)
            expr2 = np.power(alpha/(1-alpha), h - a) * (1 - rho)
            expr3 = (alpha * (1-alpha)) / (np.power(1-2*alpha, 2)) + (h - a) / (1- 2 * alpha)
            expr_total = expr1 + expr2 * expr3
            reward_selfish[adopt][state_index, state_mapping[1, 0, irrelevant]] = expr_total
            reward_selfish[adopt][state_index, state_mapping[0, 1, irrelevant]] = expr_total
            reward_honest[adopt][state_index, state_mapping[1, 0, irrelevant]] = 0
            reward_honest[adopt][state_index, state_mapping[0, 1, irrelevant]] = 0
    Wrho = []
    for i in range(choices):
        Wrho.append((1-rho)*reward_selfish[i] - rho*reward_honest[i])
    rhoPrime = max(lowRho - epsilon/4, 0)
    rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, Wrho, epsilon/8)
    rvi.run()
    reward = rvi.average_reward
    policy = rvi.policy
    if reward > 0:
        lowRho = rho
    else:
        highRho = rho
print('alpha: ', alpha, 'upper bound reward', rho)

alpha:  0.45 lower bound reward: 0.5507888793945312
alpha:  0.45 upper bound reward 0.6507766723632813


In [None]:
alpha:  0.45
processing state 0
alpha:  0.45 lower bound reward: 0.5507888793945312
alpha:  0.45 upper bound reward 0.6507766723632813