In [7]:
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as ss

In [8]:
maxForkLen = 75
numOfStates = (maxForkLen+1) * (maxForkLen+1) * 3
print('numOfStates: ', numOfStates)
alphaPower = 0.45
gammaRatio = 0
irrelevant = 0; relevant = 1; active = 2;
choices = 4
adopt = 0; override = 1; match = 2; wait = 3;
P = []; Rs = []; Rh = [];
for _ in range(choices):
    P.append(ss.csr_matrix(np.zeros(shape=(numOfStates, numOfStates))))
    Rs.append(ss.csr_matrix(np.zeros(shape=(numOfStates, numOfStates))))
    Rh.append(ss.csr_matrix(np.zeros(shape=(numOfStates, numOfStates))))

numOfStates:  17328


In [9]:
# generate a state to integer mapping and list of states
state_mapping = {}
states = []
count = 0
for a in range(maxForkLen+1):
    for h in range(maxForkLen+1):
        for fork in range(3):
            state_mapping[(a, h, fork)] = count
            states.append((a, h, fork))
            count += 1

In [10]:
# adopt
P[adopt][:, state_mapping[1, 0, irrelevant]] = alphaPower
P[adopt][:, state_mapping[0, 1, irrelevant]] = 1 - alphaPower
for state_index in range(numOfStates):
    if state_index % 2000 == 0:
        print('processing state', state_index)
    a, h, fork = states[state_index]
    
    # adopt rewards
    Rh[adopt][state_index, state_mapping[1, 0, irrelevant]] = h
    Rh[adopt][state_index, state_mapping[0, 1, irrelevant]] = h
    
    # override
    if a > h:
        P[override][state_index, state_mapping[a-h, 0, irrelevant]] = alphaPower
        Rs[override][state_index, state_mapping[a-h, 0, irrelevant]] = h+1
        P[override][state_index, state_mapping[a-h-1, 1, relevant]] = 1 - alphaPower
        Rs[override][state_index, state_mapping[a-h-1, 1, relevant]] = h+1
    else:
        P[override][state_index, 0] = 1
        Rh[override][state_index, 0] = 10000
        
    # wait
    if (fork != active) and (a < maxForkLen) and (h < maxForkLen):
        P[wait][state_index, state_mapping[a+1, h, irrelevant]] = alphaPower
        P[wait][state_index, state_mapping[a, h+1, relevant]] = 1 - alphaPower
    elif (fork == active) and (a > h) and (h > 0) and (a < maxForkLen) and (h < maxForkLen): 
        P[wait][state_index, state_mapping[a+1, h, active]] = alphaPower
        P[wait][state_index, state_mapping[a-h, 1, relevant]] = gammaRatio*(1-alphaPower)
        Rs[wait][state_index, state_mapping[a-h, 1, relevant]] = h
        P[wait][state_index, state_mapping[a, h+1, relevant]] = (1-gammaRatio)*(1-alphaPower)
    else:
        P[wait][state_index, 0] = 1
        Rh[wait][state_index, 0] = 10000
    
    # match
    if (fork == relevant) and (a >= h) and (h > 0) and (a < maxForkLen) and (h < maxForkLen):
        P[match][state_index, state_mapping[a+1, h, active]] = alphaPower
        P[match][state_index, state_mapping[a-h, 1, relevant]] = gammaRatio*(1-alphaPower)
        Rs[match][state_index, state_mapping[a-h, 1, relevant]] = h
        P[match][state_index, state_mapping[a, h+1, relevant]] = (1-gammaRatio)*(1-alphaPower)
    else:
        P[match][state_index, 0] = 1
        Rh[match][state_index, 0] = 10000

processing state 0
processing state 2000
processing state 4000
processing state 6000
processing state 8000
processing state 10000
processing state 12000
processing state 14000
processing state 16000


In [12]:
epsilon = 0.0001
lowRho = 0
highRho = 1
while(highRho - lowRho > epsilon/8):
    rho = (highRho + lowRho) / 2;
    print(rho, highRho, lowRho)
    Wrho = []
    for i in range(choices):
        Wrho.append((1-rho)*Rs[i] - rho*Rh[i])
    rvi = mdptoolbox.mdp.RelativeValueIteration(P, Wrho, epsilon/8)
    rvi.run()
    lowerBoundPolicy = rvi.policy
    reward = rvi.average_reward
    if reward > 0:
        lowRho = rho
    else:
        highRho = rho
print('lower bound reward:', rho)

0.5 1 0
0.75 1 0.5
0.625 0.75 0.5
0.6875 0.75 0.625
0.65625 0.6875 0.625
0.671875 0.6875 0.65625
0.6640625 0.671875 0.65625
0.66796875 0.671875 0.6640625
0.669921875 0.671875 0.66796875
0.6689453125 0.669921875 0.66796875
0.66845703125 0.6689453125 0.66796875
0.668212890625 0.66845703125 0.66796875
0.6680908203125 0.668212890625 0.66796875
0.66815185546875 0.668212890625 0.6680908203125
0.668121337890625 0.66815185546875 0.6680908203125
0.6681365966796875 0.66815185546875 0.668121337890625
0.6681442260742188 0.66815185546875 0.6681365966796875
lower bound reward: 0.6681442260742188


In [13]:
lowerBoundRho = rho
lowRho = rho
highRho = min(rho+0.1, 1)
while (highRho - lowRho) > (epsilon / 8):
    rho = (highRho + lowRho) / 2
    print(rho, highRho, lowRho)
    for state_index in range(numOfStates):
        a, h, fork = states[state_index]
        if a == maxForkLen:
            expr = (1-rho)*alphaPower*(1-alphaPower)/(1-2*alphaPower)**2+0.5*((a-h)/(1-2*alphaPower)+a+h)
            Rs[adopt][state_index, state_mapping[1, 0, irrelevant]] = expr
            Rs[adopt][state_index, state_mapping[0, 1, irrelevant]] = expr
            Rs[adopt][state_index, state_mapping[1, 0, irrelevant]] = 0
            Rs[adopt][state_index, state_mapping[0, 1, irrelevant]] = 0
        elif h == maxForkLen:
            expr1 = (1 - np.power(alphaPower/(1-alphaPower), h - a)) * (-1*rho*h)
            expr2 = np.power(alphaPower/(1-alphaPower), h - a) * (1 - rho)
            expr3 = (alphaPower * (1-alphaPower)) / (np.power(1-2*alphaPower, 2)) + (h - a) / (1- 2 * alphaPower)
            expr_total = expr1 + expr2 * expr3
            Rs[adopt][state_index, state_mapping[1, 0, irrelevant]] = expr_total
            Rs[adopt][state_index, state_mapping[0, 1, irrelevant]] = expr_total
            Rh[adopt][state_index, state_mapping[1, 0, irrelevant]] = 0
            Rh[adopt][state_index, state_mapping[0, 1, irrelevant]] = 0
    Wrho = []
    for i in range(choices):
        Wrho.append((1-rho)*Rs[i] - rho*Rh[i])
    rhoPrime = max(lowRho - epsilon/4, 0)
    rvi = mdptoolbox.mdp.RelativeValueIteration(P, Wrho, epsilon/8)
    rvi.run()
    reward = rvi.average_reward
    policy = rvi.policy
    if reward > 0:
        lowRho = rho
    else:
        highRho = rho
print('upper bound reward', rho)

0.7181442260742188 0.7681442260742187 0.6681442260742188
0.6931442260742188 0.7181442260742188 0.6681442260742188
0.6806442260742187 0.6931442260742188 0.6681442260742188
0.6743942260742187 0.6806442260742187 0.6681442260742188
0.6712692260742188 0.6743942260742187 0.6681442260742188
0.6697067260742188 0.6712692260742188 0.6681442260742188
0.6704879760742188 0.6712692260742188 0.6697067260742188
0.6708786010742188 0.6712692260742188 0.6704879760742188
0.6706832885742189 0.6708786010742188 0.6704879760742188
0.6705856323242189 0.6706832885742189 0.6704879760742188
0.6705368041992188 0.6705856323242189 0.6704879760742188
0.6705612182617189 0.6705856323242189 0.6705368041992188
0.6705490112304688 0.6705612182617189 0.6705368041992188
upper bound reward 0.6705490112304688


In [17]:
for alpha in np.linspace(0.05, 0.45, 9):
    maxForkLen = 75
    numOfStates = (maxForkLen+1) * (maxForkLen+1) * 3
    print('alpha: ', alpha)
    alphaPower = alpha
    gammaRatio = 0
    irrelevant = 0; relevant = 1; active = 2;
    choices = 4
    adopt = 0; override = 1; match = 2; wait = 3;
    P = []; Rs = []; Rh = [];
    for _ in range(choices):
        P.append(ss.csr_matrix(np.zeros(shape=(numOfStates, numOfStates))))
        Rs.append(ss.csr_matrix(np.zeros(shape=(numOfStates, numOfStates))))
        Rh.append(ss.csr_matrix(np.zeros(shape=(numOfStates, numOfStates))))
    # generate a state to integer mapping and list of states
    state_mapping = {}
    states = []
    count = 0
    for a in range(maxForkLen+1):
        for h in range(maxForkLen+1):
            for fork in range(3):
                state_mapping[(a, h, fork)] = count
                states.append((a, h, fork))
                count += 1
    # adopt
    P[adopt][:, state_mapping[1, 0, irrelevant]] = alphaPower
    P[adopt][:, state_mapping[0, 1, irrelevant]] = 1 - alphaPower
    for state_index in range(numOfStates):
        if state_index % 2000 == 0:
            print('processing state', state_index)
        a, h, fork = states[state_index]

        # adopt rewards
        Rh[adopt][state_index, state_mapping[1, 0, irrelevant]] = h
        Rh[adopt][state_index, state_mapping[0, 1, irrelevant]] = h

        # override
        if a > h:
            P[override][state_index, state_mapping[a-h, 0, irrelevant]] = alphaPower
            Rs[override][state_index, state_mapping[a-h, 0, irrelevant]] = h+1
            P[override][state_index, state_mapping[a-h-1, 1, relevant]] = 1 - alphaPower
            Rs[override][state_index, state_mapping[a-h-1, 1, relevant]] = h+1
        else:
            P[override][state_index, 0] = 1
            Rh[override][state_index, 0] = 10000

        # wait
        if (fork != active) and (a < maxForkLen) and (h < maxForkLen):
            P[wait][state_index, state_mapping[a+1, h, irrelevant]] = alphaPower
            P[wait][state_index, state_mapping[a, h+1, relevant]] = 1 - alphaPower
        elif (fork == active) and (a > h) and (h > 0) and (a < maxForkLen) and (h < maxForkLen): 
            P[wait][state_index, state_mapping[a+1, h, active]] = alphaPower
            P[wait][state_index, state_mapping[a-h, 1, relevant]] = gammaRatio*(1-alphaPower)
            Rs[wait][state_index, state_mapping[a-h, 1, relevant]] = h
            P[wait][state_index, state_mapping[a, h+1, relevant]] = (1-gammaRatio)*(1-alphaPower)
        else:
            P[wait][state_index, 0] = 1
            Rh[wait][state_index, 0] = 10000

        # match
        if (fork == relevant) and (a >= h) and (h > 0) and (a < maxForkLen) and (h < maxForkLen):
            P[match][state_index, state_mapping[a+1, h, active]] = alphaPower
            P[match][state_index, state_mapping[a-h, 1, relevant]] = gammaRatio*(1-alphaPower)
            Rs[match][state_index, state_mapping[a-h, 1, relevant]] = h
            P[match][state_index, state_mapping[a, h+1, relevant]] = (1-gammaRatio)*(1-alphaPower)
        else:
            P[match][state_index, 0] = 1
            Rh[match][state_index, 0] = 10000
    epsilon = 0.0001
    lowRho = 0
    highRho = 1
    while(highRho - lowRho > epsilon/8):
        rho = (highRho + lowRho) / 2;
        Wrho = []
        for i in range(choices):
            Wrho.append((1-rho)*Rs[i] - rho*Rh[i])
        rvi = mdptoolbox.mdp.RelativeValueIteration(P, Wrho, epsilon/8)
        rvi.run()
        lowerBoundPolicy = rvi.policy
        reward = rvi.average_reward
        if reward > 0:
            lowRho = rho
        else:
            highRho = rho
    print('lower bound reward:', rho)
    lowerBoundRho = rho
    lowRho = rho
    highRho = min(rho+0.1, 1)
    while (highRho - lowRho) > (epsilon / 8):
        rho = (highRho + lowRho) / 2
        for state_index in range(numOfStates):
            a, h, fork = states[state_index]
            if a == maxForkLen:
                expr = (1-rho)*alphaPower*(1-alphaPower)/(1-2*alphaPower)**2+0.5*((a-h)/(1-2*alphaPower)+a+h)
                Rs[adopt][state_index, state_mapping[1, 0, irrelevant]] = expr
                Rs[adopt][state_index, state_mapping[0, 1, irrelevant]] = expr
                Rs[adopt][state_index, state_mapping[1, 0, irrelevant]] = 0
                Rs[adopt][state_index, state_mapping[0, 1, irrelevant]] = 0
            elif h == maxForkLen:
                expr1 = (1 - np.power(alphaPower/(1-alphaPower), h - a)) * (-1*rho*h)
                expr2 = np.power(alphaPower/(1-alphaPower), h - a) * (1 - rho)
                expr3 = (alphaPower * (1-alphaPower)) / (np.power(1-2*alphaPower, 2)) + (h - a) / (1- 2 * alphaPower)
                expr_total = expr1 + expr2 * expr3
                Rs[adopt][state_index, state_mapping[1, 0, irrelevant]] = expr_total
                Rs[adopt][state_index, state_mapping[0, 1, irrelevant]] = expr_total
                Rh[adopt][state_index, state_mapping[1, 0, irrelevant]] = 0
                Rh[adopt][state_index, state_mapping[0, 1, irrelevant]] = 0
        Wrho = []
        for i in range(choices):
            Wrho.append((1-rho)*Rs[i] - rho*Rh[i])
        rhoPrime = max(lowRho - epsilon/4, 0)
        rvi = mdptoolbox.mdp.RelativeValueIteration(P, Wrho, epsilon/8)
        rvi.run()
        reward = rvi.average_reward
        policy = rvi.policy
        if reward > 0:
            lowRho = rho
        else:
            highRho = rho
    print('upper bound reward', rho)

alpha:  0.05
processing state 0
processing state 2000
processing state 4000
processing state 6000
processing state 8000
processing state 10000
processing state 12000
processing state 14000
processing state 16000
lower bound reward: 0.04999542236328125
upper bound reward 0.05000762939453125
alpha:  0.1
processing state 0
processing state 2000
processing state 4000
processing state 6000
processing state 8000
processing state 10000
processing state 12000
processing state 14000
processing state 16000
lower bound reward: 0.09999847412109375
upper bound reward 0.10001068115234375
alpha:  0.15000000000000002
processing state 0
processing state 2000
processing state 4000
processing state 6000
processing state 8000
processing state 10000
processing state 12000
processing state 14000
processing state 16000
lower bound reward: 0.15000152587890625
upper bound reward 0.15001373291015624
alpha:  0.2
processing state 0
processing state 2000
processing state 4000
processing state 6000
processing state

KeyboardInterrupt: 