In [67]:
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as ss

In [68]:
epsilon = 10e-5
T = 70
gamma = 0
alpha = 0.4

In [69]:
# the numbers of states is (T+1)*(T+1)*3 because each chain can be up to T length and there are 3 fork states.
num_states = (T+1)*(T+1)*3

# generate a state to integer mapping and list of states
state_mapping = {}
states = []
count = 0
for a in range(T+1):
    for h in range(T+1):
        for fork in ['irrelevant', 'relevant', 'active']:
            state_mapping[(a, h, fork)] = count
            states.append((a, h, fork))
            count += 1

In [70]:
def getAdoptMatrices(rho):
    # creating the adopt transition & reward matrices
    adopt_transitions = np.zeros(shape = (num_states, num_states))
    adopt_rewards = np.zeros(shape = (num_states, num_states))

    # each adopt matrix only can map to (1,0,irrelevant) or (0,1,irrelevant)
    adopt_new_state_1_index = state_mapping[(1, 0, 'irrelevant')]
    adopt_new_state_2_index = state_mapping[(0, 1, 'irrelevant')]
    for state_index in range(num_states):
        state = states[state_index]
        adopt_transitions[state_index, adopt_new_state_1_index] = alpha
        adopt_transitions[state_index, adopt_new_state_2_index] = 1 - alpha
        adopt_rewards[state_index, adopt_new_state_1_index] = rho * state[1]
        adopt_rewards[state_index, adopt_new_state_2_index] = rho * state[1]

    # making matrices sparse
    return ss.csr_matrix(adopt_transitions), ss.csr_matrix(adopt_rewards)

In [74]:
def getOverrideMatrices(rho):
    # creating the override transition & reward matrices
    override_transitions = np.zeros(shape = (num_states, num_states))
    override_rewards = np.zeros(shape = (num_states, num_states))

    for state_index in range(num_states):
        state = states[state_index]
        # a > h, which must be true for override to succeed
        if state[0] > state[1]:
            # (a-h, 0, irrelevant)
            new_state_1 = (state[0]-state[1], 0, 'irrelevant')
            # (a-h-1, 1, relevant)
            new_state_2 = (state[0]-state[1]-1, 1, 'relevant')
            override_transitions[state_index, state_mapping[new_state_1]] = alpha
            override_transitions[state_index, state_mapping[new_state_2]] = 1 - alpha
            override_rewards[state_index, state_mapping[new_state_1]] = (1 - rho) * (state[1] + 1)
            override_rewards[state_index, state_mapping[new_state_2]] = (1 - rho) * (state[1] + 1)
        else:
            # filling in remainder of array.
            override_transitions[state_index, 0] = 1
            override_rewards[state_index, 0] = -1*rho*1000

    # making matrices sparse
    return ss.csr_matrix(override_transitions), ss.csr_matrix(override_rewards)

In [76]:
def getWaitMatrices(rho):
    # creating the wait transition & reward matrices
    wait_transitions = np.zeros(shape = (num_states, num_states))
    wait_rewards = np.zeros(shape = (num_states, num_states))

    for state_index in range(num_states):
        state = states[state_index]
        # ignore truncated states
        if (state[0] == T or state[1] == T):
            ### FIX THIS
            wait_transitions[state_index, 0] = 1
            continue

        # irrelevant or relevant
        if (state[2] == 'irrelevant') or (state[2] == 'relevant'):
            # (a+1, h, irrelevant)
            new_state_1 = (state[0] + 1, state[1], 'irrelevant')
            # (a, h+1, relevant)
            new_state_2 = (state[0], state[1] + 1, 'relevant')
            wait_transitions[state_index, state_mapping[new_state_1]] = alpha
            wait_transitions[state_index, state_mapping[new_state_2]] = 1 - alpha
        # active
        else:
            # a >= h
            if state[0] >= state[1]: 
                # (a+1, h, active)
                new_state_1 = (state[0] + 1, state[1], 'active')
                # (a-h, 1, relevant)
                new_state_2 = (state[0] - state[1], 1, 'relevant')
                # (a, h+1, relevant)
                new_state_3 = (state[0], state[1] + 1, 'relevant')
                wait_transitions[state_index, state_mapping[new_state_1]] = alpha
                wait_transitions[state_index, state_mapping[new_state_2]] = gamma * (1 - alpha)
                wait_transitions[state_index, state_mapping[new_state_3]] = (1 - gamma) * (1 - alpha)
                wait_rewards[state_index, state_mapping[new_state_2]] = (1 - rho) * state[1]
            else:
                wait_transitions[state_index, 0] = 1
                wait_rewards[state_index, 0] = -1*rho*1000

    # making matrices sparse
    return ss.csr_matrix(wait_transitions), ss.csr_matrix(wait_rewards)

In [79]:
def getMatchMatrices(rho):
    # creating the match transition & rewards matrices
    match_transitions = np.zeros(shape = (num_states, num_states))
    match_rewards = np.zeros(shape = (num_states, num_states))

    for state_index in range(num_states):
        state = states[state_index]
        # ignore truncated states
        if (state[0] == T or state[1] == T):
            #### FIX THIS
            match_transitions[state_index, 0] = 1
            continue


        # a >= h and relevant
        if (state[0] >= state[1]) and (state[2] == 'relevant'):
            # (a+1, h, active)
            new_state_1 = (state[0] + 1, state[1], 'active')
            # (a-h, 1, relevant)
            new_state_2 = (state[0] - state[1], 1, 'relevant')
            # (a, h+1, relevant)
            new_state_3 = (state[0], state[1] + 1, 'relevant')
            match_transitions[state_index, state_mapping[new_state_1]] = alpha
            match_transitions[state_index, state_mapping[new_state_2]] = gamma * (1 - alpha)
            match_transitions[state_index, state_mapping[new_state_3]] = (1 - gamma) * (1 - alpha)
            match_rewards[state_index, state_mapping[new_state_2]] = (1 - rho) * state[1]
        else:
            match_transitions[state_index, 0] = 1
            match_rewards[state_index, 0] = -1*rho*1000

    # making matrices sparse
    return ss.csr_matrix(match_transitions), ss.csr_matrix(match_rewards)

In [54]:
transition_matrix = np.asarray([adopt_transitions, override_transitions, wait_transitions, match_transitions])
reward_matrix = np.asarray([adopt_rewards, override_rewards, wait_rewards, match_rewards])

In [56]:
rvi = mdptoolbox.mdp.RelativeValueIteration(transition_matrix, reward_matrix)

In [57]:
rvi.run()

In [58]:
rvi.average_reward

0.5