In [1]:
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as ss
import warnings
warnings.filterwarnings('ignore', category=ss.SparseEfficiencyWarning)

In [4]:
# params
alpha = 0.4
T = 9
state_count = (T+1) * (T+1)
epsilon = None

# game
action_count = 3
adopt = 0; override = 1; mine = 2

# mapping utils
state_mapping = {}
states = []
count = 0
for a in range(T+1):
    for h in range(T+1):
            state_mapping[(a, h)] = count
            states.append((a, h))
            count += 1

# initialize matrices
transitions = []; reward_selfish = []; reward_honest = []
for _ in range(action_count):
    transitions.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))
    reward_selfish.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))
    reward_honest.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))

In [None]:
for state_index in range(state_count):
    a, h = states[state_index]

    # adopt transitions
    transitions[adopt][state_index, state_mapping[0, 0]] = 1
    # adopt rewards
    reward_honest[adopt][state_index, state_mapping[0, 0]] = h

    # override
    if a > h:
        transitions[override][state_index, state_mapping[a-h-1, 0]] = 1
        reward_selfish[override][state_index, state_mapping[a-h-1, 0]] = h+1
    else:
        transitions[override][state_index, 0] = 1
        reward_honest[override][state_index, 0] = 10000

    # mine transitions
    if (a < self.T) and (h < self.T):
        transitions[mine][state_index, self.state_mapping[a+1, h]] = self.alpha
        self.transitions[self.wait][state_index, self.state_mapping[a, h+1]] = 1 - self.alpha
    else:
        self.transitions[self.wait][state_index, 0] = 1
        self.reward_honest[self.wait][state_index, 0] = 10000

    def getRhoBounds(self):
        low = 0; high = 1
        while (high - low) > self.epsilon / 8:
            rho = (low + high) / 2
            print(low, high, rho)
            total_reward = []
            for i in range(self.action_count):
                total_reward.append((1-rho)*self.reward_selfish[i] - rho*self.reward_honest[i])
            rvi = mdptoolbox.mdp.RelativeValueIteration(self.transitions, total_reward, self.epsilon/8)
            rvi.run()
            if rvi.average_reward > 0:
                low = rho
            else:
                high = rho
        opt_policy = rvi.policy
        print('alpha: ', self.alpha, 'lower bound reward:', rho)
        
        # ql = mdptoolbox.mdp.QLearning(self.transitions, total_reward, discount=1, n_iter=100000)
        # ql.run()
        # self.processPolicy(ql.policy)
        
        # vi = mdptoolbox.mdp.ValueIteration(self.transitions, total_reward, discount=1, epsilon=self.epsilon/8)
        # vi.run()
        # self.processPolicy(vi.policy)
        
        # pi = mdptoolbox.mdp.PolicyIteration(self.transitions, total_reward, discount=0.99, eval_type=1)
        # pi.run()
        # self.processPolicy(pi.policy)
        
        self.processPolicy(opt_policy)
        
    def processPolicy(self, policy):
        results = ''
        for a in range(9):
            results += '{} & '.format(a)
            for h in range(9):
                state_index = self.state_mapping[(a, h)]
                action = policy[state_index]
                assert(action in [0, 1, 2])
                if action == 0:
                    results += 'a'
                elif action == 1:
                    results += 'o'
                else:
                    results += 'w'
                results += ' & '
            results = results[:-2]
            results += '\\\\ \n'
        print(results)

def main():
    for alpha in [0.4]:
        print(alpha)
        mdp = SelfishMDP(alpha=alpha, T=9, epsilon=10e-5)
        mdp.getRhoBounds()

if __name__ == "__main__":
    main()