In [1]:
import environmentv7 as e
from honestagentv7 import HonestAgent
import mdptoolbox
import matplotlib.pyplot as plt
import numpy as np
import progressbar as pb
import scipy.sparse as ss
import seaborn as sns
from selfishagentv7 import SelfishAgent
import warnings
warnings.filterwarnings('ignore', category=ss.SparseEfficiencyWarning)

In [2]:
# params
alpha = 0.45
gamma = 0.5
T = 8
epsilon = 10e-5

# game
action_count = 4
adopt = 0; override = 1; mine = 2; match = 3

# fork params
fork_count = 3
irrelevant = 0; relevant = 1; active = 2;

state_count = (T+1) * (T+1) * 3

# mapping utils
state_mapping = {}
states = []
count = 0
for a in range(T+1):
    for h in range(T+1):
        for fork in range(fork_count):
            state_mapping[(a, h, fork)] = count
            states.append((a, h, fork))
            count += 1

# initialize matrices
transitions = []; rewards = []
for _ in range(action_count):
    transitions.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))
    rewards.append(ss.csr_matrix(np.zeros(shape=(state_count, state_count))))

In [3]:
def get_opt_policy(alpha, gamma, T, mining_cost):
    # populate matrices
    for state_index in range(state_count):
        a, h, fork = states[state_index]

        # adopt
        transitions[adopt][state_index, state_mapping[0, 0, irrelevant]] = 1

        # override
        if a > h:
            transitions[override][state_index, state_mapping[a-h-1, 0, irrelevant]] = 1
            rewards[override][state_index, state_mapping[a-h-1, 0, irrelevant]] = h + 1
        else:
            transitions[override][state_index, 0] = 1
            rewards[override][state_index, 0] = -10000

        # mine 
        if (fork != active) and (a < T) and (h < T):
            transitions[mine][state_index, state_mapping[a+1, h, irrelevant]] = alpha
            transitions[mine][state_index, state_mapping[a, h+1, relevant]] = (1 - alpha) 
            rewards[mine][state_index, state_mapping[a+1, h, irrelevant]] = -1 * alpha * mining_cost
            rewards[mine][state_index, state_mapping[a, h+1, relevant]] = -1 * alpha * mining_cost        
        elif (fork == active) and (a > h) and (h > 0) and (a < T) and (h < T):
            transitions[mine][state_index, state_mapping[a+1, h, active]] = alpha
            transitions[mine][state_index, state_mapping[a-h, 1, relevant]] = (1 - alpha) * gamma
            transitions[mine][state_index, state_mapping[a, h+1, relevant]] = (1 - alpha) * (1 - gamma)
            rewards[mine][state_index, state_mapping[a+1, h, active]] = -1 * alpha * mining_cost
            rewards[mine][state_index, state_mapping[a-h, 1, relevant]] = h - alpha * mining_cost
            rewards[mine][state_index, state_mapping[a, h+1, relevant]] = -1 * alpha * mining_cost
        else:
            transitions[mine][state_index, 0] = 1
            rewards[mine][state_index, 0] = -10000

        # match 
        if (fork == relevant) and (a >= h) and (h > 0) and (a < T) and (h < T):
            transitions[match][state_index, state_mapping[a+1, h, active]] = alpha
            transitions[match][state_index, state_mapping[a-h, 1, relevant]] = (1 - alpha) * gamma
            transitions[match][state_index, state_mapping[a, h+1, relevant]] = (1 - alpha) * (1 - gamma)
            rewards[match][state_index, state_mapping[a+1, h, active]] = -1 * alpha * mining_cost
            rewards[match][state_index, state_mapping[a-h, 1, relevant]] = h - alpha * mining_cost
            rewards[match][state_index, state_mapping[a, h+1, relevant]] = -1 * alpha * mining_cost
        else:
            transitions[match][state_index, 0] = 1
            rewards[match][state_index, 0] = -10000

    rvi = mdptoolbox.mdp.RelativeValueIteration(transitions, rewards, epsilon/8)
    rvi.run()
    policy = rvi.policy
    return np.reshape(policy, (9,9,3))

In [4]:
def processPolicy(policy):
    results = ''
    for a in range(9):
        for h in range(9):
            for fork in range(3):
                state_index = state_mapping[(a, h, fork)]
                action = policy[state_index]
                
                if action == 0:
                    results += 'a'
                elif action == 1:
                    results += 'o'
                elif action == 2:
                    results += 'w'
                elif action == 3:
                    results += 'm'
                else:
                    print('here')
            results += ' & '
        results += '\\\\ \n'
    print(results)

In [5]:
# params
alpha = 0.4
gamma = 0.5
T = 8
epsilon = 10e-5

In [8]:
# simulation
length = int(1e6)
mining_costs = np.arange(0, 1.1, 0.1)

bar = pb.ProgressBar()
for mining_cost in bar(mining_costs):
    optimal_policy = get_opt_policy(alpha, gamma, T, mining_cost)
    env = e.Environment(alpha, gamma, T, mining_cost)
    # simulation
    inner_bar = pb.ProgressBar()
    _ = env.reset()
    current_reward = 0
    for _ in inner_bar(range(length)):
        a, h, fork = env.current_state
        action = optimal_policy[(a, h, fork)]
        _, reward = env.takeAction(action)
        current_reward += reward
    print('mining cost {:.04f}, reward per action {:.06f}'.format(mining_cost, current_reward / length))

100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:36 Time:  0:00:36
  9% (1 of 11) |##                       | Elapsed Time: 0:00:36 ETA:   0:06:06

mining cost 0.0000, reward per action 0.220834


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:36 Time:  0:00:36
 18% (2 of 11) |####                     | Elapsed Time: 0:01:13 ETA:   0:05:32

mining cost 0.1000, reward per action 0.190290


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:33 Time:  0:00:33
 27% (3 of 11) |######                   | Elapsed Time: 0:01:47 ETA:   0:04:30

mining cost 0.2000, reward per action 0.156051


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:33 Time:  0:00:33
 36% (4 of 11) |#########                | Elapsed Time: 0:02:20 ETA:   0:03:53

mining cost 0.3000, reward per action 0.139121


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:33 Time:  0:00:33
 45% (5 of 11) |###########              | Elapsed Time: 0:02:54 ETA:   0:03:22

mining cost 0.4000, reward per action 0.118128


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:37 Time:  0:00:37
 54% (6 of 11) |#############            | Elapsed Time: 0:03:32 ETA:   0:03:08

mining cost 0.5000, reward per action 0.086993


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:34 Time:  0:00:34
 63% (7 of 11) |###############          | Elapsed Time: 0:04:08 ETA:   0:02:23

mining cost 0.6000, reward per action 0.073113


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:32 Time:  0:00:32
 72% (8 of 11) |##################       | Elapsed Time: 0:04:41 ETA:   0:01:39

mining cost 0.7000, reward per action 0.044188


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:32 Time:  0:00:32
 81% (9 of 11) |####################     | Elapsed Time: 0:05:14 ETA:   0:01:06

mining cost 0.8000, reward per action 0.039382


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:30 Time:  0:00:30
 90% (10 of 11) |#####################   | Elapsed Time: 0:05:45 ETA:   0:00:30

mining cost 0.9000, reward per action 0.019600


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:24 Time:  0:00:24
100% (11 of 11) |########################| Elapsed Time: 0:06:10 Time:  0:06:10


mining cost 1.0000, reward per action 0.000000


In [6]:
# simulation
length = int(1e6)
mining_costs = np.arange(0, 1.1, 0.1)
honest_agent = HonestAgent()

bar = pb.ProgressBar()
for mining_cost in bar(mining_costs):
    env = e.Environment(alpha, gamma, T, mining_cost)
    # simulation
    inner_bar = pb.ProgressBar()
    _ = env.reset()
    current_reward = 0
    for _ in inner_bar(range(length)):
        a, h, fork = env.current_state
        action = honest_agent.act((a, h, fork))
        _, reward = env.takeAction(action)
        current_reward += reward
    print('mining cost {:.04f}, reward per action {:.06f}'.format(mining_cost, current_reward / length))

100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:26 Time:  0:00:26
  0% (5287 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:37

mining cost 0.0000, reward per action 0.199354


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:28 Time:  0:00:28
  0% (8182 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:24

mining cost 0.1000, reward per action 0.180698


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:26 Time:  0:00:26
  0% (7732 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:25

mining cost 0.2000, reward per action 0.159946


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:27 Time:  0:00:27
  0% (9167 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:21

mining cost 0.3000, reward per action 0.140660


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:29 Time:  0:00:29
  0% (6105 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:32

mining cost 0.4000, reward per action 0.120178


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:27 Time:  0:00:27
  0% (6342 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:31

mining cost 0.5000, reward per action 0.099535


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:28 Time:  0:00:28
  0% (7322 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:27

mining cost 0.6000, reward per action 0.079739


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:29 Time:  0:00:29
  0% (6709 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:29

mining cost 0.7000, reward per action 0.060014


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:26 Time:  0:00:26
  0% (6707 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:29

mining cost 0.8000, reward per action 0.039963


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:27 Time:  0:00:27
  0% (1493 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:01:07

mining cost 0.9000, reward per action 0.020398


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:27 Time:  0:00:27
100% (11 of 11) |########################| Elapsed Time: 0:05:04 Time:  0:05:04


mining cost 1.0000, reward per action -0.000011


In [6]:
# simulation
length = int(1e6)
mining_costs = np.arange(0, 1.1, 0.1)
selfish_agent = SelfishAgent(T)

bar = pb.ProgressBar()
for mining_cost in bar(mining_costs):
    env = e.Environment(alpha, gamma, T, mining_cost)
    # simulation
    inner_bar = pb.ProgressBar()
    _ = env.reset()
    current_reward = 0
    for _ in inner_bar(range(length)):
        a, h, fork = env.current_state
        action = selfish_agent.act((a, h, fork))
        _, reward = env.takeAction(action)
        current_reward += reward
    print('mining cost {:.04f}, reward per action {:.06f}'.format(mining_cost, current_reward / length))

100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:34 Time:  0:00:34
  0% (7884 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:25

mining cost 0.0000, reward per action 0.230558


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:31 Time:  0:00:31
  0% (6033 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:33

mining cost 0.1000, reward per action 0.205044


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:28 Time:  0:00:28
                                                                               

mining cost 0.2000, reward per action 0.176130


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:28 Time:  0:00:28
  0% (6298 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:31

mining cost 0.3000, reward per action 0.149347


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:29 Time:  0:00:29
  0% (5861 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:34

mining cost 0.4000, reward per action 0.121975


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:30 Time:  0:00:30
                                                                               

mining cost 0.5000, reward per action 0.093110


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:31 Time:  0:00:31
  0% (5771 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:34

mining cost 0.6000, reward per action 0.066472


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:29 Time:  0:00:29
  0% (3067 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:32

mining cost 0.7000, reward per action 0.039339


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:30 Time:  0:00:30
  0% (3634 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:27

mining cost 0.8000, reward per action 0.011908


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:29 Time:  0:00:29
  0% (6946 of 1000000) |                 | Elapsed Time: 0:00:00 ETA:   0:00:28

mining cost 0.9000, reward per action -0.016083


100% (1000000 of 1000000) |##############| Elapsed Time: 0:00:29 Time:  0:00:29
100% (11 of 11) |########################| Elapsed Time: 0:05:33 Time:  0:05:33


mining cost 1.0000, reward per action -0.044013
