In [7]:
import numpy as np
import tqdm

NUM_ACTIONS = 3


class Player:
    def __init__(self):
        self.strategy = np.array([0.0] * NUM_ACTIONS)
        self.regret_sum = np.array([0.0] * NUM_ACTIONS)
        self.strategy_sum = np.array([0.0] * NUM_ACTIONS)

    def get_strategy(self):
        normalizing_sum = 0
        for a in range(NUM_ACTIONS):
            self.strategy[a] = self.regret_sum[a] if self.regret_sum[a] > 0 else 0
            normalizing_sum += self.strategy[a]
        for a in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                self.strategy[a] /= normalizing_sum
            else:
                self.strategy[a] = 1.0 / NUM_ACTIONS
            self.strategy_sum[a] += self.strategy[a]
        return self.strategy

    def get_action(self):
        strategy = self.get_strategy()
        return np.random.choice(NUM_ACTIONS, p=strategy)

    def update_regret(self, my_action, opp_action):
        action_utility = np.array([0.0] * NUM_ACTIONS)
        action_utility[opp_action] = 0
        action_utility[(opp_action + 1) % NUM_ACTIONS] = 1
        action_utility[(opp_action + 2) % NUM_ACTIONS] = -1

        self.regret_sum += action_utility - action_utility[my_action]
    
    def get_avg_strategy(self):
        avg_strategy = np.array([0.0] * NUM_ACTIONS)
        normalizing_sum = 0
        for a in range(NUM_ACTIONS):
            normalizing_sum += self.strategy_sum[a]
        for a in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                avg_strategy[a] = self.strategy_sum[a] / normalizing_sum
            else:
                avg_strategy[a] = 1.0 / NUM_ACTIONS
        return avg_strategy

In [8]:
me = Player()
opp = Player()
def train(iterations):
    global regret_sum
    for _ in tqdm.tqdm(range(iterations)):
        my_action = me.get_action()
        opp_action = opp.get_action()

        me.update_regret(my_action, opp_action)
        opp.update_regret(opp_action, my_action)


train(10000)

100%|██████████| 10000/10000 [00:00<00:00, 19213.02it/s]


In [9]:
# show avg strategy as a float, not exp notation
with np.printoptions(suppress=True):
    print(me.get_avg_strategy())
    print(opp.get_avg_strategy())

[0.32480263 0.3349484  0.34024896]
[0.3344306  0.33287496 0.33269445]
