In [1]:
import itertools

SOLDIERS = 5
BATTLEFIELDS = 3

def build_all_strategies():
    # Generate combinations with replacement indices
    combinations = itertools.combinations_with_replacement(range(BATTLEFIELDS), SOLDIERS)
    strategies = []
    
    for comb in combinations:
        strat = [0] * BATTLEFIELDS
        for b in comb:
            strat[b] += 1
        strategies.append(strat)
    
    return strategies

In [2]:
import numpy as np
import tqdm

ALL_STRATEGIES = build_all_strategies()
NUM_ACTIONS = len(ALL_STRATEGIES)

class Player:
    def __init__(self):
        self.strategy = np.array([0.0] * NUM_ACTIONS)
        self.regret_sum = np.array([0.0] * NUM_ACTIONS)
        self.strategy_sum = np.array([0.0] * NUM_ACTIONS)

    def get_strategy(self):
        normalizing_sum = 0
        for a in range(NUM_ACTIONS):
            self.strategy[a] = self.regret_sum[a] if self.regret_sum[a] > 0 else 0
            normalizing_sum += self.strategy[a]
        for a in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                self.strategy[a] /= normalizing_sum
            else:
                self.strategy[a] = 1.0 / NUM_ACTIONS
            self.strategy_sum[a] += self.strategy[a]
        return self.strategy

    def get_action(self):
        strategy = self.get_strategy()
        return ALL_STRATEGIES[np.random.choice(NUM_ACTIONS, p=strategy)]

    def update_regret(self, my_action, opp_action):
        action_utility = np.array([0.0] * NUM_ACTIONS)

        for strategy in ALL_STRATEGIES:
            utility = Player.calculate_utility(strategy, opp_action)
            action_utility[ALL_STRATEGIES.index(strategy)] = utility
        
        self.regret_sum += action_utility - action_utility[ALL_STRATEGIES.index(my_action)]
    
    def get_avg_strategy(self):
        avg_strategy = np.array([0.0] * NUM_ACTIONS)
        normalizing_sum = 0
        for a in range(NUM_ACTIONS):
            normalizing_sum += self.strategy_sum[a]
        for a in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                avg_strategy[a] = self.strategy_sum[a] / normalizing_sum
            else:
                avg_strategy[a] = 1.0 / NUM_ACTIONS
        return avg_strategy
    
    @classmethod
    def calculate_utility(cls, my_action, opp_action):
        total_utility = 0
        for i in range(BATTLEFIELDS):
            if my_action[i] > opp_action[i]:
                total_utility += 1
            elif my_action[i] < opp_action[i]:
                total_utility -= 1
        return total_utility

In [3]:
me = Player()
opp = Player()
def train(iterations):
    global regret_sum
    for _ in tqdm.tqdm(range(iterations)):
        my_action = me.get_action()
        opp_action = opp.get_action()

        me.update_regret(my_action, opp_action)
        opp.update_regret(opp_action, my_action)


train(2500000)

100%|██████████| 2500000/2500000 [05:04<00:00, 8219.31it/s]


In [4]:
def print_strategy(probs):
    for prob, strat in zip(probs, ALL_STRATEGIES):
        # round prob to 3 decimal places
        print(f"{strat}: {prob:.3f}")

# show avg strategy as a float, not exp notation
with np.printoptions(precision=3, floatmode='fixed', suppress=True):
    print_strategy(me.get_avg_strategy())
    print(opp.get_avg_strategy())

[5, 0, 0]: 0.000
[4, 1, 0]: 0.000
[4, 0, 1]: 0.000
[3, 2, 0]: 0.106
[3, 1, 1]: 0.114
[3, 0, 2]: 0.113
[2, 3, 0]: 0.113
[2, 2, 1]: 0.000
[2, 1, 2]: 0.000
[2, 0, 3]: 0.106
[1, 4, 0]: 0.000
[1, 3, 1]: 0.114
[1, 2, 2]: 0.000
[1, 1, 3]: 0.114
[1, 0, 4]: 0.000
[0, 5, 0]: 0.000
[0, 4, 1]: 0.000
[0, 3, 2]: 0.106
[0, 2, 3]: 0.113
[0, 1, 4]: 0.000
[0, 0, 5]: 0.000
[0.000 0.000 0.000 0.102 0.115 0.117 0.112 0.000 0.000 0.105 0.000 0.120
 0.000 0.112 0.000 0.000 0.000 0.097 0.120 0.000 0.000]
