In [45]:
NUM_ACTIONS = 2


class Node:
    def __init__(self, info_set):
        self.info_set = info_set
        self.regret_sum = [0] * NUM_ACTIONS
        self.strategy = [0] * NUM_ACTIONS
        self.strategy_sum = [0] * NUM_ACTIONS

    def get_strategy(self, realization_weight):
        normalizing_sum = 0
        for i in range(NUM_ACTIONS):
            self.strategy[i] = max(self.regret_sum[i], 0)
            normalizing_sum += self.strategy[i]
        for i in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                self.strategy[i] /= normalizing_sum
            else:
                self.strategy[i] = 1 / NUM_ACTIONS
            self.strategy_sum[i] += self.strategy[i] * realization_weight
        return self.strategy

    def get_avg_strategy(self):
        avg_strategy = [0] * NUM_ACTIONS
        normalizing_sum = 0
        for i in range(NUM_ACTIONS):
            normalizing_sum += self.strategy_sum[i]
        for i in range(NUM_ACTIONS):
            if normalizing_sum > 0:
                avg_strategy[i] = self.strategy_sum[i] / normalizing_sum
            else:
                avg_strategy[i] = 1 / NUM_ACTIONS
        return avg_strategy

    def __str__(self):
        return f"{self.info_set} {self.get_avg_strategy()}"

In [46]:
import random
import tqdm

node_map = {}

def train(iterations):
    util = 0.0

    for _ in tqdm.tqdm(range(iterations)):
        cards = [1, 2, 3]
        random.shuffle(cards)
        util += cfr(cards, "", 1, 1)

    print(f"Average game value: {util / iterations}")
    for info_set in sorted(node_map):
        print(f"{info_set}: {node_map[info_set]}")
    return util / iterations


def cfr(cards, history, p0, p1):
    active_player = len(history) % 2

    if len(history) > 1:
        terminal_pass = history[-1] == "p"
        double_bet = history[-2:] == "bb"
        is_player_card_higher = cards[active_player] > cards[1 - active_player]
        if terminal_pass:
            if history == "pp":
                return 1 if is_player_card_higher else -1
            return 1
        elif double_bet:
            return 2 if is_player_card_higher else -2

    info_set = str(cards[active_player]) + history
    if info_set not in node_map:
        node_map[info_set] = Node(info_set)
    node = node_map[info_set]

    strategy = node.get_strategy(p0 if active_player == 0 else p1)
    util = [0] * NUM_ACTIONS
    node_util = 0

    for a in range(NUM_ACTIONS):
        action = "p" if a == 0 else "b"
        next_history = history + action

        if active_player == 0:
            util[a] = -cfr(cards, next_history, p0 * strategy[a], p1)
        else:
            util[a] = -cfr(cards, next_history, p0, p1 * strategy[a])
        
        node_util += strategy[a] * util[a]
    
    for a in range(NUM_ACTIONS):
        regret = util[a] - node_util
        node.regret_sum[a] += (p1 if active_player == 0 else p0) * regret
    
    return node_util

In [52]:
import fractions
fractions.Fraction(train(500000)).limit_denominator(50)

  0%|          | 0/500000 [00:00<?, ?it/s]

100%|██████████| 500000/500000 [00:05<00:00, 83706.91it/s]

Average game value: -0.05617425936737551
1: 1 [0.8115925307674753, 0.1884074692325247]
1b: 1b [0.999999142258194, 8.577418059925274e-07]
1p: 1p [0.6660647204159101, 0.33393527958408986]
1pb: 1pb [0.9999994725016587, 5.274983413750445e-07]
2: 2 [0.9999991413390302, 8.586609697373528e-07]
2b: 2b [0.6661981207247752, 0.3338018792752249]
2p: 2p [0.9999926311568398, 7.368843160171232e-06]
2pb: 2pb [0.4780271718631045, 0.5219728281368956]
3: 3 [0.4373820681306907, 0.5626179318693093]
3b: 3b [8.568451646513669e-07, 0.9999991431548354]
3p: 3p [8.568451646513669e-07, 0.9999991431548354]
3pb: 3pb [9.791716641846843e-07, 0.9999990208283358]





Fraction(-1, 18)