In [1]:
from RangeAgent import EvalAgentDeepRange
from PokerRL.game.games import StandardLeduc  # or any other game
from PokerRL.eval.rl_br.RLBRArgs import RLBRArgs

from DeepCFR.EvalAgentDeepCFR import EvalAgentDeepCFR
from DeepCFR.TrainingProfile import TrainingProfile
from DeepCFR.workers.driver.Driver import Driver

from PokerRL.game._.tree.PublicTree import PublicTree
from PokerRL.rl import rl_util

rlbr_args = RLBRArgs(
    rlbr_bet_set = None
)

t_prof = TrainingProfile(
    name="DEEPCOPY_v0",
    nn_type="feedforward",
    
    max_buffer_size_adv=3e6,
    eval_agent_export_freq=20,  # export API to play against the agent
    n_traversals_per_iter=1500,
    n_batches_adv_training=750,
    n_batches_avrg_training=2000,
    n_merge_and_table_layer_units_adv=64,
    n_merge_and_table_layer_units_avrg=64,
    n_units_final_adv=64,
    n_units_final_avrg=64,
    mini_batch_size_adv=2048,
    mini_batch_size_avrg=2048,
    init_adv_model="last",
    init_avrg_model="last",
    use_pre_layers_adv=False,
    use_pre_layers_avrg=False,

    game_cls=StandardLeduc,

    # You can specify one or both modes. Choosing both is useful to compare them.
    eval_modes_of_algo=(
     # EvalAgentDeepCFR.EVAL_MODE_SINGLE,  # SD-CFR
     EvalAgentDeepCFR.EVAL_MODE_AVRG_NET,  # Deep-CFR
    ),

    DISTRIBUTED=False,
    rl_br_args=rlbr_args
)


 ************************** Initing args for:  DEEPCOPY_v0   **************************


In [2]:
import copy
import numpy as np
import torch
import pickle

import sys
sys.path.append("/home/leduc/Deep-CFR/")
sys.path.append("/home/leduc/PokerRL/")

from PokerRL.game import Poker
from PokerRL.game._.tree._.nodes import PlayerActionNode
from PokerRL.rl import rl_util
from PokerRL.rl.base_cls.EvalAgentBase import EvalAgentBase as _EvalAgentBase
from PokerRL.rl.errors import UnknownModeError

from DeepCFR.IterationStrategy import IterationStrategy
from DeepCFR.StrategyBuffer import StrategyBuffer
from DeepCFR.workers.la.AvrgWrapper import AvrgWrapper

from RangeWrapper import RangeWrapper

NP_FLOAT_TYPE = np.float64  # Use 64 for extra stability in big games

class EvalAgentTree(_EvalAgentBase):
    
    def __init__(self, t_prof, br_agent, mode=None, device=None):
        super().__init__(t_prof=t_prof, mode=mode, device=device)
        
        self.tree = PublicTree(
            env_bldr=rl_util.get_env_builder(t_prof=t_prof),
            stack_size=t_prof.eval_stack_sizes[0],
            stop_at_street=None,
            put_out_new_round_after_limit=True,
            is_debugging=t_prof.DEBUGGING
        )
        self.tree.build_tree()
        self.br_agent = br_agent # agent to play best response against
        self.solve_br()
        
        self.modes = ["EVAL", "BR"]
        self.mode = "EVAL" # default is eval
        
    def can_compute_mode(self):
        """ All modes are always computable (i.e. not dependent on iteration etc.)"""
        return True
    
    def _find_node_by_env(self, action_history):
        node = self.tree.root
        
        """
        envw = self._internal_env_wrapper.env_bldr
        i = 0
        
        last_round_ = None
        round_ = node.env_state['current_round']
        p_id = node.p_id_acting_next
        
        if last_round_ == round_:
            nth_action_this_round += 1
        else:
            last_round_ = round_
            nth_action_this_round = 0
        
        def reverse_idx(idx, round_, p_id, nth_action_this_round):
            return i - nth_action_this_round - p_id * envw._VEC_HALF_ROUND_SIZE[round_] - envw._VEC_ROUND_OFFSETS[round_]
        
        while i < len(action_history):
            if action_history[i] == 1:
                action_idx = reverse_idx(i, round_, p_id, nth_action_this_round) + 1 # fold never accepted
                node = node.children[action_idx] #recurse through tree
        """
        
        i = 0
        
        while i < len(action_history):
            if isinstance(node.children[0], PlayerActionNode): #next node is playerAction
                action = action_history[i][0]
                assert(node.p_id_acting_next == action_history[i][2])
                node = node.children[node.allowed_actions.index(action)]
                i += 1
            else: #chance node, flop
                assert(node.children[0].action == "CHANCE")
                card = self._internal_env_wrapper.env.board
                node = node.children[self._card_to_idx(card)]
                assert(self._card_to_idx(node.env_state['board_2d']) == self._card_to_idx(card))
                
        if not isinstance(node.children[0], PlayerActionNode): # just need to do one more loop lol
            assert(node.children[0].action == "CHANCE")
            card = self._internal_env_wrapper.env.board
            node = node.children[self._card_to_idx(card)]
            assert(self._card_to_idx(node.env_state['board_2d']) == self._card_to_idx(card))
        
        return node
    
    def _card_to_idx(self, card):
        return card[0][0] * 2 + card[0][1]
    
    def solve_br(self):
        self.tree.fill_with_agent_policy(agent=self.br_agent)
        self.tree.compute_ev()
                    
    def get_action(self, step_env=True, need_probs=False):
        """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """
        # print("action history", self._internal_env_wrapper._action_history_vector)
        # node = self._find_node_by_env(self._internal_env_wrapper._action_history_vector)
        
        # print("action history", self._internal_env_wrapper._action_history_list)
        node = self._find_node_by_env(self._internal_env_wrapper._action_history_list)
        
        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id
        range_idx = self._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting)
        legal_actions_list = self._internal_env_wrapper.env.get_legal_actions()
        a_probs_all_hands = None
        
        if self.mode == "BR":
            action = None
            best_ev = -1e10 #really bad
            
            for idx, potential_action in enumerate(node.allowed_actions):
                if node.children[idx].ev[p_id_acting,range_idx] > best_ev:
                    action = potential_action # deterministic
                    best_ev = node.children[idx].ev[p_id_acting,range_idx]
            
        elif self.mode == "EVAL":
            a_probs = node.strategy[range_idx,:]
            # print(node.strategy, node.strategy.shape)
            # print(node.allowed_actions)
            # print("allowed:", legal_actions_list)
            # print(node.ev, node.ev_br)
            action = np.random.choice(node.allowed_actions, p=a_probs)

        if step_env:
            self._internal_env_wrapper.step(action=action)
        
        assert(a_probs_all_hands is None)
        
        return action, a_probs_all_hands
        
    def get_mode(self):
        return "BESTRESPONSE"


In [9]:
from H2HEvaluator import H2HEval

agent_file1 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_200/120/eval_agentAVRG_NET.pkl"
agent_file2 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_2/2/eval_agentAVRG_NET.pkl"

student_agent = EvalAgentDeepRange(t_prof, mode=None, device=None)
enemy_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file2)
init_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file1)

copycat = EvalAgentTree(t_prof, br_agent=enemy_agent, mode=None, device=None)
bestresponse = EvalAgentTree(t_prof, br_agent=enemy_agent, mode=None, device=None)
bestresponse.mode = "BR"

In [10]:
H2HEval(enemy_agent, copycat).h2h_eval(n_games=10000)


Played 20000 hands of poker.
Player  AVRG_NET: 53.45000076293945 +/- 68.41536406978825
Player  BESTRESPONSE: -53.45000076293945 +/- 68.41536406978825


(53.45000076293945, 1218.4147336530805)

In [11]:
H2HEval(enemy_agent, bestresponse).h2h_eval(n_games=10000)


Played 20000 hands of poker.
Player  AVRG_NET: -2147.5 +/- 74.02282180066341
Player  BESTRESPONSE: 2147.5 +/- 74.02282180066341


(-2147.5, 1426.3270895805763)

In [None]:

stack_size = t_prof.eval_stack_sizes[0]

gt = PublicTree(env_bldr=rl_util.get_env_builder(t_prof=t_prof),
           stack_size=stack_size,
           stop_at_street=None,
           put_out_new_round_after_limit=True,
           is_debugging=t_prof.DEBUGGING)
gt.build_tree()
print("Tree with stack size", gt.stack_size, "has", gt.n_nodes, "nodes out of which", gt.n_nonterm,
      "are non-terminal.")
gt.fill_with_agent_policy(agent=init_agent)
gt.compute_ev()

init_agent._internal_env_wrapper.reset()

In [None]:
gt.root, gt.root.strategy, gt.root.children[0].strategy, gt.root.ev, gt.root.exploitability

In [None]:
gt.root.env_state['a_seq'] == init_agent._internal_env_wrapper.state_dict()['base']['env']['a_seq']

In [None]:
init_agent._internal_env_wrapper.state_dict()

In [None]:
gt.root.children[0].env_state

In [None]:
init_agent._internal_env_wrapper.env.board

In [None]:
gt.root.children[0].children[0].children[0].action

In [None]:
gt.root.children[0].action, gt.root.children[1].action

In [None]:
gt.root.children[0].ev

In [None]:
gt.root.children[0].children[0].children[0].env_state

In [None]:
gt.root.children[0].children[0].children[1].p_id_acting_next