In [1]:
from PokerRL.cfr.VanillaCFR import VanillaCFR
from PokerRL.game import bet_sets
from PokerRL.game.games import DiscretizedNLLeduc
from PokerRL.rl.base_cls.workers.ChiefBase import ChiefBase


In [4]:
n_iterations = 150
name = "CFR_EXAMPLE"

# Passing None for t_prof will is enough for ChiefBase. We only use it to log; This CFR impl is not distributed.
chief = ChiefBase(t_prof=None)
cfr = VanillaCFR(name=name,
                 game_cls=DiscretizedNLLeduc,
                 agent_bet_set=bet_sets.POT_ONLY,
                 chief_handle=chief)
s
for iter_id in range(n_iterations):
    print("Iteration: ", iter_id)
    cfr.iteration()


Tree with stack size [20000, 20000] has 1095 nodes out of which 428 are non-terminal.
Iteration:  0
Iteration:  1
Iteration:  2
Iteration:  3
Iteration:  4
Iteration:  5
Iteration:  6
Iteration:  7
Iteration:  8
Iteration:  9
Iteration:  10
Iteration:  11
Iteration:  12
Iteration:  13
Iteration:  14
Iteration:  15
Iteration:  16
Iteration:  17
Iteration:  18
Iteration:  19
Iteration:  20
Iteration:  21
Iteration:  22
Iteration:  23
Iteration:  24
Iteration:  25
Iteration:  26
Iteration:  27
Iteration:  28
Iteration:  29
Iteration:  30
Iteration:  31
Iteration:  32
Iteration:  33
Iteration:  34
Iteration:  35
Iteration:  36
Iteration:  37
Iteration:  38
Iteration:  39
Iteration:  40
Iteration:  41
Iteration:  42
Iteration:  43
Iteration:  44
Iteration:  45
Iteration:  46
Iteration:  47
Iteration:  48
Iteration:  49
Iteration:  50
Iteration:  51
Iteration:  52
Iteration:  53
Iteration:  54
Iteration:  55
Iteration:  56
Iteration:  57
Iteration:  58
Iteration:  59
Iteration:  60
Iteration

In [5]:
cfr

<PokerRL.cfr.VanillaCFR.VanillaCFR at 0x7f6125dda748>

## Example agent class

In [4]:
import torch
import torch.nn.functional as nnf

from PokerRL.rl import rl_util
from PokerRL.rl.neural.AvrgStrategyNet import AvrgStrategyNet
from PokerRL.rl.neural.NetWrapperBase import NetWrapperArgsBase as _NetWrapperArgsBase
from PokerRL.rl.neural.NetWrapperBase import NetWrapperBase as _NetWrapperBase


class AvgWrapper(_NetWrapperBase):

    def __init__(self, owner, env_bldr, avg_training_args):
        super().__init__(
            net=AvrgStrategyNet(avrg_net_args=avg_training_args.avg_net_args, env_bldr=env_bldr,
                                device=avg_training_args.device_training),
            env_bldr=env_bldr,
            args=avg_training_args,
            owner=owner,
            device=avg_training_args.device_training,
        )
        self._all_range_idxs = torch.arange(self._env_bldr.rules.RANGE_SIZE, device=self.device, dtype=torch.long)

    def get_a_probs(self, pub_obses, range_idxs, legal_actions_lists):
        """
        Args:
            pub_obses (list):             list of np arrays of shape [np.arr([history_len, n_features]), ...)
            range_idxs (np.ndarray):    array of range_idxs (one for each pub_obs) tensor([2, 421, 58, 912, ...])
            legal_actions_lists (list:  list of lists. each 2nd level lists contains ints representing legal actions
        """
        with torch.no_grad():
            masks = rl_util.batch_get_legal_action_mask_torch(n_actions=self._env_bldr.N_ACTIONS,
                                                              legal_actions_lists=legal_actions_lists,
                                                              device=self.device)

            return self.get_a_probs2(pub_obses=pub_obses, range_idxs=range_idxs, legal_action_masks=masks)

    def get_a_probs2(self, pub_obses, range_idxs, legal_action_masks):
        with torch.no_grad():
            pred = self._net(pub_obses=pub_obses,
                             range_idxs=torch.from_numpy(range_idxs).to(dtype=torch.long, device=self.device),
                             legal_action_masks=legal_action_masks)

            return nnf.softmax(pred, dim=-1).cpu().numpy()

    def get_a_probs_for_each_hand(self, pub_obs, legal_actions_list):
        assert isinstance(legal_actions_list[0], int), "all hands can do the same actions. no need to batch"

        with torch.no_grad():
            mask = rl_util.get_legal_action_mask_torch(n_actions=self._env_bldr.N_ACTIONS,
                                                       legal_actions_list=legal_actions_list,
                                                       device=self.device, dtype=torch.uint8)
            mask = mask.unsqueeze(0).expand(self._env_bldr.rules.RANGE_SIZE, -1)

            pred = self._net(pub_obses=[pub_obs] * self._env_bldr.rules.RANGE_SIZE,
                             range_idxs=self._all_range_idxs,
                             legal_action_masks=mask)

            return nnf.softmax(pred, dim=1).cpu().numpy()

    def _mini_batch_loop(self, buffer, grad_mngr):
        batch_pub_obs_t, \
        batch_a_t, \
        batch_range_idx, \
        batch_legal_action_mask_t \
            = buffer.sample(device=self.device, batch_size=self._args.batch_size)

        # [batch_size, n_actions]
        pred = self._net(pub_obses=batch_pub_obs_t,
                         range_idxs=batch_range_idx,
                         legal_action_masks=batch_legal_action_mask_t)

        grad_mngr.backprop(pred=pred, target=batch_a_t)


class AvgWrapperArgs(_NetWrapperArgsBase):

    def __init__(self,
                 avg_net_args,
                 res_buf_size=1e6,
                 min_prob_add_res_buf=0.0,
                 batch_size=512,
                 n_mini_batches_per_update=1,
                 loss_str="ce",
                 optim_str="rms",
                 lr=0.0002,
                 device_training="cpu",
                 grad_norm_clipping=10.0,
                 ):
        super().__init__(batch_size=batch_size,
                         n_mini_batches_per_update=n_mini_batches_per_update,
                         optim_str=optim_str,
                         loss_str=loss_str,
                         lr=lr,
                         grad_norm_clipping=grad_norm_clipping,
                         device_training=device_training)
        self.avg_net_args = avg_net_args
        self.res_buf_size = int(res_buf_size)
        self.min_prob_res_buf = min_prob_add_res_buf


In [5]:
# Copyright (c) 2019 Eric Steinberger


import numpy as np

from PokerRL.rl.base_cls.EvalAgentBase import EvalAgentBase as _EvalAgentBase
from PokerRL.rl.errors import UnknownModeError


class EvalAgentNFSP(_EvalAgentBase):
    EVAL_MODE_AVG = "NFSP_Avg"
    ALL_MODES = [EVAL_MODE_AVG]

    def __init__(self, t_prof, mode=None, device=None):
        super().__init__(t_prof=t_prof, mode=mode, device=device)
        self.avg_args = t_prof.module_args["avg"]

        self.policies = [
            AvgWrapper(owner=p, env_bldr=self.env_bldr, avg_training_args=self.avg_args)
            for p in range(t_prof.n_seats)
        ]
        for pol in self.policies:
            pol.eval()

    def can_compute_mode(self):
        return True

    def get_a_probs_for_each_hand(self):
        """ BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE!!!!! """
        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id

        if self._mode == self.EVAL_MODE_AVG:
            return self.policies[p_id_acting].get_a_probs_for_each_hand(
                pub_obs=self._internal_env_wrapper.get_current_obs(),
                legal_actions_list=self._internal_env_wrapper.env.get_legal_actions())

        else:
            raise UnknownModeError(self._mode)

    def get_a_probs(self):
        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id
        range_idx = self._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting)
        return self.policies[p_id_acting].get_a_probs(
            pub_obses=[self._internal_env_wrapper.get_current_obs()],
            range_idxs=np.array([range_idx], dtype=np.int32),
            legal_actions_lists=[self._internal_env_wrapper.env.get_legal_actions()]
        )[0]

    def get_action(self, step_env=True, need_probs=False):
        """ !! BEFORE CALLING, NOTIFY EVALAGENT OF THE PAST ACTIONS / ACTIONSEQUENCE !! """

        p_id_acting = self._internal_env_wrapper.env.current_player.seat_id
        range_idx = self._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting)

        if self._mode == self.EVAL_MODE_AVG:
            if need_probs:  # only do if rly necessary
                a_probs_all_hands = self.get_a_probs_for_each_hand()
                a_probs = a_probs_all_hands[range_idx]
            else:
                a_probs_all_hands = None  # not needed
                a_probs = self.policies[p_id_acting].get_a_probs(
                    pub_obses=[self._internal_env_wrapper.get_current_obs()],
                    range_idxs=np.array([range_idx], dtype=np.int32),
                    legal_actions_lists=[self._internal_env_wrapper.env.get_legal_actions()]
                )[0]

            action = np.random.choice(np.arange(self.env_bldr.N_ACTIONS), p=a_probs)

            if step_env:
                self._internal_env_wrapper.step(action=action)

            return action, a_probs_all_hands

        else:
            raise UnknownModeError(self._mode)

    def update_weights(self, weights_for_eval_agent):
        for i in range(self.t_prof.n_seats):
            self.policies[i].load_net_state_dict(self.ray.state_dict_to_torch(weights_for_eval_agent[i],
                                                                              device=self.device))
            self.policies[i].eval()

    def _state_dict(self):
        return {
            "net_state_dicts": [pol.net_state_dict() for pol in self.policies],
        }

    def _load_state_dict(self, state_dict):
        for i in range(self.t_prof.n_seats):
            self.policies[i].load_net_state_dict(state_dict["net_state_dicts"][i])


In [9]:
import copy

import torch
from PokerRL.game import bet_sets
from PokerRL.game.games import StandardLeduc
from PokerRL.game.wrappers import HistoryEnvBuilder, FlatLimitPokerEnvBuilder
from PokerRL.rl.agent_modules.DDQN import DDQNArgs
from PokerRL.rl.base_cls.TrainingProfileBase import TrainingProfileBase
from PokerRL.rl.neural.AvrgStrategyNet import AvrgNetArgs
from PokerRL.rl.neural.DuelingQNet import DuelingQArgs
from PokerRL.rl.neural.MainPokerModuleFLAT import MPMArgsFLAT
from PokerRL.rl.neural.MainPokerModuleRNN import MPMArgsRNN

class TrainingProfile(TrainingProfileBase):

    def __init__(self,

                 # --- general
                 name,
                 log_export_freq=200,
                 checkpoint_freq=99999999,
                 eval_agent_export_freq=99999999,

                 # --- Computing
                 path_data=None,
                 local_crayon_server_docker_address="localhost",
                 device_inference="cpu",
                 device_parameter_server="cpu",
                 n_learner_actor_workers=8,
                 max_n_las_sync_simultaneously=100,
                 DISTRIBUTED=False,
                 CLUSTER=False,
                 DEBUGGING=False,
                 VERBOSE=True,

                 # --- env
                 game_cls=StandardLeduc,
                 n_seats=2,
                 use_simplified_headsup_obs=True,
                 start_chips=None,

                 agent_bet_set=bet_sets.B_2,
                 stack_randomization_range=(0, 0),
                 uniform_action_interpolation=False,

                 # --- Evaluation
                 eval_modes_of_algo=(EvalAgentNFSP.EVAL_MODE_AVG,),
                 eval_stack_sizes=None,

                 # --- NFSP
                 nn_type="feedforward",
                 anticipatory_parameter=0.1,

                 # Original NFSP also adds epsilon-exploration actions to the averaging buffer.
                 add_random_actions_to_avg_buffer=True,

                 n_br_updates_per_iter=2,
                 n_avg_updates_per_iter=2,
                 target_net_update_freq=300,  # every N neural net updates. Not every N global iters, episodes, or steps
                 cir_buf_size_each_la=2e5,
                 res_buf_size_each_la=2e6,  # the more the better to infinity
                 min_prob_add_res_buf=0.0,  # 0.0 =  vanilla reservoir; >0 exponential averaging.

                 eps_start=0.06,
                 eps_const=0.01,
                 eps_exponent=0.5,
                 eps_min=0.0,

                 # --- Training.
                 n_steps_per_iter_per_la=128,
                 n_steps_pretrain_per_la=0,
                 n_envs=128,

                 mini_batch_size_br_per_la=128,
                 n_mini_batches_per_la_per_update_br=1,  # total num of samples per iter is that * batch_size above.
                 mini_batch_size_avg_per_la=128,
                 n_mini_batches_per_la_per_update_avg=1,  # total num of samples per iter is that * batch_size above.
                 training_multiplier_iter_0=1,  # In iter 0 the BR net is clueless, but adds to res_buf. -> "pretrain"

                 # --- Q-Learning Hyperparameters
                 n_cards_state_units_br=192,
                 n_merge_and_table_layer_units_br=64,
                 n_units_final_br=64,
                 normalize_last_layer_flat=False,
                 rnn_cls_str_br="lstm",
                 rnn_units_br=128,
                 rnn_stack_br=1,
                 lr_br=0.1,
                 dropout_br=0.0,
                 use_pre_layers_br=True,  # True -> Use deep multi-branch network; False -> Use shallow net
                 grad_norm_clipping_br=10.0,
                 optimizer_br="sgd",
                 loss_br="mse",

                 # --- Avg Network Hyperparameters
                 n_cards_state_units_avg=192,
                 n_merge_and_table_layer_units_avg=64,
                 n_units_final_avg=64,
                 rnn_cls_str_avg="lstm",
                 rnn_units_avg=128,
                 rnn_stack_avg=1,
                 lr_avg=0.005,
                 dropout_avg=0.0,
                 use_pre_layers_avg=True,  # True -> Use deep multi-branch network; False -> Use shallow net
                 grad_norm_clipping_avg=10.0,
                 optimizer_avg="sgd",
                 loss_avg="ce",

                 # Option
                 lbr_args=None,
                 rlbr_args=None,
                 ):
        print(" ************************** Initing args for: ", name, "  **************************")

        if nn_type == "recurrent":
            env_bldr_cls = HistoryEnvBuilder

            mpm_args_br = MPMArgsRNN(rnn_cls_str=rnn_cls_str_br,
                                     rnn_units=rnn_units_br,
                                     rnn_stack=rnn_stack_br,
                                     rnn_dropout=dropout_br,
                                     use_pre_layers=use_pre_layers_br,
                                     n_cards_state_units=n_cards_state_units_br,
                                     n_merge_and_table_layer_units=n_merge_and_table_layer_units_br)
            mpm_args_avg = MPMArgsRNN(rnn_cls_str=rnn_cls_str_avg,
                                      rnn_units=rnn_units_avg,
                                      rnn_stack=rnn_stack_avg,
                                      rnn_dropout=dropout_avg,
                                      use_pre_layers=use_pre_layers_avg,
                                      n_cards_state_units=n_cards_state_units_avg,
                                      n_merge_and_table_layer_units=n_merge_and_table_layer_units_avg)

        elif nn_type == "feedforward":
            env_bldr_cls = FlatLimitPokerEnvBuilder

            mpm_args_br = MPMArgsFLAT(use_pre_layers=use_pre_layers_br,
                                      card_block_units=n_cards_state_units_br,
                                      other_units=n_merge_and_table_layer_units_br,
                                      normalize=normalize_last_layer_flat,
                                      )
            mpm_args_avg = MPMArgsFLAT(use_pre_layers=use_pre_layers_avg,
                                       card_block_units=n_cards_state_units_avg,
                                       other_units=n_merge_and_table_layer_units_avg)

        else:
            raise ValueError(nn_type)

        super().__init__(

            name=name,
            log_verbose=VERBOSE,
            log_export_freq=log_export_freq,
            checkpoint_freq=checkpoint_freq,
            eval_agent_export_freq=eval_agent_export_freq,
            path_data=path_data,
            game_cls=game_cls,
            env_bldr_cls=env_bldr_cls,
            start_chips=start_chips,
            eval_modes_of_algo=eval_modes_of_algo,
            eval_stack_sizes=eval_stack_sizes,

            DEBUGGING=DEBUGGING,
            DISTRIBUTED=DISTRIBUTED,
            CLUSTER=CLUSTER,
            device_inference=device_inference,
            local_crayon_server_docker_address=local_crayon_server_docker_address,

            module_args={
                "ddqn": DDQNArgs(
                    q_args=DuelingQArgs(
                        mpm_args=mpm_args_br,
                        n_units_final=n_units_final_br,
                    ),
                    cir_buf_size=int(cir_buf_size_each_la),
                    batch_size=mini_batch_size_br_per_la,
                    n_mini_batches_per_update=n_mini_batches_per_la_per_update_br,
                    target_net_update_freq=target_net_update_freq,
                    optim_str=optimizer_br,
                    loss_str=loss_br,
                    lr=lr_br,
                    eps_start=eps_start,
                    eps_const=eps_const,
                    eps_exponent=eps_exponent,
                    eps_min=eps_min,
                    grad_norm_clipping=grad_norm_clipping_br,
                ),
                "avg": AvgWrapperArgs(
                    avg_net_args=AvrgNetArgs(
                        mpm_args=mpm_args_avg,
                        n_units_final=n_units_final_avg,
                    ),
                    batch_size=mini_batch_size_avg_per_la,
                    n_mini_batches_per_update=n_mini_batches_per_la_per_update_avg,
                    res_buf_size=int(res_buf_size_each_la),
                    min_prob_add_res_buf=min_prob_add_res_buf,
                    loss_str=loss_avg,
                    optim_str=optimizer_avg,
                    lr=lr_avg,
                    grad_norm_clipping=grad_norm_clipping_avg,
                ),
                "env": game_cls.ARGS_CLS(
                    n_seats=n_seats,
                    starting_stack_sizes_list=[start_chips for _ in range(n_seats)],
                    stack_randomization_range=stack_randomization_range,
                    use_simplified_headsup_obs=use_simplified_headsup_obs,
                    uniform_action_interpolation=uniform_action_interpolation,

                    # Set up in a way that just ignores this if not Discretized
                    bet_sizes_list_as_frac_of_pot=copy.deepcopy(agent_bet_set),
                ),
                "lbr": lbr_args,
                "rlbr": rlbr_args,
            }
        )

        # ____________________________________________________ NFSP ____________________________________________________
        self.nn_type = nn_type
        self.n_br_updates_per_iter = int(n_br_updates_per_iter)
        self.n_avg_updates_per_iter = int(n_avg_updates_per_iter)
        self.anticipatory_parameter = anticipatory_parameter
        self.add_random_actions_to_buffer = add_random_actions_to_avg_buffer
        self.training_multiplier_iter_0 = int(training_multiplier_iter_0)
        self.n_envs = int(n_envs)
        self.n_steps_pretrain_per_la = int(n_steps_pretrain_per_la)
        self.n_steps_per_iter_per_la = int(n_steps_per_iter_per_la)

        if DISTRIBUTED or CLUSTER:
            self.n_learner_actors = int(n_learner_actor_workers)
        else:
            self.n_learner_actors = 1

        self.max_n_las_sync_simultaneously = int(max_n_las_sync_simultaneously)

        assert isinstance(device_parameter_server, str), "Please pass a string (either 'cpu' or 'cuda')!"
        self.device_parameter_server = torch.device(device_parameter_server)


In [10]:
## Play an agent

In [13]:
t_prof = TrainingProfile(name='test')
eval_agent = EvalAgentNFSP(t_prof=t_prof)

 ************************** Initing args for:  test   **************************


In [15]:
from PokerRL.game import InteractiveGame

game = InteractiveGame(env_cls=eval_agent.env_bldr.env_cls,
                       env_args=eval_agent.env_bldr.env_args,
                       seats_human_plays_list=[0],
                       eval_agent=eval_agent,
                       )

game.start_to_play()


                       
                                                _____
                    _____                _____ |6    |
                   |2    | _____        |5    || & & | 
                   |  &  ||3    | _____ | & & || & & | _____
                   |     || & & ||4    ||  &  || & & ||7    |
                   |  &  ||     || & & || & & ||____9|| & & | _____
                   |____Z||  &  ||     ||____S|       |& & &||8    | _____
                          |____E|| & & |              | & & ||& & &||9    |
                                 |____h|              |____L|| & & ||& & &|
                                                             |& & &||& & &|
                                                             |____8||& & &|
                                                                    |____6|
               
____________________________________________ TUTORIAL ____________________________________________
Actions:
0 	Fold
1 	Call
2 	Raise according to cur

NotImplementedError: 