In [1]:
"""
Bayesian PsuedoCount updating
"""

from RangeAgent import EvalAgentDeepRange
from TreeAgent import EvalAgentTree

from PokerRL.game.games import StandardLeduc  # or any other game
from PokerRL.eval.rl_br.RLBRArgs import RLBRArgs

from DeepCFR.EvalAgentDeepCFR import EvalAgentDeepCFR
from DeepCFR.TrainingProfile import TrainingProfile
from DeepCFR.workers.driver.Driver import Driver

rlbr_args = RLBRArgs(
    rlbr_bet_set = None
)

t_prof = TrainingProfile(
    name="DEEPCOPY_v0",
    nn_type="feedforward",
    
    max_buffer_size_adv=3e6,
    eval_agent_export_freq=20,  # export API to play against the agent
    n_traversals_per_iter=1500,
    n_batches_adv_training=750,
    n_batches_avrg_training=2000,
    n_merge_and_table_layer_units_adv=64,
    n_merge_and_table_layer_units_avrg=64,
    n_units_final_adv=64,
    n_units_final_avrg=64,
    mini_batch_size_adv=2048,
    mini_batch_size_avrg=2048,
    init_adv_model="last",
    init_avrg_model="last",
    use_pre_layers_adv=False,
    use_pre_layers_avrg=False,

    game_cls=StandardLeduc,

    # You can specify one or both modes. Choosing both is useful to compare them.
    eval_modes_of_algo=(
     # EvalAgentDeepCFR.EVAL_MODE_SINGLE,  # SD-CFR
     EvalAgentDeepCFR.EVAL_MODE_AVRG_NET,  # Deep-CFR
    ),

    DISTRIBUTED=False,
    rl_br_args=rlbr_args
)


 ************************** Initing args for:  DEEPCOPY_v0   **************************


In [8]:
import numpy as np
import torch
import torch.nn as nn

import time
from copy import deepcopy

action_loss = nn.CrossEntropyLoss()

def best_response(agent):
    """
    Returns strategy that is best response to agent strategy
    """
    br = EvalAgentTree(t_prof, br_agent=agent, mode=None, device=None)
    br.mode = "BR"
    return br

def bayesian_while_play(bayesian_agent, enemy_agent, args={'lr':1e-2, 'iters':10000}):
    """
    Train bayesian_agent to mimic stationery enemy_agent.
    """
            
    env_bldr = bayesian_agent.env_bldr
    env_cls = env_bldr.env_cls
    env_args = env_bldr.env_args
    lut_holder = env_cls.get_lut_holder()
    
    assert(bayesian_agent.env_bldr.env_cls == enemy_agent.env_bldr.env_cls)
    assert(env_args.n_seats == 2)

    start_time = time.time()
    
    REFERENCE_AGENT = 0
    
    _env = env_cls(env_args=env_args, lut_holder=lut_holder, is_evaluating=True)
    _eval_agents = [enemy_agent, deepcopy(bayesian_agent)] # play against an new frozen copy of the BR to agent while training
    
    results = {
        "winnings": []
    }
    iters = 0 # number of hands played
    evals = 0 # number of teaching moments

    while iters < args['iters']:
        iters += 1
        
        if iters % 200 == 0:
            print("Iters {} | Evals {} | Winnings mBB/Hand {} | ".format(
                iters, evals, sum(results["winnings"]) / iters
            ))
            
            # play against an new frozen copy of the BR to agent while training
            _eval_agents[1] = best_response(deepcopy(bayesian_agent)) 
        
        for seat_p0 in range(_env.N_SEATS):
            seat_p1 = 1 - seat_p0
            
            # """""""""""""""""
            # Reset Episode
            # """""""""""""""""
            _, r_for_all, done, info = _env.reset()
            for e in _eval_agents:
                e.reset(deck_state_dict=_env.cards_state_dict())

            # """""""""""""""""
            # Play Episode
            # """""""""""""""""

            while not done:
                p_id_acting = _env.current_player.seat_id

                if p_id_acting == seat_p0:
                    evals += 1 #increment counter
                    
                    # set bayesian agent to position of agent 1, update psuedocount
                    bayesian_agent.set_env_wrapper(_eval_agents[REFERENCE_AGENT]._internal_env_wrapper)
                    node = bayesian_agent._find_node_by_env(bayesian_agent._internal_env_wrapper._action_history_list)
                    range_idx = bayesian_agent._internal_env_wrapper.env.get_range_idx(p_id=p_id_acting) #get opponent hole card
                    
                    # get true values 
                    action_int, _ = _eval_agents[REFERENCE_AGENT].get_action(step_env=True, need_probs=False)
                    
                    # update pseudocount
                    node.data[range_idx, action_int] += 1 #node.allowed_actions.index(action_int)
                    
                    # notify opponent
                    _eval_agents[1 - REFERENCE_AGENT].notify_of_action(p_id_acted=p_id_acting,
                                                                       action_he_did=action_int)
                    
                elif p_id_acting == seat_p1:
                    action_int, _ = _eval_agents[1 - REFERENCE_AGENT].get_action(step_env=True,
                                                                                 need_probs=False)
                    _eval_agents[REFERENCE_AGENT].notify_of_action(p_id_acted=p_id_acting,
                                                                   action_he_did=action_int)
                else:
                    raise ValueError("Only HU supported!")

                _, r_for_all, done, info = _env.step(action_int)
                
            # """""""""""""""""
            # Add Rews
            # """""""""""""""""
            results["winnings"].append(r_for_all[seat_p0] * _env.REWARD_SCALAR * _env.EV_NORMALIZER)
    
    end_time = time.time()
    print("Time taken", end_time - start_time)
    
    return results

In [9]:
import numpy as np

np.ones((3,6))

array([[1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1.]])

In [11]:
agent_file1 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_200/120/eval_agentAVRG_NET.pkl"
agent_file2 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_200/20/eval_agentAVRG_NET.pkl"

enemy_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file2)
init_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file1)
bayesian_agent = EvalAgentTree(t_prof, br_agent=init_agent, mode="BAYESIAN", device=None)

results = bayesian_while_play(bayesian_agent, enemy_agent, args={'lr':1e-2, 'iters':10000})


Iters 200 | Evals 843 | Winnings mBB/Hand 1655.0 | 
Iters 400 | Evals 1772 | Winnings mBB/Hand 1217.5 | 
Iters 600 | Evals 2644 | Winnings mBB/Hand 1020.0 | 
Iters 800 | Evals 3479 | Winnings mBB/Hand 895.0 | 
Iters 1000 | Evals 4325 | Winnings mBB/Hand 692.0 | 
Iters 1200 | Evals 5179 | Winnings mBB/Hand 640.8333333333334 | 
Iters 1400 | Evals 6062 | Winnings mBB/Hand 623.5714285714286 | 
Iters 1600 | Evals 6970 | Winnings mBB/Hand 423.75 | 
Iters 1800 | Evals 7840 | Winnings mBB/Hand 452.77777777777777 | 
Iters 2000 | Evals 8622 | Winnings mBB/Hand 431.0 | 
Iters 2200 | Evals 9438 | Winnings mBB/Hand 432.27272727272725 | 
Iters 2400 | Evals 10265 | Winnings mBB/Hand 308.3333333333333 | 
Iters 2600 | Evals 11116 | Winnings mBB/Hand 262.6923076923077 | 
Iters 2800 | Evals 12010 | Winnings mBB/Hand 167.5 | 
Iters 3000 | Evals 12888 | Winnings mBB/Hand 134.0 | 
Iters 3200 | Evals 13768 | Winnings mBB/Hand 72.8125 | 
Iters 3400 | Evals 14667 | Winnings mBB/Hand 34.411764705882355 | 
Iters

In [5]:
bayesian_agent

<TreeAgent.EvalAgentTree at 0x7fd2be28e390>