In [1]:
from RangeAgent import EvalAgentDeepRange
from PokerRL.game.games import StandardLeduc  # or any other game

from DeepCFR.EvalAgentDeepCFR import EvalAgentDeepCFR
from DeepCFR.TrainingProfile import TrainingProfile
from DeepCFR.workers.driver.Driver import Driver


In [2]:
t_prof = TrainingProfile(
    name="DEEP_RANGE_v0",
    nn_type="feedforward",
    
    max_buffer_size_adv=3e6,
    eval_agent_export_freq=20,  # export API to play against the agent
    n_traversals_per_iter=1500,
    n_batches_adv_training=750,
    n_batches_avrg_training=2000,
    n_merge_and_table_layer_units_adv=64,
    n_merge_and_table_layer_units_avrg=64,
    n_units_final_adv=64,
    n_units_final_avrg=64,
    mini_batch_size_adv=2048,
    mini_batch_size_avrg=2048,
    init_adv_model="last",
    init_avrg_model="last",
    use_pre_layers_adv=False,
    use_pre_layers_avrg=False,

    game_cls=StandardLeduc,

    # You can specify one or both modes. Choosing both is useful to compare them.
    eval_modes_of_algo=(
     EvalAgentDeepCFR.EVAL_MODE_SINGLE,  # SD-CFR
     EvalAgentDeepCFR.EVAL_MODE_AVRG_NET,  # Deep CFR
    ),

    DISTRIBUTED=False,
)


 ************************** Initing args for:  DEEP_RANGE_v0   **************************


In [3]:
EvalAgentDeepRange(t_prof, mode=None, device=None).policy._net

RangeActionNet(
  (range_net): RangeNet(
    (_relu): ReLU()
    (_mpm): MainPokerModuleFLAT(
      (_relu): ReLU()
      (final_fc_1): Linear(in_features=64, out_features=64, bias=True)
      (final_fc_2): Linear(in_features=64, out_features=64, bias=True)
    )
    (_final_layer): Linear(in_features=64, out_features=64, bias=True)
    (_out_layer): Linear(in_features=64, out_features=6, bias=True)
    (_softmax): Softmax()
  )
  (_relu): ReLU()
  (_final_layer): Linear(in_features=27, out_features=64, bias=True)
  (_out_layer): Linear(in_features=64, out_features=3, bias=True)
)

In [4]:
import numpy as np
import torch
import torch.nn as nn

import time
from copy import deepcopy

range_loss = nn.CrossEntropyLoss()
action_loss = nn.MSELoss() ## cross-entropy would be ideal

def hole_card_onehot(hole_card):
    rank = hole_card[0][0]
    suit = hole_card[0][1]
    out = rank + suit * 3 ## arbitrary but it will learn the relationship
    return torch.LongTensor([out])

def distill(student_agent, teacher_agent, args={'lr':1e-2, 'iters':10000, 'lambda':10}):
    """
    Distill student_agent to play like teacher_agent
    """
    
    env_bldr = student_agent.env_bldr
    env_cls = env_bldr.env_cls
    env_args = env_bldr.env_args
    lut_holder = env_cls.get_lut_holder()
    
    assert(student_agent.env_bldr.env_cls == teacher_agent.env_bldr.env_cls)
    assert(env_args.n_seats == 2)

    optimizer = torch.optim.Adam(list(student_agent.policy._net.parameters()), lr=args['lr'])
    start_time = time.time()
    
    REFERENCE_AGENT = 0
    
    _env = env_cls(env_args=env_args, lut_holder=lut_holder, is_evaluating=True)
    _eval_agents = [teacher_agent, deepcopy(teacher_agent)]
    
    results = {
        "range_loss": [],
        "action_loss": [],
        "total_loss": [],
    }
    iters = 0 # number of hands played
    evals = 0 # number of teaching moments
    
    # zero grads, set net to train mode
    student_agent.policy._net.train()
    optimizer.zero_grad()

    while iters < args['iters']:
        iters += 1
        
        if iters % 200 == 0:
            print("Iters {} | Evals {} | RangeLoss {} | ActionLoss {} | TotalLoss {}".format(
                iters, evals, sum(results['range_loss']) / evals, sum(results['action_loss']) / evals, sum(results['total_loss']) / evals
            ))
            
            # print("gradient:", list(student_agent.policy._net.parameters())[0].grad)

            # print("old params:", list(student_agent.policy._net.parameters())[0])

            optimizer.step()

            # print("new params:", list(student_agent.policy._net.parameters())[0])
            
            optimizer.zero_grad()

        
        for seat_p0 in range(_env.N_SEATS):
            seat_p1 = 1 - seat_p0
            
            # """""""""""""""""
            # Reset Episode
            # """""""""""""""""
            _, r_for_all, done, info = _env.reset()
            for e in _eval_agents + [student_agent]:
                e.reset(deck_state_dict=_env.cards_state_dict())

            # """""""""""""""""
            # Play Episode
            # """""""""""""""""

            while not done:
                p_id_acting = _env.current_player.seat_id

                if p_id_acting == seat_p0:
                    evals += 1 #increment counter
                    
                    # set student to position of agent 1, estimate range + actions
                    student_agent.set_env_wrapper(_eval_agents[REFERENCE_AGENT]._internal_env_wrapper) 
                    student_a_probs = student_agent.get_a_probs_tensor()
                    student_range_probs = student_agent.get_range_probs()
                    
                    # get true values 
                    a_probs = torch.Tensor(_eval_agents[REFERENCE_AGENT].get_a_probs())
                    range_label = _env.get_hole_cards_of_player(seat_p1) #get opponent's true range
                    range_label = hole_card_onehot(range_label) # convert to label
                    action_int, _ = _eval_agents[REFERENCE_AGENT].get_action(step_env=True, need_probs=False)
                    
                    # print("True:", a_probs, range_label)
                    # print("Prediction:", student_a_probs, student_range_probs)
                    # print("Checking requires_grad:", student_a_probs.requires_grad, student_range_probs.requires_grad)
                    
                    # compute loss
                    rloss = range_loss(student_range_probs.view(1,-1), range_label)
                    aloss = action_loss(student_a_probs, a_probs)
                    loss = rloss + args['lambda'] * aloss
                    
                    results['total_loss'].append(loss)
                    results['range_loss'].append(rloss)
                    results['action_loss'].append(aloss)
                    
                    # print("Loss:", rloss, aloss, loss)
                    
                    # backpropogate
                    loss.backward() # accumulate gradients over many steps
                                        
                    # notify opponent
                    _eval_agents[1 - REFERENCE_AGENT].notify_of_action(p_id_acted=p_id_acting,
                                                                       action_he_did=action_int)
                elif p_id_acting == seat_p1:
                    a_probs = _eval_agents[REFERENCE_AGENT].get_a_probs()
                    action_int, _ = _eval_agents[1 - REFERENCE_AGENT].get_action(step_env=True, need_probs=False)
                    _eval_agents[REFERENCE_AGENT].notify_of_action(p_id_acted=p_id_acting,
                                                                   action_he_did=action_int)
                else:
                    raise ValueError("Only HU supported!")
                
                _, r_for_all, done, info = _env.step(action_int)  
    
    end_time = time.time()
    print("Time taken", end_time - start_time)

    print(optimizer)
    
    return results

In [16]:
## Distillation

agent_file1 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_200/120/eval_agentAVRG_NET.pkl"

student_agent = EvalAgentDeepRange(t_prof, mode=None, device=None)
teacher_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file1)

results = distill(student_agent, teacher_agent, args={'lr':1e-2, 'iters':10000, 'lambda':5})

# student_agent.save_to_file("deep_range_example.pt")
# student_agent.load_from_file("deep_range_example.pt")

Iters 200 | Evals 814 | RangeLoss 1.7955774068832397 | ActionLoss 0.07010836154222488 | TotalLoss 2.146121025085449
Iters 400 | Evals 1647 | RangeLoss 1.7900111675262451 | ActionLoss 0.06836526840925217 | TotalLoss 2.1318373680114746
Iters 600 | Evals 2462 | RangeLoss 1.7845299243927002 | ActionLoss 0.06678110361099243 | TotalLoss 2.1184351444244385
Iters 800 | Evals 3259 | RangeLoss 1.7796403169631958 | ActionLoss 0.06599391251802444 | TotalLoss 2.1096062660217285
Iters 1000 | Evals 4098 | RangeLoss 1.7755833864212036 | ActionLoss 0.06611665338277817 | TotalLoss 2.106161594390869
Iters 1200 | Evals 4931 | RangeLoss 1.769439935684204 | ActionLoss 0.06510770320892334 | TotalLoss 2.094968557357788
Iters 1400 | Evals 5760 | RangeLoss 1.7679038047790527 | ActionLoss 0.06519202888011932 | TotalLoss 2.0938565731048584
Iters 1600 | Evals 6592 | RangeLoss 1.7644996643066406 | ActionLoss 0.06483491510152817 | TotalLoss 2.0886728763580322
Iters 1800 | Evals 7448 | RangeLoss 1.7620487213134766 | 

In [18]:
## Evaluation

import time, sys
from os.path import dirname, abspath

sys.path.append("/home/leduc/Deep-CFR/")

import numpy as np

from DeepCFR.EvalAgentDeepCFR import EvalAgentDeepCFR
from PokerRL.game.AgentTournament import AgentTournament

agent_file1 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_200/120/eval_agentAVRG_NET.pkl"
agent_file2 = "/home/leduc/poker_ai_data/eval_agent/SD-CFR_LEDUC_EXAMPLE_2/2/eval_agentAVRG_NET.pkl"

teacher_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file1)
crappy_agent = EvalAgentDeepCFR.load_from_disk(path_to_eval_agent=agent_file2)

# student_file = "deep_range_100000_0.01.pt"
# student_agent.load_from_file(student_file)

def h2heval(eval_agent_1, eval_agent_2, names=['agent1', 'agent2']):
    env_bldr = eval_agent_1.env_bldr
    env = env_bldr.get_new_env(is_evaluating=False)
    env_cls = env_bldr.env_cls
    env_args = env_bldr.env_args
    
    print("Agent 1:", names[0])
    print("Agent 2:", names[1])
    matchup = AgentTournament(env_cls, env_args, eval_agent_1, eval_agent_2)
    mean, upper_conf95, lower_conf95 = matchup.run(n_games_per_seat=1000)
    print("\n")

h2heval(student_agent, teacher_agent, names=['student', 'teacher'])
h2heval(student_agent, crappy_agent, names=['student', 'crappy'])
h2heval(crappy_agent, teacher_agent, names=['crappy', 'teacher'])


Agent 1: student
Agent 2: teacher

Played 2000 hands of poker.
Player  DEEPRANGE: -136.0 +/- 157.56683117557216
Player  AVRG_NET: 136.0 +/- 157.56683117557216


Agent 1: student
Agent 2: crappy

Played 2000 hands of poker.
Player  DEEPRANGE: 302.0 +/- 190.76949881451117
Player  AVRG_NET: -302.0 +/- 190.76949881451117


Agent 1: crappy
Agent 2: teacher

Played 2000 hands of poker.
Player  AVRG_NET: -836.5 +/- 200.8964947712741
Player  AVRG_NET: 836.5 +/- 200.8964947712741


