In [None]:
"""
This code is largely a modified version of the original RLCard github repo: https://github.com/datamllab/rlcard.
We also use and modify the ARM implementation provided in the accompanying code: https://github.com/kxinhe/LRM_FP of the following paper: 
"He, H. Wu, Z. Wang, and H. Li, “Finding nash equilibrium
for imperfect information games via fictitious play based on local regret minimization,” 
International Journal of Intelligent Systems, 2022"
in our NFSP-ARM implementation. 
"""

In [1]:
import os
import argparse

import torch

import rlcard
from rlcard.agents import RandomAgent
from rlcard.models.leducholdem_rule_models import LeducHoldemRuleAgentV2

from rlcard.utils import get_device, set_seed, tournament, tournament_winnings, reorganize, Logger, plot_winrate_curve

In [3]:
def train(env, algorithm, device, seed, num_episodes, num_eval_games, evaluate_every, save_every, log_dir, opposition_agent):

    # hyperparameters I need to parameterise
    csv_path = log_dir + opposition_agent + '_performance.csv'

    # Seed numpy, torch, random
    set_seed(seed)

    # Make the training environment with seed
    env = rlcard.make(env, config={'seed': seed})

    # Make the evaluation environment with seed
    eval_env = rlcard.make("teamleducholdemv3", config={'seed': seed})

    # Initialize the agents
    if algorithm == "NFSP":
        from rlcard.agents import NFSPAgent
        agents = []

        # create NFSP agents
        for i in range(env.num_players):
            agent = NFSPAgent(num_actions=env.num_actions,
                              state_shape=env.state_shape[0],
                              hidden_layers_sizes=[64, 64],
                              q_mlp_layers=[64, 64],
                              device=device)
            agents.append(agent)
    elif algorithm == "NFSP-ARM":
        from rlcard.agents import NFSPARMAgent
        agents = []

        # create NFSP-ARM agents
        for i in range(env.num_players):
            agent = NFSPARMAgent(num_actions=env.num_actions,
                                 state_shape=env.state_shape[0],
                                 hidden_layers_sizes=[64, 64],
                                 q_mlp_layers=[64, 64],
                                 device=device)
            agents.append(agent)

    elif algorithm == "DQN":
        from rlcard.agents import DQNAgent
        agents = []

        # create DQN agents
        for i in range(env.num_players):
            agent = DQNAgent(num_actions=env.num_actions,
                             state_shape=env.state_shape[0],
                             mlp_layers=[64, 64],
                             device=device)
            agents.append(agent)
    else:
        agents = []

        # create DQN agents
        for i in range(env.num_players):
            agent = DQNAgent(num_actions=env.num_actions,
                             state_shape=env.state_shape[0],
                             mlp_layers=[64, 64],
                             device=device)
            agents.append(agent)

    # assign training agent distribution
    env.set_agents(agents)

    if opposition_agent == "rule-based":
        # create rule-based agents for evaluation
        eval_agent_1 = LeducHoldemRuleAgentV2()
        eval_agent_2 = LeducHoldemRuleAgentV2()

    else:
        # create random agents for evaluation
        eval_agent_1 = RandomAgent(num_actions=env.num_actions)
        eval_agent_2 = RandomAgent(num_actions=env.num_actions)
    
    # assign evaluation agent distribution
    eval_env.set_agents([agents[0], eval_agent_1, agents[2], eval_agent_2])

    # Start training
    with Logger(log_dir ,opposition_agent) as logger:
        for episode in range(num_episodes):

            # sample policy for episode for each
            for agent in agents:
                agent.sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs, _ = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for i in range(env.num_players):
                for ts in trajectories[i]:
                    agents[i].feed(ts)

            # Evaluate win rate performance against opposition evaluation agents
            if episode % evaluate_every == 0:
                payoffs, winnings = tournament_winnings(eval_env, num_eval_games)
                logger.log_winrate(
                    env.timestep, payoffs[0] + payoffs[2], winnings[0])

            # Make plot
            if episode % save_every == 0 and episode > 0:
                # Save model
                save_path = os.path.join(log_dir, opposition_agent +'_model.pth')
                torch.save(agents[0], save_path)
            csv_path, fig_path = logger.csv_path, logger.fig_path
    # plot win rate curve
    plot_winrate_curve(csv_path, fig_path, algorithm)

In [None]:
import time
env = "teamleducholdemv3"
algorithm = "NFSP"

# hyperparameters
device = get_device()
seed = 42
num_episodes = 15000
num_eval_games = 26500
evaluate_every = 525
save_every = 50
opposition_agent = 'random'
log_dir = './'

start = time.time()
train(env, algorithm, device, seed, num_episodes,
      num_eval_games, evaluate_every, save_every, log_dir, opposition_agent)
end = time.time()
print(end - start , ' seconds')

--> Running on the CPU

----------------------------------------
  timestep     |  11
  reward       |  0.1791547169811729
  winrate      |  0.5407169811320754
----------------------------------------
INFO - Step 100, rl-loss: 1.4461621046066284
INFO - Copied model beep parameters to target network.


  state_batch, action_batch, reward_batch, next_state_batch, legal_actions_batch, done_batch = self.memory.sample()


INFO - Step 100, rl-loss: 0.8985109329223633
INFO - Copied model beep parameters to target network.
INFO - Step 100, rl-loss: 0.7837046980857849
INFO - Copied model beep parameters to target network.
INFO - Step 100, rl-loss: 0.9936385154724121
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 3.7275242805480957
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 2.6592292785644535
INFO - Copied model beep parameters to target network.
INFO - Step 1096, sl-loss: None60113430023193
----------------------------------------
  timestep     |  4409
  reward       |  0.16255094339626444
  winrate      |  0.5369433962264151
----------------------------------------
INFO - Step 1100, rl-loss: 3.5663781166076665
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 6.23044061660766647
INFO - Copied model beep parameters to target network.
INFO - Step 2100, rl-loss: 0.96490144729614265
INFO - Copied model be

INFO - Step 26263, sl-loss: 0.9162411689758301
----------------------------------------
  timestep     |  115706
  reward       |  0.8056150943395479
  winrate      |  0.6830943396226415
----------------------------------------
INFO - Step 30100, rl-loss: 0.6461005210876465
INFO - Copied model beep parameters to target network.
INFO - Step 31100, rl-loss: 4.42616653442382815
INFO - Copied model beep parameters to target network.
INFO - Step 29100, rl-loss: 1.0930424928665161
INFO - Copied model beep parameters to target network.
INFO - Step 27100, rl-loss: 1.86103367805480963
INFO - Copied model beep parameters to target network.
INFO - Step 31100, rl-loss: 1.76033687591552734
INFO - Copied model beep parameters to target network.
INFO - Step 32100, rl-loss: 5.31660366058349617
INFO - Copied model beep parameters to target network.
INFO - Step 30100, rl-loss: 1.7237988710403442
INFO - Copied model beep parameters to target network.
INFO - Step 27530, sl-loss: 0.8573567867279053
-------