In [None]:
"""
This code is largely a modified version of the original RLCard github repo: https://github.com/datamllab/rlcard.
We also use and modify the ARM implementation provided in the accompanying code: https://github.com/kxinhe/LRM_FP of the following paper: 
"He, H. Wu, Z. Wang, and H. Li, “Finding nash equilibrium
for imperfect information games via fictitious play based on local regret minimization,” 
International Journal of Intelligent Systems, 2022"
in our NFSP-ARM implementation. 
"""

In [1]:
import os
import argparse

import torch

import rlcard
from rlcard.agents import RandomAgent
from rlcard.models.uno_rule_models import UNORuleAgentV1
from rlcard.utils import get_device, set_seed, tournament, tournament_winnings, reorganize, Logger, plot_winrate_curve
import time

In [3]:
def train(env, algorithm, device, seed, num_episodes, num_eval_games, evaluate_every, save_every, log_dir, opposition_agent):

    # hyperparameters I need to parameterise
    csv_path = log_dir + opposition_agent + '_performance.csv'

    # Seed numpy, torch, random
    set_seed(seed)

    # Make the training environment with seed
    env = rlcard.make(env, config={'seed': seed})

    # Make the evaluation environment with seed
    eval_env = rlcard.make("teamuno", config={'seed': seed})

    # Initialize the agents
    if algorithm == "NFSP":
        from rlcard.agents import NFSPAgent
        agents = []

        # create NFSP agents
        for i in range(env.num_players):
            agent = NFSPAgent(num_actions=env.num_actions,
                              state_shape=env.state_shape[0],
                              hidden_layers_sizes=[64, 64],
                              q_mlp_layers=[64, 64],
                              device=device)
            agents.append(agent)
    elif algorithm == "NFSP-ARM":
        from rlcard.agents import NFSPARMAgent
        agents = []

        # create NFSP-ARM agents
        for i in range(env.num_players):
            agent = NFSPARMAgent(num_actions=env.num_actions,
                                 state_shape=env.state_shape[0],
                                 hidden_layers_sizes=[64, 64],
                                 q_mlp_layers=[64, 64],
                                 device=device)
            agents.append(agent)

    elif algorithm == "DQN":
        from rlcard.agents import DQNAgent
        agents = []

        # create DQN agents
        for i in range(env.num_players):
            agent = DQNAgent(num_actions=env.num_actions,
                             state_shape=env.state_shape[0],
                             mlp_layers=[64, 64],
                             device=device)
            agents.append(agent)
    else:
        agents = []

        # create DQN agents
        for i in range(env.num_players):
            agent = DQNAgent(num_actions=env.num_actions,
                             state_shape=env.state_shape[0],
                             mlp_layers=[64, 64],
                             device=device)
            agents.append(agent)

    # assign training agent distribution
    env.set_agents(agents)

    if opposition_agent == "rule-based":
        # create rule-based agents for evaluation
        eval_agent_1 = UNORuleAgentV1()
        eval_agent_2 = UNORuleAgentV1()

    else:
        # create random agents for evaluation
        eval_agent_1 = RandomAgent(num_actions=env.num_actions)
        eval_agent_2 = RandomAgent(num_actions=env.num_actions)

    # assign evaluation agent distribution
    eval_env.set_agents([agents[0], eval_agent_1, agents[2], eval_agent_2])

    # Start training
    with Logger(log_dir ,opposition_agent) as logger:
        for episode in range(num_episodes):

            # Generate data from the environment
            trajectories, payoffs, _ = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for i in range(env.num_players):
                for ts in trajectories[i]:
                    agents[i].feed(ts)

            # Evaluate win rate performance against opposition evaluation agents
            if episode % evaluate_every == 0:
                payoffs, winnings = tournament_winnings(eval_env, num_eval_games)
                logger.log_winrate(
                    env.timestep, payoffs[0] + payoffs[2], winnings[0])

            # Make plot
            if episode % save_every == 0 and episode > 0:
                # Save model
                save_path = os.path.join(log_dir, opposition_agent +'_model.pth')
                torch.save(agents[0], save_path)
            csv_path, fig_path = logger.csv_path, logger.fig_path
    # plot win rate curve
    plot_winrate_curve(csv_path, fig_path, algorithm)

In [None]:
env = "teamuno"
algorithm = "NFSP"

# hyperparameters
device = get_device()
seed = 42
num_episodes = 12000
num_eval_games = 1850
evaluate_every = 525
save_every = 50
opposition_agent = 'rule-based'
log_dir = './'

start = time.time()
train(env, algorithm, device, seed, num_episodes,
      num_eval_games, evaluate_every, save_every, log_dir, opposition_agent)
end = time.time()
print(end - start)

--> Running on the CPU

----------------------------------------
  timestep     |  241
  reward       |  0.08646486486486407
  winrate      |  0.48702702702702705
----------------------------------------
INFO - Step 100, rl-loss: 0.24585957825183868
INFO - Copied model beep parameters to target network.
INFO - Step 118, rl-loss: 12.872159957885742

  state_batch, action_batch, reward_batch, next_state_batch, legal_actions_batch, done_batch = self.memory.sample()


INFO - Step 100, rl-loss: 0.48751404881477356
INFO - Copied model beep parameters to target network.
INFO - Step 100, rl-loss: 0.27943098545074463
INFO - Copied model beep parameters to target network.
INFO - Step 100, rl-loss: 0.7298408746719368
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 239.46066284179688
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 6.1839032173156745
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 10.251250267028809
INFO - Copied model beep parameters to target network.
INFO - Step 1100, rl-loss: 5.4802670478820814
INFO - Copied model beep parameters to target network.
INFO - Step 2100, rl-loss: 20.969322204589844
INFO - Copied model beep parameters to target network.
INFO - Step 2100, rl-loss: 98.918487548828127
INFO - Copied model beep parameters to target network.
INFO - Step 2100, rl-loss: 16.184757232666016
INFO - Copied model beep parameters to target 

INFO - Step 20100, rl-loss: 33.745544433593754
INFO - Copied model beep parameters to target network.
INFO - Step 20100, rl-loss: 66.069869995117194
INFO - Copied model beep parameters to target network.
INFO - Step 20100, rl-loss: 37.248325347900395
INFO - Copied model beep parameters to target network.
INFO - Step 20100, rl-loss: 79.881835937598446
INFO - Copied model beep parameters to target network.
INFO - Step 21100, rl-loss: 20.433076858520508
INFO - Copied model beep parameters to target network.
INFO - Step 21100, rl-loss: 158.38308715820312
INFO - Copied model beep parameters to target network.
INFO - Step 21100, rl-loss: 33.687751770019534
INFO - Copied model beep parameters to target network.
INFO - Step 21100, rl-loss: 81.414352416992196
INFO - Copied model beep parameters to target network.
INFO - Step 22100, rl-loss: 58.068496704101564
INFO - Copied model beep parameters to target network.
INFO - Step 22100, rl-loss: 36.233322143554694
INFO - Copied model beep parameters

INFO - Step 60100, rl-loss: 84.093154907226565
INFO - Copied model beep parameters to target network.
INFO - Step 61100, rl-loss: 64.890342712402346
INFO - Copied model beep parameters to target network.
INFO - Step 60100, rl-loss: 21.099393844604492
INFO - Copied model beep parameters to target network.
INFO - Step 61100, rl-loss: 77.407264709472665
INFO - Copied model beep parameters to target network.
INFO - Step 61100, rl-loss: 20.939889907836914
INFO - Copied model beep parameters to target network.
INFO - Step 62100, rl-loss: 66.179504394531256
INFO - Copied model beep parameters to target network.
INFO - Step 61100, rl-loss: 74.729385375976568
INFO - Copied model beep parameters to target network.
INFO - Step 62100, rl-loss: 244.72927856445312
INFO - Copied model beep parameters to target network.
INFO - Step 62100, rl-loss: 55.125305175781254
INFO - Copied model beep parameters to target network.
INFO - Step 63100, rl-loss: 73.002365112304695
INFO - Copied model beep parameters

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




----------------------------------------
  timestep     |  314482
  reward       |  0.21521081081081025
  winrate      |  0.5021621621621621
----------------------------------------
INFO - Step 78100, rl-loss: 66.009498596191495
INFO - Copied model beep parameters to target network.
INFO - Step 79100, rl-loss: 22.932466506958008
INFO - Copied model beep parameters to target network.
INFO - Step 80100, rl-loss: 64.476402282714844
INFO - Copied model beep parameters to target network.
INFO - Step 79100, rl-loss: 43.947799682617196
INFO - Copied model beep parameters to target network.
INFO - Step 79100, rl-loss: 17.507650375366214
INFO - Copied model beep parameters to target network.
INFO - Step 80100, rl-loss: 30.961410522460938
INFO - Copied model beep parameters to target network.
INFO - Step 81100, rl-loss: 57.495655059814456
INFO - Copied model beep parameters to target network.
INFO - Step 80100, rl-loss: 37.252567291259766
INFO - Copied model beep parameters to target network.
I

INFO - Step 116100, rl-loss: 36.246517181396484
INFO - Copied model beep parameters to target network.
INFO - Step 117100, rl-loss: 72.946990966796882
INFO - Copied model beep parameters to target network.
INFO - Step 117100, rl-loss: 61.528881072998054
INFO - Copied model beep parameters to target network.
INFO - Step 119100, rl-loss: 19.792015075683594
INFO - Copied model beep parameters to target network.
INFO - Step 117100, rl-loss: 30.754093170166016
INFO - Copied model beep parameters to target network.
INFO - Step 118100, rl-loss: 38.116512298583984
INFO - Copied model beep parameters to target network.
INFO - Step 118100, rl-loss: 51.366889953613282
INFO - Copied model beep parameters to target network.
INFO - Step 120100, rl-loss: 44.288185119628906
INFO - Copied model beep parameters to target network.
INFO - Step 119100, rl-loss: 30.824024200439453
INFO - Copied model beep parameters to target network.
INFO - Step 118100, rl-loss: 175.07925415039062
INFO - Copied model beep 

INFO - Step 155100, rl-loss: 36.494754791259766
INFO - Copied model beep parameters to target network.
INFO - Step 157100, rl-loss: 51.818054199218756
INFO - Copied model beep parameters to target network.
INFO - Step 156100, rl-loss: 26.480936050415046
INFO - Copied model beep parameters to target network.
INFO - Step 154727, rl-loss: 37.279125213623054
----------------------------------------
  timestep     |  624232
  reward       |  -0.006772972972973634
  winrate      |  0.4805405405405405
----------------------------------------
INFO - Step 155100, rl-loss: 64.570602416992196
INFO - Copied model beep parameters to target network.
INFO - Step 156100, rl-loss: 23.362482070922852
INFO - Copied model beep parameters to target network.
INFO - Step 158100, rl-loss: 51.069747924804695
INFO - Copied model beep parameters to target network.
INFO - Step 157100, rl-loss: 27.464498519897464
INFO - Copied model beep parameters to target network.
INFO - Step 156100, rl-loss: 62.387264251708984

INFO - Step 193100, rl-loss: 66.296707153320315
INFO - Copied model beep parameters to target network.
INFO - Step 192100, rl-loss: 31.930221557617188
INFO - Copied model beep parameters to target network.
INFO - Step 194100, rl-loss: 67.158653259277346
INFO - Copied model beep parameters to target network.
INFO - Step 196100, rl-loss: 45.367485046386727
INFO - Copied model beep parameters to target network.
INFO - Step 194100, rl-loss: 85.323432922363285
INFO - Copied model beep parameters to target network.
INFO - Step 193100, rl-loss: 37.354873657226567
INFO - Copied model beep parameters to target network.
INFO - Step 195100, rl-loss: 31.135654449462892
INFO - Copied model beep parameters to target network.
INFO - Step 197100, rl-loss: 54.982494354248057
INFO - Copied model beep parameters to target network.
INFO - Step 195100, rl-loss: 96.445144653320318
INFO - Copied model beep parameters to target network.
INFO - Step 194100, rl-loss: 14.868543624877939
INFO - Copied model beep 