In [2]:
import rlcard
from rlcard.envs.registration import register
from rlcard.utils import get_device, Logger, reorganize, plot_curve
from rlcard.agents import RandomAgent

from rlcard.agents import DQNAgent
import torch
import numpy as np
import argparse
import os
from copy import deepcopy
from tqdm import tqdm

In [3]:
register(
    env_id='yaniv',
    entry_point='env:YanivEnv',
)
rlcard.make('yaniv', config={'game_num_players': 4})
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [20]:
env = rlcard.make('yaniv')
eval_env = rlcard.make('yaniv')

agents = [DQNAgent(
                num_actions=env.num_actions,
                state_shape=env.state_shape[0],
                mlp_layers=[64, 64],
                device=device
            )
            for _ in range(env.num_players)]

env.set_agents(agents)

eval_agents = [agents[0]] + [RandomAgent(num_actions=eval_env.num_actions) for _ in range(eval_env.num_players - 1)]
eval_env.set_agents(eval_agents)

In [12]:
NUM_EPISODES = 1000
EVAL_EVERY = 100
TOP_MODEL_TOURNAMENT_GAMES = 3
BASELINE_EVAL_GAMES = 3

In [22]:
def tournament(env, num):
    payoffs = [0 for _ in range(env.num_players)]
    running_turns = 0
    counter = 0
    
    while counter < num:
        _, _payoffs = env.run(is_training=False)
        running_turns += len(env.action_recorder) / env.num_players
        if isinstance(_payoffs, list):
            for _p in _payoffs:
                for i, _ in enumerate(payoffs):
                    payoffs[i] += _p[i]
                counter += 1
        else:
            for i, _ in enumerate(payoffs):
                payoffs[i] += _payoffs[i]
            counter += 1
    for i, _ in enumerate(payoffs):
        payoffs[i] /= counter
    return payoffs, running_turns / counter

In [30]:
baseline_scores = []
baseline_turns = []
top_tournament_scores = []
top_tournament_turns = []


for episode in range(NUM_EPISODES):
    trajectories, payoffs = env.run(is_training=True)
    trajectories = reorganize(trajectories, payoffs)
    for ts in trajectories[0]:
        for agent in agents:
            agent.feed(ts)
    
    
    if episode != 0 and episode % EVAL_EVERY == 0:
        # grab best agent 
        payoffs, mean_turns = tournament(env, TOP_MODEL_TOURNAMENT_GAMES)
        top_tournament_scores.append(sum(payoffs) / env.num_players)
        top_tournament_turns.append(mean_turns)
        best = np.argmax(payoffs)
        env.set_agents([agents[best]] + [deepcopy(agents[best]) for _ in range(env.num_players - 1)])
        
        payoffs, mean_turns = tournament(eval_env, BASELINE_EVAL_GAMES)
        baseline_scores.append(payoffs[0])
        baseline_turns.append(mean_turns)

        print(f'episode {episode} Top Model vs. Random Mean Score: {baseline_scores[-1]}')
        print(f'episode {episode} Top Model vs. Random Mean Turns: {baseline_turns[-1]}')
        print(f'episode {episode} Models vs. Models Mean Score: {top_tournament_scores[-1]}')
        print(f'episode {episode} Models vs. Models Mean Turns: {top_tournament_turns[-1]}')
        


  state_batch, action_batch, reward_batch, next_state_batch, legal_actions_batch, done_batch = self.memory.sample()


INFO - Step 44101, rl-loss: 0.65396791696548466
INFO - Copied model parameters to target network.
INFO - Step 44100, rl-loss: 2.0162034034729004
INFO - Copied model parameters to target network.
INFO - Step 44101, rl-loss: 4.7664752006530766
INFO - Copied model parameters to target network.
INFO - Step 44101, rl-loss: 1.3373063802719116
INFO - Copied model parameters to target network.
INFO - Step 45101, rl-loss: 3.70091176033023273
INFO - Copied model parameters to target network.
INFO - Step 45100, rl-loss: 15.256819725036621
INFO - Copied model parameters to target network.
INFO - Step 45101, rl-loss: 0.6305344104766846
INFO - Copied model parameters to target network.
INFO - Step 45101, rl-loss: 1.3145532608032227
INFO - Copied model parameters to target network.
INFO - Step 46101, rl-loss: 1.36679351329803477
INFO - Copied model parameters to target network.
INFO - Step 46100, rl-loss: 3.428567409515381
INFO - Copied model parameters to target network.
INFO - Step 46101, rl-loss: 

In [None]:
save_dir = './'
save_path = os.path.join(save_dir, f'model_gen.pth')
torch.save(agent, save_path)
print('Model saved in', save_path)