In [92]:
import os
import argparse

import torch

import rlcard

from rlcard.agents import DQNAgent, RandomAgent
from rlcard.utils import (
    get_device,
    set_seed,
    tournament,
    reorganize,
    Logger,
    plot_curve,
)

import numpy as np

In [93]:
# Initialize the Uno environment
env = rlcard.make('uno')

In [94]:
# The paths for saving the logs and learning curves
log_dir = './experiments/uno_dqn_result/'

In [95]:
agent = DQNAgent(
                 num_actions=env.num_actions,
                 state_shape=env.state_shape[0],
                 mlp_layers=[64,64],
                 replay_memory_size=5000,
                 update_target_estimator_every=100,
                 epsilon_decay_steps=10000,
                 learning_rate=0.0005,
                 batch_size=32,
                 device=get_device(),
                 save_path=log_dir
                 )

--> Running on the CPU


In [96]:
# Set the number of players
num_players = 2

# Set the players in the environment
env.set_agents([agent] + [RandomAgent(env.num_actions) for _ in range(num_players - 1)])

# Reset the environment
state = env.reset()

## Training the Agent
To train the agent, you will need to create a training loop. During each iteration of the loop, the agent makes a decision, the environment is updated, and the agent learns from the result.

In [97]:
def adjust_rewards(trajectories, payoffs):
    adjusted_trajectories = []
    for traj in trajectories:
        adjusted_traj = []
        for state, action, reward, next_state, done in traj:
            if action == 60:  # draw a card
                reward -= 1  # Penalty for drawing a card
                
            # Add more conditions for other strategic rewards
            # increment reward for actions 0-9, play red cards
            elif action >= 0 and action <= 9:
                reward += 1
            # increment reward for actions 15-24, play green cards
            elif action >= 15 and action <= 24:
                reward += 1
            # increment reward for actions 30-39, play blue cards
            elif action >= 30 and action <= 39:
                reward += 1
            # increment reward for actions 45-54, play yellow cards
            elif action >= 45 and action <= 54:
                reward += 1
                        
            
            adjusted_traj.append((state, action, reward, next_state, done))
        adjusted_trajectories.append(adjusted_traj)
    return adjusted_trajectories


# Building out the simulation

Previously, the code would run an entire/complete simulation. However, this restricts the ability to learn from moves within the simulation.

In [98]:
episode_num = 50000  # Number of episodes 

evaluate_every = 1000 # Evaluate the agent every X episodes
evaluate_num = 100  # Number of games played in evaluation

with Logger(log_dir) as logger:
    for episode in range(episode_num):  # Number of episodes

        trajectories, payoffs = env.run(is_training=True)

        # Assuming 'payoffs' are the game outcomes for each player
        for i, payoff in enumerate(payoffs):
            if payoff > 0:  # Assuming a positive payoff means winning
                payoffs[i] = 100
            else:
                payoffs[i] = -25

        trajectories = reorganize(trajectories, payoffs)

        # After reorganizing the trajectories, adjust the rewards
        trajectories = adjust_rewards(trajectories, payoffs)
        # print(trajectories[0])

        for ts in trajectories[0]:
            agent.feed(ts)
        
        if episode % evaluate_every == 0:
                logger.log_performance(
                    episode,
                    tournament(
                        env,
                        evaluate_num,
                    )[0]
                )

    # Get the paths
    csv_path, fig_path = logger.csv_path, logger.fig_path


----------------------------------------
  episode      |  0
  reward       |  -0.08
----------------------------------------
INFO - Step 100, rl-loss: 1.6680594682693481
INFO - Copied model parameters to target network.
INFO - Step 200, rl-loss: 17.9861392974853556
INFO - Copied model parameters to target network.
INFO - Step 300, rl-loss: 298.746276855468755
INFO - Copied model parameters to target network.
INFO - Step 400, rl-loss: 655.47613525390622
INFO - Copied model parameters to target network.
INFO - Step 500, rl-loss: 585.06903076171885
INFO - Copied model parameters to target network.
INFO - Step 600, rl-loss: 543.44512939453123
INFO - Copied model parameters to target network.
INFO - Step 700, rl-loss: 281.84103393554691
INFO - Copied model parameters to target network.
INFO - Step 800, rl-loss: 269.70611572265625
INFO - Copied model parameters to target network.
INFO - Step 900, rl-loss: 280.49554443359375
INFO - Copied model parameters to target network.
INFO - Step 1000

## Evaluating the Agent
After training, evaluate your agent's performance. You can use RLCard's tournament function to play the game multiple times and see how well your agent performs:

In [1]:
# Plot the learning curve
plot_curve(csv_path, fig_path, "dqn")

# Save model
save_path = os.path.join(log_dir, 'model.pth')
torch.save(agent, save_path)
print('Model saved in', save_path)

NameError: name 'plot_curve' is not defined