In [1]:
%reload_ext dotenv
%dotenv

# ML libraries
import torch
import torch.nn as nn

# Local imports
from env import Env
from trainer import Trainer
import utils
# Training Agents
from agents.cql_agent import CQLAgent
from agents.deep_q_agent import DeepQAgent
from agents.deep_q_agent_double_q import DDQAgent
# Non-Training Agents
from agents.minimax_agent import MinimaxAgent
from agents.random_agent import RandomAgent

In [3]:
##################
# INITIALIZATION #
##################

# Tweek these parameters for training and evaluation
num_episodes = {'TRAIN': 2000, 'EVAL': 100}
with_options = False
# Agent
agent_type = 'DDQAgent'         # DDQAgent, DeepQAgent, CQLAgent
agent_network_type = 'FCNN'            # CNN, FCNN
# Opponent
opponent_type = 'DDQAgent'  # DDQAgent, DeepQAgent, CQLAgent, MinimaxAgent, RandomAgent
opponent_network_type = 'FCNN'            # CNN, FCNN
minimax_depth = 2
minimax_epsilon = 0.3
# Same for both agent and opponent
epsilon_max = 1
epsilon_min = 0.01
epsilon_decay = 0.9997

# Fix random seed
utils.seed_everything(43, deterministic=False)

# Use GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

# Define player and opponent IDs
AGENT = 1
OPPONENT = 2

# Define game environment, this should be passed to the agents and the trainer
env = Env()

# Define agent
if agent_type == 'DDQAgent':
    agent = DDQAgent(env=env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=agent_network_type, device=device)
elif agent_type == 'DeepQAgent':
    agent = DeepQAgent(env=env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=agent_network_type, device=device)
elif agent_type == 'CQLAgent':
    agent = CQLAgent(env=env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=agent_network_type, device=device)
else:
    raise ValueError(f"Unknown agent_type: {agent_type}")
# Define opponent
if opponent_type == 'DDQAgent':
    opponent = DDQAgent(env=env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=opponent_network_type, device=device)
elif opponent_type == 'DeepQAgent':
    opponent = DeepQAgent(env=env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=opponent_network_type, device=device)
elif opponent_type == 'CQLAgent':
    opponent = CQLAgent(env=env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=opponent_network_type, device=device)
elif opponent_type == 'MinimaxAgent':
    opponent = MinimaxAgent(env=env, depth=minimax_depth, epsilon=minimax_epsilon, player=OPPONENT)
elif opponent_type == 'RandomAgent':
    opponent = RandomAgent(env=env)
else:
    raise ValueError(f"Unknown opponent_type: {opponent_type}")

# Define options for training
replacement_agent = None
options = {
           'UPDATE_OPPONENT': False,                         # Whether to enable self-play or not
           'OPPONENT_UPDATE_FREQUENCY': 100,                # After how many episodes the opponent will be replaced by the current agent
           'BOOTSTRAP_EPISODES': 7500,                      # During this time, the agent will not be replaced by itself
           'DECAY_RANDOMNESS_OPPONENT': False,               # Decay randomness of the opponent. Use only if the opponent acts with some randomness
           'DECAY_RANDOMNESS_FREQUENCY': 1000,              # Frequency of randomness decay
           'REPLACE_FOR_EVALUATION': False,                  # Whether to replace the training model at the end with another evaluation model
        #    'REPLACE_FOR_EVALUATION_BY': replacement_agent,  # Evalutation model to replace training model by
           'AUTOSAVE': True,                                # Whether to save the model at certain intervals
           'AUTOSAVE_TYPE': 'NUM_EPISODES',                 # One of ["NUM_OPTIMIZATIONS", "NUM_EPISODES"]
           'AUTOSAVE_PERIOD': 1000,                         # After how many _ to save the model
           }
if not with_options: options = None

# Initialize trainer
trainer = Trainer(env=env, agent=agent, opponent=opponent, agent_id=AGENT, opponent_id=OPPONENT, num_episodes=num_episodes, device=device, verbose=True, options=options)

In [27]:
############
# TRAINING #
############

# Train agent
trainer.train()

TRAIN: Running episode 2000 of 2000. Agent won 224 times. Current win ratio of AGENT is 11.20%. Agent Parameters: Epsilon = 0.037009, Memory Size = 10000
TRAIN: Average turns per episode 5.5955
TRAIN: Average invalid moves per episode 0.0


Model was saved in ./saved_models/ as CQLAgent_FCNN_10992.pt
EVAL: Running episode 100 of 100. Agent won 14 times. Current win ratio of AGENT is 14.00%. Agent Parameters: Epsilon = 0.036954, Memory Size = 10000  
EVAL: Average turns per episode 5.37
EVAL: Average invalid moves per episode 0.0




In [2]:
####################
# AGENT EVALUATION #
####################

# To reproduce our results, you only need to set agent_type/opponent_type and agent_network_type/opponent_network_type

# Tweek these parameters for evaluation
episodes = 1000
# Agent
agent_type = 'CQLAgent'                 # DDQAgent, DeepQAgent, CQLAgent, MinimaxAgent, RandomAgent
agent_network_type = 'CNN'              # CNN, FCNN
# Opponent
opponent_type = 'CQLAgent'              # DDQAgent, DeepQAgent, CQLAgent, MinimaxAgent, RandomAgent
opponent_network_type = 'CNN'           # CNN, FCNN
minimax_depth = 2
minimax_epsilon = 0.3
epsilon_max = 0.1 # Allow some randomness during evaluation
epsilon_min = 0.01
epsilon_decay = 0.9997 # No decay during evaluation

# Set random seed values (we chose 41, 42 and 43)
random_seeds = [41, 42, 43]

# You don't need to change anything below this for evaluation
#############################################################
for random_seed_nr in random_seeds:
    # Fix random seed
    utils.seed_everything(random_seed_nr, deterministic=False)

    # Use GPU if available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Define player and opponent IDs
    AGENT = 1
    OPPONENT = 2

    eval_env = Env()

    # Load agent from save and set to eval mode
    if agent_type == 'DDQAgent':
        agent = DDQAgent(env=eval_env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=agent_network_type, device=device)
    elif agent_type == 'DeepQAgent':
        agent = DeepQAgent(env=eval_env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=agent_network_type, device=device)
    elif agent_type == 'CQLAgent':
        agent = CQLAgent(env=eval_env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=agent_network_type, device=device)
    elif agent_type == 'MinimaxAgent':
        agent = MinimaxAgent(env=eval_env, depth=minimax_depth, epsilon=minimax_epsilon, player=AGENT)
    elif agent_type == 'RandomAgent':
        agent = RandomAgent(env=eval_env)
    else:
        raise ValueError(f"Unknown agent_type: {agent_type}")

    # Define opponent
    if opponent_type == 'DDQAgent':
        opponent = DDQAgent(env=eval_env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=opponent_network_type, device=device)
    elif opponent_type == 'DeepQAgent':
        opponent = DeepQAgent(env=eval_env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=opponent_network_type, device=device)
    elif opponent_type == 'CQLAgent':
        opponent = CQLAgent(env=eval_env, epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, network_type=opponent_network_type, device=device)
    elif opponent_type == 'MinimaxAgent':
        opponent = MinimaxAgent(env=eval_env, depth=minimax_depth, epsilon=minimax_epsilon, player=OPPONENT)
    elif opponent_type == 'RandomAgent':
        opponent = RandomAgent(env=eval_env)
    else:
        raise ValueError(f"Unknown opponent_type: {opponent_type}")

    agent_string = f"{agent_type}_{agent_network_type}_RS{random_seed_nr}.pt" if agent_type in ['DDQAgent', 'CQLAgent', 'DeepQAgent'] else None
    opponent_string = f"{opponent_type}_{opponent_network_type}_RS{random_seed_nr}.pt" if opponent_type in ['DDQAgent', 'CQLAgent', 'DeepQAgent'] else None

    # Load pretrained models
    if agent_type in ['DDQAgent', 'CQLAgent', 'DeepQAgent']:
        agent.load_model(f'./dqn_cql_model/{agent_string}')
    if opponent_type in ['DDQAgent', 'CQLAgent', 'DeepQAgent']:
        opponent.load_model(f'./dqn_cql_model/{opponent_string}')

    # Initialize trainer
    trainer = Trainer(env=eval_env, agent=agent, opponent=opponent, agent_id=AGENT, opponent_id=OPPONENT, device=device, verbose=True)

    # Run evaluation
    print(f"Evaluating: {agent_string if agent_string else agent_type + str(random_seed_nr)} vs. {opponent_string if opponent_string else opponent_type + str(random_seed_nr)}")
    trainer.eval(agent=agent, opponent=opponent, episodes=episodes, agent_start=True, print_last_n_games=0)

Evaluating: CQLAgent_CNN_RS41.pt vs. CQLAgent_CNN_RS41.pt
EVAL: Running episode 1000 of 1000. Ratios are [WINS: 55.20% | LOSSES: 44.20% | TIES: 0.60%]
EVAL: Average turns per episode 12.743
EVAL: Average invalid moves per episode 0.0


Evaluating: CQLAgent_CNN_RS42.pt vs. CQLAgent_CNN_RS42.pt
EVAL: Running episode 1000 of 1000. Ratios are [WINS: 51.70% | LOSSES: 47.40% | TIES: 0.90%]
EVAL: Average turns per episode 13.364
EVAL: Average invalid moves per episode 0.0


Evaluating: CQLAgent_CNN_RS43.pt vs. CQLAgent_CNN_RS43.pt
EVAL: Running episode 1000 of 1000. Ratios are [WINS: 50.90% | LOSSES: 47.10% | TIES: 2.00%]
EVAL: Average turns per episode 12.481
EVAL: Average invalid moves per episode 0.0




In [3]:
# Mean and STD calculation
import numpy as np

data = [55.2,
        51.7,
        50.9]

assert len(data) == 3, "Check data!"
mean = np.mean(data)
std = np.std(data, ddof = 1) # Using Sample Standard Deviation

print(f"{mean:.2f}\pm{std:.2f}")

52.60\pm2.29
