In [10]:
%reload_ext dotenv
%dotenv

# Base libraries
from collections import namedtuple
from dataclasses import dataclass
from typing import List, Tuple, Dict, Callable, Any
import numpy as np
import pandas as pd
import random
from typing import Union

# ML libraries
import torch

# Local imports
from board import ConnectFourField
from env import Env
from agents.random_agent import RandomAgent
from agents.deep_q_agent import DeepQAgent
from agents.cql_agent import CQLAgent
import utils

In [16]:
# Fix random seed
utils.seed_everything(42, deterministic=True)

# Use GPU if available
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

#Initialize Environment and Agent
env = Env()
agent = DeepQAgent(env, epsilon_max = 1, epsilon_min = 0.01, device=device)

AGENT = 1
OPPONENT = 2
NUM_EPISODES = {'TRAIN': 1000, 'EVAL': 100} # Define number of episodes for training and evaluation 
VERBOSE = False
MODES = ['TRAIN', 'EVAL'] # Loop modes
UPDATE_OPPONENT = True # Wether to update opponent with current player
OPPONENT_UPDATE_FREQUENCY = 50 # Update opponent with current player every n episodes
BOOTSTRAP_EPISODES = 100 # Episodes where opponent is just random agent

if VERBOSE: env.render_console()

#Initialize Opponent Agent (This Agent is NOT trained)
opponent = RandomAgent(env)

for mode in MODES:
    # Reset score counter
    p1_score = 0
    p2_score = 0

    turns = 0
    invalid = 0

    if mode == 'EVAL' and type(opponent) is not RandomAgent:
        print('Changing agent to random agent for evaluation.')
        # For evaluation, make opponent a random agent again
        opponent = RandomAgent(env)

    for i in range(1, NUM_EPISODES[mode] + 1):
        # Clean up terminal line 
        if i % 100 == 0: print('\r', '                                                                                                                       ', end='')
        # Print current episode
        print('\r', f'{mode}: Running episode {i} of {NUM_EPISODES[mode]}.', end='')

        # Print the last game
        if i == NUM_EPISODES[mode] and mode == 'EVAL':
            VERBOSE = True

        # Initialize other variables
        finished = -1

        # Make it random who gets to start the game
        # Set to true during the episode
        agent_start = random.choice([True, False])
        # Run one episode of the game
        while finished == -1:
            if finished != -1: break
            # Agent makes a turn
            if agent_start:
                state = env.get_state()
                action = agent.act(state)
                if VERBOSE: print(f"Agents Action: {action}")
                valid, reward, finished = agent.env.step(action, AGENT)
                turns += 1
                if not valid: invalid += 1
                if VERBOSE: env.render_console()

                # TODO: Here all the code for storing sequences in the buffer and learning/training the network would be!
                if type(agent) is not RandomAgent and mode == 'TRAIN':
                    agent.remember(state, action, reward, env.get_state(), finished)
                    agent.optimize_model()
                if finished != -1: break
            else:
                agent_start = True

            # If move was invalid, repeat TODO: cumulate negative reward in this case!
            '''
            TODO: How to handle this whole "invalid move" situation in general,
            1) Should we adapt the actionspace to only the valid actions? (hard..)
            2) Punish the Agent for making a invalid move, but how to we represent that in the sequence?
            -> Easiest way would probably just be to give negative reward and make the agent "skip their move",
            i.e. they are not allowed to play a move (this punishes them aswell as they will more likely lose!)

            Here I follow the approach that the Agent is NOT able to repeat the move if it was invalid!
            TODO: In that case, the "valid" variable is unnecessary
            '''

            # Opponent makes their turn
            action = opponent.act(env.get_state_inverted())
            if VERBOSE: print(f"Opponents Action: {action}")
            valid, reward, finished = opponent.env.step(action, OPPONENT)
            turns += 1
            if not valid: invalid += 1
            if VERBOSE: env.render_console()
            if finished != -1: break
        
        # Print the last game
        if i == NUM_EPISODES[mode] and mode == 'EVAL':
            VERBOSE = False

        # Update opponent with old versions of the agent, i.e. make the agent play against itself
        if UPDATE_OPPONENT and mode == 'TRAIN' and i % OPPONENT_UPDATE_FREQUENCY == 0 and i > BOOTSTRAP_EPISODES:
            agent_class = type(agent)
            opponent = agent_class(env=env, epsilon_max=0.5, device=device, options={'weights_init': agent})

        episode_str = f'\n Winner of episode {i} was player {finished}.'
        if finished == 1:
            p1_score += 1
        elif finished == 2:
            p2_score += 1
        else:
            episode_str = f'Episode {i} ended in a draw.'

        if VERBOSE or i == NUM_EPISODES[mode]:
            if mode == 'EVAL': episode_str = '\n Results of evaluation were:'
            print(episode_str + f' P1 has {p1_score} wins, P2 has {p2_score} wins, and there were {i - p1_score - p2_score} draws.')
            # print('End state of the last game was:')
            # env.render_console()

        env.reset()

    print('Average turns per episode ', turns / NUM_EPISODES[mode])
    print('Average invalid moves per episode ', invalid / NUM_EPISODES[mode])

 TRAIN: Running episode 1000 of 1000.                                                                                   TRAIN: Running episode 646 of 1000.
 Winner of episode 1000 was player 1. P1 has 574 wins, P2 has 426 wins, and there were 0 draws.
Average turns per episode  17.449
Average invalid moves per episode  16.112
Changing agent to random agent for evaluation.
 EVAL: Running episode 100 of 100.Agents Action: 2                                                                      
_______________
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|1|0|0|0|0|
Opponents Action: 1
_______________
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|2|1|0|0|0|0|
Agents Action: 0
_______________
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|1|2|1|0|0|0|0|
Opponents Action: 3
_______________
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|0|0|0|0|0|0|0|
|1|2|1|2|0|0|0|