In [None]:
# import statements
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
from tqdm.notebook import tqdm
import random
import matplotlib.pyplot as plt
import importlib
import wandb

import player
import rl_player
importlib.reload(player)
importlib.reload(rl_player)
from player import *
from rl_player import *

In [None]:
NUM_GAMES = 100

def run_simulation(players, shuffle_turn_order=False, num_games=NUM_GAMES):
    winners = []

    for game_index in range(num_games):
        if shuffle_turn_order:
            random.shuffle(players)
        game = Game(players, debug=False)
        
        turn_count = 0
        while len(game.game_state['players']) > 1:
            game.simulate_turn()
            turn_count += 1
            if turn_count > 100:
                break
        winner = game.game_state['players'][0].name
        winners.append(winner)
    return winners

In [None]:
def train_agent(agent, env, num_episodes, max_steps, batch_size, reward_dict, update_freq, sample_freq, sp_update_freq=10,
                win_threshold=0.8, display_progress=False, save_model=False, player_types=[]):
    total_rewards = []
    epsilon = 1.0
    win_rates = []
    best_winrate = 0.0

    for episode in tqdm(range(num_episodes), desc='Episode Loop'):
        initial_state = env.reset()
        state = initial_state
        done = False
        total_reward = 0
        step = 0
        
        while not done and step <= max_steps:
            # Environment stepping
            action, next_state, reward, done = env.step(reward_dict)

            # For adding experience
            agent.add_experience(state, action, reward, next_state, done)

            # Update the game state and history for the next iteration
            state = next_state
            total_reward += reward
            step += 1
        
        # Replaying experience
        if len(agent.replay_buffer) >= batch_size and step % update_freq == 0:
            agent.replay_experience(batch_size, agent.name)

        # Decay epsilon
        agent.epsilon = max(agent.epsilon * agent.epsilon_decay, agent.epsilon_min)
        
        # Save the total reward for this episode
        total_rewards.append(total_reward)

        # Log the total reward for this episode
        wandb.log({"total_reward": total_reward})

        if display_progress and episode % sample_freq == 0:
            winners = run_simulation(env.players)
            win_pct = len([w for w in winners if w == agent.name]) / len(winners)
            
            # Log the win rate for this episode
            wandb.log({"win_rate": win_pct})
            win_rates.append(win_pct)
            
            # Print win rates
            print(f"Episode {episode + 1}/{num_episodes} - Win Rate: {win_pct} - Epsilon: {agent.epsilon}")
            
            # If last sp_update_freq win rates are all over threshold, update the other players to main agent's weights
            if len(win_rates) >= sp_update_freq and all([w > win_threshold for w in win_rates[-sp_update_freq:]]):
                for player in env.players:
                    if player.agent is not None and player.name != agent.name:
                        # TO-DO: Maybe empty the replay buffer?
                        player.agent.model.load_state_dict(agent.model.state_dict())
                        player.agent.target_model.load_state_dict(agent.target_model.state_dict())
                        print(f"Updating {player.name} to {agent.name}'s weights")

    if display_progress:
        # Plot the win rate over time
        plt.plot([x * sample_freq + 1 for x in list(range(len(win_rates)))], win_rates, label="agent win rate")
        plt.plot([x * sample_freq + 1 for x in list(range(len(win_rates)))], [1 / len(env.players) for _ in range(len(win_rates))], label="expected win rate", linestyle='dashed')
        plt.legend()
        plt.title('Q-Learning win rate over time')
        plt.ylabel('win rate')
        plt.xlabel('number of episodes')
        plt.show()

    if save_model:
        agent.save_model('models/{0}-{1}-{2}'.format(len(env.players), num_episodes, ','.join(player_types)))
        wandb.save('models/{0}-{1}-{2}'.format(len(env.players), num_episodes, ','.join(player_types)))

    return total_rewards, win_rates

In [None]:
def self_play_training(n, reward_dict, learning_rate, gamma, epsilon_decay, h_dim, h_layers, tau,
                        buffer_size, num_episodes, batch_size, max_steps_per_episode,
                        update_freq, sample_freq, sp_update_freq, win_threshold, history_length=5, display_progress=True):
  
  action_dim = 5
  block_size = 2
  turn_dim = action_dim + n * 3 + block_size * 2
  state_dim = (10 + 11 * n) + (history_length * turn_dim)
  action_dim = 1 + 3 * n



  agent1 = QLearningAgent(state_dim, action_dim, learning_rate, gamma, 'Player 1', True, history_length=history_length, 
                          epsilon_decay=epsilon_decay, h_dim= h_dim, h_layers=h_layers, tau=tau, buffer_size=buffer_size)
  
  agents = []
  for i in range(2, n+1):
    agent = QLearningAgent(state_dim, action_dim, learning_rate, gamma, f'Player {i}', False, history_length=history_length,
                          epsilon_decay=epsilon_decay,h_dim= h_dim, h_layers=h_layers, tau=tau, buffer_size=buffer_size)
    agents.append(agent)

  RLTRAINING_FUNCS = {
    'decision_fn': rltraining_decision, 
    'block_fn': income_block,
    'dispose_fn': random_dispose,
    'keep_fn': random_keep
  }

  players = [Player('Player 1', RLTRAINING_FUNCS, agent1)]
  for i in range (2, n+1):
    players.append(Player(f'Player {i}', RLTRAINING_FUNCS, agents[i-2]))
  
  env = Environment('Player 1', players)  
 

  total_rewards, win_rates = train_agent(
    agent1, env, num_episodes, max_steps_per_episode, batch_size, reward_dict, 
    update_freq=update_freq, sample_freq=sample_freq, sp_update_freq=sp_update_freq, win_threshold=win_threshold, 
    display_progress=display_progress
  )

  return total_rewards, win_rates


In [None]:
import wandb
wandb.init(project="rl-coup-agent-tuning")

sweep_config = {
    'method': 'random',  # Can be "grid", "random", "bayes"
    'metric': {
      'name': 'win_rate',
      'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'min': 1e-4,
            'max': 1e-2
        },
        'update_freq': {
            'values': [1, 16, 32, 64, 128]
        },
        'gamma': {
            'values': [0.95, 0.99, 0.999]
        },
        'epsilon_decay': {
            'min': 0.99,
            'max': 0.9999
        },
        'h_dim': {
            'values': [128, 256, 512, 1024]
        },
        'h_layers': {
            'values': [2, 3, 4, 5, 6]
        },
        'tau': {
            'min': 1e-4,
            'max': 1e-2
        },
        'batch_size': {
            'values': [32, 64, 128, 1024, 4096]
        },
        'history_length': {
            'values': [10, 20, 30, 40, 50]
        },
        'buffer_size': {
            'values': [1e4, 1e5, 1e6, 1e7, 1e8]
        },

        'COIN_VALUE': {
            'min': 0.05,
            'max': 0.4
        },
        'CARD_VALUE': {
            'min': 0.3,
            'max': 2.0
        },
        'CARD_DIVERSITY_VALUE': {
            'min': 0,
            'max': 0.5
        },
        'WIN_VALUE': {
            'min': 0.5,
            'max': 5.0
        },
    }
}

In [None]:
def train():
    # Initialize a new wandb run
    with wandb.init() as run:
        config = wandb.config
        
        reward_dict = {
            'COIN_VALUE': config.COIN_VALUE,
            'CARD_VALUE': config.CARD_VALUE,
            'CARD_DIVERSITY_VALUE': config.CARD_DIVERSITY_VALUE,
            'WIN_VALUE': config.WIN_VALUE
        }
        
        # Call your training function with wandb's config
        total_rewards, win_rates = self_play_training(
            n=3,
            reward_dict=reward_dict,
            learning_rate=config.learning_rate,
            gamma=config.gamma,
            epsilon_decay=config.epsilon_decay,
            h_dim=config.h_dim,
            h_layers=config.h_layers,
            tau=config.tau,
            buffer_size=config.buffer_size,
            num_episodes=10000,
            batch_size=config.batch_size,
            max_steps_per_episode=100,
            update_freq=config.update_freq,
            sp_update_freq=5,
            sample_freq=1000,
            win_threshold=0.9,
            history_length=config.history_length,
            display_progress=True
        )

        # Log maximum and average values of total_rewards
        wandb.log({"max_total_reward": max(total_rewards)})
        wandb.log({"avg_total_reward": sum(total_rewards) / len(total_rewards)})

        # Log maximum and average values of win_rates
        wandb.log({"max_win_rate": max(win_rates)})
        wandb.log({"avg_win_rate": sum(win_rates) / len(win_rates)})



In [None]:
sweep_id = wandb.sweep(sweep_config, project="rl-coup-agent-tuning")
wandb.agent(sweep_id, train)