In [14]:
import argparse
import os
from copy import deepcopy
from typing import Optional, Tuple

import gymnasium as gym
import numpy as np
import torch
from pettingzoo.classic import hanabi_v4
from torch.utils.tensorboard import SummaryWriter

from tianshou.data import Collector, PrioritizedVectorReplayBuffer
from tianshou.env import DummyVectorEnv
from tianshou.env.pettingzoo_env import PettingZooEnv
from tianshou.policy import (
    BasePolicy,
    RainbowPolicy,
    MultiAgentPolicyManager,
    RandomPolicy,
)
from tianshou.utils.net.discrete import NoisyLinear
from tianshou.trainer import offpolicy_trainer
from tianshou.utils import TensorboardLogger
from tianshou.utils.net.common import Net
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
# copy of rainbow used in deepmind paper
p = {
    'hidden_layers': [128,128],
    'gamma': 0.99,
    'lr': 1e-4,
    'target_update_freq': 500,
    'estimation_steps': 1,
    'num_train':32,
    'num_test':16,
    'buffer_size':50000,
    'vmax':25,
    'vmin':-25,
    'noisy_std':0.1,
    'atom_size':51,
    'minimum_replay_history':500,
    'batch_size':32,
    'steps_per_collect': 4,
    'updates_per_train': 1,
    'test_steps': 10000,
    'epochs':int(10e6),
    'eps_decay_period': 20000,
    'test_frequency': 1000,
    'test_eps': 0,
    'save_frequency': 10000,
    'eps_final':0.01,
    'adam_eps': 3.125e-5,
    'path': 'results/hanabi_small_128_size2/',
    'lr_scheduler_factor': 0.1,
    'lr_scheduler_patience': 20
}


In [20]:
def get_env(render_mode=None):
    return PettingZooEnv(hanabi_v4.env(colors=2, ranks=5, players=2, hand_size=2, max_information_tokens=3,
max_life_tokens=1, observation_type=1))

In [21]:
def get_agents(p):
    
    def noisy_linear(x, y):
        return NoisyLinear(x, y, p['noisy_std'])
    
    # Return Policy, Agents, Envs
    env = get_env()
    observation_space = env.observation_space['observation'] if isinstance(
    env.observation_space, gym.spaces.Dict
    ) else env.observation_space

    state_shape = observation_space.shape or observation_space.n
    action_shape = env.action_space.shape or env.action_space.n
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    net = Net(
            state_shape,
            action_shape,
            hidden_sizes=p['hidden_layers'],
            device = device,
            softmax = True,
            num_atoms = p['atom_size'],
            dueling_param = ({
                'linear_layer': noisy_linear
            }, {
                'linear_layer': noisy_linear})
    )

    optim = torch.optim.Adam(net.parameters(), lr= p['lr'], eps=p['adam_eps'])
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode = 'max', factor = p['lr_scheduler_factor'],
                                                              patience = p['lr_scheduler_patience'])

    agent = RainbowPolicy(
            net,
            optim,
            p['gamma'],
            num_atoms = p['atom_size'],
            v_min = p['vmin'],
            v_max = p['vmax'],
            estimation_step = p['estimation_steps'],
            target_update_freq=p['target_update_freq']
        ).to(device)

    agents = [agent, agent]
    policy = MultiAgentPolicyManager(agents, env)
    agents = env.agents

    train_envs = DummyVectorEnv([get_env for _ in range(p['num_train'])])
    test_envs = DummyVectorEnv([get_env for _ in range(p['num_test'])])
    
    return policy, agents, train_envs, test_envs, lr_scheduler

In [22]:
def get_collectors(
    policy,
    train_envs,
    test_envs,
    p
):
    
    # Get collectors
    train_collector = Collector(
    policy,
    train_envs,
    PrioritizedVectorReplayBuffer(p['buffer_size'], len(train_envs), alpha = 0.6, beta = 0.4, weight_norm=True),
    exploration_noise=True)
    
    test_collector = Collector(policy, test_envs, exploration_noise=True)
    
    return train_collector, test_collector

In [23]:
def initialize_buffer(
    train_collector,
    agents,
    policy,
    p
):
    for a in agents:
        policy.policies[a].set_eps(1)
    train_collector.collect(n_step = p['minimum_replay_history'])

In [24]:
def save_policy(policy, agents, p):
    for a in agents:
        torch.save(policy.policies[a].state_dict(), f'{p["path"]}{a}_params.pth')

def save_history(history, p):
    np.save(f'{p["path"]}training_rewards.npy', np.array(history))
    
def change_lr(optimizer, new_lr):
    # Run this to change the learning rate to 1e-5:
    for g in optimizer.param_groups:
        g['lr'] = new_lr

In [25]:
def get_eps(iteration, p):
    if iteration > p['eps_decay_period']:
        return p['eps_final']
    else:
        gradient = (1 - p['eps_final'])/p['eps_decay_period']
        return 1 - gradient*iteration
        
def set_eps(policy, agents, new_eps):
    for a in agents:
        policy.policies[a].set_eps(new_eps)
        
def train(
    policy,
    train_collector,
    test_collector,
    agents,
    p,
    lr_scheduler,
    training_history = []
):

    for i in tqdm(range(p['epochs'])):
        
        eps = get_eps(i, p)
        set_eps(policy, agents, eps)
        
        # Collection step
        result = train_collector.collect(n_step = p['steps_per_collect'])
        
        # Test Step
        if i%p['test_frequency'] == 0:
            set_eps(policy, agents, p['test_eps'])
            result = test_collector.collect(n_step = p['test_steps'])
            mean_reward = result['rews'].mean()
            tqdm.write(str(mean_reward))
            training_history.append(mean_reward)
            set_eps(policy, agents, eps)
            lr_scheduler.step(mean_reward)
    
        if i%p['save_frequency'] == 0:
            save_policy(policy, agents,p)
            save_history(training_history,p)
            plot_and_save(training_history, p['test_frequency'],p, show = False)
    
        # Update step (one epoch)
        for _ in range(p['updates_per_train']): 
            losses = policy.update(p['batch_size'], train_collector.buffer)
    
    plot_and_save(training_history, test_frequency)
        

In [26]:
def plot_and_save(training_history, test_frequency, p, save = True, show = True):
    x = np.arange(len(training_history))
    x *= test_frequency
    plt.plot(x, training_history)
    plt.title('Combined Average Score (Rainbow, 2 Color game)')
    plt.xlabel('Epoch')
    plt.ylabel('Average Score (max 10)')
    if save: plt.savefig(f'{p["path"]}training_curve.png')
    if show:
        plt.show()
    else:
        plt.close()
        
def load(policy, agents, p):
    for a in agents:
        policy.policies[a].load_state_dict(torch.load(f'{p["path"]}{a}_params.pth'))
    his = list(np.load(f'{p["path"]}training_rewards.npy'))
    return his

In [27]:
policy, agents, train_envs, test_envs, lr_scheduler = get_agents(p)
train_collector, test_collector = get_collectors(policy, train_envs, test_envs, p)
initialize_buffer(train_collector, agents, policy, p)



In [28]:
#training_history = load(policy, agents,p)
training_history = []

In [29]:
train(policy, train_collector, test_collector, agents, p, lr_scheduler, training_history = training_history)

  0%|          | 0/10000000 [00:00<?, ?it/s]



0.0




0.2967171717171717
0.921875
1.0133779264214047
1.0501672240802675
0.9965753424657534
1.0203389830508474
0.9900662251655629
1.04
0.9965635738831615
1.0
1.0
1.0433333333333332
0.9895104895104895
1.0209059233449478
1.246875
1.1580756013745706
1.0172413793103448
1.4205298013245033
1.3395638629283488
1.4746666666666666
1.4152744630071599
1.70316301703163
1.677115987460815
1.5325581395348837
1.7335329341317365
1.6155555555555556
1.8329297820823245
1.4025974025974026
1.7112299465240641
1.523961661341853
1.0171821305841924
1.7681940700808625
1.9390862944162437
1.6936708860759493
1.8058035714285714
1.6377708978328174
1.8386243386243386
1.818421052631579
1.3559322033898304
1.7135678391959799
1.391304347826087
1.7557603686635945
1.6645367412140575
1.8115942028985508
1.8640226628895185
1.8918918918918919
2.0028248587570623
1.9835164835164836
2.1810089020771515
1.9586374695863746
2.0631868131868134
2.1440922190201728
1.7917888563049853
2.3382789317507418
2.1715817694369974
2.1910994764397906
2.0784

5.340490797546012
5.226993865030675
5.219512195121951
5.382978723404255
5.275757575757575
5.215151515151515
5.324159021406728
5.288753799392097
5.126888217522659
5.222560975609756
5.324159021406728
5.234939759036145
5.245454545454545
5.306990881458966
5.3689024390243905
5.273006134969325
5.335365853658536
5.3003003003003
5.300613496932515
5.25
5.290909090909091
5.27355623100304
5.296636085626911
5.166163141993958
5.357575757575757
5.2926829268292686
5.313069908814589
5.335365853658536
5.371951219512195
5.265861027190332
5.211656441717792
5.323262839879154
5.294478527607362
5.327327327327327
5.247706422018348
5.325227963525836
5.285276073619632
5.334346504559271
5.234939759036145
5.296636085626911
5.182926829268292
5.287878787878788
5.245454545454545
5.134969325153374
5.254545454545455
5.317073170731708
5.27355623100304
5.218461538461538
5.234756097560975
5.278787878787878
5.2782874617737
5.128048780487805
5.274390243902439
5.295731707317073
5.348484848484849
5.355623100303951
5.2909090

KeyboardInterrupt: 