<a href="https://colab.research.google.com/github/m-wallner/deep-rl-dqn-pommerman/blob/main/deep-rl-dqn-pommerman.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites

### Download and install env

In [None]:
!pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 torchaudio==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html
!pip install onnx==1.8.1
!pip install onnx2pytorch==0.2.0
!pip install onnxruntime==1.2.0
!pip install tqdm
!pip install torchvision
!pip install seaborn
!pip install sklearn
!pip install numpy
!pip install opencv-python

In [None]:
 # ## commands to install the env
!git clone https://github.com/MultiAgentLearning/playground ./pommer_setup
!pip install -U ./pommer_setup
!rm -rf ./pommer_setup

!git clone https://github.com/RLCommunity/graphic_pomme_env ./graphic_pomme_env
!pip install -U ./graphic_pomme_env
!rm -rf ./graphic_pomme_env

### Imports and Constants

In [None]:
import numpy as np
import os
from functools import partial
from itertools import chain
import random
from time import strftime
from collections import deque, namedtuple
from tqdm import trange

import torch
import torch.nn as nn
import torch.optim as optim

from PIL import Image
import cv2

import gym
from gym import Env, Wrapper
from gym import logger as gymlogger

from pommerman import make
from pommerman.agents import BaseAgent, RandomAgent, SimpleAgent
from graphic_pomme_env import graphic_pomme_env
from graphic_pomme_env.wrappers import PommerEnvWrapperFrameSkip2

import onnx
from onnx2pytorch import ConvertModel

import IPython.display
from IPython.display import clear_output

pomenvs = [es.id for es in gym.envs.registry.all() if es.id.startswith('Pomme')]
print("\n".join(pomenvs))
res = graphic_pomme_env.load_resources()

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

# Define data paths
data_path = '/content/gdrive/My Drive/Colab Notebooks/data/DRL/C4'

In [None]:
USE_CUDA = torch.cuda.is_available()
Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
!nvidia-smi

### Helper Code

In [None]:
class Logger():
  def __init__(self, logdir, params=None):
      self.basepath = os.path.join(logdir, strftime("%Y-%m-%dT%H-%M-%S"))
      os.makedirs(self.basepath, exist_ok=True)
      os.makedirs(self.log_dir, exist_ok=True)
      if params is not None and os.path.exists(params):
          shutil.copyfile(params, os.path.join(self.basepath, "params.pkl"))
      self.log_dict = {}
      self.dump_idx = {}

  @property
  def param_file(self):
      return os.path.join(self.basepath, "params.pkl")

  @property
  def onnx_file(self):
      return os.path.join(self.basepath, "model.onnx")

  @property
  def log_dir(self):
      return os.path.join(self.basepath, "logs")

  def log(self, name, value):
      if name not in self.log_dict:
          self.log_dict[name] = []
          self.dump_idx[name] = -1
      self.log_dict[name].append((len(self.log_dict[name]), time(), value))
  
  def get_values(self, name):
      if name in self.log_dict:
          return [x[2] for x in self.log_dict[name]]
      return None
  
  def dump(self):
      for name, rows in self.log_dict.items():
          with open(os.path.join(self.log_dir, name + ".log"), "a") as f:
              for i, row in enumerate(rows):
                  if i > self.dump_idx[name]:
                      f.write(",".join([str(x) for x in row]) + "\n")
                      self.dump_idx[name] = i


def plot_metrics(logger):
  train_loss  = logger.get_values("training_loss")
  train_entropy  = logger.get_values("training_entropy")
  val_loss = logger.get_values("validation_loss")
  val_acc = logger.get_values("validation_accuracy")
  
  fig = plt.figure(figsize=(15,5))
  ax1 = fig.add_subplot(131, label="train")
  ax2 = fig.add_subplot(131, label="val",frame_on=False)
  ax4 = fig.add_subplot(132, label="entropy")
  ax3 = fig.add_subplot(133, label="acc")

  ax1.plot(train_loss, color="C0")
  ax1.set_ylabel("Loss")
  ax1.set_xlabel("Update (Training)", color="C0")        
  ax1.xaxis.grid(False)  
  ax1.set_ylim((0,4))

  ax2.plot(val_loss, color="C1")
  ax2.xaxis.tick_top()
  ax2.yaxis.tick_right()
  ax2.set_xlabel('Epoch (Validation)', color="C1")     
  ax2.xaxis.set_label_position('top')     
  ax2.xaxis.grid(False)
  ax2.get_yaxis().set_visible(False)
  ax2.set_ylim((0,4))

  ax4.plot(train_entropy, color="C3")    
  ax4.set_xlabel('Update (Training)', color="black")     
  ax4.set_ylabel("Entropy", color="C3")
  ax4.tick_params(axis='x', colors="black")
  ax4.tick_params(axis='y', colors="black")
  ax4.xaxis.grid(False)    
  # ax4.set_ylim((0,4))

  ax3.plot(val_acc, color="C2")
  ax3.set_xlabel("Epoch (Validation)", color="black")
  ax3.set_ylabel("Accuracy", color="C2")
  ax3.tick_params(axis='x', colors="black")
  ax3.tick_params(axis='y', colors="black")
  ax3.xaxis.grid(False)
  ax3.set_ylim((0,1))

  fig.tight_layout(pad=2.0)
  plt.show()

def save_as_onnx(torch_model, sample_input, model_path):
  torch.onnx.export(torch_model,               # model being run
                    sample_input,              # model input (or a tuple for multiple inputs)
                    f=model_path,              # where to save the model (can be a file or file-like object)
                    export_params=True,        # store the trained parameter weights inside the model file
                    opset_version=10,          # the ONNX version to export the model to - see https://github.com/microsoft/onnxruntime/blob/master/docs/Versioning.md
                    do_constant_folding=True,  # whether to execute constant folding for optimization
                    )
def print_num_trainable_params(net):
  num_trainable_params = sum(p.numel() for p in net.parameters() if p.requires_grad)
  print("Trainable Parameters: {}".format(num_trainable_params))

def make_video(list_of_observations_of_a_player, prefix):
    images = list_of_observations_of_a_player
    height, width, layer = images[0].shape    
    video_name = f'{prefix}-video.avi'
    video = cv2.VideoWriter(video_name, 0, 3, (width,height))
    for image in images:
        video.write(image)

    cv2.destroyAllWindows()
    video.release()

### Some useful opponent actor functions


In [None]:
def idle_actor(frame_stack):
    del frame_stack
    return 0
  
def random_actor(frame_stack):
    del frame_stack
    return np.random.randint(NUM_ACTIONS)

def no_bomb_random_actor(frame_stack):
    del frame_stack
    return np.random.randint(NUM_ACTIONS-1)
  
def model_actor(frame_stack, model):
    obs = torch.from_numpy(np.array(frame_stack.get_observation()))
    net_out = model(obs).detach().cpu().numpy()
    action = np.argmax(net_out)
    return action


### Prioritized replay buffer

In [None]:
class ReplayBuffer():
  def __init__(self, num_actions, memory_len = 10000):
      self.memory_len = memory_len
      self.transition = []
      self.num_actions = num_actions

  def add(self, state, action, reward, next_state, done):
      if self.length() > self.memory_len:
        self.remove()
      self.transition.append(Transition(state, action, reward, next_state, done))

  def sample_batch(self, batch_size = 32):
      minibatch = random.sample(self.transition, batch_size)

      states_mb, a_, reward_mb, next_states_mb, done_mb = map(np.array, zip(*minibatch))

      mb_reward = torch.from_numpy(reward_mb).cuda()
      mb_done = torch.from_numpy(done_mb.astype(int)).cuda()

      a_mb = np.zeros((a_.size, self.num_actions))
      a_mb[np.arange(a_.size), a_] = 1
      mb_a = torch.from_numpy(a_mb).cuda()

      return states_mb, mb_a, mb_reward, next_states_mb, mb_done

  def length(self):
      return len(self.transition)

  def remove(self):
      self.transition.pop(0)

### DQN network

In [None]:
class DQNNetwork(nn.Module):

    def __init__(self, num_stack, num_actions):
        super().__init__()

        # CNN
        self.features = nn.Sequential(
            nn.Conv2d(num_stack, 32, 8, 4),
            nn.ReLU(),
            nn.Conv2d(32, 64, 4, 2),
            nn.ReLU(),
            nn.Conv2d(64, 64, 3, 1),
            nn.AdaptiveAvgPool2d(1)
        )
        
        # FC head
        self.fc = nn.Sequential(
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, num_actions)
        )

    def forward(self, x):
      # output forward should always be q values for all actions
        #print(x)
        #print(x.shape)
        x = torch.tensor(x, dtype=torch.float).cuda()
        if len(x.size()) == 3:
          x = x.unsqueeze(dim=0)
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

### Update target network via soft update

In [None]:
@torch.no_grad() # NO gradients!
def soft_update(local_model, target_model, tau):
    """Soft update model parameters.
    θ_target = τ*θ_local + (1 - τ)*θ_target
    """
    local_params, target_params = local_model.parameters(), target_model.parameters()

    for local, target in zip(local_params, target_params):
        target *= (1 - tau) # in-place
        target += tau * local # in-place

### Hyperparameters

In [None]:
N_PLAYERS = 2 
NUM_STACK = 5
NUM_ACTIONS = 6
'''
0 Stop
1 Up
2 Down
3 Left
4 Right
5 Bomb
'''

board = 'GraphicOVOCompact-v0'
num_episodes = 40000 # number of episodes to run the algorithm
buffer_size = 10 ** 5 * 3 # size of the buffer to use
epsilon = 1.0 # initial probablity of selecting random action a, annealed over time
timesteps = 0 # counter for number of frames
minibatch_size = 128 # size of the minibatch sampled
gamma = 0.99 # discount factor
eval_episode = 100
num_eval = 10
tau = 1e-3 # hyperparameter for updating the target network
learning_rate = 0.00001 
update_after = 2000 # update after num time steps
epsilon_decay = 10**5
epsilon_ub = 1.0
epsilon_lb = 0.02

### Training

In [None]:
# Train the agent using DQN for Pommerman
returns = []
returns_500 = deque(maxlen=500)
losses = []
buffer = ReplayBuffer(num_actions=NUM_ACTIONS, memory_len=buffer_size)

# Load pretrained model in PT format (or start from scratch)
model_file = os.path.join(data_path, 'model_episode_19000.tar')
dqn = DQNNetwork(num_stack=NUM_STACK, num_actions=NUM_ACTIONS).cuda()
#dqn.load_state_dict(torch.load(model_file))

dqn_target = DQNNetwork(num_stack=NUM_STACK, num_actions=NUM_ACTIONS).cuda()
optimizer = optim.Adam(dqn.parameters(), lr=learning_rate)
mse = torch.nn.MSELoss()

checkpoint = torch.load(model_file)
dqn.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
episode = checkpoint['episode']
loss = checkpoint['loss']

dqn.train()

timesteps = 0
for i in trange(episode, num_episodes):

  # Set Pommerman gym environment with start_pos 0 or 1 randomly
  if np.random.choice([0, 1], p=[0.5, 0.5]) == 0:
    env = PommerEnvWrapperFrameSkip2(num_stack=NUM_STACK, start_pos=0, board=board)
  else:
    env = PommerEnvWrapperFrameSkip2(num_stack=NUM_STACK, start_pos=1, board=board)
  state = env.reset()[0]
  
  ret = 0
  done = False
  while not done:
    #print(state)

    # Decay epsilon :
    epsilon = max(epsilon_lb, epsilon_ub - timesteps/ epsilon_decay)
    # action selection
    if np.random.choice([0,1], p=[1-epsilon,epsilon]) == 1:
      a = np.random.randint(low=0, high=NUM_ACTIONS, size=1)[0]
    else:
      net_out = dqn(state).detach().cpu().numpy()
      a = np.argmax(net_out)
    next_state, r, done, info = env.step(a)[0]
    ret = ret + r

    # Store transition in the replay buffer 
    buffer.add(state, a, r, next_state, done)

    state = next_state
    timesteps = timesteps + 1

    # update policy using temporal difference
    if buffer.length() > minibatch_size and buffer.length() > update_after:
      optimizer.zero_grad()
      # Sample a minibatch randomly

      # Minibatch: Current states, advantage, rewards, next state, done
      states_mb, mb_a, mb_reward, next_states_mb, mb_done = buffer.sample_batch(minibatch_size)

      # Compute q values for states
      q_values = dqn(states_mb)

      # Compute the targets for training
      q_values_next = dqn_target(next_states_mb)
      targets = torch.where(
          mb_done.byte(),
          mb_reward.float(),
          mb_reward.float() + gamma * torch.max(q_values_next, dim=-1).values
      )

      # Compute the predictions for training
      predictions = q_values[mb_a != 0]

      # Update loss: mse = mean squared error
      loss = mse(predictions, targets)
      loss.backward(retain_graph=False)
      optimizer.step()
      losses.append(loss.item())
 
      # Update target network
      soft_update(dqn, dqn_target, tau)
    if done:
      state_done = env.reset()
      #print(torch.tensor(state_done, dtype=torch.float)[0].shape)
      break
  returns.append(ret)
  returns_500.append(ret)
  if i % 500 == 0 and i != 0:
    wins_perc = sum(np.array(returns_500)==1) / len(returns_500) * 100
    print(f'\n\nEpisode {i} \t Wins: {wins_perc:.2f}%\n')
    dummy_input = torch.tensor(state, dtype=torch.float)
    file_path = os.path.join(data_path, f'model_episode_{i}.onnx')
    try:
        save_as_onnx(dqn, dummy_input, file_path)
        torch.save({
            'episode': i,
            'model_state_dict': dqn.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
            }, os.path.join(data_path, f'model_episode_{i}.tar'))
        
    except:
        '\nError saving in ONNX format, saving in pytorch PT format instead.\n'
        #torch.save(dqn.state_dict(), os.path.join(data_path, f'model_episode_{i}.pt'))
        torch.save({
            'episode': i,
            'model_state_dict': dqn.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
            }, os.path.join(data_path, f'model_episode_{i}.tar'))

### Evaluate model

In [None]:
gymlogger.set_level(40)  # error only
np.random.seed(147)
torch.manual_seed(147)
N_EPISODES = 100

# Specify pretrained model in ONNX format
model_file = os.path.join(data_path, f'model_episode_17500.onnx')

# Network
net = ConvertModel(onnx.load(model_file), experimental=True)
net.eval()

win_count = 0.0
#env = PommerEnvWrapperFrameSkip2(num_stack=5, start_pos=0, board='GraphicOVOCompact-v0')

for i in trange(N_EPISODES):

    # Set Pommerman gym environment with start_pos 0 or 1 randomly
    if np.random.choice([0, 1], p=[0.5, 0.5]) == 0:
        env = PommerEnvWrapperFrameSkip2(num_stack=NUM_STACK, start_pos=0, board=board)
    else:
        env = PommerEnvWrapperFrameSkip2(num_stack=NUM_STACK, start_pos=1, board=board)

    done = False
    obs, opponent_obs = env.reset()
    while not done:
        obs = torch.from_numpy(np.array(obs)).float()
        net_out = net(obs).detach().cpu().numpy()
        action = np.argmax(net_out)

        agent_step, opponent_step = env.step(action)
        obs, r, done, info = agent_step

    if r > 0:
        win_count += 1

print(win_count / N_EPISODES)