In [1]:
!pip install git+https://github.com/Farama-Foundation/MAgent2

Collecting git+https://github.com/Farama-Foundation/MAgent2
  Cloning https://github.com/Farama-Foundation/MAgent2 to /tmp/pip-req-build-lbygbq3l
  Running command git clone --filter=blob:none --quiet https://github.com/Farama-Foundation/MAgent2 /tmp/pip-req-build-lbygbq3l
  Resolved https://github.com/Farama-Foundation/MAgent2 to commit b2ddd49445368cf85d4d4e1edcddae2e28aa1406
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting pygame>=2.1.0 (from magent2==0.3.3)
  Downloading pygame-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading pygame-2.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m87.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[?25hBuilding wheels for collected packages: magent2
  Building wheel for

## Import

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import numpy as np
import os
from tqdm import tqdm

from magent2.environments import battle_v4
import cv2
from collections import deque
import time
import random
import warnings
warnings.filterwarnings('ignore')

## DQN

In [3]:
class QNetwork(nn.Module):
    def __init__(self, observation_shape, action_shape, device='cpu'):
        super().__init__()
        self.observation_shape = observation_shape
        self.action_shape = action_shape
        self.device = device

        self.cnn = nn.Sequential(
            nn.Conv2d(observation_shape[-1], observation_shape[-1], kernel_size=3),
            nn.ReLU(),
            nn.Conv2d(observation_shape[-1], observation_shape[-1], kernel_size=3),
            nn.ReLU(),
        )

        dummy_input = torch.randn(observation_shape).permute(2, 0, 1)
        dummy_output = self.cnn(dummy_input)
        flatten_dim = dummy_output.view(-1).shape[0]
        self.network = nn.Sequential(
            nn.Linear(flatten_dim, 120),
            nn.ReLU(),
            nn.Linear(120, 84),
            nn.ReLU(),
            nn.Linear(84, action_shape),
        )

    def forward(self, x):
        assert len(x.shape) >= 3, "only support magent input observation"
        x = self.cnn(x)
        if len(x.shape) == 3:
            batchsize = 1
        else:
            batchsize = x.shape[0]
        x = x.reshape(batchsize, -1)
        return self.network(x)

## Replay Buffer

In [4]:
class ReplayMemory(Dataset):
    def __init__(self, maxlen):
        super().__init__()
        self.maxlen = maxlen
        self.step_memory = [deque([],maxlen=self.maxlen)]

    def push(self, step_idx, observation, action, reward, next_observation, done):
        if step_idx == len(self.step_memory):
            self.step_memory.append(deque([],maxlen=self.maxlen))
        self.step_memory[step_idx].append((observation, action, reward, next_observation, done))

    def __len__(self):
        return sum([len(memory) for memory in self.step_memory])

    def __getitem__(self, idx):
        step_idx = 0
        while idx >= len(self.step_memory[step_idx]):
            idx -= len(self.step_memory[step_idx])
            step_idx += 1
        observation, action, reward, next_observation, done = self.step_memory[step_idx][idx]
        return (
            torch.Tensor(observation).float().permute([2, 0, 1]),
            torch.tensor(action),
            torch.tensor(reward, dtype=torch.float),
            torch.tensor(next_observation).float().permute([2,0,1]),
            torch.tensor(done, dtype=torch.float32),
        )

## Trainer

In [5]:
class Trainer:
    def __init__(
        self,
        policy_dqn, target_dqn,
        n_action,
        loss_fn, optimizer, scheduler,
        epsilon_start, epsilon_end, epsilon_decay,
        device='cpu'
    ):

        self.policy_dqn = policy_dqn.to(device)
        self.target_dqn = target_dqn.to(device)
        self.target_dqn.eval()

        self.n_action = n_action

        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        self.epsilon = self.epsilon_start

        self.device = device

        self.policy_dqn.apply(self.weights_init)

    def weights_init(self, m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
            nn.init.xavier_uniform_(m.weight)
            if torch.is_tensor(m.bias):
                m.bias.data.fill_(0.01)


    def policy(self, observation):
        if np.random.rand() < self.epsilon:
            return np.random.randint(0, self.n_action)
        else:
            with torch.no_grad():
                q_values = self.policy_dqn(
                    torch.Tensor(observation).float().permute([2, 0, 1]).unsqueeze(0).to(self.device)
                )
            return torch.argmax(q_values, dim=1).cpu().numpy()[0]


    def optimize_model(self, replay_memory, batch_size, gamma):
        if len(replay_memory) < batch_size:
            return
        train_loader = DataLoader(replay_memory, batch_size=batch_size, shuffle=True)
        self.policy_dqn.train()

        for observations, actions, rewards, next_observations, dones in train_loader:

            self.policy_dqn.zero_grad()

            observations = observations.to(self.device)
            actions = actions.unsqueeze(1).to(self.device)
            rewards = rewards.unsqueeze(1).to(self.device)
            next_observations = next_observations.to(self.device)
            dones = dones.unsqueeze(1).to(self.device)

            current_q_values = self.policy_dqn(observations).gather(1, actions)

            with torch.no_grad():
                target_q_values = rewards + gamma * (1 - dones) * self.target_dqn(next_observations).max(1, keepdim=True)[0]

            # loss
            loss = self.loss_fn(current_q_values, target_q_values)

            loss.backward()
            self.optimizer.step()
            self.scheduler.step()

    def train(self,
              env, episodes,
              target_agent, batch_size, gamma, replay_memory,
              update_tg_freq, TAU
             ):
        train_rewards = []
        train_durations = []

        for episode in tqdm(range(episodes)):
            ep_reward = 0

            ep_steps = 0

            observations = {}
            actions = {}
            step_idx = {}

            env.reset()

            for idx, agent in enumerate(env.agent_iter()):
                ep_steps += 1
                observation, reward, termination, truncation, info = env.last()

                if target_agent in agent:
                    ep_reward += reward
                else:
                    ep_reward -= reward

                action = self.policy(observation)
                
                step_idx[agent] = 0
                observations[agent] = observation
                actions[agent] = action
                env.step(action)

                if (idx+1) % env.num_agents == 0:
                    break

            for agent in env.agent_iter():
                ep_steps += 1

                next_observation, reward, termination, truncation, info = env.last()

                if target_agent in agent:
                    ep_reward += reward
                else:
                    ep_reward -= reward

                # Agent die
                if termination or truncation:
                    action = None
                else:
                    action = self.policy(next_observation)

                replay_memory.push(
                    step_idx[agent],
                    observations[agent],
                    actions[agent],
                    reward,
                    next_observation,
                    termination
                )

                step_idx[agent] += 1
                observations[agent] = next_observation
                actions[agent] = action
                env.step(action)

            # Training
            self.optimize_model(replay_memory, batch_size, gamma)

            if episode % update_tg_freq == 0:
                target_dqn_state_dict = self.target_dqn.state_dict()
                policy_dqn_state_dict = self.policy_dqn.state_dict()
                for key in policy_dqn_state_dict:
                    target_dqn_state_dict[key] = policy_dqn_state_dict[key]*TAU + target_dqn_state_dict[key]*(1-TAU)
                self.target_dqn.load_state_dict(target_dqn_state_dict)


            self.epsilon = max(self.epsilon_end, self.epsilon * self.epsilon_decay)

            print(f"\nEpisode {episode + 1}, Episode Reward: {ep_reward}, Steps: {ep_steps}, Epsilon: {self.epsilon}")

            train_rewards.append(ep_reward)
            train_durations.append(ep_steps)

        return train_rewards, train_durations

## Config

In [6]:

env = battle_v4.env(map_size=45, render_mode="rgb_array")

episodes = 40
target_agent = 'blue'
batch_size = 1024
gamma = 0.9
update_tg_freq = 1
TAU = 0.3

maxlen = 81 * episodes

learning_rate = 1e-3
theta = 1e-6
epsilon_start = 1
epsilon_end = 0.01
epsilon_decay = 0.9
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [7]:
def seed_everything(seed_value):
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed_everything(42)

## Training Loop

In [8]:
policy_dqn = QNetwork(
    env.observation_space("red_0").shape, env.action_space("red_0").n
    )

target_dqn = QNetwork(
    env.observation_space("red_0").shape, env.action_space("red_0").n
    )

target_dqn.load_state_dict(policy_dqn.state_dict())

<All keys matched successfully>

In [9]:
loss_function = nn.MSELoss()

In [10]:
optimizer = torch.optim.AdamW(policy_dqn.parameters(), weight_decay=0, lr=learning_rate)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=episodes, eta_min=theta)

In [11]:
replay_memory = ReplayMemory(maxlen)

In [12]:

trainer = Trainer(
    policy_dqn, target_dqn,
    env.action_space("red_0").n,
    loss_function, optimizer, lr_scheduler,
    epsilon_start, epsilon_end, epsilon_decay,
    device=device
)

In [None]:

train_rewards, train_durations = trainer.train(
    env, episodes,
    target_agent, batch_size, gamma, replay_memory,
    update_tg_freq, TAU
)

  2%|▎         | 1/40 [00:36<23:53, 36.75s/it]


Episode 1, Episode Reward: 65.33500276319683, Steps: 158611, Epsilon: 0.9


  5%|▌         | 2/40 [01:58<39:58, 63.13s/it]


Episode 2, Episode Reward: 1.4350003516301513, Steps: 160308, Epsilon: 0.81


  8%|▊         | 3/40 [03:41<50:12, 81.42s/it]


Episode 3, Episode Reward: -122.04000551160425, Steps: 103534, Epsilon: 0.7290000000000001


 10%|█         | 4/40 [05:28<54:55, 91.56s/it]


Episode 4, Episode Reward: 42.000003438442945, Steps: 53125, Epsilon: 0.6561000000000001


 12%|█▎        | 5/40 [07:39<1:01:40, 105.74s/it]


Episode 5, Episode Reward: 157.2750077350065, Steps: 88321, Epsilon: 0.5904900000000002


 15%|█▌        | 6/40 [10:23<1:11:04, 125.42s/it]


Episode 6, Episode Reward: -149.68000826239586, Steps: 123226, Epsilon: 0.5314410000000002


 18%|█▊        | 7/40 [12:59<1:14:31, 135.51s/it]


Episode 7, Episode Reward: 32.19000013452023, Steps: 36241, Epsilon: 0.47829690000000014


 20%|██        | 8/40 [15:42<1:16:52, 144.15s/it]


Episode 8, Episode Reward: 5.854999823495746, Steps: 36305, Epsilon: 0.43046721000000016


 22%|██▎       | 9/40 [18:37<1:19:33, 153.97s/it]


Episode 9, Episode Reward: 300.07001288980246, Steps: 51952, Epsilon: 0.38742048900000015


 25%|██▌       | 10/40 [21:29<1:19:44, 159.48s/it]


Episode 10, Episode Reward: -40.7999964999035, Steps: 25440, Epsilon: 0.34867844010000015


 28%|██▊       | 11/40 [24:22<1:19:01, 163.51s/it]


Episode 11, Episode Reward: -28.305002064444125, Steps: 14793, Epsilon: 0.31381059609000017


 30%|███       | 12/40 [27:18<1:18:08, 167.43s/it]


Episode 12, Episode Reward: -50.61500302888453, Steps: 19536, Epsilon: 0.28242953648100017


 32%|███▎      | 13/40 [30:18<1:17:02, 171.21s/it]


Episode 13, Episode Reward: -16.59500029589981, Steps: 15925, Epsilon: 0.25418658283290013


 35%|███▌      | 14/40 [33:17<1:15:11, 173.53s/it]


Episode 14, Episode Reward: -57.514999899081886, Steps: 9011, Epsilon: 0.22876792454961012


 38%|███▊      | 15/40 [36:18<1:13:14, 175.79s/it]


Episode 15, Episode Reward: 61.029998858459294, Steps: 13221, Epsilon: 0.2058911320946491


 40%|████      | 16/40 [39:18<1:10:50, 177.12s/it]


Episode 16, Episode Reward: 174.23999799694866, Steps: 7138, Epsilon: 0.1853020188851842


 42%|████▎     | 17/40 [42:20<1:08:24, 178.46s/it]


Episode 17, Episode Reward: 78.01999875437468, Steps: 11148, Epsilon: 0.16677181699666577


 45%|████▌     | 18/40 [45:25<1:06:12, 180.58s/it]


Episode 18, Episode Reward: 128.16499777231365, Steps: 16232, Epsilon: 0.1500946352969992


 48%|████▊     | 19/40 [48:35<1:04:07, 183.24s/it]


Episode 19, Episode Reward: -77.41499787848443, Steps: 20579, Epsilon: 0.13508517176729928


 50%|█████     | 20/40 [51:46<1:01:52, 185.63s/it]


Episode 20, Episode Reward: -50.55999910645187, Steps: 14124, Epsilon: 0.12157665459056936


 52%|█████▎    | 21/40 [54:59<59:28, 187.80s/it]  


Episode 21, Episode Reward: -72.33999922219664, Steps: 15292, Epsilon: 0.10941898913151243


In [None]:
trainer.policy_dqn

In [None]:
torch.save(trainer.policy_dqn.state_dict(), 'blue.pt')

In [None]:
import matplotlib.pyplot as plt
# Vẽ cả 2 mảng trên cùng 1 biểu đồ
plt.plot(train_rewards, label='Train Rewards', marker='o')
plt.plot(train_durations, label='Train Durations', marker='x')
plt.title('Train Rewards and Train Durations')
plt.xlabel('Episode')
plt.ylabel('Value')
plt.legend()  # Hiển thị chú thích
plt.grid(True)  # Hiển thị lưới
plt.show()