# Google Colab用セットアップ

In [None]:
%cd /content/
!git clone https://github.com/nekoneko02/cat-brain.git
%cd cat-brain
!git checkout origin/cnn


In [None]:
%cd /content
!mv /content/cat-brain /content/cat_brain
!mv /content/cat_brain/cat-dqn /content/cat_brain/cat_dqn
!sed -i 's|\.\./cat-game/config/common\.json|/content/cat_brain/cat-game/config/common.json|g' /content/cat_brain/cat_dqn/cat_toy_env.py

# 強化学習モデルの学習 (main.py)

このセルでは、DQNアルゴリズムを用いて、`CartPole-v1`環境でモデルを学習させます。

In [None]:
!apt install cmake swig zlib1g-dev
%pip install torch torchvision
%pip install numpy onnx
%pip install pettingzoo[all]
%pip install torchrl


In [None]:
from pettingzoo.test import api_test
from cat_toy_env import CatToyEnv
env_kwargs=dict(render_mode=None, max_steps=1000)

# 1個だけ環境を作る（並列ではなく）
env = CatToyEnv(**env_kwargs)
api_test(env, num_cycles=1000, verbose_progress=False)

In [None]:
import gymnasium as gym
import torch

from cat_toy_env import CatToyEnv
#from cat_brain.cat_dqn.cat_toy_env import CatToyEnv # Google Colab用

import torch
import torch.nn as nn
import torch.optim as optim
from torchrl.data import PrioritizedReplayBuffer, LazyTensorStorage
from torchrl.data.replay_buffers.samplers import PrioritizedSampler
from torchrl.modules import NoisyLinear

import numpy as np
import random
from collections import deque
import os

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import importlib
import cat_toy_env

# モジュールを再読み込み
importlib.reload(cat_toy_env)

# クラスを再インポート
from cat_toy_env import CatToyEnv


In [None]:
num_iterations = 60
num_episodes_per_iteration = 1
num_steps_per_episode = 100000
# num_epoches = 1
# num_replays_per_episode = num_epoches * num_episodes_per_iteration * num_steps_per_episode
update_target_steps = 10
replay_interval = 6
buffer_size = 10000
batch_size = 64
sequence_length = 1

In [None]:
env_kwargs=dict(render_mode=None, max_steps = num_steps_per_episode)
# 1個だけ環境を作る
env_preview = CatToyEnv(**env_kwargs)

obs = env_preview.reset()

# 観測のshapeを確認
print("観測の形:", obs)
print("観測の中身:", obs)
# 学習用環境
env_learning = CatToyEnv(**env_kwargs)

In [None]:
class DQN(nn.Module):
    def __init__(self, input_dim, output_dim, rnn_hidden_dim=64):
        super(DQN, self).__init__()

        # RNN層
        self.rnn = nn.GRU(input_dim, rnn_hidden_dim, batch_first=True)

        # 特徴抽出層
        self.feature = nn.Sequential(
            nn.Linear(64, 256),
            nn.ReLU()
        )

        # 状態価値関数 V(s)
        self.value_stream = nn.Sequential(
            NoisyLinear(256, 128),
            nn.ReLU(),
            NoisyLinear(128, 1)
        )

        # アドバンテージ関数 A(s, a)
        self.advantage_stream = nn.Sequential(
            NoisyLinear(256, 128),
            nn.ReLU(),
            NoisyLinear(128, output_dim)
        )

    def forward(self, x, hidden_state=None):
        # RNNの処理
        x, hidden_state = self.rnn(x, hidden_state)
        x = x[:, -1, :]  # 最後の出力のみを使用

        # 特徴抽出
        x = self.feature(x)
        value = self.value_stream(x)
        advantage = self.advantage_stream(x)

        # Q値の計算
        q = value + advantage - advantage.mean(dim=1, keepdim=True)
        return q, hidden_state


class DQNAgent:
    def __init__(self, agent_name, env, learning_rate=1e-4, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.agent_name = agent_name
        self.action_space = env.action_spaces[self.agent_name]
        self.state_shape = env.observation_spaces[self.agent_name].shape[0]
        
        self.model = DQN(self.state_shape, self.action_space.n).to(device)
        self.target_model = DQN(self.state_shape, self.action_space.n).to(device)

        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate

        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.loss_fn = nn.MSELoss()

        self.memory = deque(maxlen=buffer_size)
        self.batch_size = batch_size
        self.update_target_model()

    def update_target_model(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def store_experience(self, state, action, reward, next_state, done):
        self.memory.append((
            state,
            action,
            reward,
            next_state,
            done
        ))

    def act(self, state, hidden_state=None):
        if random.random() <= self.epsilon:
            return self.action_space.sample(), hidden_state
        state = torch.FloatTensor(state).unsqueeze(0).unsqueeze(0).to(device)  # [1, 1, state_dim]
        q_values, hidden_state = self.model(state, hidden_state)
        return torch.argmax(q_values).item(), hidden_state

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        batch = list(zip(*batch))  # tuple of lists

        states = torch.FloatTensor(np.stack(batch[0])).to(device)
        actions = torch.LongTensor(batch[1]).to(device)
        rewards = torch.FloatTensor(batch[2]).to(device)
        next_states = torch.FloatTensor(np.stack(batch[3])).to(device)
        dones = torch.FloatTensor(batch[4]).to(device)

        # RNNの初期隠れ状態
        hidden_state = None

        # Q値の計算
        q_values, _ = self.model(states, hidden_state)
        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

        # ターゲットQ値の計算
        next_q_values, _ = self.target_model(next_states, hidden_state)
        max_next_q_values = next_q_values.max(1)[0]
        target_q_values = rewards + self.gamma * max_next_q_values * (1 - dones)

        # 損失計算と学習
        loss = self.loss_fn(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ε減少
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def save_model(self, filepath):
        torch.save(self.model.state_dict(), filepath)

    def load_model(self, filepath):
        self.model.load_state_dict(torch.load(filepath))
        self.target_model.load_state_dict(self.model.state_dict())


In [None]:
def train_dqn(agent_dict, env, num_iterations, num_episodes_per_iteration):
    total_rewards = {agent: 0.0 for agent in env.agents}
    rewards = {agent: 0.0 for agent in env.agents}
    steps = 0
    for iteration in range(num_iterations):
        for episode in range(num_episodes_per_iteration):
            obs = env.reset()
            seq_obs = {agent: deque(maxlen=sequence_length+1) for agent in env.agents} # (len(agents), sequence_length, state_dim)
            for agent in env.agents:
                for _ in range(sequence_length+1):
                    seq_obs[agent].append(obs)
            prev_action = {agent: None for agent in env.agents}
            prev_total_reward = {agent: 0.0 for agent in env.agents}

            for agent in env.agent_iter():
                if agent == "dummy":
                    # dummyエージェントは行動しない
                    action = None
                    env.step(action)
                    continue

                obs, total_reward, terminated, truncated, _ = env.last()
                done = terminated or truncated

                if prev_action[agent] is not None:
                    seq_obs[agent].append(obs)  # 過去の状態を更新
                    list_obs = list(seq_obs[agent])
                    # 前回行動の結果が今回のループで得られたので、ここで保存できる
                    agent_dict[agent].store_experience(
                        list_obs[0:-1],         # s
                        prev_action[agent],      # a
                        total_reward - prev_total_reward[agent],      # r (現在のループで得られた報酬)
                        list_obs[1:],                     # s' (次状態)
                        float(terminated)              # done
                    )
                    # ここでreplayを行う
                    if env.step_count % replay_interval == 0:
                        for replay_agent in ["cat", "toy"]:
                            agent_dict[replay_agent].replay()

                if done or env.step_count % 1000 == 0:
                    print(f"{agent} with  steps {env.step_count}, reward {total_reward - prev_total_reward[agent]: 2f}, action: {prev_action}, state is {obs}")


                if done:
                    action = None  # No action needed if agent is done
                    #total_rewards[agent] += total_reward
                    steps += env.step_count
                else:
                    action, _ = agent_dict[agent].act(obs)

                env.step(action)

                prev_action[agent] = action  # 次の行動を更新
                prev_total_reward[agent] = total_reward # 次の報酬を更新

        # ログ出力
        if iteration % update_target_steps == 0:
            print(f"Iteration {iteration}: " + ", ".join([f"{a}: {r / update_target_steps:.2f}" for a, r in total_rewards.items()]), steps / update_target_steps)
            #total_rewards = {agent: 0.0 for agent in total_rewards.keys()}
            steps = 0

        # ターゲットネットワーク更新
        if iteration % update_target_steps == 0:
            for agent in agent_dict.values():
                agent.update_target_model()

def evaluate_model(agent_dict, eval_env, n_eval_episodes=10):
    reward_sums = {agent_name: [] for agent_name in agent_dict.keys()}

    for _ in range(n_eval_episodes):
        env = eval_env  # 環境がreset可能で、内部状態が共有でないと仮定
        env.reset()
        episode_rewards = {agent_name: 0.0 for agent_name in agent_dict.keys()}

        for agent in env.agent_iter():
            if agent == "dummy":
                # dummyエージェントは行動しない
                action = None
                env.step(action)
                continue
            obs, reward, termination, truncation, info = env.last()
            done = termination or truncation

            if done:
                action = None  # 終了したら行動不要
            else:
                action, _ = agent_dict[agent].act(obs)  # 各エージェントに行動させる

            env.step(action)
            episode_rewards[agent] += reward  # 各agentごとに報酬を記録

        for agent_name in reward_sums:
            reward_sums[agent_name].append(episode_rewards[agent_name])

    # 統計量（平均・標準偏差）を返す
    mean_std_rewards = {
        agent: (np.mean(rewards), np.std(rewards))
        for agent, rewards in reward_sums.items()
    }

    return mean_std_rewards

def save_dqn(agent_dict, base_path = "models"):
    os.makedirs(base_path, exist_ok=True)
    for agent_name, agent in agent_dict.items():
        filepath = os.path.join(base_path, f"{agent_name}_model.pth")
        agent.save_model(filepath)

def load_dqn(env, agents = ["cat", "toy"] , base_path = "models"):
    agent_dict = {}
    for agent_name in agents:
        filepath = os.path.join(base_path, f"{agent_name}_model.pth")
        agent = DQNAgent(agent_name, env)
        agent.load_model(filepath)
        agent_dict[agent_name] = agent
    return agent_dict

In [None]:
# エージェントの作成
agent_dict = {
    agent_name: DQNAgent(agent_name, env_learning)
    for agent_name in env_learning.agents
}


In [None]:
# 学習
train_dqn(agent_dict, env_learning, num_iterations, num_episodes_per_iteration)


In [None]:
# 評価用環境
env_kwargs=dict(render_mode="human", max_steps=1000)
env_eval = CatToyEnv(**env_kwargs)

# モデル評価
mean_std_rewards = evaluate_model(agent_dict, env_eval, n_eval_episodes=1)
print(f"mean_reward: {mean_std_rewards['cat']} +/- {mean_std_rewards['toy']}")

In [None]:
# モデルの保存
save_dqn(agent_dict, "models")

In [None]:
# Google Colab用 Artifact保存
%cd /content/cat_brain/cat_dqn
save_dqn(agent_dict, "models")
!git config --global user.email "taka.flemish.giant@gmail.com"
!git config --global user.name "nekoneko02"
!git pull
!git add models/*
!git commit -m "Model保存 from Google Colab"
!git push origin HEAD:google-colab-artifact


In [None]:
# 評価用環境
env_kwargs=dict(render_mode="human", max_steps=1000)
env_eval = CatToyEnv(**env_kwargs)

# モデルのロード
loaded_model = load_dqn(env_eval, ["cat", "toy"], "models")

# ロードしたモデルの評価
mean_std_rewards = evaluate_model(loaded_model, env_eval, n_eval_episodes=10)
print(f"mean_reward: {mean_std_rewards['cat']} +/- {mean_std_rewards['toy']}")

In [None]:
# 入力の2つのTensorを結合
toy = torch.randn(1, 2)
cat = torch.randn(1, 2)
dum = torch.randn(1, 2)
concat_input = torch.cat([toy, cat, dum], dim=1)  # shape: (1, 4)

# エクスポート対象モデル（例: policyネットワーク）
# dummyの環境
env_kwargs=dict(render_mode="human", max_steps=1000)
env_dummy = CatToyEnv(**env_kwargs)

# モデルのロード
loaded_model = load_dqn(env_dummy, ["cat", "toy", "dummy"], "models")
policy_net = agent_dict["cat"].model  # catエージェントのポリシーネットワークを取得

# ONNXエクスポート
torch.onnx.export(
    policy_net,
    concat_input,  # ← dictではなく単一Tensor
    "cat_dqn_policy.onnx",
    export_params=True,
    opset_version=11,
    input_names=["obs"],
    output_names=["q_values"],
    dynamic_axes={
        "obs": {0: "batch_size"},
        "q_values": {0: "batch_size"}
    }
)


In [None]:
# 環境のクローズ
env_learning.close()
env_eval.close()