# Google Colab用セットアップ

In [None]:
%cd /content/
!git clone https://github.com/nekoneko02/cat-brain.git
%cd cat-brain
!git checkout origin/cnn


In [None]:
%cd /content
!mv /content/cat-brain /content/cat_brain
!mv /content/cat_brain/cat-dqn /content/cat_brain/cat_dqn
!sed -i 's|\.\./cat-game/config/common\.json|/content/cat_brain/cat-game/config/common.json|g' /content/cat_brain/cat_dqn/cat_toy_env.py

# 強化学習モデルの学習 (main.py)

このセルでは、DQNアルゴリズムを用いて、`CartPole-v1`環境でモデルを学習させます。

In [None]:
!apt install cmake swig zlib1g-dev
%pip install torch torchvision
%pip install numpy onnx
%pip install pettingzoo[all]
%pip install torchrl
%pip install tensordict


In [None]:
from pettingzoo.test import api_test
from cat_toy_env import CatToyEnv
env_kwargs=dict(render_mode=None, max_steps=1000)

# 1個だけ環境を作る（並列ではなく）
env = CatToyEnv(**env_kwargs)
api_test(env, num_cycles=1000, verbose_progress=False)

In [None]:
import torch
import os
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
import importlib
import cat_toy_env
import train
import dqn_agent
# モジュールを再読み込み
importlib.reload(cat_toy_env)
importlib.reload(train)
importlib.reload(dqn_agent)

# クラスを再インポート
from cat_toy_env import CatToyEnv
from train import train_dqn, evaluate_model
from dqn_agent import DQNAgent

In [None]:
env_kwargs=dict(render_mode=None, chaser= "cat", runner = "toy", dummy = "dummy")
# 1個だけ環境を作る
env_preview = CatToyEnv(**env_kwargs)
env_kwargs=dict(render_mode=None, chaser= "pre-cat", runner = "toy", dummy = "dummy")
# 1個だけ環境を作る
env_pre_preview = CatToyEnv(**env_kwargs)

# 観測のshapeを確認
print("観測の中身:", env_preview.reset())

# num_epoches = 1
# num_replays_per_episode = num_epoches * num_episodes_per_iteration * num_steps_per_episode

with open('../cat-game/config/common.json', 'r') as f:
  config_file = json.load(f)
common_categorical_config={
  "v_max": config_file["model"]["v_max"],
  "v_min": config_file["model"]["v_min"],
  "num_atoms": config_file["model"]["num_atoms"]
}
 
config = {
  "pre-train":{
    "num_iterations": 50,
    "num_episodes_per_iteration": 1,
    "num_steps_per_episode": 10000,
    "update_target_steps": 5,
    "replay_interval": 7,
    "batch_size": 64
  },
  "train":{
    "num_iterations": 100,
    "num_episodes_per_iteration": 1,
    "num_steps_per_episode": 10000,
    "update_target_steps": 10,
    "replay_interval": 7,
    "batch_size": 64
  },
  "cat": {
    "dqn": {
      "input_dim": env_preview.observation_spaces["cat"].shape[0],
      "rnn": {
        "hidden_dim": config_file["model"]["hidden_size"],
        "sequence_length": config_file["model"]["sequence_length"],
      },
      "feature": [256, 256],
      "value_stream": [128, 1],
      "speed_advantage_stream": [128, 3],
      "direction_advantage_stream": [128, 4],
      "categorical": {
        "v_max": config_file["model"]["v_max"],
        "v_min": config_file["model"]["v_min"],
        "num_atoms": config_file["model"]["num_atoms"]
      }
    },
    "agent": {
      "state_shape": env_preview.observation_spaces["cat"].shape[0],
      "action_space": env_preview.action_spaces["cat"],
      "buffer": {
        "size": 10000,
        "alpha": 0.6,
        "beta": 0.4
      },
      "learning_rate": 1e-4,
      "discount_rate": 0.995
    }
  },
  "pre-cat": {
    "dqn": {
      "input_dim": env_pre_preview.observation_spaces["pre-cat"].shape[0],
      "rnn": {
        "hidden_dim": config_file["model"]["hidden_size"],
        "sequence_length": 5
      },
      "feature": [256, 256],
      "value_stream": [128, 1],
      "advantage_stream": [128, env_pre_preview.action_spaces["pre-cat"].n],
      "categorical": {
        "v_max": config_file["model"]["v_max"],
        "v_min": config_file["model"]["v_min"],
        "num_atoms": config_file["model"]["num_atoms"]
      }
    },
    "agent": {
      "state_shape": env_pre_preview.observation_spaces["pre-cat"].shape[0],
      "action_space": env_pre_preview.action_spaces["pre-cat"],
      "buffer": {
        "size": 10000,
        "alpha": 0.6,
        "beta": 0.4
      },
      "learning_rate": 1e-4,
      "discount_rate": 0.995
    }
  },
  "toy": {
    "dqn": {
      "input_dim": env_preview.observation_spaces["toy"].shape[0],
      "feature": [
        64,
        64
      ],
      "value_stream": [
        64,
        1
      ],
      "advantage_stream": [
        64,
        env_preview.action_spaces["toy"].n
      ],
      "categorical": {
        "v_max": 200,
        "v_min": 0,
        "num_atoms": 51
      }
    },
    "agent": {
      "state_shape": env_preview.observation_spaces["toy"].shape[0],
      "action_space": env_preview.action_spaces["toy"],
      "buffer": {
        "size": 10000,
        "alpha": 0.6,
        "beta": 0.4
      },
      "learning_rate": 1e-4,
      "discount_rate": 0.995
    }
  }
}
print(env_preview.action_spaces["cat"])

In [None]:
gamma = 0.995#config["cat"]["agent"]["discount_rate"]
worst_reward = -1 + 0 # 各ステップの最悪の報酬
best_reward = -0.1 + -0.3 # 各ステップの良い報酬
finish_reward = 20
print("累積報酬の最小値", worst_reward / (1-gamma))
print("良い行動を続けるがクリアしない場合の累積報酬", best_reward / (1-gamma))
print("累積報酬の最大値", finish_reward)
print("1000ステップ後にクリアする場合の累積報酬の最小値", worst_reward*(1 - gamma**1000) / (1-gamma) + (gamma ** 1000) * finish_reward)
print("1000ステップ後にクリアする場合の累積報酬の最大値", best_reward*(1 - gamma**1000) / (1-gamma) + (gamma ** 1000) * finish_reward)
print()
print(f"step: {[i for i in range(0, 1000, 100)]}")
for _gamma in [0.9, 0.99, 0.995, 0.999]:
  print(f"{_gamma}: {[_gamma ** i for i in range(0, 1000, 100)]}")

In [None]:
env_kwargs=dict(render_mode=None, max_steps = 100000)
# 学習用環境
env_learning = CatToyEnv(**env_kwargs)

In [None]:
def save_dqn(agent_dict, base_path = "models"):
    os.makedirs(base_path, exist_ok=True)
    for agent_name, agent in agent_dict.items():
        filepath = os.path.join(base_path, f"{agent_name}_model.pth")
        agent.save_model(filepath)

def load_dqn(env, agents = ["cat", "toy"] , base_path = "models"):
    # エージェントの作成
    agent_dict = {
        agent: DQNAgent(
            config[agent]["dqn"],
            config[agent]["agent"],
            device=device
        ) for agent in agents
    }
    
    # LazyLinearを初期化
    env.reset()
    done_agents = {agent: False for agent in agents}
    for agent in env.agent_iter():
        if agent not in agents:
            env.step(0)
            continue
        obs, total_reward, terminated, truncated, _ = env.last()
        done_agents[agent] = True
        env.step(agent_dict[agent].act(obs))
        if all(done_agents.values()):
            break
    # Loadする
    for agent in agents:
        filepath = os.path.join(base_path, f"{agent}_model.pth")
        agent_dict[agent].load_model(filepath)
    return agent_dict

In [None]:
# エージェントの作成
pre_agents = ["pre-cat", "toy"]

pre_agent_dict = {
    agent: DQNAgent(
        config[agent]["dqn"],
        config[agent]["agent"],
        device=device
    ) for agent in pre_agents
}
"""# 学習済みモデルを適用する場合
env_kwargs=dict(render_mode=None, max_steps = config["pre-train"]["num_steps_per_episode"], chaser= "pre-cat", runner = "toy", dummy = None)
env_dummy = CatToyEnv(**env_kwargs)
pre_agent_dict = load_dqn(env_dummy, agents=["pre-cat", "toy"], base_path="pre-models")
"""

In [None]:
env_kwargs=dict(render_mode=None, max_steps = config["train"]["num_steps_per_episode"], chaser= "pre-cat", runner = "toy", dummy = None)
# 学習step1用環境
env_learning = CatToyEnv(**env_kwargs)

# 学習
train_dqn(pre_agent_dict, pre_agents, env_learning, config["pre-train"])


In [None]:
# 評価用環境
env_kwargs=dict(render_mode="human", max_steps=3000, chaser= "pre-cat", runner = "toy", dummy = None)
env_eval = CatToyEnv(**env_kwargs)

# モデル評価
mean_std_rewards = evaluate_model(pre_agent_dict, env_eval, n_eval_episodes=1)
print(f"mean_reward: {mean_std_rewards['pre-cat']} +/- {mean_std_rewards['toy']}")

In [None]:
# モデルの保存
save_dqn(pre_agent_dict, "pre-models")

In [None]:
# エージェントの作成
agents = ["cat", "toy", "dummy"]

agent_dict = {
    "cat": DQNAgent(
        config["cat"]["dqn"],
        config["cat"]["agent"],
        device=device
    )
}
agent_dict["toy"] = pre_agent_dict["toy"] # 学習済みモデルを適用

# 学習済みモデルを適用する場合
"""env_kwargs=dict(render_mode=None, max_steps = config["train"]["num_steps_per_episode"], chaser= "cat", runner = "toy", dummy = None)
env_dummy = CatToyEnv(**env_kwargs)
agent_dict["cat"]=load_dqn(env_dummy, ["cat"], "models")["cat"]
"""


In [None]:
env_kwargs=dict(render_mode=None, max_steps = config["train"]["num_steps_per_episode"], chaser= "cat", runner = "toy", dummy = "dummy")
# 学習step2用環境
env_learning = CatToyEnv(**env_kwargs)

# 学習
train_dqn(agent_dict, ["cat"], env_learning, config["train"])

In [None]:
# 評価用環境
env_kwargs=dict(render_mode="human", max_steps=3000)
env_eval = CatToyEnv(**env_kwargs)

# モデル評価
mean_std_rewards = evaluate_model(agent_dict, env_eval, n_eval_episodes=1)
print(f"mean_reward: {mean_std_rewards['cat']} +/- {mean_std_rewards['toy']}")

In [None]:
# モデルの保存
save_dqn(agent_dict, "models")

In [None]:
"""
# Google Colab用 Artifact保存
%cd /content/cat_brain/cat_dqn
save_dqn(agent_dict, "models")
!git config --global user.email "taka.flemish.giant@gmail.com"
!git config --global user.name "nekoneko02"
!git pull
!git add models/*
!git commit -m "Model保存 from Google Colab"
!git push origin HEAD:google-colab-artifact
"""

In [None]:
import dqn_onnx
importlib.reload(dqn_onnx)

# 入力の2つのTensorを結合
toy = torch.randn(1, 2)
cat = torch.randn(1, 2)
dum = torch.randn(1, 2)
hidden_state = torch.randn(1, 1, 64)

# 各入力を結合
single_input = torch.cat([toy, cat, dum], dim=1)  # shape: (1, obs_dim)

# sequence_lengthの長さに拡張
concat_input = single_input.repeat(config["cat"]["dqn"]["rnn"]["sequence_length"], 1).unsqueeze(0)  # shape: (1, sequence_length, obs_dim)

# エクスポート対象モデル（例: policyネットワーク）
# dummyの環境
env_kwargs=dict(render_mode="human", max_steps=1000)
env_dummy = CatToyEnv(**env_kwargs)

# モデルのロード
loaded_model = load_dqn(env_dummy, ["cat", "toy"], "models")
policy_net = loaded_model["cat"].model  # catエージェントのポリシーネットワークを取得
policy_net = dqn_onnx.DQNOnnx(policy_net)

# ONNX エクスポート
torch.onnx.export(
    policy_net,
    (concat_input),  # RNN用の入力は (入力テンソル, 隠れ状態) とする
    "cat_dqn_policy.onnx",
    export_params=True,
    opset_version=12,
    input_names=["obs"],
    output_names=["action_speed", "action_direction", "q_values_speed", "q_values_direction"],
    dynamic_axes={
        "obs": {0: "batch_size"},  # 観測データのバッチ次元を可変に
        "action_speed": {0: "batch_size"},
        "action_direction": {0: "batch_size"},
        "q_values_speed": {0: "batch_size"},
        "q_values_direction": {0: "batch_size"}
    }
)