<a href="https://colab.research.google.com/github/mitosagi/puzzdra-nnsolver/blob/master/puzz_move.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 初期化

docker run -p 9000:8080 -p 6006:6006 asia-docker.pkg.dev/colab-images/public/runtime

In [1]:
!git clone --recursive https://github.com/mitosagi/puzzdra-nnsolver
%cd /content/puzzdra-nnsolver
!pip install --log=pip_log -e .
!pip install git+https://github.com/DLR-RM/stable-baselines3
# !pip install sbx-rl

Cloning into 'puzzdra-nnsolver'...
remote: Enumerating objects: 275, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 275 (delta 17), reused 0 (delta 0), pack-reused 245[K
Receiving objects: 100% (275/275), 8.27 MiB | 22.35 MiB/s, done.
Resolving deltas: 100% (159/159), done.
Submodule 'extern/pybind11' (https://github.com/pybind/pybind11) registered for path 'extern/pybind11'
Cloning into '/content/puzzdra-nnsolver/extern/pybind11'...
remote: Enumerating objects: 27212, done.        
remote: Counting objects: 100% (1131/1131), done.        
remote: Compressing objects: 100% (380/380), done.        
remote: Total 27212 (delta 749), reused 969 (delta 674), pack-reused 26081        
Receiving objects: 100% (27212/27212), 10.71 MiB | 24.54 MiB/s, done.
Resolving deltas: 100% (19076/19076), done.
Submodule path 'extern/pybind11': checked out '8de7772cc72daca8e947b79b83fea46214931604'
/content/puzzdra-nnsolver
Obtai

## 実際の処理

In [36]:
import numpy as np
import gymnasium
from gymnasium import spaces
from puzzpy import PuzzTable
from stable_baselines3.common.env_checker import check_env
import random

drop_color = 3
board_width = 6
board_height = 5
min_width = 6 # 36 if CNN, 6 if manual MLP or CNN
min_height = 5
buffer = 6 # num of previous observation

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    BLACK = '\033[30m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    MAGENTA = '\033[35m'
    CYAN = '\033[36m'

class PuzzEnv(gymnasium.Env):
  """
  パズドラの環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  def __init__(self):
    super(PuzzEnv, self).__init__()

    self.action_space = spaces.Discrete(4) # 指を離す動作を含めると5
    self.observation_space = spaces.Box(low=0, high=255, shape=(1+1+drop_color, max(min_height, board_height) * buffer, max(min_width, board_width)), dtype=np.uint8) # 1 for turn num, +1 for finger position
    self.buffer_tmp_array = None
    self.rew = 0

  def retobs(self, table):
    turn_array = [np.full((board_height, board_width), table.get_turn(), np.uint8)]
    finger_array = np.array([np.array(table.get_XY_as_table()).astype(np.uint8)]).astype(np.uint8)
    table_array = np.array(table.get_table()).astype(np.uint8)
    u = np.arange(table_array.max()+1)
    table_array = (u[:,np.newaxis,np.newaxis]==table_array).astype(np.uint8)[1:] # ドロップをone-hotにする https://stackoverflow.com/questions/67249470/convert-a-2d-numpy-array-into-a-hot-encoded-3d-numpy-array-with-same-values-in
    tmp_array = np.pad(np.concatenate([turn_array, finger_array, table_array]), [(0,0), (0, max(min_height - board_height, 0)), (0, max(min_width - board_width, 0))], mode='constant', constant_values=0)
    if self.buffer_tmp_array is None:
       self.buffer_tmp_array = np.concatenate([tmp_array for i in range(buffer)], 1)
    tmp_array = np.concatenate([tmp_array, self.buffer_tmp_array[:, :- tmp_array.shape[1]]], 1)
    self.buffer_tmp_array = tmp_array
    now_rew = table.eval_otoshi()
    step_rew = now_rew - self.rew
    self.rew = now_rew
    return tmp_array, step_rew

  def reset(self, seed = None, test_min = 30):
    self.buffer_tmp_array = None
    self.rew = 0

    super().reset(seed=seed)
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    while True:
      self.table = PuzzTable("".join([str(random.randrange(drop_color)) for i in range(board_width*board_height)]), random.randrange(board_width), random.randrange(board_height), test_min) # n色陣　操作時間m秒
      if self.table.eval_otoshi() == 0:
        break

    return self.retobs(self.table)[0], {}

  def step(self, action):
    if action == 4: # 指を離す動作
      return *(self.retobs(self.table)), True, False, {}

    next_table = self.table.next_tables()[action]

    if next_table.get_table()[0][0] == 127: # 壁に移動
      self.table.set_turn(self.table.get_turn() - 1)
      next_table = self.table

    self.table = next_table

    if self.table.get_turn() <= 0: # 時間切れ
      return *(self.retobs(self.table)), True, False, {}

    return *(self.retobs(self.table)), False, False, {}

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    tcolor = [bcolors.RED, bcolors.BLUE, bcolors.GREEN, bcolors.MAGENTA, bcolors.YELLOW, bcolors.BLACK]
    start = self.table.get_XY_as_table()
    table = self.table.get_table()
    for i in range(board_height):
      for j in range(board_width):
        if start[i][j] == 1:
          print(tcolor[table[i][j]-1]  +  bcolors.UNDERLINE + "●" + bcolors.ENDC, end='')
        else:
          print(tcolor[table[i][j]-1]  + "●" + bcolors.ENDC, end='')

      print('')

check_env(PuzzEnv())

In [37]:
env = PuzzEnv()

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, _, info = env.step(env.action_space.sample())
  print('obs=', obs.shape, 'reward=', reward, 'done=', done)
  with np.printoptions(threshold=np.inf):
    print(obs)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  1  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 1  0  1  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 1  0  1  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  1  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 0  0  1  0  1  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
    0  0  0  0  0  0  0  0  0  0  0  0  0]
  [ 1  0  1  0  0  1  0  0  0  0  

In [15]:
%load_ext tensorboard
# %reload_ext tensorboard
%tensorboard --logdir puzzdra_tensorboard --load_fast=false
# %tensorboard --logdir puzzdra_tensorboard --host=127.0.0.1 --port=6006 --load_fast=false
# from google.colab import output
# output.serve_kernel_port_as_iframe(6006, path="")

/bin/bash: line 1: kill: (533) - No such process


<IPython.core.display.Javascript object>

In [47]:
# 実行前にログ名を設定すること!!!
log_name='PPO gamma'

from stable_baselines3 import PPO, SAC
# from sbx import TQC, DroQ, SAC, PPO, DQN, TD3, DDPG
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.envs.multi_input_envs import SimpleMultiObsEnv
from stable_baselines3.common.callbacks import BaseCallback
from statistics import mean


# 環境の生成

# 環境のラップ
# monienv = Monitor(env, filename=None, allow_early_resets=True)
monienv = VecMonitor(DummyVecEnv([lambda: PuzzEnv() for i in range(5)]), filename=None)

import torch as th
import torch.nn as nn
from gymnasium import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor

def conv3x3(in_channels, out_channels, stride=1):
    return nn.Conv2d(
        in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False
    )

class ResidualBlock(nn.Module):
    def __init__(self, num_channels, stride=1):
        super().__init__()
        self.conv1 = conv3x3(num_channels, num_channels, stride)
        self.bn1 = nn.BatchNorm2d(num_channels)
        self.conv2 = conv3x3(num_channels, num_channels)
        self.bn2 = nn.BatchNorm2d(num_channels)

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = nn.functional.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out += x
        out = nn.functional.relu(out)
        return out

class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: spaces.Box, features_dim: int = 256):
        super().__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 128, kernel_size=3, stride=1, padding='same'),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            ResidualBlock(128),
            # ResidualBlock(128),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=32),
    # share_features_extractor=False
)

# エージェントの訓練
model = PPO('CnnPolicy', monienv, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log="./puzzdra_tensorboard/")
# model = PPO('CnnPolicy', monienv, verbose=1, tensorboard_log="./puzzdra_tensorboard/")
print(model.policy)

class TensorboardCallback(BaseCallback):
  """
  Custom callback for plotting additional values in tensorboard.
  """

  def __init__(self, verbose=0):
    super(TensorboardCallback, self).__init__(verbose)
    self.check_freq = 100_000//5

  def _on_step(self) -> bool:
    if self.n_calls % self.check_freq == 0:

      env = PuzzEnv()
      rew_array = []

      for test in range(100):
        obs, _info = env.reset()
        n_steps = 100
        for step in range(n_steps):
          action, _ = model.predict(obs, deterministic=True)
          obs, reward, done, _, info = env.step(action)
          if done:
            rew_array.append(reward)
            break

      self.logger.record('combo', mean(rew_array) if len(rew_array) > 0 else 0)
    return True

model = model.learn(200_000, tb_log_name=log_name, callback=TensorboardCallback()) # 1_500_000 = 1 hour

Using cpu device
ActorCriticCnnPolicy(
  (features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(5, 128, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): ResidualBlock(
        (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (4): Flatten(start_dim=1, end_dim=-1)
    )
    (linear): Sequential(
      (0): Linear(in_features=165888, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (pi_features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(5, 128, kernel_size=(3, 3), stride=(1, 1), paddin

KeyboardInterrupt: ignored

In [40]:
#
env = PuzzEnv()

obs, _info = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 30
for step in range(n_steps):
  print("Step {}".format(step + 1))
  action, _states = model.predict(obs)
  obs, reward, done, _, info = env.step(action)
  # print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

[34m●[0m[32m●[0m[34m●[0m[31m●[0m[31m●[0m[34m●[0m
[31m●[0m[34m●[0m[31m●[0m[31m●[0m[32m●[0m[32m●[0m
[31m●[0m[32m●[0m[31m●[0m[32m●[0m[31m●[0m[32m●[0m
[34m●[0m[31m●[0m[34m●[0m[34m●[0m[31m●[0m[34m●[0m
[32m●[0m[31m[4m●[0m[34m●[0m[31m●[0m[34m●[0m[31m●[0m
Box(0, 255, (5, 36, 36), uint8)
Discrete(4)
2
Step 1
[34m●[0m[32m●[0m[34m●[0m[31m●[0m[31m●[0m[34m●[0m
[31m●[0m[34m●[0m[31m●[0m[31m●[0m[32m●[0m[32m●[0m
[31m●[0m[32m●[0m[31m●[0m[32m●[0m[31m●[0m[32m●[0m
[34m●[0m[31m●[0m[34m●[0m[34m●[0m[31m●[0m[34m●[0m
[32m●[0m[34m●[0m[31m[4m●[0m[31m●[0m[34m●[0m[31m●[0m
Step 2
[34m●[0m[32m●[0m[34m●[0m[31m●[0m[31m●[0m[34m●[0m
[31m●[0m[34m●[0m[31m●[0m[31m●[0m[32m●[0m[32m●[0m
[31m●[0m[32m●[0m[31m●[0m[32m●[0m[31m●[0m[32m●[0m
[34m●[0m[31m●[0m[31m[4m●[0m[34m●[0m[31m●[0m[34m●[0m
[32m●[0m[34m●[0m[34m●[0m[31m●[0m[34m●[0m[31m●[0m
Step 3
[34m●

In [41]:
model.save('puzzdra_nn')

In [None]:
print(model.policy)