<a href="https://colab.research.google.com/github/mitosagi/puzzdra-nnsolver/blob/master/puzz_move.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 初期化

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!cp -r /content/drive/MyDrive/User/python/puzzdra-nnsolver /content/puzzdra-nnsolver
%cd /content/puzzdra-nnsolver
!pip install --log=pip_log -e .
!pip install git+https://github.com/DLR-RM/stable-baselines3
!pip install sbx-rl

/content/puzzdra-nnsolver
Obtaining file:///content/puzzdra-nnsolver
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: Puzzpy
  Attempting uninstall: Puzzpy
    Found existing installation: Puzzpy 1.0
    Uninstalling Puzzpy-1.0:
      Successfully uninstalled Puzzpy-1.0
  Running setup.py develop for Puzzpy
Successfully installed Puzzpy-1.0
Collecting git+https://github.com/DLR-RM/stable-baselines3
  Cloning https://github.com/DLR-RM/stable-baselines3 to /tmp/pip-req-build-l3hfy0dm
  Running command git clone --filter=blob:none --quiet https://github.com/DLR-RM/stable-baselines3 /tmp/pip-req-build-l3hfy0dm
  Resolved https://github.com/DLR-RM/stable-baselines3 to commit 69afefc91d408d352b4224ae5244ad2c32bb7634
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting sbx-rl
  Downloading sbx_rl-0.8.0-py3-none-any.whl (44 kB)
[2K

## 実際の処理

In [96]:
import numpy as np
import gymnasium
from gymnasium import spaces
from puzzpy import PuzzTable
from stable_baselines3.common.env_checker import check_env
import random

drop_color = 3
board_width = 6
board_height = 5
min_size = 6 # 36 if CNN, 6 if manual MLP or CNN

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    BLACK = '\033[30m'
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    BLUE = '\033[34m'
    MAGENTA = '\033[35m'
    CYAN = '\033[36m'

class PuzzEnv(gymnasium.Env):
  """
  パズドラの環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  def __init__(self):
    super(PuzzEnv, self).__init__()

    self.action_space = spaces.Discrete(4) # 指を離す動作を含めると5
    self.observation_space = spaces.Box(low=0, high=255, shape=(1+1+1+drop_color, max(min_size, board_height), max(min_size, board_width)), dtype=np.uint8) # 1 for turn num, +1 for finger position
    self.action = 255
    self.prev_action = 255
    self.prev_finger_array = None
    self.rew = 0

  def retobs(self, table):
    turn_array = [np.full((board_height, board_width), table.get_turn(), np.uint8)]
    action_array = [np.full((board_height, board_width), self.action, np.uint8)]
    finger_array = np.array([np.array(table.get_XY_as_table()).astype(np.uint8) * (2 ** 5)]).astype(np.uint8)
    # if self.prev_finger_array is not None:
    #   finger_array = finger_array + self.prev_finger_array // 2
    # self.prev_finger_array = finger_array
    table_array = np.array(table.get_table()).astype(np.uint8)
    u = np.arange(table_array.max()+1)
    table_array = (u[:,np.newaxis,np.newaxis]==table_array).astype(np.uint8)[1:] # ドロップをone-hotにする https://stackoverflow.com/questions/67249470/convert-a-2d-numpy-array-into-a-hot-encoded-3d-numpy-array-with-same-values-in
    tmp_array = np.concatenate([turn_array, action_array, finger_array, table_array])
    now_rew = table.eval_otoshi()
    step_rew = now_rew - self.rew
    self.rew = now_rew
    return np.pad(tmp_array, [(0,0), (0, max(min_size - board_height, 0)), (0, max(min_size - board_width, 0))], mode='constant', constant_values=0), step_rew

  def reset(self, seed = None, test_min = 30):
    # test_min = random.randrange(10,30,1)
    super().reset(seed=seed)
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    while True:
      self.table = PuzzTable("".join([str(random.randrange(drop_color)) for i in range(board_width*board_height)]), random.randrange(board_width), random.randrange(board_height), test_min) # n色陣　操作時間m秒
      if self.table.eval_otoshi() == 0:
        break
    # self.table = PuzzTable("".join([str(random.randrange(drop_color)) for i in range(board_width*board_height)]), random.randrange(board_width), random.randrange(board_height), test_min) # n色陣　操作時間m秒

    return self.retobs(self.table)[0], {}

  def step(self, action):
    self.action = action
    if action == 4: # 指を離す動作
      return *(self.retobs(self.table)), True, False, {}

    next_table = self.table.next_tables()[action]

    if abs(action - self.prev_action) == 2: # 元の方向に戻る 0と2 1と3が該当
      # next_table = self.table
      return self.retobs(self.table)[0], -1, True, False, {}
    self.prev_action = action

    if next_table.get_table()[0][0] == 127: # 壁に移動
      # next_table = self.table
      return self.retobs(self.table)[0], -1, True, False, {}

    self.table = next_table

    if self.table.get_turn() <= 0: # 時間切れ
      return *(self.retobs(self.table)), True, False, {}

    return *(self.retobs(self.table)), False, False, {}

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    tcolor = [bcolors.RED, bcolors.BLUE, bcolors.GREEN, bcolors.MAGENTA, bcolors.YELLOW, bcolors.BLACK]
    start = self.table.get_XY_as_table()
    table = self.table.get_table()
    for i in range(board_height):
      for j in range(board_width):
        if start[i][j] == 1:
          print(tcolor[table[i][j]-1]  +  bcolors.UNDERLINE + "●" + bcolors.ENDC, end='')
        else:
          print(tcolor[table[i][j]-1]  + "●" + bcolors.ENDC, end='')

      print('')

check_env(PuzzEnv())



In [88]:
env = PuzzEnv()

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, _, info = env.step(env.action_space.sample())
  print('obs=', obs.shape, 'reward=', reward, 'done=', done)
  with np.printoptions(threshold=np.inf):
    print(obs)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

[31m●[0m[34m[4m●[0m[31m●[0m[32m●[0m[32m●[0m[34m●[0m
[31m●[0m[32m●[0m[34m●[0m[32m●[0m[34m●[0m[31m●[0m
[32m●[0m[31m●[0m[32m●[0m[34m●[0m[31m●[0m[31m●[0m
[31m●[0m[32m●[0m[31m●[0m[31m●[0m[32m●[0m[34m●[0m
[32m●[0m[34m●[0m[31m●[0m[32m●[0m[34m●[0m[31m●[0m
Box(0, 255, (6, 5, 6), uint8)
Discrete(4)
1
Step 1
obs= (6, 5, 6) reward= 0 done= False
[[[29 29 29 29 29 29]
  [29 29 29 29 29 29]
  [29 29 29 29 29 29]
  [29 29 29 29 29 29]
  [29 29 29 29 29 29]]

 [[ 3  3  3  3  3  3]
  [ 3  3  3  3  3  3]
  [ 3  3  3  3  3  3]
  [ 3  3  3  3  3  3]
  [ 3  3  3  3  3  3]]

 [[ 0  0 32  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]
  [ 0  0  0  0  0  0]]

 [[ 1  1  0  0  0  0]
  [ 1  0  0  0  0  1]
  [ 0  1  0  0  1  1]
  [ 1  0  1  1  0  0]
  [ 0  0  1  0  0  1]]

 [[ 0  0  1  0  0  1]
  [ 0  0  1  0  1  0]
  [ 0  0  0  1  0  0]
  [ 0  0  0  0  0  1]
  [ 0  1  0  0  1  0]]

 [[ 0  0  0  1  1  0]
  [ 0  1  0  1  0  0]

In [98]:
%load_ext tensorboard
%reload_ext tensorboard
# %tensorboard --logdir puzzdra_tensorboard
%tensorboard --logdir puzzdra_tensorboard --host=127.0.0.1 --port=6006 --load_fast=false
from google.colab import output
output.serve_kernel_port_as_window(6006, path="")

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


<IPython.core.display.Javascript object>



<IPython.core.display.Javascript object>

In [99]:
# 実行前にログ名を設定すること!!!
log_name='PPO gamma no turn no prev'

from stable_baselines3 import PPO, SAC
# from sbx import TQC, DroQ, SAC, PPO, DQN, TD3, DDPG
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from stable_baselines3.common.envs.multi_input_envs import SimpleMultiObsEnv
from stable_baselines3.common.callbacks import BaseCallback
from statistics import mean

log_name='PPO gamma'

# 環境の生成
env = PuzzEnv()

# 環境のラップ
# monienv = Monitor(env, filename=None, allow_early_resets=True)
monienv = VecMonitor(DummyVecEnv([lambda: PuzzEnv() for i in range(4)]), filename=None)

import torch as th
import torch.nn as nn
from gymnasium import spaces
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor


class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: spaces.Box, features_dim: int = 256):
        super().__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 32, kernel_size=3, stride=1, padding='same'),
            nn.ReLU(),
            nn.Conv2d(32, 8, kernel_size=3, stride=1, padding='valid'),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with th.no_grad():
            n_flatten = self.cnn(
                th.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())

    def forward(self, observations: th.Tensor) -> th.Tensor:
        return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=32),
    # share_features_extractor=False
)

# エージェントの訓練
model = PPO('MlpPolicy', monienv, verbose=1, policy_kwargs=policy_kwargs, tensorboard_log="./puzzdra_tensorboard/")
# model = PPO('MlpPolicy', monienv, verbose=1, policy_kwargs=dict(net_arch=[64,64]), tensorboard_log="./puzzdra_tensorboard/")
print(model.policy)

class TensorboardCallback(BaseCallback):
  """
  Custom callback for plotting additional values in tensorboard.
  """

  def __init__(self, verbose=0):
    super(TensorboardCallback, self).__init__(verbose)
    self.check_freq = 100_000//4

  def _on_step(self) -> bool:
    if self.n_calls % self.check_freq == 0:

      env = PuzzEnv()
      rew_array = []

      for test in range(100):
        obs, _info = env.reset()
        n_steps = 100
        for step in range(n_steps):
          action, _ = model.predict(obs, deterministic=True)
          obs, reward, done, _, info = env.step(action)
          if done:
            rew_array.append(reward)
            break

      self.logger.record('combo', mean(rew_array) if len(rew_array) > 0 else 0)
    return True

model = model.learn(500_000, tb_log_name=log_name, callback=TensorboardCallback()) # 1_500_000 = 1 hour

Using cpu device
ActorCriticPolicy(
  (features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(6, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (1): ReLU()
      (2): Conv2d(32, 8, kernel_size=(3, 3), stride=(1, 1), padding=valid)
      (3): ReLU()
      (4): Flatten(start_dim=1, end_dim=-1)
    )
    (linear): Sequential(
      (0): Linear(in_features=128, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (pi_features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(6, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
      (1): ReLU()
      (2): Conv2d(32, 8, kernel_size=(3, 3), stride=(1, 1), padding=valid)
      (3): ReLU()
      (4): Flatten(start_dim=1, end_dim=-1)
    )
    (linear): Sequential(
      (0): Linear(in_features=128, out_features=32, bias=True)
      (1): ReLU()
    )
  )
  (vf_features_extractor): CustomCNN(
    (cnn): Sequential(
      (0): Conv2d(6, 32, kernel_size=(3, 3), stride=(1, 1), padding=same)
      

KeyboardInterrupt: ignored

In [55]:
#
env = PuzzEnv()

obs, _info = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 30
for step in range(n_steps):
  print("Step {}".format(step + 1))
  action, _states = model.predict(obs)
  obs, reward, done, _, info = env.step(action)
  # print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

[34m●[0m[31m●[0m[34m●[0m[31m●[0m[31m●[0m[32m●[0m
[31m●[0m[32m●[0m[31m●[0m[31m●[0m[32m●[0m[31m●[0m
[32m●[0m[31m●[0m[34m●[0m[32m●[0m[34m●[0m[32m●[0m
[31m●[0m[32m●[0m[34m●[0m[31m●[0m[34m●[0m[34m[4m●[0m
[31m●[0m[32m●[0m[31m●[0m[32m●[0m[31m●[0m[34m●[0m
Box(0, 255, (6, 6, 6), uint8)
Discrete(4)
0
Step 1
[34m●[0m[31m●[0m[34m●[0m[31m●[0m[31m●[0m[32m●[0m
[31m●[0m[32m●[0m[31m●[0m[31m●[0m[32m●[0m[31m●[0m
[32m●[0m[31m●[0m[34m●[0m[32m●[0m[34m●[0m[32m●[0m
[31m●[0m[32m●[0m[34m●[0m[31m●[0m[34m●[0m[34m●[0m
[31m●[0m[32m●[0m[31m●[0m[32m●[0m[31m●[0m[34m[4m●[0m
Step 2
[34m●[0m[31m●[0m[34m●[0m[31m●[0m[31m●[0m[32m●[0m
[31m●[0m[32m●[0m[31m●[0m[31m●[0m[32m●[0m[31m●[0m
[32m●[0m[31m●[0m[34m●[0m[32m●[0m[34m●[0m[32m●[0m
[31m●[0m[32m●[0m[34m●[0m[31m●[0m[34m●[0m[34m[4m●[0m
[31m●[0m[32m●[0m[31m●[0m[32m●[0m[31m●[0m[34m●[0m
Step 3
[34m●[

In [None]:
model.save('puzzdra_nn')

In [77]:
print(model.policy)

ActorCriticPolicy(
  (features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (pi_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (vf_features_extractor): FlattenExtractor(
    (flatten): Flatten(start_dim=1, end_dim=-1)
  )
  (mlp_extractor): MlpExtractor(
    (policy_net): Sequential(
      (0): Linear(in_features=216, out_features=32, bias=True)
      (1): Tanh()
      (2): Linear(in_features=32, out_features=32, bias=True)
      (3): Tanh()
      (4): Linear(in_features=32, out_features=32, bias=True)
      (5): Tanh()
    )
    (value_net): Sequential(
      (0): Linear(in_features=216, out_features=32, bias=True)
      (1): Tanh()
      (2): Linear(in_features=32, out_features=32, bias=True)
      (3): Tanh()
      (4): Linear(in_features=32, out_features=32, bias=True)
      (5): Tanh()
    )
  )
  (action_net): Linear(in_features=32, out_features=4, bias=True)
  (value_net): Linear(in_features=32,

  and should_run_async(code)
