<a href="https://colab.research.google.com/github/mitosagi/puzzdra-nnsolver/blob/master/puzz_move.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 初期化

In [23]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
!cp -r /content/drive/MyDrive/User/python/puzzdra-nnsolver /content/puzzdra-nnsolver
%cd /content/puzzdra-nnsolver
!pip install --log=pip_log -e .
!pip install git+https://github.com/DLR-RM/stable-baselines3

/content/puzzdra-nnsolver
Obtaining file:///content/puzzdra-nnsolver
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: Puzzpy
  Attempting uninstall: Puzzpy
    Found existing installation: Puzzpy 1.0
    Uninstalling Puzzpy-1.0:
      Successfully uninstalled Puzzpy-1.0
  Running setup.py develop for Puzzpy
Successfully installed Puzzpy-1.0
Collecting git+https://github.com/DLR-RM/stable-baselines3
  Cloning https://github.com/DLR-RM/stable-baselines3 to /tmp/pip-req-build-2eddgafv
  Running command git clone --filter=blob:none --quiet https://github.com/DLR-RM/stable-baselines3 /tmp/pip-req-build-2eddgafv
  Resolved https://github.com/DLR-RM/stable-baselines3 to commit 69afefc91d408d352b4224ae5244ad2c32bb7634
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: stable-baselines3
  Building wheel

## 実際の処理

In [29]:
import numpy as np
import gymnasium
from gymnasium import spaces
from puzzpy import PuzzTable
from stable_baselines3.common.env_checker import check_env
import random

class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

class PuzzEnv(gymnasium.Env):
  """
  パズドラの環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  def __init__(self, gamma = 0.9, out_combo = False):
    super(PuzzEnv, self).__init__()

    self.action_space = spaces.Discrete(4) # 指を離す動作を含めると5
    self.observation_space = spaces.Box(low=0, high=255, shape=(1,(5+1)*6,6*6), dtype=np.uint8) # +1 for metadata

    self.out_combo = out_combo
    self.gamma = gamma

  def retobs(self, table, prev = 10):
    metadata_array = np.zeros((1, 6), np.uint8)
    metadata_array[0][0] = table.get_turn()
    # metadata_array[0][1] = prev
    table_array = np.array(table.get_table()).astype(np.uint8) + 10 * np.array(table.get_XY_as_table()).astype(np.uint8)
    tmp_array = np.stack([np.vstack([metadata_array, table_array])])

    rew = (self.gamma ** (10 - table.eval_otoshi())) if table.eval_otoshi() > 0 else 0
    if self.out_combo == True:
      rew = table.eval_otoshi()
    return np.repeat(np.repeat(tmp_array, 6, axis=2), 6, axis=1), rew, True, False, {}

  def reset(self, seed = None, test_min = 10):
    super().reset(seed=seed)
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    while True:
      self.table = PuzzTable("".join([str(random.randrange(3)) for i in range(5*6)]), random.randrange(6), random.randrange(5), test_min) # n色陣　操作時間m秒
      if self.table.eval_otoshi() == 0:
        break

    return self.retobs(self.table)[0], {}

  def step(self, action):
    if action == 4: # 指を離す動作
      return self.retobs(self.table)

    next_table = self.table.next_tables()[action]

    if next_table.get_table()[0][0] == 127: # 壁に移動
      return self.retobs(self.table)[0], 0, True, False, {}

    # if next_table.get_XY_as_table() == self.table.get_XY_as_table(): # 千日手
    #   return self.retobs(self.table)[0], 0, True, False, {}

    self.table = next_table

    if self.table.get_turn() <= 0: # 時間切れ
      return self.retobs(self.table, prev = action)

    return self.retobs(self.table, prev = action)[0], 0, False, False, {}

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    start = self.table.get_XY_as_table()
    table = self.table.get_table()
    for i in range(5):
      for j in range(6):
        if start[i][j] == 1:
          print(bcolors.FAIL + str(table[i][j]) + bcolors.ENDC, end='')
        else:
          print(table[i][j], end='')

      print('')

check_env(PuzzEnv())

In [31]:
#
env = PuzzEnv(out_combo=True)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, _, info = env.step(env.action_space.sample())
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

231312
132232
223[91m2[0m21
322313
131311
Box(0, 255, (1, 36, 36), uint8)
Discrete(4)
1
Step 1
obs= [[[9 9 9 ... 0 0 0]
  [9 9 9 ... 0 0 0]
  [9 9 9 ... 0 0 0]
  ...
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]]] reward= 0 done= False
231312
132232
223321
322[91m2[0m13
131311
Step 2
obs= [[[8 8 8 ... 0 0 0]
  [8 8 8 ... 0 0 0]
  [8 8 8 ... 0 0 0]
  ...
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]]] reward= 0 done= False
231312
132232
223321
3221[91m2[0m3
131311
Step 3
obs= [[[7 7 7 ... 0 0 0]
  [7 7 7 ... 0 0 0]
  [7 7 7 ... 0 0 0]
  ...
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]]] reward= 0 done= False
231312
132232
223321
322113
1313[91m2[0m1
Step 4
obs= [[[7 7 7 ... 0 0 0]
  [7 7 7 ... 0 0 0]
  [7 7 7 ... 0 0 0]
  ...
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]
  [1 1 1 ... 1 1 1]]] reward= 0 done= True
231312
132232
223321
322113
1313[91m2[0m1
Goal reached! reward= 0


  and should_run_async(code)


In [32]:
%load_ext tensorboard
%tensorboard --logdir puzzdra_tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 7732), started 0:06:13 ago. (Use '!kill 7732' to kill it.)

<IPython.core.display.Javascript object>

In [34]:
# 実行前にログ名を設定すること!!!
log_name='PPO gamma no turn no prev'

from stable_baselines3 import PPO, SAC
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.envs.multi_input_envs import SimpleMultiObsEnv
from stable_baselines3.common.callbacks import BaseCallback
from statistics import mean

class TensorboardCallback(BaseCallback):
  """
  Custom callback for plotting additional values in tensorboard.
  """

  def __init__(self, verbose=0):
    super(TensorboardCallback, self).__init__(verbose)
    self.check_freq = 1000

  def _on_step(self) -> bool:
    if self.n_calls % self.check_freq == 0:

      env = PuzzEnv(out_combo=True)
      rew_array = []

      for test in range(100):
        obs = env.reset()
        n_steps = 100
        for step in range(n_steps):
          action, _ = model.predict(obs, deterministic=True)
          obs, reward, done, info = env.step(action)
          if done:
            rew_array.append(reward)
            break

      self.logger.record('combo', mean(rew_array))
    return True

# for i in [0.5, 0.6, 0.7, 0.8, 0.9]:
for i in [1.0]:
  log_name='PPO gamma ' + str(i)

  # 環境の生成
  #env = PuzzEnv(gamma = i)
  env = PuzzEnv(out_combo=True)

  # 環境のラップ
  monienv = Monitor(env, filename=None, allow_early_resets=True)

  # エージェントの訓練
  model = PPO('CnnPolicy', monienv, verbose=1, tensorboard_log="./puzzdra_tensorboard/").learn(70000, tb_log_name=log_name, callback=TensorboardCallback()) # 1500000 = 1 hour

Using cuda device
Wrapping the env in a DummyVecEnv.
Logging to ./puzzdra_tensorboard/PPO gamma 1.0_3


  and should_run_async(code)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5.18     |
|    ep_rew_mean     | 0.29     |
| time/              |          |
|    fps             | 248      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.86        |
|    ep_rew_mean          | 0.34        |
| time/                   |             |
|    fps                  | 251         |
|    iterations           | 2           |
|    time_elapsed         | 16          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.004805024 |
|    clip_fraction        | 0.0175      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.38       |
|    explained_variance   | 8.16e-05    |
|    learning_rate        | 0.

In [40]:
#
env = PuzzEnv(out_combo=True)

obs, _info = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  action, _states = model.predict(obs)
  obs, reward, done, _, info = env.step(action)
  # print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

323233
312312
131232
113213
[91m2[0m13311
Box(0, 255, (1, 36, 36), uint8)
Discrete(4)
2
Step 1
323233
312312
131232
113213
1[91m2[0m3311
Step 2
323233
312312
131232
1[91m2[0m3213
113311
Step 3
323233
312312
131232
13[91m2[0m213
113311
Step 4
323233
312312
131232
132[91m2[0m13
113311
Step 5
323233
312312
131232
132313
113[91m2[0m11
Step 6
323233
312312
131232
132[91m2[0m13
113311
Step 7
323233
312312
131232
13[91m2[0m213
113311
Step 8
323233
312312
131232
133213
11[91m2[0m311
Step 9
323233
312312
131232
133213
113[91m2[0m11
Step 10
323233
312312
131232
133213
1131[91m2[0m1
Goal reached! reward= 2


In [None]:
model.save('puzzdra_nn')