<a href="https://colab.research.google.com/github/mitosagi/puzzdra-nnsolver/blob/master/puzz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 初期化

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
!cp -r /content/drive/MyDrive/User/python/puzzdra-nnsolver /content/puzzdra-nnsolver
%cd /content/puzzdra-nnsolver
!pip install --log=pip_log -e .
!python puzz_test.py

/content/puzzdra-nnsolver
Obtaining file:///content/puzzdra-nnsolver
Installing collected packages: Puzzpy
  Running setup.py develop for Puzzpy
Successfully installed Puzzpy
512621
215133
432116
125133
523161
[[5, 1, 2, 6, 2, 1], [2, 5, 1, 1, 3, 3], [4, 3, 2, 1, 1, 6], [1, 2, 5, 1, 3, 3], [5, 2, 3, 1, 6, 1]]
[[0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
[[5, 1, 5, 6, 2, 1], [2, 1, 2, 1, 3, 3], [4, 3, 2, 1, 1, 6], [1, 2, 5, 1, 3, 3], [5, 2, 3, 1, 6, 1]]
[[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
[[5, 1, 2, 6, 2, 1], [2, 1, 2, 1, 3, 3], [4, 3, 5, 1, 1, 6], [1, 2, 5, 1, 3, 3], [5, 2, 3, 1, 6, 1]]
[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]]
[[5, 1, 2, 6, 2, 1], [2, 1, 1, 5, 3, 3], [4, 3, 2, 1, 1, 6], [1, 2, 5, 1, 3, 3], [5, 2, 3, 1, 6, 1]]
[[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0

In [13]:
!pip install git+https://github.com/DLR-RM/stable-baselines3

Collecting git+https://github.com/DLR-RM/stable-baselines3
  Cloning https://github.com/DLR-RM/stable-baselines3 to /tmp/pip-req-build-rsnzlrws
  Running command git clone -q https://github.com/DLR-RM/stable-baselines3 /tmp/pip-req-build-rsnzlrws
Building wheels for collected packages: stable-baselines3
  Building wheel for stable-baselines3 (setup.py) ... [?25l[?25hdone
  Created wheel for stable-baselines3: filename=stable_baselines3-1.1.0a11-cp37-none-any.whl size=160811 sha256=b01c29df8a774074d8634a8c4a06460807e64e954e8c5b87a511248378052551
  Stored in directory: /tmp/pip-ephem-wheel-cache-75kq94fm/wheels/cf/89/6b/cd4b89427eb5ff0858bcba73911088d606c59eb3a97290b1bb
Successfully built stable-baselines3
Installing collected packages: stable-baselines3
Successfully installed stable-baselines3-1.1.0a11


## サンプルの実行

In [14]:
import numpy as np
import gym
from gym import spaces


class GoLeftEnv(gym.Env):
  """
  Gymのインターフェースに従うカスタム環境
  エージェントが常に左に行くことを学ぶ環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  # 定数を定義
  LEFT = 0
  RIGHT = 1

  def __init__(self, grid_size=10):
    super(GoLeftEnv, self).__init__()

    # 1Dグリッドのサイズ
    self.grid_size = grid_size

    # グリッドの右側でエージェントを初期化
    self.agent_pos = grid_size - 1

    # 行動空間と状態空間を定義
    # gym.spacesオブジェクトでなければならない
    # 離散行動を使用する場合の例には、左と右の2つがある
    n_actions = 2
    self.action_space = spaces.Discrete(n_actions)

    # 状態はエージェントの座標になる
    # Discrete空間とBox空間の両方で表現できる
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                       shape=(1,), dtype=np.float32)

  def reset(self):
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    # グリッドの右側でエージェントを初期化
    self.agent_pos = self.grid_size - 1

    # float32に変換してより一般的なものにします（連続行動を使用する場合）
    return np.array(self.agent_pos).astype(np.float32)

  def step(self, action):
    if action == self.LEFT:
      self.agent_pos -= 1
    elif action == self.RIGHT:
      self.agent_pos += 1
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # グリッドの境界を表現
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # グリッドの左側にいるか
    done = self.agent_pos == 0

    # ゴールを除くすべての場所で0の報酬
    reward = 1 if self.agent_pos == 0 else 0

    # 必要に応じて情報を渡すことができるが、現在は未使用
    info = {}

    return np.array(self.agent_pos).astype(np.float32), reward, done, info

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    # エージェントは「x」、残りは「.」として表現
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

In [15]:
env = GoLeftEnv(grid_size=10)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = 0

# ハードコードされた最高のエージェント：常に左に行く
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

.........x.
Box(0.0, 10.0, (1,), float32)
Discrete(2)
0
Step 1
obs= 8.0 reward= 0 done= False
........x..
Step 2
obs= 7.0 reward= 0 done= False
.......x...
Step 3
obs= 6.0 reward= 0 done= False
......x....
Step 4
obs= 5.0 reward= 0 done= False
.....x.....
Step 5
obs= 4.0 reward= 0 done= False
....x......
Step 6
obs= 3.0 reward= 0 done= False
...x.......
Step 7
obs= 2.0 reward= 0 done= False
..x........
Step 8
obs= 1.0 reward= 0 done= False
.x.........
Step 9
obs= 0.0 reward= 1 done= True
x..........
Goal reached! reward= 1


In [16]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

# 環境の生成
env = GoLeftEnv(grid_size=10)

# 環境のラップ
env = Monitor(env, filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

In [17]:
# エージェントの訓練
model = PPO('MlpPolicy', env, verbose=1).learn(5000)

Using cuda device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 99.2     |
|    ep_rew_mean     | 1        |
| time/              |          |
|    fps             | 901      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 63.3        |
|    ep_rew_mean          | 1           |
| time/                   |             |
|    fps                  | 688         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.018118575 |
|    clip_fraction        | 0.214       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.677      |
|    explained_variance   | -1.2        |
|    learnin

In [18]:
# 訓練済みエージェントのテスト
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # VecEnvは、エピソード完了に遭遇すると自動的にリセットされることに注意
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [0]
obs= [[8.]] reward= [0.] done= [False]
........x..
Step 2
Action:  [0]
obs= [[7.]] reward= [0.] done= [False]
.......x...
Step 3
Action:  [0]
obs= [[6.]] reward= [0.] done= [False]
......x....
Step 4
Action:  [0]
obs= [[5.]] reward= [0.] done= [False]
.....x.....
Step 5
Action:  [0]
obs= [[4.]] reward= [0.] done= [False]
....x......
Step 6
Action:  [0]
obs= [[3.]] reward= [0.] done= [False]
...x.......
Step 7
Action:  [0]
obs= [[2.]] reward= [0.] done= [False]
..x........
Step 8
Action:  [0]
obs= [[1.]] reward= [0.] done= [False]
.x.........
Step 9
Action:  [0]
obs= [[9.]] reward= [1.] done= [ True]
.........x.
Goal reached! reward= [1.]


## 実際の処理

In [101]:
import numpy as np
import gym
from gym import spaces
from puzzpy import PuzzTable

class PuzzEnv(gym.Env):
  """
  パズドラの環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  def __init__(self):
    super(PuzzEnv, self).__init__()

    self.action_space = spaces.Discrete(5)

    # 状態はエージェントの座標になる
    # Discrete空間とBox空間の両方で表現できる
    self.observation_space = spaces.Dict({'table': spaces.Box(low=0, high=5,
                                       shape=(5,6), dtype=np.float32),
                                       'start': spaces.Discrete(30),
                                       'turn': spaces.Box(low=1, high=155,
                                       shape=(1,), dtype=np.float32),
                                       'rew': spaces.Box(low=1, high=100,
                                       shape=(1,), dtype=np.float32)})
    
    #                                    'start': spaces.Box(low=0, high=1,
    #                                    shape=(5,6), dtype=np.float32),

  def retobs(self, table):
    # return {'table': np.array(table.get_table()).astype(np.float32),
    #           'start': np.array(table.get_XY_as_table()).astype(np.float32),
    #           'turn': table.get_turn(),'rew': table.eval_otoshi()}, table.eval_otoshi(), True, {}
    return {'table': np.array(table.get_table()).astype(np.float32),
              'start': np.flatnonzero(table.get_XY_as_table())[0],
              'turn': table.get_turn(),'rew': table.eval_otoshi()}, table.eval_otoshi(), True, {}

  def reset(self):
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    self.table = PuzzTable(155)
    return self.retobs(self.table)

  def step(self, action):
    if action == 4:
      return self.retobs(self.table)

    next_table = self.table.next_tables()[action]

    if next_table.get_table()[0][0] == 127:
      return self.retobs(self.table)

    self.table = next_table

    if self.table.get_turn() <= 0:
      return self.retobs(self.table)

    # return {'table': np.array(self.table.get_table()).astype(np.float32),
    #           'start': np.array(self.table.get_XY_as_table()).astype(np.float32),
    #           'turn': self.table.get_turn(),'rew': self.table.eval_otoshi()}, 0, False, {}
    return {'table': np.array(self.table.get_table()).astype(np.float32),
              'start': np.flatnonzero(table.get_XY_as_table())[0],
              'turn': self.table.get_turn(),'rew': self.table.eval_otoshi()}, 0, False, {}

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    
    for i in self.table.get_table():
      print(*i)

In [102]:
env = PuzzEnv()

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(env.action_space.sample())
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

1 1 1 1 4 1
1 3 6 5 6 2
6 6 4 2 1 4
6 3 5 3 1 1
5 2 5 6 1 4
Dict(rew:Box(1.0, 100.0, (1,), float32), start:Discrete(30), table:Box(0.0, 5.0, (5, 6), float32), turn:Box(1.0, 155.0, (1,), float32))
Discrete(5)
2
Step 1
obs= {'table': array([[1., 1., 1., 1., 4., 1.],
       [1., 3., 6., 5., 6., 2.],
       [6., 6., 4., 2., 1., 4.],
       [6., 3., 5., 3., 1., 1.],
       [5., 2., 5., 6., 1., 4.]], dtype=float32), 'start': 24, 'turn': 154, 'rew': 2} reward= 0 done= False
1 1 1 1 4 1
1 3 6 5 6 2
6 6 4 2 1 4
6 3 5 3 1 1
5 2 5 6 1 4
Step 2
obs= {'table': array([[1., 1., 1., 1., 4., 1.],
       [1., 3., 6., 5., 6., 2.],
       [6., 6., 4., 2., 1., 4.],
       [6., 3., 5., 3., 1., 1.],
       [5., 2., 5., 6., 1., 4.]], dtype=float32), 'start': 23, 'turn': 154, 'rew': 2} reward= 2 done= True
1 1 1 1 4 1
1 3 6 5 6 2
6 6 4 2 1 4
6 3 5 3 1 1
5 2 5 6 1 4
Goal reached! reward= 2


In [103]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.envs.multi_input_envs import SimpleMultiObsEnv

# 環境の生成
env = PuzzEnv()

# 環境のラップ
env = Monitor(env, filename=None, allow_early_resets=True)
#env = DummyVecEnv([lambda: env])
env = SimpleMultiObsEnv()

In [89]:
# エージェントの訓練
model = PPO('MultiInputPolicy', env, verbose=1).learn(300000)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 63.1     |
|    ep_rew_mean     | -5.76    |
| time/              |          |
|    fps             | 661      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 59.2      |
|    ep_rew_mean          | -5.22     |
| time/                   |           |
|    fps                  | 482       |
|    iterations           | 2         |
|    time_elapsed         | 8         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0177618 |
|    clip_fraction        | 0.159     |
|    clip_range           | 0.2       |
|   

In [104]:
# 訓練済みエージェントのテスト
obs = env.reset()
n_steps = 155
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  # print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    # VecEnvは、エピソード完了に遭遇すると自動的にリセットされることに注意
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  2
Went right in state 6, got to state 6
Step 2
Action:  2
Went right in state 6, got to state 6
Step 3
Action:  2
Went right in state 6, got to state 6
Step 4
Action:  2
Went right in state 6, got to state 6
Step 5
Action:  2
Went right in state 6, got to state 6
Step 6
Action:  2
Went right in state 6, got to state 6
Step 7
Action:  2
Went right in state 6, got to state 6
Step 8
Action:  2
Went right in state 6, got to state 6
Step 9
Action:  2
Went right in state 6, got to state 6
Step 10
Action:  2
Went right in state 6, got to state 6
Step 11
Action:  2
Went right in state 6, got to state 6
Step 12
Action:  2
Went right in state 6, got to state 6
Step 13
Action:  2
Went right in state 6, got to state 6
Step 14
Action:  2
Went right in state 6, got to state 6
Step 15
Action:  2
Went right in state 6, got to state 6
Step 16
Action:  2
Went right in state 6, got to state 6
Step 17
Action:  2
Went right in state 6, got to state 6
Step 18
Action:  2
Went right in state 6