<a href="https://colab.research.google.com/github/mitosagi/puzzdra-nnsolver/blob/master/puzz.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 初期化

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r /content/drive/MyDrive/User/python/puzzdra-nnsolver /content/puzzdra-nnsolver
%cd /content/puzzdra-nnsolver
!pip install --log=pip_log -e .
!python puzz_test.py

/content/puzzdra-nnsolver
Obtaining file:///content/puzzdra-nnsolver
Installing collected packages: Puzzpy
  Running setup.py develop for Puzzpy
Successfully installed Puzzpy
365326
241332
653461
225214
523426
[['\x03', '\x06', '\x05', '\x02', '\x03', '\x06'], ['\x02', '\x04', '\x01', '\x03', '\x03', '\x02'], ['\x06', '\x05', '\x03', '\x04', '\x06', '\x01'], ['\x02', '\x02', '\x05', '\x02', '\x01', '\x04'], ['\x05', '\x02', '\x03', '\x04', '\x02', '\x06']]
[['\x00', '\x00', '\x00', '\x01', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00']]
[['\x7f', '\x06', '\x05', '\x03', '\x02', '\x06'], ['\x02', '\x04', '\x01', '\x03', '\x03', '\x02'], ['\x06', '\x05', '\x03', '\x04', '\x06', '\x01'], ['\x02', '\x02', '\x05', '\x02', '\x01', '\x04'], ['\x05', '\x02', '\x03', '\x04', '\x02', '\x06']]
[['\x00', '\x00', '\x00', '\x00', '\x

In [3]:
!pip install stable-baselines3

Collecting stable-baselines3
[?25l  Downloading https://files.pythonhosted.org/packages/18/d3/6ae6e774ac6cf8f5eeca1c30b9125231db901b75f72da7d81e939f293f69/stable_baselines3-1.0-py3-none-any.whl (152kB)
[K     |██▏                             | 10kB 14.1MB/s eta 0:00:01[K     |████▎                           | 20kB 19.5MB/s eta 0:00:01[K     |██████▍                         | 30kB 22.6MB/s eta 0:00:01[K     |████████▋                       | 40kB 18.2MB/s eta 0:00:01[K     |██████████▊                     | 51kB 19.7MB/s eta 0:00:01[K     |████████████▉                   | 61kB 21.9MB/s eta 0:00:01[K     |███████████████                 | 71kB 20.8MB/s eta 0:00:01[K     |█████████████████▏              | 81kB 21.5MB/s eta 0:00:01[K     |███████████████████▎            | 92kB 22.2MB/s eta 0:00:01[K     |█████████████████████▌          | 102kB 23.5MB/s eta 0:00:01[K     |███████████████████████▋        | 112kB 23.5MB/s eta 0:00:01[K     |█████████████████████████▊

## サンプルの実行

In [4]:
import numpy as np
import gym
from gym import spaces


class GoLeftEnv(gym.Env):
  """
  Gymのインターフェースに従うカスタム環境
  エージェントが常に左に行くことを学ぶ環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  # 定数を定義
  LEFT = 0
  RIGHT = 1

  def __init__(self, grid_size=10):
    super(GoLeftEnv, self).__init__()

    # 1Dグリッドのサイズ
    self.grid_size = grid_size

    # グリッドの右側でエージェントを初期化
    self.agent_pos = grid_size - 1

    # 行動空間と状態空間を定義
    # gym.spacesオブジェクトでなければならない
    # 離散行動を使用する場合の例には、左と右の2つがある
    n_actions = 2
    self.action_space = spaces.Discrete(n_actions)

    # 状態はエージェントの座標になる
    # Discrete空間とBox空間の両方で表現できる
    self.observation_space = spaces.Box(low=0, high=self.grid_size,
                                       shape=(1,), dtype=np.float32)

  def reset(self):
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    # グリッドの右側でエージェントを初期化
    self.agent_pos = self.grid_size - 1

    # float32に変換してより一般的なものにします（連続行動を使用する場合）
    return np.array(self.agent_pos).astype(np.float32)

  def step(self, action):
    if action == self.LEFT:
      self.agent_pos -= 1
    elif action == self.RIGHT:
      self.agent_pos += 1
    else:
      raise ValueError("Received invalid action={} which is not part of the action space".format(action))

    # グリッドの境界を表現
    self.agent_pos = np.clip(self.agent_pos, 0, self.grid_size)

    # グリッドの左側にいるか
    done = self.agent_pos == 0

    # ゴールを除くすべての場所で0の報酬
    reward = 1 if self.agent_pos == 0 else 0

    # 必要に応じて情報を渡すことができるが、現在は未使用
    info = {}

    return np.array(self.agent_pos).astype(np.float32), reward, done, info

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    # エージェントは「x」、残りは「.」として表現
    print("." * self.agent_pos, end="")
    print("x", end="")
    print("." * (self.grid_size - self.agent_pos))

In [5]:
env = GoLeftEnv(grid_size=10)

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

GO_LEFT = 0

# ハードコードされた最高のエージェント：常に左に行く
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(GO_LEFT)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

.........x.
Box(0.0, 10.0, (1,), float32)
Discrete(2)
1
Step 1
obs= 8.0 reward= 0 done= False
........x..
Step 2
obs= 7.0 reward= 0 done= False
.......x...
Step 3
obs= 6.0 reward= 0 done= False
......x....
Step 4
obs= 5.0 reward= 0 done= False
.....x.....
Step 5
obs= 4.0 reward= 0 done= False
....x......
Step 6
obs= 3.0 reward= 0 done= False
...x.......
Step 7
obs= 2.0 reward= 0 done= False
..x........
Step 8
obs= 1.0 reward= 0 done= False
.x.........
Step 9
obs= 0.0 reward= 1 done= True
x..........
Goal reached! reward= 1


In [6]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

# 環境の生成
env = GoLeftEnv(grid_size=10)

# 環境のラップ
env = Monitor(env, filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

In [7]:
# エージェントの訓練
model = PPO('MlpPolicy', env, verbose=1).learn(5000)

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 115      |
|    ep_rew_mean     | 1        |
| time/              |          |
|    fps             | 1281     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 73          |
|    ep_rew_mean          | 1           |
| time/                   |             |
|    fps                  | 983         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009428214 |
|    clip_fraction        | 0.0639      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -1.53       |
|    learning

In [8]:
# 訓練済みエージェントのテスト
obs = env.reset()
n_steps = 20
for step in range(n_steps):
  action, _ = model.predict(obs, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action: ", action)
  obs, reward, done, info = env.step(action)
  print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render(mode='console')
  if done:
    # VecEnvは、エピソード完了に遭遇すると自動的にリセットされることに注意
    print("Goal reached!", "reward=", reward)
    break

Step 1
Action:  [0]
obs= [[8.]] reward= [0.] done= [False]
........x..
Step 2
Action:  [0]
obs= [[7.]] reward= [0.] done= [False]
.......x...
Step 3
Action:  [0]
obs= [[6.]] reward= [0.] done= [False]
......x....
Step 4
Action:  [0]
obs= [[5.]] reward= [0.] done= [False]
.....x.....
Step 5
Action:  [0]
obs= [[4.]] reward= [0.] done= [False]
....x......
Step 6
Action:  [0]
obs= [[3.]] reward= [0.] done= [False]
...x.......
Step 7
Action:  [0]
obs= [[2.]] reward= [0.] done= [False]
..x........
Step 8
Action:  [0]
obs= [[1.]] reward= [0.] done= [False]
.x.........
Step 9
Action:  [0]
obs= [[9.]] reward= [1.] done= [ True]
.........x.
Goal reached! reward= [1.]


## 実際の処理

In [15]:
import numpy as np
import gym
from gym import spaces


from puzzpy import PuzzTable
table = PuzzTable(10)
table.show_table()
for a in table.next_tables():
    print(a.get_table())
    print(a.get_XY_as_table())

class PuzzEnv(gym.Env):
  """
  Gymのインターフェースに従うカスタム環境
  エージェントが常に左に行くことを学ぶ環境
  """
  # ColabのためGUIを実装できない
  metadata = {'render.modes': ['console']}

  # 定数を定義
  LEFT = 0
  RIGHT = 1

  def __init__(self):
    super(PuzzEnv, self).__init__()

    self.action_space = spaces.Discrete(5)

    # 状態はエージェントの座標になる
    # Discrete空間とBox空間の両方で表現できる
    self.observation_space = spaces.Tuple([spaces.Box(low=0, high=5,
                                       shape=(5,6), dtype=np.float32),
                                       spaces.Box(low=0, high=1,
                                       shape=(5,6), dtype=np.float32),
                                       spaces.Box(low=1, high=155,
                                       shape=(1,), dtype=np.float32)])

  def reset(self):
    """
    【重要】観測はnumpy配列でなければならない
    :return: (np.array)
    """
    self.table = PuzzTable(155)

    return [np.array(self.table.get_table()).astype(np.float32),
              np.array(self.table.get_XY_as_table()).astype(np.float32),
              self.table.get_turn()]

  def step(self, action):
    if action == 4:
      return [np.array(self.table.get_table()).astype(np.float32),
              np.array(self.table.get_XY_as_table()).astype(np.float32),
              self.table.get_turn()], self.table.eval_otoshi(), True, {}

    next_table = self.table.next_tables()[action]

    if next_table.get_table()[0,0] == 255:
      return [np.array(self.table.get_table()).astype(np.float32),
              np.array(self.table.get_XY_as_table()).astype(np.float32),
              self.table.get_turn()], self.table.eval_otoshi(), True, {}

    self.table = next_table

    if self.table.get_turn() <= 0:
      return [np.array(self.table.get_table()).astype(np.float32),
              np.array(self.table.get_XY_as_table()).astype(np.float32),
              self.table.get_turn()], self.table.eval_otoshi(), True, {}

    return [np.array(self.table.get_table()).astype(np.float32),
              np.array(self.table.get_XY_as_table()).astype(np.float32),
              self.table.get_turn()], 0, False, {}

  def render(self, mode='console', close=False):
    if mode != 'console':
      raise NotImplementedError()

    self.table.show_table()

[['\x04', '\x04', '\x06', '\x01', '\x04', '\x05'], ['\x02', '\x02', '\x01', '\x05', '\x03', '\x05'], ['\x03', '\x06', '\x02', '\x03', '\x01', '\x03'], ['\x02', '\x02', '\x05', '\x04', '\x06', '\x03'], ['\x04', '\x06', '\x06', '\x03', '\x01', '\x01']]
[['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x01', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00']]
[['\x04', '\x04', '\x06', '\x01', '\x04', '\x05'], ['\x02', '\x02', '\x06', '\x05', '\x03', '\x05'], ['\x03', '\x02', '\x01', '\x03', '\x01', '\x03'], ['\x02', '\x02', '\x05', '\x04', '\x06', '\x03'], ['\x04', '\x06', '\x06', '\x03', '\x01', '\x01']]
[['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x01', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00'], ['\x00', '\x00', '\x00', '\x00', '\x00', '\x00

In [16]:
env = PuzzEnv()

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

# GO_LEFT = 0

# # ハードコードされた最高のエージェント：常に左に行く
# n_steps = 20
# for step in range(n_steps):
#   print("Step {}".format(step + 1))
#   obs, reward, done, info = env.step(GO_LEFT)
#   print('obs=', obs, 'reward=', reward, 'done=', done)
#   env.render()
#   if done:
#     print("Goal reached!", "reward=", reward)
#     break

ValueError: ignored

In [24]:
table = PuzzTable(10)
table.show_table()
str1 = np.array(table.get_table())[0,0]
print(ord(str1))

4
