In [1]:
import gym
import pandas as pd

In [2]:
env = gym.make("FrozenLake-v0")
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


どんな変数があるか確認する

In [3]:
# それぞれのマスの型が入っている
env.desc

array([[b'S', b'F', b'F', b'F'],
       [b'F', b'H', b'F', b'H'],
       [b'F', b'F', b'F', b'H'],
       [b'H', b'F', b'F', b'G']], dtype='|S1')

In [4]:
env.desc[1][2]

b'F'

In [5]:
env.ncol

4

In [6]:
env.nrow

4

In [7]:
env.nS

16

状態s:どのマスにいるか

In [8]:
env.nA

4

行動a:どの方向に、「進みたい」と思うか。0:left,1:down,2:right,3:up

In [21]:
env.reset()
env.step(0)
env.render()

  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG


In [22]:
env.reset()

0

In [23]:
env.step(1)
env.render()

  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG


In [24]:
env.step(2)
env.render()

  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG


In [31]:
env.step(3)
env.render()

  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG


In [33]:
env.P[1][1]

[(0.3333333333333333, 0, 0.0, False),
 (0.3333333333333333, 5, 0.0, True),
 (0.3333333333333333, 2, 0.0, False)]

P = {s: {a: prob, next_s}}

実際にゲームを進めていくコードを書く

In [35]:
import pandas as pd

class Planner:
    def __init__(self, env):
        self.env = env
        
    def s_to_loc(self, s):
        row = s // self.env.ncol
        col = s % self.env.ncol
        return row, col
    
    def reward(self, s):
        row, col = self.s_to_loc(s)
        if self.env.desc[row][col] == b'H':
            return -1
        elif self.env.desc[row][col] == b'G':
            return 1
        else:
            return 0

In [38]:
pl = Planner(env)
pl.env.render()

  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG


In [44]:
pl.reward(15)

1

## 2.1. 動的計画法
* 状態sに対して、価値V(s)を計算する
* 価値が最大になる行動を取る

 - 状態の価値:その状態のますにおける報酬 + ますから最善の行動をしたときにもらえる報酬

R(s): 状態sでの報酬
$$V(s) = R(s) + \gamma \max_a \sum_{s^{\prime}} T(s^{\prime} |s, a) V(s^{\prime})$$

T(s'|s,a):sからaという行動をとったときにs'にいける遷移確率
V(s'):s'における価値

P = {s: {a: prob, next_s}}
* 価値反復法

最初、V(s)を適当に決める。V0=0とすると
$$V_1(s) = R(s)$$
$$V_2(s) = R(s) + \gamma \max_a \sum_{s^\prime} T(s^{\prime} | s, a)R(s^{\prime})$$
$$V_3(s) = R(s) + \gamma \max_a \sum_{s^\prime} T(s^{\prime} | s, a)V_2(s^{\prime})$$

一般化すると、

$$V_{i+1}(s) = R(s) + \gamma \max_a \sum_{s^\prime} T(s^{\prime} | s, a)V_i(s^{\prime})$$

反復の終了は、
$$|V_{i+1}(s) - V_i(s)| < 0.0001$$
となったら終了

In [52]:
import gym
import pandas as pd

import pandas as pd

class Planner:
    def __init__(self, env):
        self.env = env
        
    def s_to_loc(self, s):
        row = s // self.env.ncol
        col = s % self.env.ncol
        return row, col
    
    def reward(self, s):
        row, col = self.s_to_loc(s)
        if self.env.desc[row][col] == b'H':
            return -1
        elif self.env.desc[row][col] == b'G':
            return 1
        else:
            return 0
    
    def plan(self, gamma=0.9, threshold=0.0001):
        self.env.reset()
        V = {}
        # 状態の価値をその場所の報酬で初期化
        for s in range(self.env.nS):
            V[s] = self.reward(s)
        while True:
            delta = 0
            for s in V:
                row, col = self.s_to_loc(s)
                if self.env.desc[row][col] in [b'H', b'G']:
                    continue
                expected_rewards = []
                for a in range(self.env.nA):
                    r = 0
                    for prob, next_s, _, _ in self.env.P[s][a]:
                        r += gamma * prob * V[next_s]
                    expected_rewards.append(r)
                max_reward = max(expected_rewards)
                new_V = self.reward(s) + max_reward
            
                # |V_i+1 - V_i|
                delta = max(delta, abs(new_V - V[s]))
                V[s] = new_V
            
            if delta < threshold:
                break
                
        return self.dict_to_grid(V)
    
    def dict_to_grid(self, V):
        grid = []
        for i in range(self.env.nrow):
            row = [0] * self.env.ncol
            grid.append(row)
        for s in V:
            row, col = self.s_to_loc(s)
            grid[row][col] = V[s]
        return pd.DataFrame(grid)

In [53]:
pl = Planner(env)
pl.plan()

Unnamed: 0,0,1,2,3
0,0.045684,0.026631,0.016628,0.01234
1,0.061139,-1.0,-0.252228,-1.0
2,0.097082,0.165458,0.142632,-1.0
3,-1.0,0.31185,0.562218,1.0
