In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# 16 states: state 0 and 15 are terminal states
states = list(range(16))
terminal_states = [0, 15]
states

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [3]:
# 4 actions: 0 (left), 1 (right), 2 (up), 3 (down)
actions = list(range(4))

# intiailize a uniform random policy
initial_policy = [
    1, 1, 1, 3,  # 첫 번째 행: 오른쪽으로 이동
    1, 1, 1, 3,  # 두 번째 행: 오른쪽으로 이동
    1, 1, 1, 3,  # 세 번째 행: 오른쪽으로 이동
    1, 1, 1, 1   # 네 번째 행: 왼쪽으로 이동 (마지막 상태는 터미널 상태)
]

# 결정론적 정책을 나타내는 pi 행렬 생성
pi = np.eye(4)[initial_policy]
pi

array([[0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.]])

In [4]:
# state transition is calculated as a function rather than using 4 different matrices

def state_transition(s, a):

    # terminal states
    if (s==0) or (s==15):
        return s

    # left
    if a == 0:
        if s % 4 == 0: # left edge
            return s
        else:
            return s - 1

    # right
    if a == 1:
        if s % 4 == 3: # right edge
            return s
        else:
            return s + 1

    # up
    if a == 2:
        if s <= 3: # upper edge
            return s
        else:
            return s - 4

    # up
    if a == 3:
        if s >= 12: # lower edge
            return s
        else:
            return s + 4

In [5]:
# value_update 함수 수정
def value_update(values, pi, gamma):
    new_values = np.zeros(16)

    for s in states:
        if s in terminal_states:
            new_values[s] = 0
        else:
            # deterministic policy에 따라 하나의 action만 선택
            a = np.argmax(pi[s])
            next_s = state_transition(s, a)
            reward = 0 if s in terminal_states else -1
            new_values[s] = reward + gamma * values[next_s]

    return new_values


In [6]:
# print state values in a 4x4 grid format
def print_values(values):
    for i in range(4):
        print(values[4 * i : 4 * i + 4])


In [7]:
gamma = 1.0

# initialize state values to be zeros
values = np.zeros(16)
print_values(values)

[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]


In [8]:
values = value_update(values, pi, gamma)
print_values(values)

[ 0. -1. -1. -1.]
[-1. -1. -1. -1.]
[-1. -1. -1. -1.]
[-1. -1. -1.  0.]


In [9]:
epsilon = 1e-5
values = np.zeros(16)

while True:
    old_values = values
    values = value_update(values, pi, gamma)
    diff = np.linalg.norm(values - old_values)

    if diff < epsilon:
        break

print_values(values)

[ 0. -5. -4. -3.]
[-5. -4. -3. -2.]
[-4. -3. -2. -1.]
[-3. -2. -1.  0.]


In [10]:
# get the greedy policy
def greedy_policy(values, gamma):
    pi = np.zeros((16, 4))
    nt_states = list(range(1, 15)) # non-terminal states

    for s in nt_states:

        # calcuate q(s, a) to get max q
        max_q = -1e9 # just a very negative number smaller than all state-values
        max_a = 0
        for a in actions:
            next_s = state_transition(s, a)
            q = -1 + gamma * values[next_s]
            if q > max_q:
                max_q = q
                max_a = a

        pi[s, max_a] = 1.0 # greedy-policy

    return pi


In [11]:
pi = greedy_policy(values, gamma)
pi

array([[0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

In [12]:
pi_actions = []
pi_actions.append(np.argmax(pi, axis=1))
pi_actions

[array([0, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 0])]

In [13]:
epsilon = 1e-5
values = np.zeros(16)

while True:
    old_values = values
    values = value_update(values, pi, gamma)
    diff = np.linalg.norm(values - old_values)

    if diff < epsilon:
        break

print_values(values)

[ 0. -1. -4. -3.]
[-1. -4. -3. -2.]
[-4. -3. -2. -1.]
[-3. -2. -1.  0.]


In [14]:
pi = greedy_policy(values, gamma)
pi

array([[0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

In [15]:
pi_actions.append(np.argmax(pi, axis=1))
pi_actions

[array([0, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 0]),
 array([0, 0, 0, 3, 2, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 0])]

In [16]:
epsilon = 1e-5
values = np.zeros(16)

while True:
    old_values = values
    values = value_update(values, pi, gamma)
    diff = np.linalg.norm(values - old_values)

    if diff < epsilon:
        break

print_values(values)

[ 0. -1. -2. -3.]
[-1. -2. -3. -2.]
[-2. -3. -2. -1.]
[-3. -2. -1.  0.]


In [17]:
pi = greedy_policy(values, gamma)
pi

array([[0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

In [18]:
pi_actions.append(np.argmax(pi, axis=1))
pi_actions

[array([0, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 0]),
 array([0, 0, 0, 3, 2, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 0]),
 array([0, 0, 0, 0, 2, 0, 0, 3, 2, 0, 1, 3, 1, 1, 1, 0])]

In [19]:
values = np.zeros(16)
for i in range(2000):
    values = value_update(values, pi, gamma)

print_values(values)

[ 0. -1. -2. -3.]
[-1. -2. -3. -2.]
[-2. -3. -2. -1.]
[-3. -2. -1.  0.]


In [20]:
pi = greedy_policy(values, gamma)
pi

array([[0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.]])

In [21]:
pi_actions.append(np.argmax(pi, axis=1))
pi_actions

[array([0, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 3, 1, 1, 1, 0]),
 array([0, 0, 0, 3, 2, 0, 1, 3, 2, 1, 1, 3, 1, 1, 1, 0]),
 array([0, 0, 0, 0, 2, 0, 0, 3, 2, 0, 1, 3, 1, 1, 1, 0]),
 array([0, 0, 0, 0, 2, 0, 0, 3, 2, 0, 1, 3, 1, 1, 1, 0])]

## 과제수행

In [22]:
# policy iteration 알고리즘을 함수로 구현

def policy_iteration(initial_policy, gamma, epsilon):
    pi = np.eye(4)[initial_policy]
    values = np.zeros(16)

    while True:
        # 정책 평가
        while True:
            old_values = values.copy()
            values = value_update(values, pi, gamma)
            if np.linalg.norm(values - old_values) < epsilon:
                break

        # 정책 개선
        new_pi = greedy_policy(values, gamma)

        if np.array_equal(new_pi, pi):
            break

        pi = new_pi

    return values, pi

In [23]:
gamma=1.0
epsilon=1e-5

In [24]:
# 임의의 deterministic policy 지정
initial_policy_1 = [
    1, 1, 1, 3,
    1, 1, 1, 3,
    1, 1, 1, 3,
    1, 1, 1, 1
]

# policy iteration 수행
values1, pi1 = policy_iteration(initial_policy_1, gamma, epsilon)

# 결과
print("\noptimal value 1:")
print_values(values1)
print("\noptimal policy 1:")
print(np.argmax(pi1, axis=1).reshape(4, 4))


optimal value 1:
[ 0. -1. -2. -3.]
[-1. -2. -3. -2.]
[-2. -3. -2. -1.]
[-3. -2. -1.  0.]

optimal policy 1:
[[0 0 0 0]
 [2 0 0 3]
 [2 0 1 3]
 [1 1 1 0]]


In [25]:
# 임의의 다른 deterministic policy 지정
initial_policy_2 = [
    3, 1, 3, 3,
    1, 1, 3, 0,
    1, 1, 1, 3,
    2, 1, 2, 1
]

# policy iteration 수행
values1, pi1 = policy_iteration(initial_policy_2, gamma, epsilon)

# 결과
print("\noptimal value 2:")
print_values(values1)
print("\noptimal policy 2:")
print(np.argmax(pi1, axis=1).reshape(4, 4))


optimal value 2:
[ 0. -1. -2. -3.]
[-1. -2. -3. -2.]
[-2. -3. -2. -1.]
[-3. -2. -1.  0.]

optimal policy 2:
[[0 0 0 0]
 [2 0 0 3]
 [2 0 1 3]
 [1 1 1 0]]


동일한 결과가 나오는 것을 확인 가능