Import packages

In [1]:
import numpy as np
from snake import *

## Policy Iteration

Step 1: Define policy iteration class

In [2]:
class PolicyIteration(object):
    @staticmethod
    def _policy_evaluation(agent, max_iter=-1):
        iteration = 0
        while True:
            new_value_s = agent.value_s.copy() # (state_size)
            for s in range(1, agent.state_size): # 0 is invalid, because SnakeEnv defines observation_space to be 101
                a = agent.pi[s]
                new_value_s[s] = np.matmul(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s) # (state_size)
            diff = np.sqrt(np.sum((new_value_s - agent.value_s) ** 2))

            if diff < 1e-6:
                break
            else:
                agent.value_s = new_value_s
                
            iteration += 1
            if iteration == max_iter:
                break

    @staticmethod
    def _policy_improvement(agent):
        new_policy = np.zeros_like(agent.pi) # (action_size, state_size, state_size)
        for s in range(1, agent.state_size):
            for a in range(agent.action_size): # iterate a !!! not a = agent.pi[s], but iteration!!!!
                # update action-value function
                agent.value_sa[s, a] = np.dot(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s)
            new_policy[s] = np.argmax(agent.value_sa[s, :]) # select the max action !!! not [s, a], but [s, :]
        if np.all(np.equal(new_policy, agent.pi)):
            return True # converge
        else:
            agent.pi = new_policy
            return False # not converge

    @staticmethod
    def policy_iteration(agent, max_iter=-1): # max_iter is corresponding to policy evaluation
        while True:
            PolicyIteration._policy_evaluation(agent, max_iter)
            ret = PolicyIteration._policy_improvement(agent)
            if ret:
                break


    @staticmethod
    def policy_iteration_time(agent, max_iter=-1):
        while True:
            with timer('Timer PolicyEval'):
                PolicyIteration._policy_evaluation(agent, max_iter)
            with timer('Timer PolicyImprove'):
                ret = PolicyIteration._policy_improvement(agent)
            if ret:
                break



Step 2: Test

No ladders

In [3]:
print('No ladders')
env = SnakeEnv(0, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration(agent)
print('return:', eval_game(env, agent))
print(agent.pi)

No ladders
return: 66
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


Random ladders

In [4]:
print('Random ladders')
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)

# 1. all 1
agent.pi[:] = 1
print('return:', eval_game(env, agent))
print(agent.pi)

# 2. all 0
agent.pi[:] = 0
print('return:', eval_game(env, agent))
print(agent.pi)


# 3. [1] * 97 + [] * 3
agent.pi[:-3] = 1
print('return:', eval_game(env, agent))
print(agent.pi)


# 4. Policy Iteration
PolicyIteration.policy_iteration(agent)
print('return:', eval_game(env, agent))
print(agent.pi)

Random ladders
return: 87
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
return: -47
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
return: 93
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
return: 75
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


## Value Iteration

### Introduction

Calculate the time policy evaluation and policy improvement runs

In [5]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.10278606414794922
Timer PolicyImprove COST:0.0025739669799804688
Timer PolicyEval COST:0.07129621505737305
Timer PolicyImprove COST:0.0029616355895996094
Timer PolicyEval COST:0.05581331253051758
Timer PolicyImprove COST:0.002955913543701172
Timer PolicyEval COST:0.03937983512878418
Timer PolicyImprove COST:0.0026044845581054688
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


As can be seen from the above the results, policy evaluation takes more time than policy improvement. What if we reduce the time spent on policy evaluation?

Case 1: max_iter=50

In [6]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent, max_iter=50)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.05925488471984863
Timer PolicyImprove COST:0.0026290416717529297
Timer PolicyEval COST:0.05691027641296387
Timer PolicyImprove COST:0.002716064453125
Timer PolicyEval COST:0.054637908935546875
Timer PolicyImprove COST:0.0029664039611816406
Timer PolicyEval COST:0.046237945556640625
Timer PolicyImprove COST:0.003295421600341797
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


Case 2: max_iter=10

In [7]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent, max_iter=10)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.012920618057250977
Timer PolicyImprove COST:0.003330230712890625
Timer PolicyEval COST:0.012677669525146484
Timer PolicyImprove COST:0.0027811527252197266
Timer PolicyEval COST:0.01230621337890625
Timer PolicyImprove COST:0.002826690673828125
Timer PolicyEval COST:0.013787984848022461
Timer PolicyImprove COST:0.0028400421142578125
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


Case 3: max_iter=1

In [8]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent, max_iter=1)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.0012767314910888672
Timer PolicyImprove COST:0.002900838851928711
Timer PolicyEval COST:0.0013184547424316406
Timer PolicyImprove COST:0.0024771690368652344
Timer PolicyEval COST:0.0016360282897949219
Timer PolicyImprove COST:0.003149271011352539
Timer PolicyEval COST:0.0012812614440917969
Timer PolicyImprove COST:0.0030074119567871094
Timer PolicyEval COST:0.0014488697052001953
Timer PolicyImprove COST:0.002941608428955078
Timer PolicyEval COST:0.0012707710266113281
Timer PolicyImprove COST:0.002722501754760742
Timer PolicyEval COST:0.0011792182922363281
Timer PolicyImprove COST:0.0026738643646240234
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


The above results are the same, indicating the times of policy evaluation does not affect the final result. In fact, The program spends more time in iterating both policy evaluation and policy improvement. Since the time spent on policy improvement is less than policy evaluation, it saves time to reduce the iterations of policy evaluation. It is called **Value Iteration**.

### Implementation for Value Iteration

1. Value Iteration:<br\>
$v(s) \leftarrow max_a{q(s, a)}$<br\>
$v(s) \leftarrow max_a{\sum_{s^{\prime}}{p(s^{\prime}|s, a)[r(s, a, s^{\prime}) + \gamma \tilde{v}(s^{\prime})]}}$

2. Policy Improvement:<br/>
$q(s_t, a_t) = \sum_{s_{t+1}}{      p(s_{t+1}|s_t, a_t)[ r_{t+1} + \gamma v_{\pi}(s_{t+1})  ]}$

DIFFERENCES:
1. Number of iterations for policy evalution;
2. Ways of updating state-value function.

In [9]:
class ValueIteration(object):
    @staticmethod
    def value_iteration(agent, max_iter=-1):
        # update state-value function
        iteration = 0
        while True:
            iteration += 1
            new_value_s = np.zeros_like(agent.value_s)
            for s in range(1, agent.state_size): # 1 is important !!!
                value_sas = list()
                for a in range(agent.action_size):
                    value_sa = np.dot(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s)
                    value_sas.append(value_sa)
                new_value_s[s] = max(value_sas) # int or float???
            diff = np.sqrt(np.sum((agent.value_s - new_value_s) ** 2))
            if diff < 1e-6 or iteration == max_iter:
                break
            else:
                agent.value_s = new_value_s # update agent's state-value function !!!
        # update action-value function
        for s in range(1, agent.state_size): # 1 is important !!!
            for a in range(agent.action_size):
                agent.value_sa[s, a] = np.dot(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s)
            agent.pi[s] = np.argmax(agent.value_sa[s, :])

In [10]:
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)

print('Value Iteration')
with timer('ValueIteration'):
    ValueIteration.value_iteration(agent)
    print('return:', eval_game(env, agent))
    print(agent.pi)

print()
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
print('Policy Iteration 10 iter')
with timer('PolicyIteration 10 iter'):
    PolicyIteration.policy_iteration(agent, max_iter=10)
    print('return:', eval_game(env, agent))
    print(agent.pi)
    
print()
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
print('Policy Iteration 1 iter')
with timer('PolicyIteration 1 iter'):
    PolicyIteration.policy_iteration(agent, max_iter=1)
    print('return:', eval_game(env, agent))
    print(agent.pi)

Value Iteration
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
ValueIteration COST:0.24405908584594727

Policy Iteration 10 iter
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
PolicyIteration 10 iter COST:0.0500035285949707

Policy Iteration 1 iter
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
PolicyIteration 1 iter COST:0.024219989776611328


## Generalized Policy Iteration

In [11]:
class GeneralizedPolicyIteration(object):
    @staticmethod
    def generalized_policy_iteration(agent):
        ValueIteration.value_iteration(agent, 10)
        PolicyIteration.policy_iteration(agent, 1)

In [12]:
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)

print('Value Iteration')
with timer('ValueIteration'):
    GeneralizedPolicyIteration.generalized_policy_iteration(agent)
    print('return:', eval_game(env, agent))
    print(agent.pi)

Value Iteration
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
ValueIteration COST:0.029242277145385742
