Import packages

In [1]:
import numpy as np
import gym
from gym.spaces import Discrete

Create snake game.

In [2]:
class SnakeEnv(gym.Env):
    SIZE=100
  
    def __init__(self, ladder_num, dices):
        self.ladder_num = ladder_num
        self.dices = dices
        self.ladders = dict(np.random.randint(1, self.SIZE, size=(self.ladder_num, 2)))
        self.observation_space = Discrete(self.SIZE + 1) # 101 !!!
        self.action_space = Discrete(len(dices))

        for k,v in list(self.ladders.items()):
            self.ladders[v] = k
        self.pos = 1

    def reset(self):
        self.pos = 1
        return self.pos

    def step(self, a):
        step = np.random.randint(1, self.dices[a] + 1)
        self.pos += step
        if self.pos == 100:
            return 100, 100, 1, {}
        elif self.pos > 100:
            self.pos = 200 - self.pos

        if self.pos in self.ladders:
            self.pos = self.ladders[self.pos]
        return self.pos, -1, 0, {}

    def reward(self, s):
        if s == 100:
            return 100
        else:
            return -1

    def render(self):
        pass

def eval_game(env, policy):
    '''
        policy: state -> action
    '''
    state = env.reset()
    total_reward = 0
    while True:
        if isinstance(policy, TableAgent) or isinstance(policy, ModelFreeAgent):
            action = policy.play(state)
        elif isinstance(policy, list):
            action = policy[state]
        else:
            raise Error('Illegal policy')
        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            return total_reward


Build agent for snake game

In [3]:
class TableAgent(object):
    def __init__(self, num_ladders, env):
        self.action_size = env.action_space.n
        self.state_size = env.observation_space.n # 101

        self.r = [env.reward(s) for s in range(self.state_size)] # (101,)

        # stochastic policy: optimizing target
        #self.pi = np.zeros((self.state_size, self.action_size)) # \pi(a_t | s_t)
        self.pi = np.zeros(self.state_size, dtype=np.int) # 1D is because we assume a state corresponds to only one action

        ladder_move = np.vectorize(lambda x: env.ladders[x] if x in env.ladders else x)
        
        # transition probability
        self.p = np.zeros((self.action_size, self.state_size, self.state_size), dtype=np.float) # p(s_{t+1} | s_t, a_t)
        for index, dice in enumerate(env.dices): # eg: env.dices: [3, 6]
            prob = 1.0 / dice # eg: 1/3 or 1/6
            for src in range(1, self.state_size - 1): # 0 and 100 should not be initialized
                #step = np.arange(1, dice + 1) # eg: [1, 2, 3] or [1, 2, 3, 4, 5, 6]
                step = np.arange(dice)
                dsts = src + step # a vector of dst
                for dst in dsts:
                    if dst > 100:
                        dst = 200 - dst # move back
                    dst = ladder_move(dst)
                    self.p[index, src, dst] += prob

        self.p[:, 100, 100] = 1 # If the src is 100, player can go nowhere but 100

        # state-value function
        self.value_s = np.zeros(self.state_size) # v_{\pi}(s)

        # action-value function
        self.value_sa = np.zeros((self.state_size, self.action_size)) # q_{\pi}(s, a)

        # discount factor
        self.gamma = 0.8

    def play(self, state):
        return self.pi[state]

## Policy Iteration

Step 1: Define policy iteration class

In [4]:
from contextlib import contextmanager
import time

@contextmanager
def timer(name):
    start = time.time()
    yield
    end = time.time()
    print('{} COST:{}'.format(name, end - start))

class PolicyIteration(object):
    @staticmethod
    def _policy_evaluation(agent, max_iter=-1):
        iteration = 0
        while True:
            new_value_s = agent.value_s.copy() # (state_size)
            for s in range(1, agent.state_size): # 0 is invalid, because SnakeEnv defines observation_space to be 101
                a = agent.pi[s]
                new_value_s[s] = np.matmul(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s) # (state_size)
            diff = np.sqrt(np.sum((new_value_s - agent.value_s) ** 2))

            if diff < 1e-6:
                break
            else:
                agent.value_s = new_value_s
                
            iteration += 1
            if iteration == max_iter:
                break

    @staticmethod
    def _policy_improvement(agent):
        new_policy = np.zeros_like(agent.pi) # (action_size, state_size, state_size)
        for s in range(1, agent.state_size):
            for a in range(agent.action_size): # iterate a !!! not a = agent.pi[s], but iteration!!!!
                # update action-value function
                agent.value_sa[s, a] = np.dot(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s)
            new_policy[s] = np.argmax(agent.value_sa[s, :]) # select the max action !!! not [s, a], but [s, :]
        if np.all(np.equal(new_policy, agent.pi)):
            return True # converge
        else:
            agent.pi = new_policy
            return False # not converge

    @staticmethod
    def policy_iteration(agent, max_iter=-1): # max_iter is corresponding to policy evaluation
        while True:
            PolicyIteration._policy_evaluation(agent, max_iter)
            ret = PolicyIteration._policy_improvement(agent)
            if ret:
                break


    @staticmethod
    def policy_iteration_time(agent, max_iter=-1):
        while True:
            with timer('Timer PolicyEval'):
                PolicyIteration._policy_evaluation(agent, max_iter)
            with timer('Timer PolicyImprove'):
                ret = PolicyIteration._policy_improvement(agent)
            if ret:
                break



Step 2: Test

No ladders

In [5]:
print('No ladders')
env = SnakeEnv(0, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration(agent)
print('return:', eval_game(env, agent))
print(agent.pi)

No ladders
return: 71
[0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


Random ladders

In [6]:
print('Random ladders')
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)

# 1. all 1
agent.pi[:] = 1
print('return:', eval_game(env, agent))
print(agent.pi)

# 2. all 0
agent.pi[:] = 0
print('return:', eval_game(env, agent))
print(agent.pi)


# 3. [1] * 97 + [] * 3
agent.pi[:-3] = 1
print('return:', eval_game(env, agent))
print(agent.pi)


# 4. Policy Iteration
PolicyIteration.policy_iteration(agent)
print('return:', eval_game(env, agent))
print(agent.pi)

Random ladders
return: 67
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
return: 49
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
return: 93
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
return: 91
[0 1 1 1 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


## Value Iteration

### Introduction

Calculate the time policy evaluation and policy improvement runs

In [7]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.09339094161987305
Timer PolicyImprove COST:0.002607583999633789
Timer PolicyEval COST:0.0632481575012207
Timer PolicyImprove COST:0.0024428367614746094
Timer PolicyEval COST:0.05125856399536133
Timer PolicyImprove COST:0.0024712085723876953
Timer PolicyEval COST:0.03414297103881836
Timer PolicyImprove COST:0.002272367477416992
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


As can be seen from the above the results, policy evaluation takes more time than policy improvement. What if we reduce the time spent on policy evaluation?

Case 1: max_iter=50

In [8]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent, max_iter=50)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.05200505256652832
Timer PolicyImprove COST:0.0023920536041259766
Timer PolicyEval COST:0.04838132858276367
Timer PolicyImprove COST:0.0023963451385498047
Timer PolicyEval COST:0.05085492134094238
Timer PolicyImprove COST:0.0030188560485839844
Timer PolicyEval COST:0.03535056114196777
Timer PolicyImprove COST:0.002475261688232422
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


Case 2: max_iter=10

In [9]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent, max_iter=10)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.010145425796508789
Timer PolicyImprove COST:0.003003358840942383
Timer PolicyEval COST:0.009757280349731445
Timer PolicyImprove COST:0.003203868865966797
Timer PolicyEval COST:0.011197090148925781
Timer PolicyImprove COST:0.0031321048736572266
Timer PolicyEval COST:0.01108860969543457
Timer PolicyImprove COST:0.0023908615112304688
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


Case 3: max_iter=1

In [10]:
np.random.seed(2)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
PolicyIteration.policy_iteration_time(agent, max_iter=1)
print('return:', eval_game(env, agent))
print(agent.pi)

Timer PolicyEval COST:0.0011734962463378906
Timer PolicyImprove COST:0.002821207046508789
Timer PolicyEval COST:0.0011456012725830078
Timer PolicyImprove COST:0.0023832321166992188
Timer PolicyEval COST:0.001275777816772461
Timer PolicyImprove COST:0.0023865699768066406
Timer PolicyEval COST:0.0011174678802490234
Timer PolicyImprove COST:0.0023479461669921875
Timer PolicyEval COST:0.0011093616485595703
Timer PolicyImprove COST:0.002362966537475586
Timer PolicyEval COST:0.0011703968048095703
Timer PolicyImprove COST:0.002497434616088867
Timer PolicyEval COST:0.0010874271392822266
Timer PolicyImprove COST:0.0026977062225341797
return: 88
[0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1
 1 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]


The above results are the same, indicating the times of policy evaluation does not affect the final result. In fact, The program spends more time in iterating both policy evaluation and policy improvement. Since the time spent on policy improvement is less than policy evaluation, it saves time to reduce the iterations of policy evaluation. It is called **Value Iteration**.

### Implementation for Value Iteration

1. Value Iteration:<br\>
$v(s) \leftarrow max_a{q(s, a)}$<br\>
$v(s) \leftarrow max_a{\sum_{s^{\prime}}{p(s^{\prime}|s, a)[r(s, a, s^{\prime}) + \gamma \tilde{v}(s^{\prime})]}}$

2. Policy Improvement:<br/>
$q(s_t, a_t) = \sum_{s_{t+1}}{      p(s_{t+1}|s_t, a_t)[ r_{t+1} + \gamma v_{\pi}(s_{t+1})  ]}$

DIFFERENCES:
1. Number of iterations for policy evalution;
2. Ways of updating state-value function.

In [11]:
class ValueIteration(object):
    @staticmethod
    def value_iteration(agent, max_iter=-1):
        # update state-value function
        iteration = 0
        while True:
            iteration += 1
            new_value_s = np.zeros_like(agent.value_s)
            for s in range(1, agent.state_size): # 1 is important !!!
                value_sas = list()
                for a in range(agent.action_size):
                    value_sa = np.dot(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s)
                    value_sas.append(value_sa)
                new_value_s[s] = max(value_sas) # int or float???
            diff = np.sqrt(np.sum((agent.value_s - new_value_s) ** 2))
            if diff < 1e-6 or iteration == max_iter:
                break
            else:
                agent.value_s = new_value_s # update agent's state-value function !!!
        # update action-value function
        for s in range(1, agent.state_size): # 1 is important !!!
            for a in range(agent.action_size):
                agent.value_sa[s, a] = np.dot(agent.p[a, s, :], agent.r + agent.gamma * agent.value_s)
            agent.pi[s] = np.argmax(agent.value_sa[s, :])

In [12]:
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)

print('Value Iteration')
with timer('ValueIteration'):
    ValueIteration.value_iteration(agent)
    print('return:', eval_game(env, agent))
    print(agent.pi)

print()
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
print('Policy Iteration 10 iter')
with timer('PolicyIteration 10 iter'):
    PolicyIteration.policy_iteration(agent, max_iter=10)
    print('return:', eval_game(env, agent))
    print(agent.pi)
    
print()
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)
print('Policy Iteration 1 iter')
with timer('PolicyIteration 1 iter'):
    PolicyIteration.policy_iteration(agent, max_iter=1)
    print('return:', eval_game(env, agent))
    print(agent.pi)

Value Iteration
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
ValueIteration COST:0.1954517364501953

Policy Iteration 10 iter
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
PolicyIteration 10 iter COST:0.05167269706726074

Policy Iteration 1 iter
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
PolicyIteration 1 iter COST:0.022751808166503906


## Generalized Policy Iteration

In [13]:
class GeneralizedPolicyIteration(object):
    @staticmethod
    def generalized_policy_iteration(agent):
        ValueIteration.value_iteration(agent, 10)
        PolicyIteration.policy_iteration(agent, 1)

In [14]:
np.random.seed(0)
env = SnakeEnv(10, [3, 6])
agent = TableAgent(0, env)

print('Value Iteration')
with timer('ValueIteration'):
    GeneralizedPolicyIteration.generalized_policy_iteration(agent)
    print('return:', eval_game(env, agent))
    print(agent.pi)

Value Iteration
return: 92
[0 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0]
ValueIteration COST:0.030349254608154297
