##  Policy Iteration

Steps are:
1. Start with a random strategy $\pi_0$
2. Evaluate the policy $\pi_0$ and construct $V^{\pi_0}$
3. Be greedy and select the best actions from $V^{\pi_0}$ which will give us $\pi_1$
4. Repeat until convergence

In [5]:
import numpy as np
import gym

env = gym.make("Taxi-v2")
env.render()

+---------+
|[35mR[0m: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [7]:
action_size = env.action_space.n
print("Action size: {}".format(action_size))

state_size = env.observation_space.n
print("State size: {}".format(state_size))

Action size: 6
State size: 500


In [18]:
V = dict()

# initially the value function for all states
# will be random values close to zero
for i in range(state_size):
    V[i] = np.random.random()

# will take random action for the first time
first_time = True
small_change = 1e-20
gamma = 0.9
episodes = 0
max_episodes = 50000

# generate random policy
policy = dict()
for s in range(state_size):
    policy[s] = env.action_space.sample()

while episodes < max_episodes:
    # policy evaluation
    while True:
        episodes += 1
        if episodes % 100 == 0:
            print("Current episode: {}".format(episodes))
        biggest_change = 0
        # loop through every state present
        for state in range(state_size):
            old_V = V[state]
            # take random action according to policy
            action = policy[state]
            prob, new_state, reward, done = env.env.P[state][action][0]
            V[state] = reward + gamma * V[new_state]
            biggest_change = max(biggest_change, abs(V[state] - old_V))
        if biggest_change < small_change:
            break
            
    # policy improvement
    policy_changed = False
    for state in range(state_size):
        best_val = -np.inf
        best_action = -1
        for action in range(action_size):
            prob, new_state, reward, done = env.env.P[state][action][0]
            future_reward = reward + gamma * V[new_state]
            if future_reward > best_val:
                best_val = future_reward
                best_action = action
        assert best_action != -1
        if policy[state] != best_action:
            policy_changed = True
        policy[state] = best_action

    if not policy_changed:
        break
print("Total episodes trained: {}".format(episodes))

Current episode: 100
Current episode: 200
Current episode: 300
Current episode: 400
Current episode: 500
Current episode: 600
Current episode: 700
Current episode: 800
Current episode: 900
Current episode: 1000
Total episodes trained: 1064


In [16]:
# play the game
env.reset()
rewards = []

test_episodes = 100
for episode in range(test_episodes):
    state = env.reset()
    total_rewards = 0
    print("*" * 100)
    print("Episode {}".format(episode))
    for step in range(25):
        env.render()
        # Take action which has the highest q value 
        # in the current state
        action = policy[state]
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        if done:
            rewards.append(total_rewards)
            print("Score", total_rewards)
            break
        state = new_state
env.close()
print("Average Score", sum(rewards) / test_episodes)

****************************************************************************************************
Episode 0
+---------+
|R: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+

+---------+
|R: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[42mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
|

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : : :[42m_[0m: |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | :[42m_[0m:[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
Score 8
**************************************************************************

In [43]:
V=dict()
for x in range(500):
    V[x] = 0
while True:
    delta = 0
    for states in reversed(range(500)):
        v = V[states]
        V[states] = np.max([sum([p*(r + gamma*V[s_]) for p, s_, r, _ in env.env.P[states][a]]) for a in range(env.env.nA)])
        delta = max(delta,abs(v-V[states]))
    if delta < 1e-30:
        break