## Policy Improvement

It takes the state-value function of the policy we want to improve, the MDP and gamma (optionally)

In [1]:
import numpy as np

def policy_improvement(V, P, gamma=1.0):
    
    # initialize the Q-function to zero (technically, we
    # can initialize these randomly, but let’s keep things simple).
    Q = np.zeros((len(P), len(P[0])))
    
    # loop through the states, actions, and transitions.
    for s in range(len(P)):
        for a in range(len(P[s])):
            for prob, next_state, reward, done in P[s][a]:
                
                # use those values to calculate the Q-function.
                Q[s][a] += prob * (reward + gamma * V[next_state] * (not done))
                
    # obtain a new, greedy policy by taking the argmax of the Q-function            
    new_pi = lambda s: {s:a for s, a in enumerate(np.argmax(Q, axis=1))}[s]
    
    return new_pi

We can consider the random policy on the taxi environment and its state value function:

In [2]:
import gym

env = gym.make('Taxi-v3')
P = env.env.P
init_state, _ = env.reset()

In [3]:
def rand_dict(n):
   from random import randrange
   mydict = {i: randrange(6) for i in range(n)}
   return mydict

rnd_policy = rand_dict(500)

rnd_pi = lambda s: rnd_policy[s]

In [4]:
import numpy as np

def policy_evaluation(pi, P, gamma=1.0, theta=1e-10):
    prev_V = np.zeros(len(P))
    while True:
        V = np.zeros(len(P))
        for s in range(len(P)):
            for prob, next_state, reward, done in P[s][pi(s)]:
                V[s] += prob * (reward + gamma * prev_V[next_state] * (not done))
        if np.max(np.abs(prev_V - V)) < theta:
            break
        prev_V = V.copy()
    return V

In [5]:
V = policy_evaluation(rnd_pi, P, gamma=0.99)

Now we can try to improve the policy:

In [6]:
rnd_plus_pi = policy_improvement(V, P, gamma=0.99)

We can show the improved policy, its probability of success and its main return using simulation:

In [7]:
def print_policy(pi, P, action_symbols=('<', 'v', '>', '^', 'P', 'D'), n_cols=5, title='Policy:'):
    print(title)
    arrs = {k:v for k,v in enumerate(action_symbols)}
    for s in range(len(P)):
        a = pi(s)
        print("| ", end="")
        if np.all([done for action in P[s].values() for _, _, _, done in action]):
            print("".rjust(9), end=" ")
        else:
            print(str(s).zfill(3), arrs[a].rjust(6), end=" ")
        if (s + 1) % n_cols == 0: print("|")

In [8]:
import random

def probability_success(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; # env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        done, steps = False, 0
        while not done and steps < max_steps:
            state, _, done, _, h = env.step(pi(state))
            steps += 1
        results.append(done)
    return np.sum(results)/len(results)

In [9]:
def mean_return(env, pi, n_episodes=100, max_steps=200):
    random.seed(123); np.random.seed(123) ; # env.seed(123)
    results = []
    for _ in range(n_episodes):
        state, _ = env.reset()
        done, steps = False, 0
        results.append(0.0)
        while not done and steps < max_steps:
            state, reward, done, _, _ = env.step(pi(state))
            results[-1] += reward
            steps += 1
    return np.mean(results)

In [None]:
print_policy(rnd_plus_pi, P)

ps = probability_success(env, rnd_plus_pi)*100
mr = mean_return(env, rnd_plus_pi)

print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(ps,mr))

Is the new policy better than the original policy?

Is there a better policy than this one? We can try to improve the rnd-plus policy. We started from a random one so it is probable that several runs will be necessary...

In [11]:
V = policy_evaluation(rnd_plus_pi, P, gamma=0.99)
rnd_plus_plus_pi = policy_improvement(V, P, gamma=0.99)
V = policy_evaluation(rnd_plus_plus_pi, P, gamma=0.99)
rnd_plus_plus_pi = policy_improvement(V, P, gamma=0.99)
V = policy_evaluation(rnd_plus_plus_pi, P, gamma=0.99)
rnd_plus_plus_pi = policy_improvement(V, P, gamma=0.99)
V = policy_evaluation(rnd_plus_plus_pi, P, gamma=0.99)
rnd_plus_plus_pi = policy_improvement(V, P, gamma=0.99)
V = policy_evaluation(rnd_plus_plus_pi, P, gamma=0.99)
rnd_plus_plus_pi = policy_improvement(V, P, gamma=0.99)

In [13]:
print_policy(rnd_plus_plus_pi, P)

ps = probability_success(env, rnd_plus_plus_pi)*100
mr = mean_return(env, rnd_plus_plus_pi)

print('Reaches goal {:.2f}%. Obtains an average undiscounted return of {:.4f}.'.format(ps,mr))

Reaches goal 5.00%. Obtains an average undiscounted return of -178.7800.


Any improvement? Have we found an optimal policy?