In [22]:
import numpy as np
import gym
import random

In [23]:
class MDP:
    def __init__(self,shape):
        self.R = np.zeros(shape)
        self.T = np.zeros(shape)
        self.V = np.zeros(shape[0])
        self.policy = np.zeros(shape[0])
        self.Q = np.zeros(shape[:-1])


def eval_reward_prob(env):
    R = np.zeros((env.env.nS,env.env.nA,env.env.nS)) #state,action,next state
    T = np.zeros((env.env.nS,env.env.nA,env.env.nS)) #state,action,next state
    for state in range(env.env.nS):
        for action in range(env.env.nA):
            for prob, next_state, reward, _ in env.env.P[state][action]:
                R[state, action, next_state] = reward
                T[state, action, next_state] = prob
    
    return R,T

def policy_iteration(env,gamma=0.95,max_iterations=100000,delta=0.001):
    #init V with zeros,policy with random values,R and P from env
    mdp = MDP((env.env.nS,env.env.nA))
    mdp.policy = np.array([env.action_space.sample() for _ in range(env.env.nS)])
    mdp.V = np.zeros(env.env.nS)
    mdp.R,mdp.T = eval_reward_prob(env)

    for _ in range(max_iterations):
        prev_V = mdp.V.copy()
        mdp.Q = (mdp.T * (mdp.R + gamma * mdp.V)).sum(axis=2) #sum by next state
        mdp.V = np.max(mdp.Q,axis=1) #eval_V(mdp)
        if np.max(np.abs(prev_V - mdp.V)) < delta:
            break
        mdp.policy = np.argmax(mdp.Q,axis=1)
        
    return mdp.policy,mdp.V


def testerA(env,policy,MAX_STEPS = 20):
    total_reward = 0
    state = env.reset()

    for steps in range(MAX_STEPS):
        env.render()
        action = policy[state]
        state,reward,done,_ = env.step(action)
        total_reward+=reward
        print("Reward: ",reward)

        if done:
            print("Done!!!")
            print("Score:{} Steps:{}".format(total_reward,steps))
            break


def testerB(check_state,V):
    print([V[n] for n in check_state])


In [24]:
env = gym.make('Taxi-v3')
policy,V = policy_iteration(env)

In [26]:
check_state = np.arange(10)
testerA(env,policy)
testerB(check_state,V)

+---------+
|R: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
Reward:  -1
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[42m_[0m: 