In [2]:
import gym
import numpy as np
import sys
import os

In [37]:
def value_iteration(env, number_of_states, number_of_actions):
    policy = np.zeros((1, number_of_states))
    value_list = np.zeros((1, number_of_states))
    old_value_list = value_list.copy()
    episode = 0
    max_change = 1
    sigma = 0.9
    while max_change > 0.01:
        episode += 1
        for s in range(number_of_states):
            assigned_value = -np.inf
            for a in range(number_of_actions):
                # get new state and its reward
                prob, new_state, reward, done = env.P[s][a][0]
                # get new states value
                value_new_state = old_value_list[0][new_state]
                cand_value = 0
                if done:
                    cand_value = reward 
                else:
                    cand_value = reward + sigma*value_new_state
                if cand_value > assigned_value:
                    assigned_value = cand_value
                    policy[0][s] = a
                    value_list[0][s] = assigned_value
        changes = np.abs(value_list - old_value_list)
        max_change = np.max(changes)
        old_value_list = value_list.copy()
    print("Solved in: ", episode, " episodes")
    return value_list, policy

In [360]:
def policy_iteration(env, number_of_states, number_of_actions):
    
    ## 1
#     policy = np.random.randint(6, size=(1,number_of_states))
    policy = np.zeros((1,number_of_states))
    value_list = np.zeros((1, number_of_states))
    episode = 0
    sigma = 0.9
    
    ## 2
    policy_stable = False
    while not policy_stable:
        episode += 1
        eval_acc = True
        while eval_acc:
            eps = 0
            for s in range(number_of_states):
                # first row
                v = value_list[0][s]

                # get the new value 
                a = policy[0][s]
                prob, new_state, reward, done = env.P[s][a][0]
                value_new_state = value_list[0][new_state]

                # second row
                if done:
                    value_list[0][s] = reward
                else:
                    value_list[0][s] = reward + sigma*value_new_state

                # third row
                eps = max(eps, np.abs(v-value_list[0][s]))
            if eps < 0.001:
                eval_acc = False


        ## 3
        policy_stable = True
        for s in range(number_of_states):

            # assign 
            old_action = policy[0][s]

            # get the argmax a here
            max_value = -np.inf
            for a in range(number_of_actions):
                # get the new value 
                prob, new_state, reward, done = env.P[s][a][0]
                value_new_state = value_list[0][new_state]
                cand_value = 0
                if done:
                    cand_value = reward
                else:
                    cand_value = reward + sigma*value_new_state

                if cand_value > max_value:
                    max_value = cand_value
                    policy[0][s] = a

            # if old-action != policy[s]
            if old_action != policy[0][s]:
                policy_stable = False
    print("Solved in: ", episode, " episodes")

    return value_list, policy       
        

In [361]:
env = gym.make('Taxi-v3')
current_state = env.reset()
value_list, policy = policy_iteration(env, env.observation_space.n, env.action_space.n)
rewards = []
env.render()

Solved in:  17  episodes
+---------+
|R: | : :[35mG[0m|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [344]:
env = gym.make('Taxi-v3')
current_state = env.reset()
value_list, policy = value_iteration(env, env.observation_space.n, env.action_space.n)
rewards = []
env.render()

Solved in:  19  episodes
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | :[43m [0m| : |
|Y| : |[35mB[0m: |
+---------+



In [371]:

act = int(policy[0][current_state])

print(act)
new_state, reward, finished, _ = env.step(act)
rewards.append(reward)
current_state = new_state
print(finished, reward)
env.render()

5
True 20
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
