In [1]:
! pip install gym==0.7.0



In [2]:
import gym
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [3]:
from gym.envs.registration import register
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False}
)

In [4]:
env = gym.make('FrozenLake-v0')
#env = gym.make('FrozenLakeNotSlippery-v0')

[2019-02-09 10:53:56,354] Making new env: FrozenLake-v0
  result = entry_point.load(False)


In [5]:
type(env)

gym.envs.toy_text.frozen_lake.FrozenLakeEnv

In [6]:
env.render()

[41mS[0mFFF
FHFH
FFFH
HFFG



<ipykernel.iostream.OutStream at 0x108956a58>

In [7]:
print(env.observation_space.n)

16


In [8]:
env.action_space.n

4

In [9]:
env.P[0]

{0: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False)],
 1: [(0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 1, 0.0, False)],
 2: [(0.3333333333333333, 4, 0.0, False),
  (0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)],
 3: [(0.3333333333333333, 1, 0.0, False),
  (0.3333333333333333, 0, 0.0, False),
  (0.3333333333333333, 0, 0.0, False)]}

In [10]:
def value_iteration(env, gamma=1.0):
    # initialize value table with zeros
    value_table = np.zeros(env.observation_space.n)
    # setting number of iterations and treshold
    num_iterations=100000
    treshold = 1e-20
    
    value_history = np.zeros((num_iterations,env.observation_space.n))
    
    for i in range(num_iterations):
        updated_value_table = np.copy(value_table)

        for state in range(env.observation_space.n):
            Q_value = []
            for action in range(env.action_space.n):
                next_states_rewards = []
                for next_s in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _ = next_s
                    next_states_rewards.append(
                        trans_prob * (reward_prob + gamma * updated_value_table[next_state]))
                Q_value.append(np.sum(next_states_rewards))

            value_table[state] = max(Q_value)
            value_history[i][state] = value_table[state]

        if(np.sum(np.fabs(updated_value_table - value_table)) <= treshold):
            print(f'Value iteration converging at iteration: {i + 1}')
            break
    
    return value_table, value_history

In [11]:
optimal_value_function, value_history = value_iteration(env, gamma=0.9)
print(optimal_value_function.reshape(4,4))

Value iteration converging at iteration: 267
[[0.0688909  0.06141457 0.07440976 0.05580732]
 [0.09185454 0.         0.11220821 0.        ]
 [0.14543635 0.24749695 0.29961759 0.        ]
 [0.         0.3799359  0.63902015 0.        ]]


In [31]:
from ipywidgets import interact
import matplotlib.pyplot as plt
from itertools import product

@interact
def view_image(i= (0,250)):
    plt.clf()
    x = range(4)
    y = range(4)
    for a,b in product(x, y): 
        plt.text(a, b, f'{np.transpose(value_history[i].reshape(4,4))[a][b]:.2f}',)
    plt.imshow(value_history[i].reshape(4,4)+1e-10, cmap='winter')
    plt.show()

interactive(children=(IntSlider(value=125, description='i', max=250), Output()), _dom_classes=('widget-interac…

In [14]:
def extract_policy(env, value_table, gamma=1.0):
    # initializing the policy table
    policy = np.zeros(env.observation_space.n)
    
    for state in range(env.observation_space.n):
        # initizalizing the Q_table for each state
        Q_table = np.zeros(env.action_space.n)
        
        # compute Q value for each action of the state
        for action in range(env.action_space.n):
            for next_s in env.P[state][action]:
                trans_prob, next_state, reward_prob, _ = next_s
                Q_table[action] += (trans_prob 
                    * (reward_prob + gamma * value_table[next_state]))
            
        # select the action that has the maximum Q-value for finding the optimal policy
        policy[state] = np.argmax(Q_table)
            
    return policy

In [15]:
policy = extract_policy(env, optimal_value_function, gamma=0.9)
policy

array([0., 3., 0., 3., 0., 0., 0., 0., 3., 1., 0., 0., 0., 2., 1., 0.])

In [16]:
d = {
    0: "←",
    1: "↓",
    2: "→",
    3: "↑",
}

arrows = []
for action in policy:
    arrows.append(d[action])

np.array(arrows).reshape(4,4)

array([['←', '↑', '←', '↑'],
       ['←', '←', '←', '←'],
       ['↑', '↓', '←', '←'],
       ['←', '→', '↓', '←']], dtype='<U1')

In [17]:
import pandas as pd
pd.DataFrame(np.array(arrows).reshape(4,4))

Unnamed: 0,0,1,2,3
0,←,↑,←,↑
1,←,←,←,←
2,↑,↓,←,←
3,←,→,↓,←
