In [95]:
from Frozen_Lake import FrozenLakeEnv
import numpy as np
import time
env = FrozenLakeEnv()

In [2]:
env.get_all_states()

((0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3))

Видим поле 4 на 4

In [8]:
state = (0, 1)
env.get_possible_actions(state)

('left', 'down', 'right', 'up')

In [9]:
env.render() #поле наше 4 х 4

*FFF
FHFH
FFFH
HFFG



In [10]:
state = (0, 1)
action = 'right'
env.get_next_states(state, action) 

{(1, 1): 0.1, (0, 2): 0.8, (0, 1): 0.1}

В какое состояние перейти можем и с какой вероятностью

In [11]:
next_state = (0, 2)
env.get_transition_prob(state, action, next_state) #returns probability to enter s' from s acting with a

0.8

In [12]:
env.get_reward(state, action, next_state) # зависит только от next_state

0.0

In [20]:
state = (1, 1)
env.is_terminal(state) # true if no possible  actions

True

In [64]:
def get_q_values(v_values, gamma):
    q_values = {}
    for state in env.get_all_states():
        q_values[state] = {}
        for action in env.get_possible_actions(state):
            q_values[state][action] = 0
            for next_state in env.get_next_states(state, action):
                q_values[state][action] += env.get_transition_prob(state, action, next_state) * env.get_reward(state, action, next_state)
                q_values[state][action] += gamma * env.get_transition_prob(state, action, next_state) * v_values[next_state]
    return q_values

In [65]:
def init_policy():
    policy = {}
    for state in env.get_all_states():
        policy[state] = {}
        for action in env.get_possible_actions(state):
            policy[state][action] = 1 / len(env.get_possible_actions(state))
    return policy

In [66]:
def init_v_values():
    v_values = {}
    for state in env.get_all_states():
        v_values[state] = 0
    return v_values

In [67]:
def policy_eval_step(v_values, gamma, policy): # inner cycle in policy eval
    q_values = get_q_values(v_values, gamma)
    new_values = init_v_values()
    for state in env.get_all_states():
        new_values[state] = 0
        for action in env.get_possible_actions(state):
            new_values[state] += policy[state][action] * q_values[state][action]
    return new_values

In [68]:
def policy_eval(policy, gamma, eval_iter_n):
    v_values = init_v_values()
    for _ in range(eval_iter_n):
        v_values = policy_eval_step(v_values, gamma, policy)
        q_values = get_q_values(v_values, gamma)
    return q_values

In [76]:
def policy_improvement(q_values):
    policy = {}
    for state in env.get_all_states():
        policy[state] = {}
        max_q_value = float('-inf')
        argmax_action = None
        for action in env.get_possible_actions(state):
            policy[state][action] = 0
            if q_values[state][action] > max_q_value:
                max_q_value = q_values[state][action]
                argmax_action = action
        policy[state][argmax_action] = 1
    return policy

In [98]:
iter_n = 20 #iterations of alg
eval_iter_n = 20 #iterations in policy evaluation step
gamma = .9

policy = init_policy()
for _ in range(iter_n):
    q_values = policy_eval(policy, gamma, eval_iter_n)
    policy = policy_improvement(q_values)

In [99]:
policy

{(0, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (0, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (0, 3): {'left': 1, 'down': 0, 'right': 0, 'up': 0},
 (1, 0): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 1): {None: 1},
 (1, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (1, 3): {None: 1},
 (2, 0): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (2, 1): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 2): {'left': 0, 'down': 1, 'right': 0, 'up': 0},
 (2, 3): {None: 1},
 (3, 0): {None: 1},
 (3, 1): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (3, 2): {'left': 0, 'down': 0, 'right': 1, 'up': 0},
 (3, 3): {None: 1}}

In [100]:
total_reward = 0
state = env.reset()
for _ in range(1000):
    action = np.random.choice(env.get_possible_actions(state), p=list(policy[state].values()))
    state, reward, done, _ = env.step(action)
    total_reward += reward
    
    env.render()
    time.sleep(.5)
    if done:
        break

SFFF
*HFH
FFFH
HFFG

SFFF
FHFH
*FFH
HFFG

SFFF
FHFH
F*FH
HFFG

SFFF
FHFH
FFFH
H*FG

SFFF
FHFH
FFFH
HF*G

SFFF
FHFH
FFFH
HFF*



In [101]:
total_reward

1.0