In [None]:
import gym
import numpy as np
ENV_NAME = "FrozenLake-v0"
GAMMA = 0.95

In [None]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.V = np.zeros(self.env.observation_space.n)
        
    def calc_action_value(self, state, action):
        action_value = sum([prob*(r + GAMMA * self.V[s_])
            for prob, s_, r, _ in self.env.P[state][action]])
        return action_value
    
    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action

    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = [self.calc_action_value(state, action)
                            for action in range(self.env.action_space.n)
                            ]
            self.V[state] = max(state_values)
        return self.V

In [None]:
TEST_EPISODES = 40
def check_improvements():
    test_env = gym.make(ENV_NAME)
    reward_test = 0.0
    for _ in range(TEST_EPISODES):
        total_reward = 0.0
        state = test_env.reset()
        while True:
            action = agent.select_action(state)
            new_state, new_reward, is_done, _ = test_env.step(action)
            total_reward += new_reward
            if is_done: break
            state = new_state
        reward_test += total_reward
    reward_test /= TEST_EPISODES
    return (reward_test)

In [None]:
REWARD_THRESHOLD = 0.90
def train(agent):
    t = 0
    best_reward = 0.0
    while best_reward < REWARD_THRESHOLD:
        agent.value_iteration()
        t += 1
        reward_test = check_improvements()
        if reward_test > best_reward:
            print("Best reward updated %.2f at iteration %d " %
                (reward_test ,t) )
            best_reward = reward_test

In [None]:
agent = Agent()
train(agent)

In [None]:
def test(agent):
    new_test_env = gym.make(ENV_NAME)
    state = new_test_env.reset()
    new_test_env.render()
    is_done = False
    t = 0
    while not is_done:
        action = agent.select_action(state)
        new_state, reward, is_done, _ = new_test_env.step(action)
        new_test_env.render()
        state = new_state
        t +=1
    print("\nlast state =", state)
    print("reward = ", reward)
    print("time steps =", t)

test(agent)