<a href="https://colab.research.google.com/github/patricio-tech/proyectos/blob/master/RL1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**V-function in Practice for Frozen-Lake Environment**

In [1]:
import gym
import collections
from torch.utils.tensorboard import SummaryWriter
import numpy as np

ENV_NAME = "FrozenLake-v0"
#ENV_NAME = "FrozenLake8x8-v0"  
GAMMA = 0.95

**The Agent**

In [2]:
class Agent:
    def __init__(self):
        self.env = gym.make(ENV_NAME)
        self.V = np.zeros(self.env.observation_space.n)

    def calc_action_value(self, state, action):
        action_value = sum([prob*(r + GAMMA * self.V[s_])
                          for prob, s_, r, _ in self.env.P[state][action]]) 
        return action_value

    def select_action(self, state):
        best_action, best_value = None, None
        for action in range(self.env.action_space.n):
            action_value = self.calc_action_value(state, action)
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        return best_action


    def value_iteration(self):
        for state in range(self.env.observation_space.n):
            state_values = [self.calc_action_value(state, action)
                           for action in range(self.env.action_space.n)
                           ]
            self.V[state] = max(state_values)
        return self.V

Training **loop**

In [3]:
TEST_EPISODES = 40
REWARD_GOAL = 0.90

def train(agent): 
  test_env = gym.make(ENV_NAME)
  writer = SummaryWriter()

  iter_no = 0
  best_reward = 0.0
 
  while best_reward < REWARD_GOAL:

        #step 1
        agent.value_iteration()

        #step 2 check the improvements 
        iter_no += 1
        reward_test = 0.0
        for _ in range(TEST_EPISODES):
            total_reward = 0.0
            state = test_env.reset()
            while True:
                action = agent.select_action(state)
                new_state, new_reward, is_done, _ = test_env.step(action)
                total_reward += new_reward
                if is_done: break
                state = new_state
            reward_test += total_reward
        reward_test /= TEST_EPISODES

        #step track with TensorBoard 
        writer.add_scalar("reward", reward_test, iter_no)
        if reward_test > best_reward:
            print("Best reward updated %.2f at iteration %d " % (reward_test ,iter_no) )
            best_reward = reward_test

  writer.close()

**Training the Agent**

In [4]:
agent = Agent()
train(agent)

Best reward updated 0.20 at iteration 3 
Best reward updated 0.28 at iteration 4 
Best reward updated 0.45 at iteration 5 
Best reward updated 0.53 at iteration 7 
Best reward updated 0.70 at iteration 13 
Best reward updated 0.80 at iteration 17 
Best reward updated 0.82 at iteration 34 
Best reward updated 0.85 at iteration 55 
Best reward updated 0.88 at iteration 95 
Best reward updated 0.93 at iteration 550 
