<a href="https://colab.research.google.com/github/patricio-tech/proyectos/blob/master/RL_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np

import torch
import torch.nn as nn

In [3]:
import gym
import gym.spaces

env = gym.make('FrozenLake-v0', is_slippery=False)

In [4]:
class OneHotWrapper(gym.ObservationWrapper):
    def __init__(self, env):
        super(OneHotWrapper, self).__init__(env)
        self.observation_space = gym.spaces.Box(0.0, 1.0, (env.observation_space.n, ), dtype=np.float32)

    def observation(self, observation):
        r = np.copy(self.observation_space.low)
        r[observation] = 1.0
        return r

env = OneHotWrapper(env)

In [5]:
obs_size = env.observation_space.shape[0] # 16
n_actions = env.action_space.n  # 4
HIDDEN_SIZE = 32


net= nn.Sequential(
            nn.Linear(obs_size, HIDDEN_SIZE),
            nn.Sigmoid(),
            nn.Linear(HIDDEN_SIZE, n_actions)
        )

In [6]:
sm = nn.Softmax(dim=1)

def select_action(state):
        state_t = torch.FloatTensor([state])
        act_probs_t = sm(net(state_t))
        act_probs = act_probs_t.data.numpy()[0]
        action = np.random.choice(len(act_probs), p=act_probs)
        return action

In [7]:
import torch.optim as optim

objective = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.001)

In [8]:
BATCH_SIZE = 100

GAMMA = 0.9

PERCENTILE = 30
REWARD_GOAL = 0.8

from collections import namedtuple

Episode = namedtuple('Episode', field_names=['reward', 'steps'])
EpisodeStep = namedtuple('EpisodeStep', field_names=['observation', 'action'])

In [9]:
iter_no = 0
reward_mean = 0
full_batch = []
batch = []
episode_steps = []
episode_reward = 0.0
state = env.reset()
    
while reward_mean < REWARD_GOAL:
        action = select_action(state)
        next_state, reward, episode_is_done, _ = env.step(action)

        episode_steps.append(EpisodeStep(observation=state, action=action))
        episode_reward += reward
        
        if episode_is_done: # Episode finished            
            batch.append(Episode(reward=episode_reward, steps=episode_steps))
            next_state = env.reset()
            episode_steps = []
            episode_reward = 0.0
             
            if len(batch) == BATCH_SIZE: # New set of batches ready --> select "elite"
                reward_mean = float(np.mean(list(map(lambda s: s.reward, batch))))
                elite_candidates= batch 
                returnG = list(map(lambda s: s.reward * (GAMMA ** len(s.steps)), elite_candidates))
                reward_bound = np.percentile(returnG, PERCENTILE)

                train_obs = []
                train_act = []
                elite_batch = []
                for example, discounted_reward in zip(elite_candidates, returnG):
                        if discounted_reward > reward_bound:
                              train_obs.extend(map(lambda step: step.observation, example.steps))
                              train_act.extend(map(lambda step: step.action, example.steps))
                              elite_batch.append(example)
                full_batch=elite_batch
                state=train_obs
                acts=train_act

                
                if len(full_batch) != 0 : # just in case empty during an iteration
                 state_t = torch.FloatTensor(state)
                 acts_t = torch.LongTensor(acts)
                 optimizer.zero_grad()
                 action_scores_t = net(state_t)
                 loss_t = objective(action_scores_t, acts_t)
                 loss_t.backward()
                 optimizer.step()
                 print("%d: loss=%.3f, reward_mean=%.3f" % (iter_no, loss_t.item(), reward_mean))
                 iter_no += 1
                batch = []
        state = next_state

0: loss=1.450, reward_mean=0.010
1: loss=1.410, reward_mean=0.010
2: loss=1.383, reward_mean=0.010
3: loss=1.453, reward_mean=0.010
4: loss=1.383, reward_mean=0.010
5: loss=1.376, reward_mean=0.030
6: loss=1.327, reward_mean=0.010
7: loss=1.374, reward_mean=0.020
8: loss=1.362, reward_mean=0.040
9: loss=1.371, reward_mean=0.030
10: loss=1.345, reward_mean=0.030
11: loss=1.377, reward_mean=0.030
12: loss=1.348, reward_mean=0.020
13: loss=1.357, reward_mean=0.030
14: loss=1.364, reward_mean=0.020
15: loss=1.321, reward_mean=0.010
16: loss=1.295, reward_mean=0.040
17: loss=1.356, reward_mean=0.010
18: loss=1.287, reward_mean=0.020
19: loss=1.366, reward_mean=0.040
20: loss=1.267, reward_mean=0.040
21: loss=1.239, reward_mean=0.020
22: loss=1.269, reward_mean=0.060
23: loss=1.314, reward_mean=0.020
24: loss=1.314, reward_mean=0.020
25: loss=1.206, reward_mean=0.020
26: loss=1.248, reward_mean=0.090
27: loss=1.326, reward_mean=0.010
28: loss=1.250, reward_mean=0.020
29: loss=1.274, reward_m

In [14]:
test_env = OneHotWrapper(gym.make('FrozenLake-v0', is_slippery=False))
state= test_env.reset()
test_env.render()

is_done = False

while not is_done:
    action = select_action(state)
    new_state, reward, is_done, _ = test_env.step(action)
    test_env.render()
    state = new_state

print("reward = ", reward)


[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Right)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFF[41mF[0m
FHFH
FFFH
HFFG
  (Down)
SFFF
FHF[41mH[0m
FFFH
HFFG
reward =  0.0
