<a href="https://colab.research.google.com/github/mooithub/pyemotion_rl/blob/master/examples/ch07_higer_level_rl_lib/07_higer_level_rl_lib_sol.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 07. Higher-Level RL Libraries (Solution)

* 파이모션 / Deep RL 핸즈온 [1]
* 김무성


# 차례 
* Why RL libraries?
* The PTAN library
* Action selectors
* The agent
* DQNAgent
* PolicyAgent
* Experience source
* Toy environment
* The ExperienceSource class
* ExperienceSourceFirstLast
* Experience replay buffers
* The TargetNet class
* The PTAN CartPole solver
* Other RL libraries

In [None]:
# 설치

In [2]:
!pip install ptan==0.6

Collecting ptan==0.6
  Downloading https://files.pythonhosted.org/packages/91/cb/57f6d86625f2b24c008b0524ca29559683aa75d00afa38b6b44d7fcad25b/ptan-0.6.tar.gz
Collecting torch==1.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/b4/0b/9d33aef363b6728ad937643d98be713c6c25d50ce338678ad57cee6e6fd5/torch-1.3.0-cp37-cp37m-manylinux1_x86_64.whl (773.1MB)
[K     |████████████████████████████████| 773.1MB 19kB/s 
Building wheels for collected packages: ptan
  Building wheel for ptan (setup.py) ... [?25l[?25hdone
  Created wheel for ptan: filename=ptan-0.6-cp37-none-any.whl size=23502 sha256=bf1880d34a010149d511f7096b61a84f1153730342c497f5839b7e3101b786a9
  Stored in directory: /root/.cache/pip/wheels/f0/4b/2f/9a45fd39b0a614a2716bc6128a7f1adb4647f323a2d90783f2
Successfully built ptan
[31mERROR: torchvision 0.9.1+cu101 has requirement torch==1.8.1, but you'll have torch 1.3.0 which is incompatible.[0m
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have tor

# 임포트

In [3]:
import ptan
import torch
import torch.nn as nn

-------------

## Why RL libraries?


--------------

## The PTAN library


-------------

# Action selectors

In [None]:
# All the classes assume that NumPy arrays will be passed to them. The complete example from this section can be found in Chapter07/01_actions.py.

In [4]:
import numpy as np

In [5]:
import ptan

In [6]:
q_vals = np.array([[1, 2, 3], [1, -1, 0]])

In [7]:
q_vals

array([[ 1,  2,  3],
       [ 1, -1,  0]])

In [8]:
selector = ptan.actions.ArgmaxActionSelector()

In [9]:
selector(q_vals)

array([2, 0])

In [None]:
# As you can see, the selector returns indices of actions with the largest values.

In [10]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0)

In [11]:
selector(q_vals)

array([2, 0])

In [None]:
# The result of the EpsilonGreedyActionSelector application is the same, as epsilon is 0.0, which means no random actions are taken. If we change epsilon to 1, actions will be random:

In [12]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)

In [13]:
selector(q_vals)

array([1, 2])

In [None]:
# Working with ProbabilityActionSelector is the same, but the input needs to be a normalized probability distribution

In [14]:
selector = ptan.actions.ProbabilityActionSelector()

In [15]:
for _ in range(10):
  acts = selector(np.array([
    [0.1, 0.8, 0.1],
    [0.0, 0.0, 1.0],
    [0.5, 0.5, 0.0]]))

  print(acts)

[1 2 0]
[1 2 0]
[1 2 1]
[1 2 0]
[0 2 1]
[1 2 0]
[1 2 1]
[1 2 1]
[1 2 1]
[1 2 1]


-------------

# The agent

-------------

## DQNAgent

In [16]:
class DQNNet(nn.Module):
  def __init__(self, actions: int):
    super(DQNNet, self).__init__()
    self.actions = actions
       
  def forward(self, x):
    return torch.eye(x.size()[0], self.actions)

In [None]:
# Once we have defined the above class, we can use it as a DQN model:

In [17]:
net = DQNNet(actions=3)

In [18]:
net(torch.zeros(2, 10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

In [None]:
# We start with the simple argmax policy, so the agent will always return actions corresponding to 1s in the network output.

In [19]:
selector = ptan.actions.ArgmaxActionSelector()

In [20]:
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)

In [21]:
agent(torch.zeros(2, 5))

(array([0, 1]), [None, None])

--------------

## PolicyAgent


In [22]:
class PolicyNet(nn.Module):
    def __init__(self, actions: int):
        super(PolicyNet, self).__init__()
        self.actions = actions

    def forward(self, x):
        # Now we produce the tensor with first two actions
        # having the same logit scores
        shape = (x.size()[0], self.actions)
        res = torch.zeros(shape, dtype=torch.float32)
        res[:, 0] = 1
        res[:, 1] = 1
        return res

In [23]:
net = PolicyNet(actions=5)
net_out = net(torch.zeros(6, 10))
print("policy_net:")
print(net_out)

policy_net:
tensor([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])


In [24]:
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True)
ag_out = agent(torch.zeros(6, 5))[0]
print(ag_out)

[0 0 0 4 1 0]


-----------------

## Experience source


--------------

## Toy environment


In [25]:
import gym
import ptan
from typing import List, Optional, Tuple, Any

In [26]:
class ToyEnv(gym.Env):
    """
    Environment with observation 0..4 and actions 0..2
    Observations are rotated sequentialy mod 5, reward is equal to given action.
    Episodes are having fixed length of 10
    """

    def __init__(self):
        super(ToyEnv, self).__init__()
        self.observation_space = gym.spaces.Discrete(n=5)
        self.action_space = gym.spaces.Discrete(n=3)
        self.step_index = 0

    def reset(self):
        self.step_index = 0
        return self.step_index

    def step(self, action):
        is_done = self.step_index == 10
        if is_done:
            return self.step_index % self.observation_space.n, \
                   0.0, is_done, {}
        self.step_index += 1
        return self.step_index % self.observation_space.n, \
               float(action), self.step_index == 10, {}

In [27]:
env = ToyEnv()
s = env.reset()
print("env.reset() -> %s" % s)
s = env.step(1)
print("env.step(1) -> %s" % str(s))
s = env.step(2)
print("env.step(2) -> %s" % str(s))

env.reset() -> 0
env.step(1) -> (1, 1.0, False, {})
env.step(2) -> (2, 2.0, False, {})


In [28]:
for _ in range(10):
  r = env.step(0)
  print(r)

(3, 0.0, False, {})
(4, 0.0, False, {})
(0, 0.0, False, {})
(1, 0.0, False, {})
(2, 0.0, False, {})
(3, 0.0, False, {})
(4, 0.0, False, {})
(0, 0.0, True, {})
(0, 0.0, True, {})
(0, 0.0, True, {})


In [29]:
class DullAgent(ptan.agent.BaseAgent):
    """
    Agent always returns the fixed action
    """
    def __init__(self, action: int):
        self.action = action

    def __call__(self, observations: List[Any],
                 state: Optional[List] = None) \
            -> Tuple[List[int], Optional[List]]:
        return [self.action for _ in observations], state

In [30]:
agent = DullAgent(action=1)
print("agent:", agent([1, 2])[0])

agent: [1, 1]


--------------

In [31]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=2)
for idx, exp in enumerate(exp_source):
  if idx > 15:
      break
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=True))
(E

## The ExperienceSource class


In [32]:
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=4)
print(next(iter(exp_source)))

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))


In [33]:
exp_source = ptan.experience.ExperienceSource(env=[ToyEnv(), ToyEnv()], agent=agent, steps_count=2)
for idx, exp in enumerate(exp_source):
  if idx > 4:
      break
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))


---------------

## ExperienceSourceFirstLast


In [34]:
print("ExperienceSourceFirstLast")
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=1)
for idx, exp in enumerate(exp_source):
  print(exp)
  if idx > 10:
      break

ExperienceSourceFirstLast
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)


-----------------

## Experience replay buffers


In [35]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=1)

In [36]:
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=100)

In [37]:
for step in range(6):
    buffer.populate(1)
    # if buffer is small enough, do nothing
    if len(buffer) < 5:
        continue
    batch = buffer.sample(4)
    print("Train time, %d batch samples:" % len(batch))
    for s in batch:
        print(s)

Train time, 4 batch samples:
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
Train time, 4 batch samples:
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)


--------------

## The TargetNet class


In [38]:
import ptan
import torch.nn as nn

In [39]:
class DQNNet(nn.Module):
    def __init__(self):
        super(DQNNet, self).__init__()
        self.ff = nn.Linear(5, 3)

    def forward(self, x):
        return self.ff(x)

In [40]:
net = DQNNet()
print(net)

DQNNet(
  (ff): Linear(in_features=5, out_features=3, bias=True)
)


In [41]:
tgt_net = ptan.agent.TargetNet(net)
print("Main net:", net.ff.weight)
print("Target net:", tgt_net.target_model.ff.weight)

Main net: Parameter containing:
tensor([[ 0.1032, -0.0820, -0.1103,  0.1248,  0.2950],
        [-0.3504, -0.1226, -0.2608, -0.4029,  0.1985],
        [ 0.1090, -0.3073,  0.1705, -0.2395,  0.4092]], requires_grad=True)
Target net: Parameter containing:
tensor([[ 0.1032, -0.0820, -0.1103,  0.1248,  0.2950],
        [-0.3504, -0.1226, -0.2608, -0.4029,  0.1985],
        [ 0.1090, -0.3073,  0.1705, -0.2395,  0.4092]], requires_grad=True)


In [42]:
net.ff.weight.data += 1.0
print("After update")
print("Main net:", net.ff.weight)
print("Target net:", tgt_net.target_model.ff.weight)

After update
Main net: Parameter containing:
tensor([[1.1032, 0.9180, 0.8897, 1.1248, 1.2950],
        [0.6496, 0.8774, 0.7392, 0.5971, 1.1985],
        [1.1090, 0.6927, 1.1705, 0.7605, 1.4092]], requires_grad=True)
Target net: Parameter containing:
tensor([[ 0.1032, -0.0820, -0.1103,  0.1248,  0.2950],
        [-0.3504, -0.1226, -0.2608, -0.4029,  0.1985],
        [ 0.1090, -0.3073,  0.1705, -0.2395,  0.4092]], requires_grad=True)


In [43]:
tgt_net.sync()
print("After sync")
print("Main net:", net.ff.weight)
print("Target net:", tgt_net.target_model.ff.weight)


After sync
Main net: Parameter containing:
tensor([[1.1032, 0.9180, 0.8897, 1.1248, 1.2950],
        [0.6496, 0.8774, 0.7392, 0.5971, 1.1985],
        [1.1090, 0.6927, 1.1705, 0.7605, 1.4092]], requires_grad=True)
Target net: Parameter containing:
tensor([[1.1032, 0.9180, 0.8897, 1.1248, 1.2950],
        [0.6496, 0.8774, 0.7392, 0.5971, 1.1985],
        [1.1090, 0.6927, 1.1705, 0.7605, 1.4092]], requires_grad=True)


-------------

## The PTAN CartPole solver


* https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On-Second-Edition/blob/3ebbd9cab1e936a05a1e8c5b384d552e6819e7a9/Chapter07/06_cartpole.py

--------------

## Other RL libraries

---------------------
참고자료
* [1] Deep Reinforcement Learning Hands-On
  - 책 - https://www.amazon.com/dp/B076H9VQH6/
  - github - https://github.com/PacktPublishing/Deep-Reinforcement-Learning-Hands-On