<a href="https://colab.research.google.com/github/mohantyk/deep-rl/blob/master/Chap7_PTAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install ptan

Collecting ptan
  Using cached https://files.pythonhosted.org/packages/91/cb/57f6d86625f2b24c008b0524ca29559683aa75d00afa38b6b44d7fcad25b/ptan-0.6.tar.gz
Collecting torch==1.3.0
[?25l  Downloading https://files.pythonhosted.org/packages/ae/05/50a05de5337f7a924bb8bd70c6936230642233e424d6a9747ef1cfbde353/torch-1.3.0-cp36-cp36m-manylinux1_x86_64.whl (773.1MB)
[K     |████████████████████████████████| 773.1MB 21kB/s 
Building wheels for collected packages: ptan
  Building wheel for ptan (setup.py) ... [?25l[?25hdone
  Created wheel for ptan: filename=ptan-0.6-cp36-none-any.whl size=23502 sha256=9cbdf8b4a5a0f77c08953a4e4e50b61667f558c45ab4f56523ce76c068bfd789
  Stored in directory: /root/.cache/pip/wheels/f0/4b/2f/9a45fd39b0a614a2716bc6128a7f1adb4647f323a2d90783f2
Successfully built ptan
[31mERROR: torchvision 0.7.0+cu101 has requirement torch==1.6.0, but you'll have torch 1.3.0 which is incompatible.[0m
Installing collected packages: torch, ptan
  Found existing installation: torch 1

In [4]:
import gym
import ptan
import torch
import torch.optim as optim
import numpy as np

## Action Selectors

In [5]:
q_vals = np.array([[1,2, 3],
                   [1, -1, 0]]) # dim 0 is batch
selector = ptan.actions.ArgmaxActionSelector()
selector(q_vals) # Returns indices of actions with largest values

array([2, 0])

In [6]:
epsgreedy = ptan.actions.EpsilonGreedyActionSelector(1)
epsgreedy(q_vals)

array([0, 1])

In [7]:
prob_selector = ptan.actions.ProbabilityActionSelector()
for _ in range(10):
  q_val_distrib = np.array([[0.1, 0.8, 0.1],
                            [0.0, 0.0, 1.0],
                            [0.5, 0.5, 0.0]])
  acts = prob_selector(q_val_distrib)
  print(acts)

[1 2 1]
[1 2 0]
[2 2 0]
[1 2 0]
[2 2 1]
[1 2 1]
[1 2 0]
[1 2 1]
[2 2 0]
[1 2 1]


# Agent

## DQNAgent

In [8]:
class DQNNet(torch.nn.Module):
  def __init__(self, actions: int):
    super().__init__()
    self.actions = actions

  def forward(self, x):
    batches = x.shape[0]
    return torch.eye(batches, self.actions)

In [13]:
net = DQNNet(actions=3)
obsv = torch.zeros(2,8)
q_vals = net(obsv)
print(q_vals)

tensor([[1., 0., 0.],
        [0., 1., 0.]])


In [14]:
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)
agent(obsv) # tuple (best_actions, list of agent's internal state)

(array([0, 1]), [None, None])

In [15]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1.0)
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)
agent(obsv)

(array([2, 2]), [None, None])

In [16]:
selector.epsilon=0.0
agent(obsv)

(array([0, 1]), [None, None])

## PolicyAgent

In [19]:
class PolicyNet(torch.nn.Module):
  def __init__(self, actions: int):
    super().__init__()
    self.actions = actions

  def forward(self,x):
    batches = x.shape[0]
    res = torch.zeros(batches, self.actions)
    # Make logits of first two actions equal
    res[:,0] = 1
    res[:,1] = 1
    return res

In [20]:
net = PolicyNet(actions=5)
obsv = torch.zeros(6, 10)
q_vals = net(obsv)
print(q_vals)

tensor([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])


In [22]:
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, 
                               apply_softmax=True)
agent(obsv) # Softmax produces non-zero probabilities for zero logits

(array([1, 0, 1, 4, 2, 4]), [None, None, None, None, None, None])

# Experience

## Experience Source

In [27]:
from typing import List, Optional, Any, Tuple

In [23]:
class ToyEnv(gym.Env):
  def __init__(self):
    super().__init__()
    self.observation_space = gym.spaces.Discrete(5)
    self.action_space = gym.spaces.Discrete(3)
    self.step_index = 0

  def reset(self):
    self.step_index = 0
    return self.step_index

  def step(self, action):
    is_done = self.step_index == 10
    if is_done:
      return self.step_index % self.observation_space.n, 0.0, is_done, {}
    self.step_index += 1
    return self.step_index % self.observation_space.n, float(action), \
          self.step_index == 10, {}

In [29]:
class DullAgent(ptan.agent.BaseAgent):
  def __init__(self, action):
    self.action = action

  def __call__(self, observation: List[Any], 
               state: Optional[List] = None) -> Tuple[List[int], Optional[List]]:
    return [self.action for _ in observation], state

In [32]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSource(env, agent, steps_count=3)

for idx, exp in enumerate(exp_source):
  if idx > 15:
    break
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Exp

In [35]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSource([ToyEnv(), ToyEnv()], agent, steps_count=2)

for idx, exp in enumerate(exp_source):
  if idx > 10:
    break
  print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(

## First Last

In [38]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1.0, steps_count=2)

for idx, exp in enumerate(exp_source):
  if idx > 10:
    break
  print(exp)

ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=0)
ExperienceFirstLast(state=4, action=1, reward=2.0, last_state=1)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=None)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)


## Replay Buffers

In [43]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, 1.0, 1)
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, 100)
len(buffer)

0

In [44]:
for step in range(6):
  buffer.populate(1)
  if len(buffer) < 5:
    continue
  batch = buffer.sample(4)
  print(f'{len(batch)} samples')
  for exp in batch:
    print(exp)

4 samples
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
4 samples
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)


# TargetNet

In [45]:
class DQNNet(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.ff = torch.nn.Linear(5, 3)

  def forward(self, x):
    return self.ff(x)


In [48]:
net = DQNNet()
print(net)
tgt_net = ptan.agent.TargetNet(net)

DQNNet(
  (ff): Linear(in_features=5, out_features=3, bias=True)
)


In [49]:
net.ff.weight

Parameter containing:
tensor([[ 0.0735, -0.1188, -0.4290, -0.1933, -0.4243],
        [ 0.0902, -0.1623,  0.0039, -0.3567, -0.0139],
        [-0.0377,  0.2571, -0.4437, -0.0462,  0.3738]], requires_grad=True)

In [50]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[ 0.0735, -0.1188, -0.4290, -0.1933, -0.4243],
        [ 0.0902, -0.1623,  0.0039, -0.3567, -0.0139],
        [-0.0377,  0.2571, -0.4437, -0.0462,  0.3738]], requires_grad=True)

In [52]:
net.ff.weight.data += 1.
net.ff.weight

Parameter containing:
tensor([[1.0735, 0.8812, 0.5710, 0.8067, 0.5757],
        [1.0902, 0.8377, 1.0039, 0.6433, 0.9861],
        [0.9623, 1.2571, 0.5563, 0.9538, 1.3738]], requires_grad=True)

In [53]:
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[ 0.0735, -0.1188, -0.4290, -0.1933, -0.4243],
        [ 0.0902, -0.1623,  0.0039, -0.3567, -0.0139],
        [-0.0377,  0.2571, -0.4437, -0.0462,  0.3738]], requires_grad=True)

In [56]:
tgt_net.sync()
tgt_net.target_model.ff.weight

Parameter containing:
tensor([[1.0735, 0.8812, 0.5710, 0.8067, 0.5757],
        [1.0902, 0.8377, 1.0039, 0.6433, 0.9861],
        [0.9623, 1.2571, 0.5563, 0.9538, 1.3738]], requires_grad=True)