In [1]:
import ptan
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import gym

# Action selectors

## ArgMax

In [2]:
q_vals = np.array([[1,2,3], [1,-1,0]])
q_vals

array([[ 1,  2,  3],
       [ 1, -1,  0]])

In [3]:
selector = ptan.actions.ArgmaxActionSelector()
selector(q_vals)

array([2, 0])

In [4]:
np.take_along_axis(q_vals, selector(q_vals)[:,None], 1).flatten()

array([3, 1])

## EpsilonGreedy

In [5]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.0)
selector(q_vals)

array([2, 0])

In [6]:
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=0.5)
selector(q_vals)

array([2, 0])

## Probability selection

In [7]:
selector = ptan.actions.ProbabilityActionSelector()
acts = np.array([[.125,.75,.125],
                [0,0,1.],
                [.5,.5,0]])

for _ in range(10):
    print(selector(acts))

[1 2 1]
[1 2 0]
[1 2 0]
[0 2 0]
[1 2 1]
[1 2 0]
[1 2 1]
[1 2 1]
[0 2 1]
[1 2 1]


# Agent

## DQN Agent

### Simple Model

In [8]:
class DQNNET(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.n_actions = n_actions
        
    def forward(self, x):
        return torch.eye(x.size()[0], self.n_actions)

In [9]:
net = DQNNET(n_actions=3)
net(torch.zeros(2,10))

tensor([[1., 0., 0.],
        [0., 1., 0.]])

### Argmax selector

In [10]:
selector = ptan.actions.ArgmaxActionSelector()
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)

In [11]:
# (actions for each batch), [internal state for each batch]

agent(torch.zeros(2,5))

(array([0, 1]), [None, None])

### Epsilon-Greedy

In [12]:
selector = ptan.actions.EpsilonGreedyActionSelector(.5)
agent = ptan.agent.DQNAgent(dqn_model=net, action_selector=selector)

agent(torch.zeros(2,5))

(array([0, 2]), [None, None])

### Policy Agent

In [13]:
class PolicyNet(nn.Module):
    def __init__(self, n_actions):
        super().__init__()
        self.n_actions = n_actions
        
    def forward(self, x):
        output_shape = (x.shape[0], self.n_actions)
        result = torch.zeros(output_shape, dtype=torch.float32)
        result[:,:2] = 1.
        
        return result

In [14]:
net = PolicyNet(5)
net(torch.zeros(6,10))

tensor([[1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])

In [15]:
selector = ptan.actions.ProbabilityActionSelector()
agent = ptan.agent.PolicyAgent(model=net, action_selector=selector, apply_softmax=True)

agent(torch.zeros(6,5))[0]

array([1, 1, 0, 1, 1, 4])

# Experience Source

In [16]:
class ToyEnv(gym.Env):
    def __init__(self):
        super().__init__()
        self.observation_space = gym.spaces.Discrete(n=5)
        self.action_space = gym.spaces.Discrete(n=3)
        self.step_index = 0
        
    def reset(self):
        self.step_index = 0
        return self.step_index
    
    def step(self, action):        
        if is_done := (self.step_index==10):
            return self.step_index%self.observation_space.n, 0.0, is_done, {}
        
        self.step_index += 1
        return self.step_index%self.observation_space.n, float(action), self.step_index==10, {}

In [17]:
env = ToyEnv()
env.step(2)

(1, 2.0, False, {})

In [18]:
class DullAgent(ptan.agent.BaseAgent):
    def __init__(self, action):
        super().__init__()
        self.action = action
        
    def __call__(self, observations, state):
        return [self.action for _ in observations], state

## ExperienceSource

In [19]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSource(env=env, agent=agent, steps_count=2)

for idx, exp in enumerate(exp_source):
    if idx>12: break
    print(exp)

(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=False))
(Experience(state=4, action=1, reward=1.0, done=False), Experience(state=0, action=1, reward=1.0, done=False))
(Experience(state=0, action=1, reward=1.0, done=False), Experience(state=1, action=1, reward=1.0, done=False))
(Experience(state=1, action=1, reward=1.0, done=False), Experience(state=2, action=1, reward=1.0, done=False))
(Experience(state=2, action=1, reward=1.0, done=False), Experience(state=3, action=1, reward=1.0, done=False))
(Experience(state=3, action=1, reward=1.0, done=False), Experience(state=4, action=1, reward=1.0, done=True))
(E

## ExperienceSourceFirstLast

Take n steps, record the first and last together with accumulated rewards.

In [20]:
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1., steps_count=2)

In [21]:
for e,exp in enumerate(exp_source):
    print(exp)
    if e>10: break

ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=0)
ExperienceFirstLast(state=4, action=1, reward=2.0, last_state=1)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)
ExperienceFirstLast(state=2, action=1, reward=2.0, last_state=4)
ExperienceFirstLast(state=3, action=1, reward=2.0, last_state=None)
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=None)
ExperienceFirstLast(state=0, action=1, reward=2.0, last_state=2)
ExperienceFirstLast(state=1, action=1, reward=2.0, last_state=3)


In [22]:
exp_source.pop_rewards_steps??

## Experience replay buffers

In [23]:
env = ToyEnv()
agent = DullAgent(action=1)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=1., steps_count=1)

In [24]:
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=100)
len(buffer)

0

Normal training loop now becomes:

1. buffer.populate(1)
2. batch = buffer.sample(BATCH_SIZE)
3. loss + backprop

In [25]:
for step in range(6):
    buffer.populate(1)
    if len(buffer)<5: continue
        
    batch = buffer.sample(4)
    print('\n'.join(str(b) for b in batch))
    print('--------------------------')

ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=2, action=1, reward=1.0, last_state=3)
--------------------------
ExperienceFirstLast(state=4, action=1, reward=1.0, last_state=0)
ExperienceFirstLast(state=0, action=1, reward=1.0, last_state=1)
ExperienceFirstLast(state=3, action=1, reward=1.0, last_state=4)
ExperienceFirstLast(state=1, action=1, reward=1.0, last_state=2)
--------------------------


# TargetNet

Syncing of two neural nets, either by copying or by linear interpolation of the weights.

In [26]:
class DQNNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.ff = nn.Linear(5,3)
        
    def forward(self, x):
        return self.ff(x)

In [27]:
net = DQNNet()
net

DQNNet(
  (ff): Linear(in_features=5, out_features=3, bias=True)
)

In [28]:
tgt_net = ptan.agent.TargetNet(net)

In [29]:
net.ff.weight[0][0], tgt_net.target_model.ff.weight[0][0]

(tensor(-0.0637, grad_fn=<SelectBackward>),
 tensor(-0.0637, grad_fn=<SelectBackward>))

In [30]:
net.ff.weight[0][0] += 1
net.ff.weight[0][0], tgt_net.target_model.ff.weight[0][0]

(tensor(0.9363, grad_fn=<SelectBackward>),
 tensor(-0.0637, grad_fn=<SelectBackward>))

In [31]:
tgt_net.sync()
net.ff.weight[0][0], tgt_net.target_model.ff.weight[0][0]

(tensor(0.9363, grad_fn=<SelectBackward>),
 tensor(0.9363, grad_fn=<SelectBackward>))

# CartPole Example

In [32]:
env = gym.make("CartPole-v0")
n_observations = env.observation_space.shape[0]
n_actions = env.action_space.n

In [33]:
class Net(nn.Module):
    def __init__(self, n_input, n_output, n_hidden):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(n_input, n_hidden),
                       nn.ReLU(),
                       nn.Linear(n_hidden, n_output),
                       )
        
    def forward(self, x):
        return self.layers(x.float())

In [34]:
net = Net(n_observations, n_actions, 128)
target_net = ptan.agent.TargetNet(net)

#net(torch.tensor(env.observation_space.sample()).unsqueeze(0))

In [35]:
selector = ptan.actions.ArgmaxActionSelector()
selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=1., selector=selector)

agent = ptan.agent.DQNAgent(net, selector)
exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=.9)
buffer = ptan.experience.ExperienceReplayBuffer(exp_source, 1000)
opt = optim.Adam(net.parameters(), 1e-3)

In [36]:
@torch.no_grad()
def unpack_batch(batch, net, gamma):
    states,actions,rewards,last_states = list(zip(*batch))
    states = torch.tensor(states)
    actions = torch.tensor(actions)
    rewards = torch.tensor(rewards)

    done_mask = [el is None for el in last_states]
    last_states = [last_state if last_state is not None else np.array(states[0]) for last_state in last_states]
    last_states = torch.tensor(last_states)

    next_state_qs = net(last_states).max(dim=1)[0]
    next_state_qs[done_mask] = 0.0
    
    return states, actions, rewards + gamma*next_state_qs

In [37]:
solved = False
episode = 0
step = 0

while True:
    step += 1
    buffer.populate(1)
    if len(buffer)<2*16:#batchsize
        continue
        
    for reward,steps in exp_source.pop_rewards_steps():
        # information about finished episodes since last call to this function
        episode += 1
        print(f'Episode: {episode} | Reward: {reward} | Epsilon: {selector.epsilon}')
        if reward >150:
            print('Solved.')
            solved = True
            
    if solved: break
    
        
    batch = buffer.sample(16)
    states, actions, target_qs = unpack_batch(batch, target_net.target_model, .9)
    
    opt.zero_grad()

    actual_qs = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
    loss = F.mse_loss(actual_qs, target_qs)
    loss.backward()

    opt.step()
    selector.epsilon *= .99
    
    if step%10==0:
        target_net.sync()
    

Episode: 1 | Reward: 37.0 | Epsilon: 0.9414801494009999
Episode: 2 | Reward: 19.0 | Epsilon: 0.7778213593991465
Episode: 3 | Reward: 20.0 | Epsilon: 0.6361854860638709
Episode: 4 | Reward: 15.0 | Epsilon: 0.5471566423907612
Episode: 5 | Reward: 15.0 | Epsilon: 0.47058664158564995
Episode: 6 | Reward: 10.0 | Epsilon: 0.42559012338865465
Episode: 7 | Reward: 11.0 | Epsilon: 0.38104711810454983
Episode: 8 | Reward: 11.0 | Epsilon: 0.34116606151404244
Episode: 9 | Reward: 13.0 | Epsilon: 0.29938039131233124
Episode: 10 | Reward: 10.0 | Epsilon: 0.270754259511994
Episode: 11 | Reward: 12.0 | Epsilon: 0.23999247958413436
Episode: 12 | Reward: 11.0 | Epsilon: 0.21487444770607952
Episode: 13 | Reward: 10.0 | Epsilon: 0.19432859888279505
Episode: 14 | Reward: 10.0 | Epsilon: 0.17574730149117582
Episode: 15 | Reward: 9.0 | Epsilon: 0.1605481911108965
Episode: 16 | Reward: 9.0 | Epsilon: 0.1466635416321037
Episode: 17 | Reward: 10.0 | Epsilon: 0.13263987810938213
Episode: 18 | Reward: 10.0 | Epsi