In [1]:
##################################################
# deque example
#################################################

# https://docs.python.org/3/library/collections.html#collections.deque

from collections import deque
import random

iterable_object = [0, 1, 2, 3, 4]
deq = deque(iterable_object, maxlen=5)

deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [2]:
# add 5 to the right side of the deque
deq.append(5)
deq # 0 is removed

deque([1, 2, 3, 4, 5], maxlen=5)

In [3]:
for i in range(6, 10):
    deq.append(i)
deq

deque([5, 6, 7, 8, 9], maxlen=5)

In [4]:
# add elements to the deque using extend method
deq.extend([10, 11, 11, 12])
deq

deque([9, 10, 11, 11, 12], maxlen=5)

In [5]:
deq.appendleft(0)
deq

deque([0, 9, 10, 11, 11], maxlen=5)

In [6]:
deq.extendleft([0, 1, 2, 3])
deq

deque([3, 2, 1, 0, 0], maxlen=5)

In [7]:
deq.reverse()
deq

deque([0, 0, 1, 2, 3], maxlen=5)

In [8]:
deq.count(0)

2

In [9]:
deq.pop() # 
deq

deque([0, 0, 1, 2], maxlen=5)

In [10]:
deq. popleft()
deq

deque([0, 1, 2], maxlen=5)

In [11]:
deq.remove(1) # the first 1 is removed
deq

deque([0, 2], maxlen=5)

In [12]:
deq.extend([0, 1, 2, 3, 4])
deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [13]:
deq.rotate()
deq

deque([4, 0, 1, 2, 3], maxlen=5)

In [14]:
deq.rotate(2)
deq

deque([2, 3, 4, 0, 1], maxlen=5)

In [15]:
deq.rotate(2)
deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [16]:
deq.rotate(-1)
deq

deque([1, 2, 3, 4, 0], maxlen=5)

In [17]:
repr(deq)

'deque([1, 2, 3, 4, 0], maxlen=5)'

In [18]:
deq.clear()
deq

deque([], maxlen=5)

In [19]:
############################################
# buffer example
#============================================

from collections import deque
import random

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)
    
    def extend(self, experiences):
        self.buffer.extend(experiences)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return batch

    def __len__(self):
        return len(self.buffer)
    
    def __repr__(self):
        return repr(self.buffer)

buffer = ReplayBuffer(max_size=10)
for i in range(10):
    buffer.add(i)
    
buffer

deque([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], maxlen=10)

In [20]:
buffer.sample(1)

[1]

In [21]:
buffer.sample(5)

[6, 4, 9, 8, 1]

In [22]:
buffer.extend([10, 11, 12])
buffer

deque([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], maxlen=10)

In [23]:
import gym
from collections import namedtuple
import numpy as np

buffer = ReplayBuffer(max_size=50)
Episode = namedtuple('Episode', field_names=['states', 'actions', 'rewards', 'next_states', 'terminateds'])

num_episodes = 2
env = gym.make('CartPole-v1')

for i in range(num_episodes):
    state, info = env.reset()
    terminated = False
    total_reward = 0.0

    states = []
    rewards = []
    actions = []
    next_states = []
    terminateds = []

    while not terminated:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        
        # record (s, a, r, s', terminated)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        next_states.append(next_state)
        terminateds.append(terminated)

        # prepare next step
        state = next_state
        total_reward += reward

    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    next_states = np.array(next_states)
    terminateds = np.array(terminateds)

    episode = Episode(states, actions, rewards, next_states, terminateds)
    buffer.add(episode)

buffer

deque([Episode(states=array([[ 9.8505523e-03,  1.5033756e-02,  4.1495539e-02,  4.8688967e-02],
       [ 1.0151227e-02, -1.8065788e-01,  4.2469319e-02,  3.5416993e-01],
       [ 6.5380693e-03, -3.7635717e-01,  4.9552720e-02,  6.5993637e-01],
       [-9.8907389e-04, -1.8195856e-01,  6.2751450e-02,  3.8325897e-01],
       [-4.6282453e-03,  1.2218914e-02,  7.0416629e-02,  1.1100284e-01],
       [-4.3838667e-03,  2.0626488e-01,  7.2636686e-02, -1.5865938e-01],
       [-2.5856926e-04,  4.0027580e-01,  6.9463499e-02, -4.2757198e-01],
       [ 7.7469465e-03,  2.0424232e-01,  6.0912054e-02, -1.1382504e-01],
       [ 1.1831793e-02,  8.3028525e-03,  5.8635555e-02,  1.9743615e-01],
       [ 1.1997850e-02, -1.8760663e-01,  6.2584281e-02,  5.0802433e-01],
       [ 8.2457168e-03,  6.5802755e-03,  7.2744764e-02,  2.3570199e-01],
       [ 8.3773229e-03,  2.0059158e-01,  7.7458806e-02, -3.3177208e-02],
       [ 1.2389154e-02,  4.4492208e-03,  7.6795258e-02,  2.8290427e-01],
       [ 1.2478139e-02,  1.98

In [24]:
reward_to_goes_episodes = []
gamma = 0.95
episodes = buffer.sample(2)
for episode in episodes:
    reward_to_goes = []
    reward_to_go = 0.0
    T = len(episode.states)
    for i in range(T-1, 0, -1):
        reward = episode.rewards[i]
        reward_to_go = reward + reward_to_go * gamma
        reward_to_goes.append(reward_to_go)
    reward_to_goes_episodes.append(np.array(reward_to_goes[::-1]))
reward_to_goes_episodes

[array([13.85286265, 13.5293291 , 13.18876747, 12.83028155, 12.45292795,
        12.05571363, 11.6375933 , 11.19746663, 10.7341754 , 10.24650042,
         9.73315833,  9.19279825,  8.62399815,  8.02526122,  7.39501181,
         6.73159137,  6.03325408,  5.29816219,  4.52438125,  3.709875  ,
         2.8525    ,  1.95      ,  1.        ]),
 array([15.24346229, 14.99311821, 14.72959811, 14.45220854, 14.16021951,
        13.85286265, 13.5293291 , 13.18876747, 12.83028155, 12.45292795,
        12.05571363, 11.6375933 , 11.19746663, 10.7341754 , 10.24650042,
         9.73315833,  9.19279825,  8.62399815,  8.02526122,  7.39501181,
         6.73159137,  6.03325408,  5.29816219,  4.52438125,  3.709875  ,
         2.8525    ,  1.95      ,  1.        ])]