In [1]:
##################################################
# deque example
#################################################

# https://docs.python.org/3/library/collections.html#collections.deque

from collections import deque
import random

iterable_object = [0, 1, 2, 3, 4]
deq = deque(iterable_object, maxlen=5)

deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [2]:
# add 5 to the right side of the deque
deq.append(5)
deq # 0 is removed

deque([1, 2, 3, 4, 5], maxlen=5)

In [3]:
for i in range(6, 10):
    deq.append(i)
deq

deque([5, 6, 7, 8, 9], maxlen=5)

In [4]:
# add elements to the deque using extend method
deq.extend([10, 11, 11, 12])
deq

deque([9, 10, 11, 11, 12], maxlen=5)

In [5]:
deq.appendleft(0)
deq

deque([0, 9, 10, 11, 11], maxlen=5)

In [6]:
deq.extendleft([0, 1, 2, 3])
deq

deque([3, 2, 1, 0, 0], maxlen=5)

In [7]:
deq.reverse()
deq

deque([0, 0, 1, 2, 3], maxlen=5)

In [8]:
deq.count(0)

2

In [9]:
deq.pop() # 
deq

deque([0, 0, 1, 2], maxlen=5)

In [10]:
deq. popleft()
deq

deque([0, 1, 2], maxlen=5)

In [11]:
deq.remove(1) # the first 1 is removed
deq

deque([0, 2], maxlen=5)

In [12]:
deq.extend([0, 1, 2, 3, 4])
deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [13]:
deq.rotate()
deq

deque([4, 0, 1, 2, 3], maxlen=5)

In [14]:
deq.rotate(2)
deq

deque([2, 3, 4, 0, 1], maxlen=5)

In [15]:
deq.rotate(2)
deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [16]:
deq.rotate(-1)
deq

deque([1, 2, 3, 4, 0], maxlen=5)

In [17]:
repr(deq)

'deque([1, 2, 3, 4, 0], maxlen=5)'

In [18]:
deq.clear()
deq

deque([], maxlen=5)

In [19]:
############################################
# buffer example
#============================================

from collections import deque
import random

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)
    
    def extend(self, experiences):
        self.buffer.extend(experiences)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return batch

    def __len__(self):
        return len(self.buffer)
    
    def __repr__(self):
        return repr(self.buffer)

buffer = ReplayBuffer(max_size=10)
for i in range(10):
    buffer.add(i)
    
buffer

deque([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], maxlen=10)

In [20]:
buffer.sample(1)

[7]

In [21]:
buffer.sample(5)

[4, 0, 9, 8, 7]

In [22]:
buffer.extend([10, 11, 12])
buffer

deque([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], maxlen=10)

In [23]:
import gym
from collections import namedtuple
import numpy as np

buffer = ReplayBuffer(max_size=50)
Episode = namedtuple('Episode', field_names=['states', 'actions', 'rewards', 'next_states', 'terminateds'])

num_episodes = 2
env = gym.make('CartPole-v1')

for i in range(num_episodes):
    state, info = env.reset()
    terminated = False
    total_reward = 0.0

    states = []
    rewards = []
    actions = []
    next_states = []
    terminateds = []

    while not terminated:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        
        # record (s, a, r, s', terminated)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        next_states.append(next_state)
        terminateds.append(terminated)

        # prepare next step
        state = next_state
        total_reward += reward

    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    next_states = np.array(next_states)
    terminateds = np.array(terminateds)

    episode = Episode(states, actions, rewards, next_states, terminateds)
    buffer.add(episode)

buffer

deque([Episode(states=array([[-0.00703318, -0.02508628,  0.02987902, -0.02511803],
       [-0.00753491, -0.2206237 ,  0.02937666,  0.2768403 ],
       [-0.01194738, -0.41615218,  0.03491347,  0.57864195],
       [-0.02027043, -0.22153647,  0.04648631,  0.29715878],
       [-0.02470116, -0.4172892 ,  0.05242948,  0.6041328 ],
       [-0.03304694, -0.6131037 ,  0.06451213,  0.91285807],
       [-0.04530901, -0.418911  ,  0.0827693 ,  0.6411284 ],
       [-0.05368723, -0.22503449,  0.09559187,  0.3756156 ],
       [-0.05818792, -0.421375  ,  0.10310417,  0.696843  ],
       [-0.06661543, -0.6177642 ,  0.11704104,  1.0201213 ],
       [-0.07897071, -0.8142348 ,  0.13744347,  1.3471426 ],
       [-0.0952554 , -0.621082  ,  0.16438632,  1.100427  ],
       [-0.10767704, -0.8179402 ,  0.18639486,  1.4398457 ]],
      dtype=float32), actions=array([0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0]), rewards=array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]), next_states=array([[-0.00753491, -0.2

In [24]:
reward_to_goes_episodes = []
gamma = 0.95
episodes = buffer.sample(2)
for episode in episodes:
    reward_to_goes = []
    reward_to_go = 0.0
    T = len(episode.states)
    for i in range(T-1, -1, -1): 
        reward = episode.rewards[i]
        reward_to_go = reward + reward_to_go * gamma
        reward_to_goes.append(reward_to_go)
    reward_to_goes_episodes.append(np.array(reward_to_goes[::-1]))
reward_to_goes_episodes

[array([9.73315833, 9.19279825, 8.62399815, 8.02526122, 7.39501181,
        6.73159137, 6.03325408, 5.29816219, 4.52438125, 3.709875  ,
        2.8525    , 1.95      , 1.        ]),
 array([10.24650042,  9.73315833,  9.19279825,  8.62399815,  8.02526122,
         7.39501181,  6.73159137,  6.03325408,  5.29816219,  4.52438125,
         3.709875  ,  2.8525    ,  1.95      ,  1.        ])]