In [1]:
##################################################
# deque example
#################################################

# https://docs.python.org/3/library/collections.html#collections.deque

from collections import deque
import random

iterable_object = [0, 1, 2, 3, 4]
deq = deque(iterable_object, maxlen=5)

deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [2]:
# add 5 to the right side of the deque
deq.append(5)
deq # 0 is removed

deque([1, 2, 3, 4, 5], maxlen=5)

In [3]:
for i in range(6, 10):
    deq.append(i)
deq

deque([5, 6, 7, 8, 9], maxlen=5)

In [4]:
# add elements to the deque using extend method
deq.extend([10, 11, 11, 12])
deq

deque([9, 10, 11, 11, 12], maxlen=5)

In [5]:
deq.appendleft(0)
deq

deque([0, 9, 10, 11, 11], maxlen=5)

In [6]:
deq.extendleft([0, 1, 2, 3])
deq

deque([3, 2, 1, 0, 0], maxlen=5)

In [7]:
deq.reverse()
deq

deque([0, 0, 1, 2, 3], maxlen=5)

In [8]:
deq.count(0)

2

In [9]:
deq.pop() # 
deq

deque([0, 0, 1, 2], maxlen=5)

In [10]:
deq. popleft()
deq

deque([0, 1, 2], maxlen=5)

In [11]:
deq.remove(1) # the first 1 is removed
deq

deque([0, 2], maxlen=5)

In [12]:
deq.extend([0, 1, 2, 3, 4])
deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [13]:
deq.rotate()
deq

deque([4, 0, 1, 2, 3], maxlen=5)

In [14]:
deq.rotate(2)
deq

deque([2, 3, 4, 0, 1], maxlen=5)

In [15]:
deq.rotate(2)
deq

deque([0, 1, 2, 3, 4], maxlen=5)

In [16]:
deq.rotate(-1)
deq

deque([1, 2, 3, 4, 0], maxlen=5)

In [17]:
repr(deq)

'deque([1, 2, 3, 4, 0], maxlen=5)'

In [18]:
deq.clear()
deq

deque([], maxlen=5)

In [19]:
############################################
# buffer example
#============================================

from collections import deque
import random

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def add(self, experience):
        self.buffer.append(experience)
    
    def extend(self, experiences):
        self.buffer.extend(experiences)

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        return batch

    def __len__(self):
        return len(self.buffer)
    
    def __repr__(self):
        return repr(self.buffer)

buffer = ReplayBuffer(max_size=10)
for i in range(10):
    buffer.add(i)
    
buffer

deque([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], maxlen=10)

In [20]:
buffer.sample(1)

[1]

In [21]:
buffer.sample(5)

[6, 5, 9, 3, 7]

In [22]:
buffer.extend([10, 11, 12])
buffer

deque([3, 4, 5, 6, 7, 8, 9, 10, 11, 12], maxlen=10)

In [23]:
import gym
from collections import namedtuple
import numpy as np

buffer = ReplayBuffer(max_size=50)
Episode = namedtuple('Episode', field_names=['states', 'actions', 'rewards', 'next_states', 'terminateds'])

num_episodes = 2
env = gym.make('CartPole-v1')

for i in range(num_episodes):
    state, info = env.reset()
    terminated = False
    total_reward = 0.0

    states = []
    rewards = []
    actions = []
    next_states = []
    terminateds = []

    while not terminated:
        action = env.action_space.sample()
        next_state, reward, terminated, truncated, info = env.step(action)
        
        # record (s, a, r, s', terminated)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        next_states.append(next_state)
        terminateds.append(terminated)

        # prepare next step
        state = next_state
        total_reward += reward

    states = np.array(states)
    actions = np.array(actions)
    rewards = np.array(rewards)
    next_states = np.array(next_states)
    terminateds = np.array(terminateds)

    episode = Episode(states, actions, rewards, next_states, terminateds)
    buffer.add(episode)

buffer

deque([Episode(states=array([[-0.03078078,  0.01060495, -0.03049067,  0.03484377],
       [-0.03056869,  0.20615058, -0.0297938 , -0.2673012 ],
       [-0.02644567,  0.01146623, -0.03513982,  0.01583765],
       [-0.02621635,  0.20707405, -0.03482307, -0.28772193],
       [-0.02207487,  0.01246555, -0.0405775 , -0.00622208],
       [-0.02182556, -0.18205167, -0.04070195,  0.27338707],
       [-0.02546659, -0.37656996, -0.03523421,  0.55295974],
       [-0.03299799, -0.57117987, -0.02417501,  0.8343367 ],
       [-0.04442159, -0.3757361 , -0.00748828,  0.53415   ],
       [-0.05193631, -0.57075197,  0.00319472,  0.824464  ],
       [-0.06335135, -0.37567383,  0.019684  ,  0.5327876 ],
       [-0.07086483, -0.18083419,  0.03033976,  0.24637146],
       [-0.07448151, -0.37637603,  0.03526719,  0.54846776],
       [-0.08200902, -0.1817668 ,  0.04623654,  0.26710182],
       [-0.08564436, -0.37751707,  0.05157858,  0.57400215],
       [-0.09319471, -0.5733228 ,  0.06305862,  0.8824775 ],
  

In [24]:
# compute reward_to_go R_t := r_t + r_(t+1) * gamma + ... 
gamma = 0.95
episodes = buffer.sample(2)
for episode in episodes:
    reward_to_goes = []
    reward_to_go = 0.0
    T = len(episode.states)
    for i in range(T-1, -1, -1): 
        reward = episode.rewards[i]
        reward_to_go = reward + reward_to_go * gamma
        reward_to_goes.append(reward_to_go)
    reward_to_goes = np.array(reward_to_goes[::-1])
    print(reward_to_goes)

[13.18876747 12.83028155 12.45292795 12.05571363 11.6375933  11.19746663
 10.7341754  10.24650042  9.73315833  9.19279825  8.62399815  8.02526122
  7.39501181  6.73159137  6.03325408  5.29816219  4.52438125  3.709875
  2.8525      1.95        1.        ]
[14.16021951 13.85286265 13.5293291  13.18876747 12.83028155 12.45292795
 12.05571363 11.6375933  11.19746663 10.7341754  10.24650042  9.73315833
  9.19279825  8.62399815  8.02526122  7.39501181  6.73159137  6.03325408
  5.29816219  4.52438125  3.709875    2.8525      1.95        1.        ]


In [25]:
# the same result as https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/ppo/core.py in discount_cumsum()
import scipy

for episode in episodes:
    reward_to_goes = scipy.signal.lfilter([1], [1, float(-gamma)], episode.rewards[::-1], axis=0)[::-1]
    print(reward_to_goes)

[13.18876747 12.83028155 12.45292795 12.05571363 11.6375933  11.19746663
 10.7341754  10.24650042  9.73315833  9.19279825  8.62399815  8.02526122
  7.39501181  6.73159137  6.03325408  5.29816219  4.52438125  3.709875
  2.8525      1.95        1.        ]
[14.16021951 13.85286265 13.5293291  13.18876747 12.83028155 12.45292795
 12.05571363 11.6375933  11.19746663 10.7341754  10.24650042  9.73315833
  9.19279825  8.62399815  8.02526122  7.39501181  6.73159137  6.03325408
  5.29816219  4.52438125  3.709875    2.8525      1.95        1.        ]
