In [6]:
import os

# Set the MUJOCO_GL environment variable to a different backend
os.environ['MUJOCO_GL'] = 'glfw'  # or 'osmesa', 'gl'

# Import and use MuJoCo environments
import gymnasium as gym

# Example usage
env = gym.make('Humanoid-v5', render_mode='human')
env.reset()
env.render()

# do some demo actions
for _ in range(1000):
    env.step(env.action_space.sample())
    env.render()

env.close()

In [None]:
import gymnasium as gym
import ale_py

gym.register_envs(ale_py)

env = gym.make('ALE/Breakout-v5')
obs, info = env.reset()
obs, reward, terminated, truncated, info = env.step(env.action_space.sample())
env.close()

# render the environment
import time
env = gym.make('ALE/Breakout-v5', render_mode='human')
env.reset()

for _ in range(100):
    env.render()
    env.step(env.action_space.sample())
    env.step(env.action_space.sample())
    time.sleep(0.1)

env.close()

\begin{align*}
q_t^{\lambda} = (1 - \lambda) \sum_{n = 1}^{\infty} \lambda^{n - 1} q_t^{(n)}
\end{align*}

Considering weight $w(n) = (1 - \lambda)\lambda^{n-1}$:

\begin{align*}
\sum_{n = 1}^N w(n) &= \sum_{n = 1}^N (1 - \lambda)\lambda^{n-1} \\
\sum_{n = 1}^N w(n) - \lambda \sum_{n = 1}^N w(n) &= (1 - \lambda) \sum_{n = 1}^N \lambda^{n-1} - (1 - \lambda) \lambda \sum_{n = 1}^N \lambda^{n-1} \\
(1 - \lambda) \sum_{n = 1}^N w(n) &= (1 - \lambda) \sum_{n = 1}^N \lambda^{n-1} - (1 - \lambda) \sum_{n = 2}^{N + 1} \lambda^{n-1} \\
(1 - \lambda) \sum_{n = 1}^N w(n) &= (1 - \lambda) \left[ \left( \lambda^0 + \sum_{n = 2}^N \lambda^{n-1} \right) - \left( \sum_{n = 2}^N \lambda^{n-1} + \lambda^N \right) \right] \\
(1 - \lambda) \sum_{n = 1}^N w(n) &= (1 - \lambda) \left[ 1 - \lambda^N \right] \\
\sum_{n = 1}^N w(n) &= 1 - \lambda^N
\end{align*}

Also, for $0 < \lambda < 1$:

\begin{align*}
\operatorname*{lim}_{k \to \infty} \sum_{n = 1}^k w(n) = \operatorname*{lim}_{k \to \infty} [1 - \lambda^k] = 1
\end{align*}

In [None]:
import gymnasium as gym
from gymnasium.wrappers import TransformObservation
import numpy as np
np.random.seed(0)
env = gym.make("CartPole-v1")
print(env.reset(seed=42))


In [None]:
env = TransformObservation(env, lambda obs: obs + 0.1 * np.random.randn(*obs.shape))
env.reset(seed=42)

In [None]:
0.1 * np.random.randn(*[4])

In [None]:
def calculate_ema(data, gamma):
    ema = 0
    emas = []
    for x in data:
        ema = gamma * ema + (1 - gamma) * x
        emas.append(ema)
    return ema, emas

rewards = [1, 3, -20, 10, 100, -50, 12, 13]  # list of rewards
gamma = 0.9  # decay factor

# Calculate EMA of rewards
_, ema_rewards = calculate_ema(rewards, gamma)
print('ema_rewards', ema_rewards)

# Calculate EMA of squared rewards
squared_rewards = [r**2 for r in rewards]
_, ema_squared_rewards = calculate_ema(squared_rewards, gamma)
print('ema_squared_rewards', ema_squared_rewards)

#plot
import matplotlib.pyplot as plt
plt.plot(rewards, label="rewards")
plt.plot(ema_rewards, label="EMA rewards")
plt.plot(squared_rewards, label="squared rewards")
plt.plot(ema_squared_rewards, label="EMA squared rewards")
plt.legend()
plt.show()