In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
示例：使用Stable Baselines3的SAC，并结合随机返回分解(RRD)来处理稀疏/延迟奖励环境。
代码仅用于演示核心思路，可能无法完全复现论文结果。
"""

import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

from stable_baselines3 import SAC
from stable_baselines3.sac.policies import SACPolicy
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.logger import configure


# ========== 1. 环境包装：将每步奖励改为0，只在episode结束时返回总reward ==========
class EpisodicRewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.cumulative_reward = 0.0

    def step(self, action):
        result = self.env.step(action)
        # 支持返回4个或5个值
        if len(result) == 5:
            obs, reward, terminated, truncated, info = result
            done = terminated or truncated
        else:
            obs, reward, done, info = result
        self.cumulative_reward += reward
        if done:
            reward = self.cumulative_reward
            self.cumulative_reward = 0.0
        else:
            reward = 0.0
        return obs, reward, done, info

    def reset(self, **kwargs):
        self.cumulative_reward = 0.0
        result = self.env.reset(**kwargs)
        # 支持返回tuple (obs, info) 或单个obs
        if isinstance(result, tuple):
            obs, _ = result
        else:
            obs = result
        return obs


# ========== 2. 轨迹存储（为了做 RRD） ==========
class TrajectoryReplay:
    """
    存储一条完整轨迹 (s, a, r, s_next, done)，
    其中 r 是“环境最终给出的总回报”（只有最后一步非0），其余为0。
    """
    def __init__(self):
        self.trajectories = []

    def add_trajectory(self, traj):
        """
        traj: list of (s, a, r, s_next, done)
        """
        self.trajectories.append(traj)

    def sample(self, batch_size):
        """
        随机采样batch_size条轨迹
        """
        indices = np.random.randint(0, len(self.trajectories), size=batch_size)
        return [self.trajectories[i] for i in indices]

    def __len__(self):
        return len(self.trajectories)


# ========== 3. 代理奖励模型： R(s,a) ==========
class RewardModel(nn.Module):
    """
    一个简单的两层网络，用于近似代理奖励 R(s,a)。
    """
    def __init__(self, state_dim, action_dim, hidden_size=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, s, a):
        x = torch.cat([s, a], dim=-1)
        return self.net(x)


# ========== 4. RRD训练函数：随机返回分解 ==========
def sample_subsequence_indices(traj_length, K):
    """
    从 [0, traj_length-1] 中随机选择 K 个不重复的索引
    """
    if K > traj_length:
        K = traj_length
    return np.random.choice(traj_length, size=K, replace=False)

def train_reward_model_rrd(reward_model, optimizer, trajectories, K=64, device='cpu'):
    """
    使用随机返回分解(RRD)来训练代理奖励模型。
    其中 trajectories 是一个batch的完整轨迹列表。
    """
    reward_model.train()
    M = len(trajectories)

    loss_sum = 0.0
    for traj in trajectories:
        # 轨迹的总回报 = 轨迹最后一步的reward(因为中间为0，最后一步为总和)
        R_ep = sum([t[2] for t in traj])  # 或者直接 traj[-1][2]

        # 收集这条轨迹的(s, a)
        s_list = []
        a_list = []
        for (s, a, r, s_next, done) in traj:
            s_list.append(s)
            a_list.append(a)

        states = torch.tensor(s_list, dtype=torch.float, device=device)
        actions = torch.tensor(a_list, dtype=torch.float, device=device)

        # 随机采样子序列
        Tj = len(traj)
        idx_subseq = sample_subsequence_indices(Tj, K)

        # 计算子序列上 R(s,a) 之和
        R_sa_sub = reward_model(states[idx_subseq], actions[idx_subseq])
        sum_R_sa_sub = R_sa_sub.sum()

        # (R_ep - sum_R_sa_sub)^2
        diff = (R_ep - sum_R_sa_sub) ** 2
        loss_sum += diff

    loss = loss_sum / M
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss.item()


# ========== 5. 收集轨迹的函数：使用当前策略进行若干回合采样 ==========
def collect_episodes(env, model, n_episodes, device='cpu'):
    trajectories = []
    for _ in range(n_episodes):
        result = env.reset()
        if isinstance(result, tuple):
            obs, _ = result
        else:
            obs = result
        done = False
        traj = []
        while not done:
            if obs is None or (hasattr(obs, '__len__') and len(obs) == 0):
                raise ValueError("Observation is empty, check env.reset() output")
            obs_tensor = torch.tensor(obs, dtype=torch.float, device=device).unsqueeze(0)
            with torch.no_grad():
                action, _states = model.predict(obs_tensor.cpu().numpy(), deterministic=False)
            result = env.step(action[0])
            if len(result) == 5:
                next_obs, reward, terminated, truncated, info = result
                done = terminated or truncated
            else:
                next_obs, reward, done, info = result
            traj.append((obs, action[0], reward, next_obs, done))
            obs = next_obs
        trajectories.append(traj)
    return trajectories


# ========== 6. 将RRD后的代理奖励存入SAC的ReplayBuffer ==========
def add_shaped_transitions_to_replay(model, reward_model, trajectories, gamma=0.99, device='cpu'):
    """
    对收集到的轨迹，用reward_model计算每步的代理奖励，然后存入SAC的replay buffer。
    """
    for traj in trajectories:
        states = []
        actions = []
        next_states = []
        dones = []
        for (s, a, r, s_next, d) in traj:
            states.append(s)
            actions.append(a)
            next_states.append(s_next)
            dones.append(d)

        # 转成张量
        s_tensor = torch.tensor(states, dtype=torch.float, device=device)
        a_tensor = torch.tensor(actions, dtype=torch.float, device=device)

        with torch.no_grad():
            shaped_rewards = reward_model(s_tensor, a_tensor).cpu().numpy().squeeze(-1)

        for i in range(len(traj)):
            s, a, _, s_next, d = traj[i]
            r_shaped = shaped_rewards[i]
            model.replay_buffer.add(
                s, s_next, a, r_shaped, d, infos=[{}],
            )




# ========== 7. 主函数：整合以上模块 ==========
def main():
    class Args:
        env = "Walker2d-v4"
        episodes = 2000
        steps_per_update = 1000
        rrd_k = 64
        device = "cuda"
    
    args = Args()

    device = torch.device(args.device)

    # 1. 创建环境 (只在最后一步返回总reward)
    base_env = gym.make(args.env)
    env = EpisodicRewardWrapper(base_env)
    # stable-baselines3 要求环境是VecEnv，故用 DummyVecEnv 包一下
    vec_env = DummyVecEnv([lambda: env])

    # 2. 创建SAC模型
    #    注意：此时环境返回的奖励是稀疏且延迟的，但我们并不直接用它来训练SAC。
    #    我们会手动往 replay_buffer 里塞“代理奖励”。
    model = SAC(
        policy="MlpPolicy",
        env=vec_env,  # 这里仍然要传env，但不会直接用它的奖励来学习
        verbose=1,
        seed=42,
        buffer_size=100000,
        learning_starts=0,       # 让它不必等待就能开始学习
        train_freq=1,            # 每次加一个transition就可以触发一次训练(后面我们自定义逻辑)
        gradient_steps=0,        # 我们会手动调用 train
        batch_size=256,
        gamma=0.99,
        device=device
    )

    model._logger = configure(folder=None, format_strings=["stdout"])

    # 3. 创建奖励模型
    #    需要知道 state_dim, action_dim
    state_dim = base_env.observation_space.shape[0]
    action_dim = base_env.action_space.shape[0]
    reward_model = RewardModel(state_dim, action_dim, hidden_size=256).to(device)
    reward_optimizer = optim.Adam(reward_model.parameters(), lr=3e-4)

    # 4. 收集轨迹 & RRD & SAC循环
    #    在每个回合：
    #      (a) 用当前策略收集一些episode
    #      (b) 用R(·)更新 reward_model (RRD)
    #      (c) 用更新后的 R(·) 生成 shaped reward，存入SAC的 replay buffer
    #      (d) 调用 model.train(gradient_steps=...) 进行SAC更新
    traj_replay = TrajectoryReplay()

    ep_record = []
    for ep in tqdm(range(args.episodes)):
        # (a) 收集1条或多条轨迹
        new_trajectories = collect_episodes(env, model, n_episodes=1, device=device)
        # 放入自定义的trajectory存储中
        for traj in new_trajectories:
            traj_replay.add_trajectory(traj)

        # (b) 如果轨迹数足够，就训练reward_model
        if len(traj_replay) >= 8:  # 大于一定数量再开始
            batch_size_trajectories = 8
            sampled_trajs = traj_replay.sample(batch_size_trajectories)
            rrd_loss = train_reward_model_rrd(
                reward_model, reward_optimizer, sampled_trajs,
                K=args.rrd_k, device=device
            )

        # (c) 将这些新轨迹用新的 reward_model 计算 shaped reward，添加到 SAC replay buffer
        add_shaped_transitions_to_replay(model, reward_model, new_trajectories, device=device)

        # (d) 调用 model.train(gradient_steps=...) 更新SAC
        #     这里的 steps-per-update 可视情况而定
        if len(traj_replay) >= 8:
            model.train(args.steps_per_update)

        # (e) 简单的评估：看一下最后一条trajectory的真实环境回报
        ep_return = sum([t[2] for t in new_trajectories[-1]])
        ep_record.append(ep_return)
        
        if ep % 10 == 0:
            print(f"Episode {ep+1}/{args.episodes}, real env return = {ep_return:.2f}")

    # 训练结束
    print("Training finished.")
    return ep_record

In [2]:
train1 = main()

Using cuda device


  0%|                                          | 5/2000 [00:00<00:40, 49.87it/s]

Episode 1/2000, real env return = 27.64


  0%|▏                                        | 10/2000 [00:08<34:55,  1.05s/it]

Episode 11/2000, real env return = -6.98


  1%|▍                                      | 21/2000 [00:41<1:30:01,  2.73s/it]

Episode 21/2000, real env return = 9.90


  2%|▌                                      | 31/2000 [01:33<3:55:17,  7.17s/it]

Episode 31/2000, real env return = -19.82


  2%|▊                                      | 41/2000 [03:04<4:59:52,  9.18s/it]

Episode 41/2000, real env return = 251.40


  3%|▉                                      | 51/2000 [04:33<4:47:17,  8.84s/it]

Episode 51/2000, real env return = 136.12


  3%|█▏                                     | 61/2000 [06:05<4:57:00,  9.19s/it]

Episode 61/2000, real env return = 209.63


  4%|█▍                                     | 71/2000 [07:38<4:56:43,  9.23s/it]

Episode 71/2000, real env return = 3.56


  4%|█▌                                     | 81/2000 [09:09<4:52:08,  9.13s/it]

Episode 81/2000, real env return = 172.28


  5%|█▊                                     | 91/2000 [10:40<4:53:46,  9.23s/it]

Episode 91/2000, real env return = 178.68


  5%|█▉                                    | 101/2000 [12:12<4:46:28,  9.05s/it]

Episode 101/2000, real env return = 249.84


  6%|██                                    | 111/2000 [13:41<4:38:45,  8.85s/it]

Episode 111/2000, real env return = -1.66


  6%|██▎                                   | 121/2000 [15:11<4:44:17,  9.08s/it]

Episode 121/2000, real env return = 20.21


  7%|██▍                                   | 131/2000 [16:42<4:50:13,  9.32s/it]

Episode 131/2000, real env return = 49.92


  7%|██▋                                   | 141/2000 [18:10<4:26:39,  8.61s/it]

Episode 141/2000, real env return = 205.73


  8%|██▊                                   | 151/2000 [19:42<4:42:09,  9.16s/it]

Episode 151/2000, real env return = 231.28


  8%|███                                   | 161/2000 [21:13<4:34:17,  8.95s/it]

Episode 161/2000, real env return = 368.38


  9%|███▏                                  | 171/2000 [22:43<4:39:35,  9.17s/it]

Episode 171/2000, real env return = 414.02


  9%|███▍                                  | 181/2000 [24:15<4:41:56,  9.30s/it]

Episode 181/2000, real env return = 163.83


 10%|███▋                                  | 191/2000 [25:46<4:32:02,  9.02s/it]

Episode 191/2000, real env return = 209.30


 10%|███▊                                  | 201/2000 [27:17<4:28:13,  8.95s/it]

Episode 201/2000, real env return = 198.97


 11%|████                                  | 211/2000 [28:47<4:24:21,  8.87s/it]

Episode 211/2000, real env return = 204.50


 11%|████▏                                 | 221/2000 [30:18<4:29:00,  9.07s/it]

Episode 221/2000, real env return = 250.75


 12%|████▍                                 | 231/2000 [31:47<4:23:25,  8.93s/it]

Episode 231/2000, real env return = 272.25


 12%|████▌                                 | 241/2000 [33:18<4:22:18,  8.95s/it]

Episode 241/2000, real env return = 341.63


 13%|████▊                                 | 251/2000 [34:49<4:31:02,  9.30s/it]

Episode 251/2000, real env return = 154.96


 13%|████▉                                 | 261/2000 [36:19<4:15:06,  8.80s/it]

Episode 261/2000, real env return = 257.03


 14%|█████▏                                | 271/2000 [37:49<4:23:32,  9.15s/it]

Episode 271/2000, real env return = 307.40


 14%|█████▎                                | 281/2000 [39:20<4:18:42,  9.03s/it]

Episode 281/2000, real env return = 279.17


 15%|█████▌                                | 291/2000 [40:49<4:12:30,  8.87s/it]

Episode 291/2000, real env return = 385.81


 15%|█████▋                                | 301/2000 [42:20<4:16:55,  9.07s/it]

Episode 301/2000, real env return = 221.18


 16%|█████▉                                | 311/2000 [43:51<4:14:17,  9.03s/it]

Episode 311/2000, real env return = 139.60


 16%|██████                                | 321/2000 [45:23<4:24:41,  9.46s/it]

Episode 321/2000, real env return = 597.00


 17%|██████▎                               | 331/2000 [46:54<4:15:52,  9.20s/it]

Episode 331/2000, real env return = 119.08


 17%|██████▍                               | 341/2000 [48:25<4:11:03,  9.08s/it]

Episode 341/2000, real env return = 234.36


 18%|██████▋                               | 351/2000 [49:55<4:04:53,  8.91s/it]

Episode 351/2000, real env return = 227.14


 18%|██████▊                               | 361/2000 [51:25<4:02:19,  8.87s/it]

Episode 361/2000, real env return = 195.99


 19%|███████                               | 371/2000 [52:56<4:13:11,  9.33s/it]

Episode 371/2000, real env return = 462.01


 19%|███████▏                              | 381/2000 [54:30<4:07:13,  9.16s/it]

Episode 381/2000, real env return = 286.13


 20%|███████▍                              | 391/2000 [56:02<4:05:28,  9.15s/it]

Episode 391/2000, real env return = 617.75


 20%|███████▌                              | 401/2000 [57:31<3:52:45,  8.73s/it]

Episode 401/2000, real env return = 381.09


 21%|███████▊                              | 411/2000 [59:03<4:01:13,  9.11s/it]

Episode 411/2000, real env return = 241.33


 21%|███████▌                            | 421/2000 [1:00:34<3:59:42,  9.11s/it]

Episode 421/2000, real env return = 386.13


 22%|███████▊                            | 431/2000 [1:02:06<3:59:26,  9.16s/it]

Episode 431/2000, real env return = 201.83


 22%|███████▉                            | 441/2000 [1:03:36<3:52:00,  8.93s/it]

Episode 441/2000, real env return = 283.59


 23%|████████                            | 451/2000 [1:05:04<3:44:15,  8.69s/it]

Episode 451/2000, real env return = 290.13


 23%|████████▎                           | 461/2000 [1:06:35<3:51:55,  9.04s/it]

Episode 461/2000, real env return = 216.58


 24%|████████▍                           | 471/2000 [1:08:03<3:44:39,  8.82s/it]

Episode 471/2000, real env return = 232.32


 24%|████████▋                           | 481/2000 [1:09:33<3:48:49,  9.04s/it]

Episode 481/2000, real env return = 527.15


 25%|████████▊                           | 491/2000 [1:11:03<3:44:23,  8.92s/it]

Episode 491/2000, real env return = 357.19


 25%|█████████                           | 501/2000 [1:12:32<3:44:26,  8.98s/it]

Episode 501/2000, real env return = 514.94


 26%|█████████▏                          | 511/2000 [1:14:03<3:46:55,  9.14s/it]

Episode 511/2000, real env return = 298.70


 26%|█████████▍                          | 521/2000 [1:15:34<3:47:50,  9.24s/it]

Episode 521/2000, real env return = 500.19


 27%|█████████▌                          | 531/2000 [1:17:05<3:42:16,  9.08s/it]

Episode 531/2000, real env return = 245.27


 27%|█████████▋                          | 541/2000 [1:18:36<3:38:29,  8.98s/it]

Episode 541/2000, real env return = 358.56


 28%|█████████▉                          | 551/2000 [1:20:06<3:38:13,  9.04s/it]

Episode 551/2000, real env return = 144.20


 28%|██████████                          | 561/2000 [1:21:36<3:33:36,  8.91s/it]

Episode 561/2000, real env return = 247.22


 29%|██████████▎                         | 571/2000 [1:23:08<3:38:05,  9.16s/it]

Episode 571/2000, real env return = 322.02


 29%|██████████▍                         | 581/2000 [1:24:40<3:35:55,  9.13s/it]

Episode 581/2000, real env return = 433.40


 30%|██████████▋                         | 591/2000 [1:26:14<3:40:36,  9.39s/it]

Episode 591/2000, real env return = 305.77


 30%|██████████▊                         | 601/2000 [1:27:45<3:38:29,  9.37s/it]

Episode 601/2000, real env return = 374.13


 31%|██████████▉                         | 611/2000 [1:29:18<3:35:32,  9.31s/it]

Episode 611/2000, real env return = 669.28


 31%|███████████▏                        | 621/2000 [1:30:51<3:35:34,  9.38s/it]

Episode 621/2000, real env return = 271.02


 32%|███████████▎                        | 631/2000 [1:32:26<3:48:09, 10.00s/it]

Episode 631/2000, real env return = 1025.26


 32%|███████████▌                        | 641/2000 [1:33:57<3:28:01,  9.18s/it]

Episode 641/2000, real env return = 408.97


 33%|███████████▋                        | 651/2000 [1:35:28<3:24:56,  9.12s/it]

Episode 651/2000, real env return = 370.93


 33%|███████████▉                        | 661/2000 [1:37:00<3:24:38,  9.17s/it]

Episode 661/2000, real env return = 1074.89


 34%|████████████                        | 671/2000 [1:38:32<3:26:38,  9.33s/it]

Episode 671/2000, real env return = 493.36


 34%|████████████▎                       | 681/2000 [1:40:08<3:28:19,  9.48s/it]

Episode 681/2000, real env return = 904.61


 35%|████████████▍                       | 691/2000 [1:41:41<3:17:35,  9.06s/it]

Episode 691/2000, real env return = 446.80


 35%|████████████▌                       | 701/2000 [1:43:18<3:35:55,  9.97s/it]

Episode 701/2000, real env return = 1689.80


 36%|████████████▊                       | 711/2000 [1:44:54<3:23:52,  9.49s/it]

Episode 711/2000, real env return = 260.48


 36%|████████████▉                       | 721/2000 [1:46:31<3:35:50, 10.13s/it]

Episode 721/2000, real env return = 2544.73


 37%|█████████████▏                      | 731/2000 [1:48:10<3:27:21,  9.80s/it]

Episode 731/2000, real env return = 1001.26


 37%|█████████████▎                      | 741/2000 [1:49:47<3:25:18,  9.78s/it]

Episode 741/2000, real env return = 1797.22


 38%|█████████████▌                      | 751/2000 [1:51:28<3:32:50, 10.22s/it]

Episode 751/2000, real env return = 1713.71


 38%|█████████████▋                      | 761/2000 [1:53:09<3:33:20, 10.33s/it]

Episode 761/2000, real env return = 2176.43


 39%|█████████████▉                      | 771/2000 [1:54:49<3:23:14,  9.92s/it]

Episode 771/2000, real env return = 2075.34


 39%|██████████████                      | 781/2000 [1:56:31<3:35:18, 10.60s/it]

Episode 781/2000, real env return = 1599.89


 40%|██████████████▏                     | 791/2000 [1:58:09<3:20:54,  9.97s/it]

Episode 791/2000, real env return = 1776.55


 40%|██████████████▍                     | 801/2000 [1:59:41<3:06:31,  9.33s/it]

Episode 801/2000, real env return = 913.76


 41%|██████████████▌                     | 811/2000 [2:01:12<3:03:59,  9.29s/it]

Episode 811/2000, real env return = 470.39


 41%|██████████████▊                     | 821/2000 [2:02:46<3:09:42,  9.65s/it]

Episode 821/2000, real env return = 794.53


 42%|██████████████▉                     | 831/2000 [2:04:21<3:11:02,  9.81s/it]

Episode 831/2000, real env return = 1271.43


 42%|███████████████▏                    | 841/2000 [2:05:58<3:05:54,  9.62s/it]

Episode 841/2000, real env return = 851.70


 43%|███████████████▎                    | 851/2000 [2:07:36<3:12:11, 10.04s/it]

Episode 851/2000, real env return = 2227.23


 43%|███████████████▍                    | 861/2000 [2:09:11<2:58:16,  9.39s/it]

Episode 861/2000, real env return = 1043.41


 44%|███████████████▋                    | 871/2000 [2:10:52<3:07:08,  9.95s/it]

Episode 871/2000, real env return = 1664.81


 44%|███████████████▊                    | 881/2000 [2:12:32<3:08:33, 10.11s/it]

Episode 881/2000, real env return = 2125.88


 45%|████████████████                    | 891/2000 [2:14:18<3:19:17, 10.78s/it]

Episode 891/2000, real env return = 1642.50


 45%|████████████████▏                   | 901/2000 [2:16:04<3:15:39, 10.68s/it]

Episode 901/2000, real env return = 1324.47


 46%|████████████████▍                   | 911/2000 [2:17:48<3:04:00, 10.14s/it]

Episode 911/2000, real env return = 1442.33


 46%|████████████████▌                   | 921/2000 [2:19:30<3:07:58, 10.45s/it]

Episode 921/2000, real env return = 1646.84


 47%|████████████████▊                   | 931/2000 [2:21:09<2:57:41,  9.97s/it]

Episode 931/2000, real env return = 860.45


 47%|████████████████▉                   | 941/2000 [2:22:50<2:59:52, 10.19s/it]

Episode 941/2000, real env return = 1229.01


 48%|█████████████████                   | 951/2000 [2:24:36<3:03:55, 10.52s/it]

Episode 951/2000, real env return = 1132.23


 48%|█████████████████▎                  | 961/2000 [2:26:13<2:52:53,  9.98s/it]

Episode 961/2000, real env return = 1634.95


 49%|█████████████████▍                  | 971/2000 [2:27:51<2:55:12, 10.22s/it]

Episode 971/2000, real env return = 1684.00


 49%|█████████████████▋                  | 981/2000 [2:29:36<2:57:52, 10.47s/it]

Episode 981/2000, real env return = 2458.62


 50%|█████████████████▊                  | 991/2000 [2:31:20<2:58:42, 10.63s/it]

Episode 991/2000, real env return = 2243.77


 50%|█████████████████▌                 | 1001/2000 [2:33:01<2:51:09, 10.28s/it]

Episode 1001/2000, real env return = 1666.23


 51%|█████████████████▋                 | 1011/2000 [2:34:47<2:55:10, 10.63s/it]

Episode 1011/2000, real env return = 1812.20


 51%|█████████████████▊                 | 1021/2000 [2:36:33<2:50:23, 10.44s/it]

Episode 1021/2000, real env return = 1400.35


 52%|██████████████████                 | 1031/2000 [2:38:18<2:47:09, 10.35s/it]

Episode 1031/2000, real env return = 2679.06


 52%|██████████████████▏                | 1041/2000 [2:40:02<2:49:11, 10.59s/it]

Episode 1041/2000, real env return = 1939.28


 53%|██████████████████▍                | 1051/2000 [2:41:46<2:42:56, 10.30s/it]

Episode 1051/2000, real env return = 54.89


 53%|██████████████████▌                | 1061/2000 [2:43:29<2:43:45, 10.46s/it]

Episode 1061/2000, real env return = 3048.59


 54%|██████████████████▋                | 1071/2000 [2:45:11<2:39:29, 10.30s/it]

Episode 1071/2000, real env return = 2081.80


 54%|██████████████████▉                | 1081/2000 [2:46:55<2:37:04, 10.26s/it]

Episode 1081/2000, real env return = 2190.48


 55%|███████████████████                | 1091/2000 [2:48:39<2:39:30, 10.53s/it]

Episode 1091/2000, real env return = 2844.66


 55%|███████████████████▎               | 1101/2000 [2:50:26<2:41:52, 10.80s/it]

Episode 1101/2000, real env return = 2881.33


 56%|███████████████████▍               | 1111/2000 [2:52:13<2:38:24, 10.69s/it]

Episode 1111/2000, real env return = 2932.99


 56%|███████████████████▌               | 1121/2000 [2:53:59<2:35:26, 10.61s/it]

Episode 1121/2000, real env return = 2880.54


 57%|███████████████████▊               | 1131/2000 [2:55:45<2:35:45, 10.75s/it]

Episode 1131/2000, real env return = 2847.49


 57%|███████████████████▉               | 1141/2000 [2:57:30<2:33:37, 10.73s/it]

Episode 1141/2000, real env return = 2987.85


 58%|████████████████████▏              | 1151/2000 [2:59:18<2:32:46, 10.80s/it]

Episode 1151/2000, real env return = 2930.96


 58%|████████████████████▎              | 1161/2000 [3:01:04<2:29:08, 10.67s/it]

Episode 1161/2000, real env return = 3173.61


 59%|████████████████████▍              | 1171/2000 [3:02:50<2:26:24, 10.60s/it]

Episode 1171/2000, real env return = 3073.93


 59%|████████████████████▋              | 1181/2000 [3:04:33<2:22:41, 10.45s/it]

Episode 1181/2000, real env return = 3075.42


 60%|████████████████████▊              | 1191/2000 [3:06:16<2:21:59, 10.53s/it]

Episode 1191/2000, real env return = 2914.80


 60%|█████████████████████              | 1201/2000 [3:08:02<2:23:15, 10.76s/it]

Episode 1201/2000, real env return = 3140.25


 61%|█████████████████████▏             | 1211/2000 [3:09:49<2:19:26, 10.60s/it]

Episode 1211/2000, real env return = 2229.46


 61%|█████████████████████▎             | 1221/2000 [3:11:34<2:15:30, 10.44s/it]

Episode 1221/2000, real env return = 3493.70


 62%|█████████████████████▌             | 1231/2000 [3:13:16<2:02:59,  9.60s/it]

Episode 1231/2000, real env return = 331.26


 62%|█████████████████████▋             | 1241/2000 [3:14:58<2:08:20, 10.15s/it]

Episode 1241/2000, real env return = 3332.05


 63%|█████████████████████▉             | 1251/2000 [3:16:41<2:12:43, 10.63s/it]

Episode 1251/2000, real env return = 3540.97


 63%|██████████████████████             | 1261/2000 [3:18:25<2:09:19, 10.50s/it]

Episode 1261/2000, real env return = 2864.74


 64%|██████████████████████▏            | 1271/2000 [3:20:11<2:08:13, 10.55s/it]

Episode 1271/2000, real env return = 3415.85


 64%|██████████████████████▍            | 1281/2000 [3:21:54<2:00:36, 10.07s/it]

Episode 1281/2000, real env return = 3500.91


 65%|██████████████████████▌            | 1291/2000 [3:23:41<2:06:23, 10.70s/it]

Episode 1291/2000, real env return = 3430.65


 65%|██████████████████████▊            | 1301/2000 [3:25:26<2:03:07, 10.57s/it]

Episode 1301/2000, real env return = 3002.51


 66%|██████████████████████▉            | 1311/2000 [3:27:09<1:56:08, 10.11s/it]

Episode 1311/2000, real env return = 1607.35


 66%|███████████████████████            | 1321/2000 [3:28:46<1:47:18,  9.48s/it]

Episode 1321/2000, real env return = 3710.93


 67%|███████████████████████▎           | 1331/2000 [3:30:31<1:56:08, 10.42s/it]

Episode 1331/2000, real env return = 3426.39


 67%|███████████████████████▍           | 1341/2000 [3:32:11<1:48:30,  9.88s/it]

Episode 1341/2000, real env return = 359.13


 68%|███████████████████████▋           | 1351/2000 [3:33:52<1:47:24,  9.93s/it]

Episode 1351/2000, real env return = 666.85


 68%|███████████████████████▊           | 1361/2000 [3:35:38<1:52:04, 10.52s/it]

Episode 1361/2000, real env return = 2385.84


 69%|███████████████████████▉           | 1371/2000 [3:37:16<1:41:59,  9.73s/it]

Episode 1371/2000, real env return = 3166.89


 69%|████████████████████████▏          | 1381/2000 [3:38:56<1:40:26,  9.74s/it]

Episode 1381/2000, real env return = 492.18


 70%|████████████████████████▎          | 1391/2000 [3:40:37<1:44:24, 10.29s/it]

Episode 1391/2000, real env return = 2939.22


 70%|████████████████████████▌          | 1401/2000 [3:42:17<1:40:06, 10.03s/it]

Episode 1401/2000, real env return = 1204.44


 71%|████████████████████████▋          | 1411/2000 [3:43:58<1:41:41, 10.36s/it]

Episode 1411/2000, real env return = 3814.65


 71%|████████████████████████▊          | 1421/2000 [3:45:38<1:37:27, 10.10s/it]

Episode 1421/2000, real env return = 3610.86


 72%|█████████████████████████          | 1431/2000 [3:47:18<1:33:38,  9.87s/it]

Episode 1431/2000, real env return = 3696.58


 72%|█████████████████████████▏         | 1441/2000 [3:48:58<1:29:59,  9.66s/it]

Episode 1441/2000, real env return = 116.48


 73%|█████████████████████████▍         | 1451/2000 [3:50:42<1:39:06, 10.83s/it]

Episode 1451/2000, real env return = 3234.65


 73%|█████████████████████████▌         | 1461/2000 [3:52:29<1:33:47, 10.44s/it]

Episode 1461/2000, real env return = 3565.52


 74%|█████████████████████████▋         | 1471/2000 [3:54:11<1:25:06,  9.65s/it]

Episode 1471/2000, real env return = 106.47


 74%|█████████████████████████▉         | 1481/2000 [3:55:54<1:30:25, 10.45s/it]

Episode 1481/2000, real env return = 3790.24


 75%|██████████████████████████         | 1491/2000 [3:57:42<1:33:16, 11.00s/it]

Episode 1491/2000, real env return = 3828.22


 75%|██████████████████████████▎        | 1501/2000 [3:59:30<1:29:19, 10.74s/it]

Episode 1501/2000, real env return = 3577.18


 76%|██████████████████████████▍        | 1511/2000 [4:01:14<1:21:36, 10.01s/it]

Episode 1511/2000, real env return = 115.14


 76%|██████████████████████████▌        | 1521/2000 [4:02:52<1:15:09,  9.41s/it]

Episode 1521/2000, real env return = 112.57


 77%|██████████████████████████▊        | 1531/2000 [4:04:32<1:13:53,  9.45s/it]

Episode 1531/2000, real env return = 671.26


 77%|██████████████████████████▉        | 1541/2000 [4:06:08<1:14:11,  9.70s/it]

Episode 1541/2000, real env return = 117.59


 78%|███████████████████████████▏       | 1551/2000 [4:07:49<1:14:19,  9.93s/it]

Episode 1551/2000, real env return = 160.13


 78%|███████████████████████████▎       | 1561/2000 [4:09:35<1:17:58, 10.66s/it]

Episode 1561/2000, real env return = 3266.82


 79%|███████████████████████████▍       | 1571/2000 [4:11:21<1:15:38, 10.58s/it]

Episode 1571/2000, real env return = 3369.31


 79%|███████████████████████████▋       | 1581/2000 [4:13:03<1:12:16, 10.35s/it]

Episode 1581/2000, real env return = 3968.30


 80%|███████████████████████████▊       | 1591/2000 [4:14:45<1:09:59, 10.27s/it]

Episode 1591/2000, real env return = 4032.98


 80%|████████████████████████████       | 1601/2000 [4:16:33<1:11:15, 10.71s/it]

Episode 1601/2000, real env return = 3917.59


 81%|████████████████████████████▏      | 1611/2000 [4:18:14<1:02:27,  9.63s/it]

Episode 1611/2000, real env return = 305.07


 81%|████████████████████████████▎      | 1621/2000 [4:19:57<1:07:17, 10.65s/it]

Episode 1621/2000, real env return = 3540.06


 82%|████████████████████████████▌      | 1631/2000 [4:21:41<1:05:07, 10.59s/it]

Episode 1631/2000, real env return = 3791.26


 82%|██████████████████████████████▎      | 1641/2000 [4:23:17<57:32,  9.62s/it]

Episode 1641/2000, real env return = 3708.17


 83%|██████████████████████████████▌      | 1651/2000 [4:24:52<58:34, 10.07s/it]

Episode 1651/2000, real env return = 3895.18


 83%|██████████████████████████████▋      | 1661/2000 [4:26:34<59:46, 10.58s/it]

Episode 1661/2000, real env return = 3830.66


 84%|██████████████████████████████▉      | 1671/2000 [4:28:19<57:01, 10.40s/it]

Episode 1671/2000, real env return = 3845.12


 84%|███████████████████████████████      | 1681/2000 [4:29:59<53:01,  9.97s/it]

Episode 1681/2000, real env return = 4082.99


 85%|███████████████████████████████▎     | 1691/2000 [4:31:44<54:36, 10.60s/it]

Episode 1691/2000, real env return = 3332.47


 85%|███████████████████████████████▍     | 1701/2000 [4:33:28<51:08, 10.26s/it]

Episode 1701/2000, real env return = 3939.87


 86%|███████████████████████████████▋     | 1711/2000 [4:35:10<48:59, 10.17s/it]

Episode 1711/2000, real env return = 4092.43


 86%|███████████████████████████████▊     | 1721/2000 [4:36:49<45:13,  9.73s/it]

Episode 1721/2000, real env return = 3647.51


 87%|████████████████████████████████     | 1731/2000 [4:38:31<45:28, 10.14s/it]

Episode 1731/2000, real env return = 433.86


 87%|████████████████████████████████▏    | 1741/2000 [4:40:16<42:57,  9.95s/it]

Episode 1741/2000, real env return = 545.50


 88%|████████████████████████████████▍    | 1751/2000 [4:41:57<42:35, 10.26s/it]

Episode 1751/2000, real env return = 3682.77


 88%|████████████████████████████████▌    | 1761/2000 [4:43:38<41:40, 10.46s/it]

Episode 1761/2000, real env return = 3787.06


 89%|████████████████████████████████▊    | 1771/2000 [4:45:22<39:56, 10.47s/it]

Episode 1771/2000, real env return = 3974.57


 89%|████████████████████████████████▉    | 1781/2000 [4:47:06<38:13, 10.47s/it]

Episode 1781/2000, real env return = 3928.33


 90%|█████████████████████████████████▏   | 1791/2000 [4:48:51<35:36, 10.22s/it]

Episode 1791/2000, real env return = 1214.17


 90%|█████████████████████████████████▎   | 1801/2000 [4:50:29<34:13, 10.32s/it]

Episode 1801/2000, real env return = 3857.18


 91%|█████████████████████████████████▌   | 1811/2000 [4:52:11<32:11, 10.22s/it]

Episode 1811/2000, real env return = 4151.55


 91%|█████████████████████████████████▋   | 1821/2000 [4:53:57<31:38, 10.61s/it]

Episode 1821/2000, real env return = 4146.07


 92%|█████████████████████████████████▊   | 1831/2000 [4:55:36<28:04,  9.97s/it]

Episode 1831/2000, real env return = 475.29


 92%|██████████████████████████████████   | 1841/2000 [4:57:18<26:35, 10.03s/it]

Episode 1841/2000, real env return = 3670.38


 93%|██████████████████████████████████▏  | 1851/2000 [4:59:01<26:15, 10.57s/it]

Episode 1851/2000, real env return = 4210.77


 93%|██████████████████████████████████▍  | 1861/2000 [5:00:45<24:28, 10.56s/it]

Episode 1861/2000, real env return = 4077.56


 94%|██████████████████████████████████▌  | 1871/2000 [5:02:19<21:42, 10.10s/it]

Episode 1871/2000, real env return = 3992.26


 94%|██████████████████████████████████▊  | 1881/2000 [5:04:05<21:05, 10.64s/it]

Episode 1881/2000, real env return = 4223.33


 95%|██████████████████████████████████▉  | 1891/2000 [5:05:50<18:55, 10.42s/it]

Episode 1891/2000, real env return = 1044.96


 95%|███████████████████████████████████▏ | 1901/2000 [5:07:32<16:47, 10.18s/it]

Episode 1901/2000, real env return = 4106.04


 96%|███████████████████████████████████▎ | 1911/2000 [5:09:16<15:30, 10.45s/it]

Episode 1911/2000, real env return = 3660.08


 96%|███████████████████████████████████▌ | 1921/2000 [5:11:01<13:22, 10.16s/it]

Episode 1921/2000, real env return = 3772.74


 97%|███████████████████████████████████▋ | 1931/2000 [5:12:44<11:50, 10.30s/it]

Episode 1931/2000, real env return = 3745.81


 97%|███████████████████████████████████▉ | 1941/2000 [5:14:30<10:24, 10.58s/it]

Episode 1941/2000, real env return = 3529.58


 98%|████████████████████████████████████ | 1951/2000 [5:16:16<08:44, 10.71s/it]

Episode 1951/2000, real env return = 4094.44


 98%|████████████████████████████████████▎| 1961/2000 [5:18:02<06:53, 10.59s/it]

Episode 1961/2000, real env return = 4184.31


 99%|████████████████████████████████████▍| 1971/2000 [5:19:47<05:09, 10.69s/it]

Episode 1971/2000, real env return = 3740.39


 99%|████████████████████████████████████▋| 1981/2000 [5:21:32<03:21, 10.58s/it]

Episode 1981/2000, real env return = 3911.75


100%|████████████████████████████████████▊| 1991/2000 [5:23:17<01:33, 10.35s/it]

Episode 1991/2000, real env return = 4014.16


100%|█████████████████████████████████████| 2000/2000 [5:24:53<00:00,  9.75s/it]


Training finished.


In [3]:
np.save('rand_walker_train1.npy', train1)