In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

from stable_baselines3 import SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.logger import configure

# ========= 1. Environment Wrapper =========
class EpisodicRewardWrapper(gym.RewardWrapper):
    def __init__(self, env):
        super().__init__(env)
        self.cumulative_reward = 0.0

    def step(self, action):
        result = self.env.step(action)
        if len(result) == 5:
            obs, reward, terminated, truncated, info = result
            done = terminated or truncated
        else:
            obs, reward, done, info = result

        # 记录累计的reward
        self.cumulative_reward += reward
        
        if done:
            reward = self.cumulative_reward
            self.cumulative_reward = 0.0
        else:
            reward = 0.0
        
        return obs, reward, done, info

    def reset(self, **kwargs):
        self.cumulative_reward = 0.0
        result = self.env.reset(**kwargs)
        if isinstance(result, tuple):
            obs, _ = result
        else:
            obs = result
        return obs




# ========= 2. 轨迹存储 =========
class TrajectoryReplay:
    def __init__(self):
        self.trajectories = []

    def add_trajectory(self, traj):
        self.trajectories.append(traj)

    def sample(self, batch_size):
        indices = np.random.randint(0, len(self.trajectories), size=batch_size)
        return [self.trajectories[i] for i in indices]

    def __len__(self):
        return len(self.trajectories)



# ========= 3. Reward Proxy ==========
class RewardModel(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=256):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
        )
        self.mu_layer = nn.Linear(hidden_size, 1)
        self.log_sigma_layer = nn.Linear(hidden_size, 1)

    def forward(self, s, a):
        x = torch.cat([s, a], dim=-1)
        x = self.net(x)
        mu = self.mu_layer(x)
        log_sigma = self.log_sigma_layer(x)
        sigma = torch.clamp(torch.exp(log_sigma), min=1e-3, max=3.0)
        return mu, sigma


# ========= 4. Train a reward model with batch trajectories ==========
def train_reward_model_gaussian_loo(reward_model, optimizer, trajectories, device='cpu'):
    """
    使用留一法训练奖励模型：
    对于每条轨迹 τ = { (s₀, a₀), ..., (s_{T-1}, a_{T-1}) }，
      - 真实轨迹奖励 R_ep = ∑ₜ rₜ（假设中间步奖励为0，只有最后一步累计）
      - 对每个时间步 i：
            * 得到每个 (s_t, a_t) 的预测参数 μ_t 和 σ_t；
            * 对于 t ≠ i，单独采样噪声 ε_{i,t} ~ N(0,1)，计算采样奖励：
                  r(s_t,a_t) = μ_t + σ_t * ε_{i,t}
            * 计算目标值 target_i = R_ep - ∑_{t ≠ i} r(s_t,a_t)
            * 计算该时间步的负对数似然：
                  nll_i = log(σ_i) + ((target_i - μ_i)²) / (2σ_i²)
    最后对每条轨迹所有时间步的 nll 取平均，再对所有轨迹取平均作为最终损失。
    """
    reward_model.train()
    total_loss = 0.0
    total_episodes = 0

    for traj in trajectories:
        # 真实轨迹奖励（假设中间步奖励为0，只有最后一步累计）
        R_ep = sum([t[2] for t in traj])
        Tj = len(traj)
        if Tj == 0:
            continue

        # 收集状态和动作
        s_list, a_list = [], []
        for (s, a, r, s_next, done) in traj:
            s_list.append(s)
            a_list.append(a)
        states = torch.tensor(s_list, dtype=torch.float, device=device)
        actions = torch.tensor(a_list, dtype=torch.float, device=device)
        
        # 预测整个轨迹每个 (s,a) 的参数，输出 shape 为 [Tj, 1]
        mu, sigma = reward_model(states, actions)
        
        nll_list = []
        for i in range(Tj):
            # 构造一个 mask，排除第 i 个时间步
            mask = torch.ones(Tj, dtype=torch.bool, device=device)
            mask[i] = False
            
            # 对于每个 t ≠ i，单独采样噪声 ε_{i,t} ~ N(0,1)
            epsilons = torch.randn((mask.sum(), 1), device=device)
            
            # 计算 t ≠ i 的采样奖励： r = μ + σ * ε_{i,t}
            r_samples = mu[mask] + epsilons * sigma[mask]
            leave_one_sum = r_samples.sum()  # 除去第 i 步的总奖励预测
            
            # 第 i 步的目标：R_ep - (其它时间步的采样奖励之和)
            target_i = R_ep - leave_one_sum
            
            # 计算第 i 步的负对数似然损失
            nll_i = torch.log(sigma[i]) + ((target_i - mu[i])**2) / (2 * sigma[i]**2)
            nll_list.append(nll_i)
        
        loss_traj = torch.stack(nll_list).mean()  # 对单条轨迹所有时刻取平均
        total_loss += loss_traj
        total_episodes += 1

    if total_episodes > 0:
        loss_mean = total_loss / total_episodes
    else:
        loss_mean = torch.tensor(0.0, device=device)

    optimizer.zero_grad()
    loss_mean.backward()
    optimizer.step()
    return loss_mean.item()








# ========= 5. How to collect a trajectory ==========
def collect_episodes(env, model, n_episodes, device='cpu'):
    trajectories = []
    for _ in range(n_episodes):
        result = env.reset()
        if isinstance(result, tuple):
            obs, _ = result
        else:
            obs = result
        done = False
        traj = []
        while not done:
            if obs is None or (hasattr(obs, '__len__') and len(obs) == 0):
                raise ValueError("Observation is empty, check env.reset() output")
            obs_tensor = torch.tensor(obs, dtype=torch.float, device=device).unsqueeze(0)
            with torch.no_grad():
                action, _states = model.predict(obs_tensor.cpu().numpy(), deterministic=False)
            result = env.step(action[0])
            
            if len(result) == 5:
                next_obs, reward, terminated, truncated, info = result
                done = terminated or truncated
            
            else:
                next_obs, reward, done, info = result
            traj.append((obs, action[0], reward, next_obs, done))
            obs = next_obs
        trajectories.append(traj)
    return trajectories


# ========= 6. 将新的reward模型生成的代理奖励存入SAC的ReplayBuffer ==========
def add_shaped_transitions_to_replay(model, reward_model, trajectories, device='cpu'):
    for traj in trajectories:
        states = []
        actions = []
        next_states = []
        dones = []
        for (s, a, r, s_next, d) in traj:
            states.append(s)
            actions.append(a)
            next_states.append(s_next)
            dones.append(d)
        
        s_tensor = torch.tensor(states, dtype=torch.float, device=device)
        a_tensor = torch.tensor(actions, dtype=torch.float, device=device)
        
        with torch.no_grad():
            mu, _ = reward_model(s_tensor, a_tensor)  # 这里我们用 μ 作为代理奖励
        shaped_rewards = mu.cpu().numpy().squeeze(-1)
        
        for i in range(len(traj)):
            s, a, _, s_next, d = traj[i]
            r_shaped = shaped_rewards[i]
            model.replay_buffer.add(
                s, s_next, a, r_shaped, d, infos=[{}],
            )

# ========= 7. 主函数 ==========
def main():
    class Args:
        env = "Walker2d-v4"
        episodes = 2000
        steps_per_update = 1000
        # 使用GPU
        device = "cuda"
    
    args = Args()
    device = torch.device(args.device if torch.cuda.is_available() else "cpu")

    # 创建环境（注意环境包装仍然不改变总reward的逻辑）
    base_env = gym.make(args.env)
    env = EpisodicRewardWrapper(base_env)
    vec_env = DummyVecEnv([lambda: env])

    # 创建SAC模型（注意这里SAC的训练依赖于ReplayBuffer，我们将手动往buffer中填入代理奖励）
    model = SAC(
        policy="MlpPolicy",
        env=vec_env,
        verbose=1,
        seed=42,
        buffer_size=100000,
        learning_starts=0,
        train_freq=1,
        gradient_steps=0,
        batch_size=256,
        gamma=0.99,
        device=device
    )
    # 配置logger
    from stable_baselines3.common.logger import configure
    model._logger = configure(folder=None, format_strings=["stdout"])

    # 创建奖励模型（现在输出μ和σ）
    state_dim = base_env.observation_space.shape[0]
    action_dim = base_env.action_space.shape[0]
    reward_model = RewardModel(state_dim, action_dim, hidden_size=256).to(device)
    reward_optimizer = optim.Adam(reward_model.parameters(), lr=3e-4)

    traj_replay = TrajectoryReplay()

    ep_record = []
    for ep in tqdm(range(args.episodes)):
        # (a) 收集轨迹
        new_trajectories = collect_episodes(env, model, n_episodes=1, device=device)
        for traj in new_trajectories:
            traj_replay.add_trajectory(traj)
        
        # (b) 当有足够轨迹后，用Gaussian对数似然训练奖励模型
        if len(traj_replay) >= 8:
            batch_size_trajectories = 8
            sampled_trajs = traj_replay.sample(batch_size_trajectories)
            loss_val = train_reward_model_gaussian_loo(
                reward_model, reward_optimizer, sampled_trajs, device=device
            )
        
        # (c) 将新轨迹用更新后的奖励模型生成的代理奖励存入SAC的ReplayBuffer
        add_shaped_transitions_to_replay(model, reward_model, new_trajectories, device=device)
        
        # (d) 用SAC更新策略
        if len(traj_replay) >= 8:
            model.train(args.steps_per_update)
        
        # (e) 简单评估：输出最新轨迹的真实环境回报
        ep_return = sum([t[2] for t in new_trajectories[-1]])
        ep_record.append(ep_return)
        if ep % 10 == 0:
            print(f"Episode {ep+1}/{args.episodes}, real env return = {ep_return:.2f}")

    print("Training finished.")
    return ep_record

In [2]:
train1 = main()

Using cuda device


  0%|▏                                         | 6/2000 [00:00<00:35, 55.98it/s]

Episode 1/2000, real env return = 21.26
Episode 11/2000, real env return = -26.03


  1%|▍                                      | 21/2000 [00:56<2:23:28,  4.35s/it]

Episode 21/2000, real env return = 231.28


  2%|▌                                      | 31/2000 [02:34<5:30:54, 10.08s/it]

Episode 31/2000, real env return = -9.95


  2%|▊                                      | 41/2000 [04:22<5:41:54, 10.47s/it]

Episode 41/2000, real env return = 5.98


  3%|▉                                      | 51/2000 [06:14<6:14:42, 11.54s/it]

Episode 51/2000, real env return = 152.84


  3%|█▏                                     | 61/2000 [08:00<5:44:40, 10.67s/it]

Episode 61/2000, real env return = 5.30


  4%|█▍                                     | 71/2000 [09:55<6:03:52, 11.32s/it]

Episode 71/2000, real env return = 249.73


  4%|█▌                                     | 81/2000 [11:48<5:58:52, 11.22s/it]

Episode 81/2000, real env return = 39.84


  5%|█▊                                     | 91/2000 [13:51<6:35:02, 12.42s/it]

Episode 91/2000, real env return = 275.16


  5%|█▉                                    | 101/2000 [15:52<6:21:25, 12.05s/it]

Episode 101/2000, real env return = 595.75


  6%|██                                    | 111/2000 [17:49<6:14:43, 11.90s/it]

Episode 111/2000, real env return = 275.33


  6%|██▎                                   | 121/2000 [19:46<6:09:15, 11.79s/it]

Episode 121/2000, real env return = 226.56


  7%|██▍                                   | 131/2000 [21:51<6:44:38, 12.99s/it]

Episode 131/2000, real env return = 2.30


  7%|██▋                                   | 141/2000 [23:56<6:19:52, 12.26s/it]

Episode 141/2000, real env return = 64.91


  8%|██▊                                   | 151/2000 [25:58<6:15:29, 12.18s/it]

Episode 151/2000, real env return = 228.61


  8%|███                                   | 161/2000 [28:09<6:28:45, 12.68s/it]

Episode 161/2000, real env return = 68.52


  9%|███▏                                  | 171/2000 [30:21<6:33:34, 12.91s/it]

Episode 171/2000, real env return = 183.14


  9%|███▍                                  | 181/2000 [32:26<6:24:54, 12.70s/it]

Episode 181/2000, real env return = 44.52


 10%|███▋                                  | 191/2000 [34:36<6:24:56, 12.77s/it]

Episode 191/2000, real env return = 281.48


 10%|███▊                                  | 201/2000 [36:51<6:50:40, 13.70s/it]

Episode 201/2000, real env return = -2.35


 11%|████                                  | 211/2000 [39:07<6:38:06, 13.35s/it]

Episode 211/2000, real env return = 268.04


 11%|████▏                                 | 221/2000 [41:21<6:30:29, 13.17s/it]

Episode 221/2000, real env return = 950.35


 12%|████▍                                 | 231/2000 [43:45<6:48:03, 13.84s/it]

Episode 231/2000, real env return = 377.64


 12%|████▌                                 | 241/2000 [46:13<7:17:35, 14.93s/it]

Episode 241/2000, real env return = 210.28


 13%|████▊                                 | 251/2000 [48:38<6:57:56, 14.34s/it]

Episode 251/2000, real env return = 246.09


 13%|████▉                                 | 261/2000 [50:56<6:50:18, 14.16s/it]

Episode 261/2000, real env return = 193.19


 14%|█████▏                                | 271/2000 [53:31<7:10:40, 14.95s/it]

Episode 271/2000, real env return = 418.94


 14%|█████▎                                | 281/2000 [55:55<7:00:57, 14.69s/it]

Episode 281/2000, real env return = 277.70


 15%|█████▌                                | 291/2000 [58:23<7:18:24, 15.39s/it]

Episode 291/2000, real env return = 438.10


 15%|█████▍                              | 301/2000 [1:00:49<7:06:34, 15.06s/it]

Episode 301/2000, real env return = 39.76


 16%|█████▌                              | 311/2000 [1:03:17<6:41:53, 14.28s/it]

Episode 311/2000, real env return = 858.50


 16%|█████▊                              | 321/2000 [1:05:49<6:34:54, 14.11s/it]

Episode 321/2000, real env return = 52.00


 17%|█████▉                              | 331/2000 [1:08:30<7:45:14, 16.73s/it]

Episode 331/2000, real env return = 637.17


 17%|██████▏                             | 341/2000 [1:11:08<7:06:20, 15.42s/it]

Episode 341/2000, real env return = 245.70


 18%|██████▎                             | 351/2000 [1:13:26<6:18:12, 13.76s/it]

Episode 351/2000, real env return = 121.98


 18%|██████▍                             | 361/2000 [1:15:50<6:34:44, 14.45s/it]

Episode 361/2000, real env return = 292.04


 19%|██████▋                             | 371/2000 [1:18:22<6:38:01, 14.66s/it]

Episode 371/2000, real env return = 455.17


 19%|██████▊                             | 381/2000 [1:20:53<6:56:19, 15.43s/it]

Episode 381/2000, real env return = 438.39


 20%|███████                             | 391/2000 [1:23:20<6:30:44, 14.57s/it]

Episode 391/2000, real env return = 405.81


 20%|███████▏                            | 401/2000 [1:25:49<6:38:33, 14.96s/it]

Episode 401/2000, real env return = 1123.20


 21%|███████▍                            | 411/2000 [1:28:33<7:23:57, 16.76s/it]

Episode 411/2000, real env return = 271.59


 21%|███████▌                            | 421/2000 [1:31:05<6:07:14, 13.95s/it]

Episode 421/2000, real env return = 425.73


 22%|███████▊                            | 431/2000 [1:33:40<7:05:36, 16.28s/it]

Episode 431/2000, real env return = 520.83


 22%|███████▉                            | 441/2000 [1:36:20<7:15:33, 16.76s/it]

Episode 441/2000, real env return = 840.66


 23%|████████                            | 451/2000 [1:38:55<6:01:31, 14.00s/it]

Episode 451/2000, real env return = 240.99


 23%|████████▎                           | 461/2000 [1:41:19<6:06:47, 14.30s/it]

Episode 461/2000, real env return = 643.07


 24%|████████▍                           | 471/2000 [1:43:47<6:17:11, 14.80s/it]

Episode 471/2000, real env return = 428.15


 24%|████████▋                           | 481/2000 [1:46:20<6:41:42, 15.87s/it]

Episode 481/2000, real env return = 542.37


 25%|████████▊                           | 491/2000 [1:48:46<6:01:33, 14.38s/it]

Episode 491/2000, real env return = 415.10


 25%|█████████                           | 501/2000 [1:51:19<6:28:06, 15.53s/it]

Episode 501/2000, real env return = 658.22


 26%|█████████▏                          | 511/2000 [1:53:47<5:52:26, 14.20s/it]

Episode 511/2000, real env return = 418.32


 26%|█████████▍                          | 521/2000 [1:56:07<5:36:42, 13.66s/it]

Episode 521/2000, real env return = 209.16


 27%|█████████▌                          | 531/2000 [1:58:38<5:45:02, 14.09s/it]

Episode 531/2000, real env return = 307.69


 27%|█████████▋                          | 541/2000 [2:01:06<6:05:55, 15.05s/it]

Episode 541/2000, real env return = 508.76


 28%|█████████▉                          | 551/2000 [2:03:30<5:58:40, 14.85s/it]

Episode 551/2000, real env return = 493.34


 28%|██████████                          | 561/2000 [2:05:51<5:39:55, 14.17s/it]

Episode 561/2000, real env return = 398.08


 29%|██████████▎                         | 571/2000 [2:08:15<5:42:36, 14.39s/it]

Episode 571/2000, real env return = 390.06


 29%|██████████▍                         | 581/2000 [2:10:44<5:38:55, 14.33s/it]

Episode 581/2000, real env return = 411.68


 30%|██████████▋                         | 591/2000 [2:13:13<6:00:30, 15.35s/it]

Episode 591/2000, real env return = 436.10


 30%|██████████▊                         | 601/2000 [2:15:37<5:25:41, 13.97s/it]

Episode 601/2000, real env return = 481.12


 31%|██████████▉                         | 611/2000 [2:17:54<5:22:19, 13.92s/it]

Episode 611/2000, real env return = 222.62


 31%|███████████▏                        | 621/2000 [2:20:23<5:28:31, 14.29s/it]

Episode 621/2000, real env return = 396.82


 32%|███████████▎                        | 631/2000 [2:22:42<5:19:12, 13.99s/it]

Episode 631/2000, real env return = 462.75


 32%|███████████▌                        | 641/2000 [2:25:09<5:56:34, 15.74s/it]

Episode 641/2000, real env return = 461.44


 33%|███████████▋                        | 651/2000 [2:27:37<5:29:51, 14.67s/it]

Episode 651/2000, real env return = 180.19


 33%|███████████▉                        | 661/2000 [2:30:02<5:20:10, 14.35s/it]

Episode 661/2000, real env return = 380.33


 34%|████████████                        | 671/2000 [2:32:19<4:53:27, 13.25s/it]

Episode 671/2000, real env return = 548.82


 34%|████████████▎                       | 681/2000 [2:34:51<5:35:13, 15.25s/it]

Episode 681/2000, real env return = 325.55


 35%|████████████▍                       | 691/2000 [2:37:16<5:05:44, 14.01s/it]

Episode 691/2000, real env return = 407.04


 35%|████████████▌                       | 701/2000 [2:39:41<5:08:13, 14.24s/it]

Episode 701/2000, real env return = 642.14


 36%|████████████▊                       | 711/2000 [2:42:07<5:07:18, 14.30s/it]

Episode 711/2000, real env return = 1113.04


 36%|████████████▉                       | 721/2000 [2:44:28<5:09:40, 14.53s/it]

Episode 721/2000, real env return = 2299.25


 37%|█████████████▏                      | 731/2000 [2:46:55<4:57:53, 14.08s/it]

Episode 731/2000, real env return = 914.88


 37%|█████████████▎                      | 741/2000 [2:49:16<4:51:41, 13.90s/it]

Episode 741/2000, real env return = 737.50


 38%|█████████████▌                      | 751/2000 [2:51:44<5:10:15, 14.90s/it]

Episode 751/2000, real env return = 1626.29


 38%|█████████████▋                      | 761/2000 [2:54:17<5:04:47, 14.76s/it]

Episode 761/2000, real env return = 171.04


 39%|█████████████▉                      | 771/2000 [2:56:50<5:02:09, 14.75s/it]

Episode 771/2000, real env return = 2659.51


 39%|██████████████                      | 781/2000 [2:59:20<4:55:45, 14.56s/it]

Episode 781/2000, real env return = 4042.51


 40%|██████████████▏                     | 791/2000 [3:02:05<5:27:48, 16.27s/it]

Episode 791/2000, real env return = 3719.74


 40%|██████████████▍                     | 801/2000 [3:04:47<5:12:23, 15.63s/it]

Episode 801/2000, real env return = 4275.25


 41%|██████████████▌                     | 811/2000 [3:07:37<6:09:01, 18.62s/it]

Episode 811/2000, real env return = 3884.04


 41%|██████████████▊                     | 821/2000 [3:10:08<5:11:18, 15.84s/it]

Episode 821/2000, real env return = 112.95


 42%|██████████████▉                     | 831/2000 [3:12:42<5:13:37, 16.10s/it]

Episode 831/2000, real env return = 4206.36


 42%|███████████████▏                    | 841/2000 [3:15:24<5:26:30, 16.90s/it]

Episode 841/2000, real env return = 4206.96


 43%|███████████████▎                    | 851/2000 [3:18:12<5:21:50, 16.81s/it]

Episode 851/2000, real env return = 4550.23


 43%|███████████████▍                    | 861/2000 [3:20:46<4:43:57, 14.96s/it]

Episode 861/2000, real env return = 51.82


 44%|███████████████▋                    | 871/2000 [3:23:22<4:37:49, 14.76s/it]

Episode 871/2000, real env return = 1961.44


 44%|███████████████▊                    | 881/2000 [3:26:16<5:24:22, 17.39s/it]

Episode 881/2000, real env return = 4224.90


 45%|████████████████                    | 891/2000 [3:29:04<5:12:59, 16.93s/it]

Episode 891/2000, real env return = 1669.88


 45%|████████████████▏                   | 901/2000 [3:31:47<5:21:36, 17.56s/it]

Episode 901/2000, real env return = 3450.36


 46%|████████████████▍                   | 911/2000 [3:34:32<4:48:05, 15.87s/it]

Episode 911/2000, real env return = 15.53


 46%|████████████████▌                   | 921/2000 [3:37:33<5:21:42, 17.89s/it]

Episode 921/2000, real env return = 321.94


 47%|████████████████▊                   | 931/2000 [3:40:27<5:26:02, 18.30s/it]

Episode 931/2000, real env return = 4014.62


 47%|████████████████▉                   | 941/2000 [3:43:15<4:56:05, 16.78s/it]

Episode 941/2000, real env return = 4109.66


 48%|█████████████████                   | 951/2000 [3:45:45<4:03:48, 13.95s/it]

Episode 951/2000, real env return = 199.45


 48%|█████████████████▎                  | 961/2000 [3:48:23<4:28:12, 15.49s/it]

Episode 961/2000, real env return = 4425.28


 49%|█████████████████▍                  | 971/2000 [3:51:12<5:03:25, 17.69s/it]

Episode 971/2000, real env return = 4640.69


 49%|█████████████████▋                  | 981/2000 [3:54:10<5:06:14, 18.03s/it]

Episode 981/2000, real env return = 4655.11


 50%|█████████████████▊                  | 991/2000 [3:56:59<4:33:12, 16.25s/it]

Episode 991/2000, real env return = 2303.59


 50%|█████████████████▌                 | 1001/2000 [3:59:48<5:03:06, 18.20s/it]

Episode 1001/2000, real env return = 153.61


 51%|█████████████████▋                 | 1011/2000 [4:02:38<4:46:25, 17.38s/it]

Episode 1011/2000, real env return = 4495.63


 51%|█████████████████▊                 | 1021/2000 [4:05:30<4:56:28, 18.17s/it]

Episode 1021/2000, real env return = 125.43


 52%|██████████████████                 | 1031/2000 [4:08:18<4:29:06, 16.66s/it]

Episode 1031/2000, real env return = 1272.85


 52%|██████████████████▏                | 1041/2000 [4:11:20<5:01:14, 18.85s/it]

Episode 1041/2000, real env return = 90.84


 53%|██████████████████▍                | 1051/2000 [4:14:11<4:41:15, 17.78s/it]

Episode 1051/2000, real env return = 4529.56


 53%|██████████████████▌                | 1061/2000 [4:17:08<4:21:42, 16.72s/it]

Episode 1061/2000, real env return = 4664.62


 54%|██████████████████▋                | 1071/2000 [4:20:05<4:30:24, 17.46s/it]

Episode 1071/2000, real env return = 4740.13


 54%|██████████████████▉                | 1081/2000 [4:23:14<4:49:04, 18.87s/it]

Episode 1081/2000, real env return = 313.69


 55%|███████████████████                | 1091/2000 [4:26:12<4:33:36, 18.06s/it]

Episode 1091/2000, real env return = 175.11


 55%|███████████████████▎               | 1101/2000 [4:29:00<4:27:50, 17.88s/it]

Episode 1101/2000, real env return = 4806.30


 56%|███████████████████▍               | 1111/2000 [4:31:47<3:52:29, 15.69s/it]

Episode 1111/2000, real env return = 188.77


 56%|███████████████████▌               | 1121/2000 [4:34:48<4:25:31, 18.12s/it]

Episode 1121/2000, real env return = 4654.86


 57%|███████████████████▊               | 1131/2000 [4:37:50<4:05:37, 16.96s/it]

Episode 1131/2000, real env return = 3393.55


 57%|███████████████████▉               | 1141/2000 [4:40:40<4:09:45, 17.45s/it]

Episode 1141/2000, real env return = 4540.22


 58%|████████████████████▏              | 1151/2000 [4:43:40<4:16:57, 18.16s/it]

Episode 1151/2000, real env return = 4762.11


 58%|████████████████████▎              | 1161/2000 [4:46:35<3:43:59, 16.02s/it]

Episode 1161/2000, real env return = 53.95


 59%|████████████████████▍              | 1171/2000 [4:49:36<4:21:24, 18.92s/it]

Episode 1171/2000, real env return = 2760.01


 59%|████████████████████▋              | 1181/2000 [4:52:39<4:20:39, 19.10s/it]

Episode 1181/2000, real env return = 4797.83


 60%|████████████████████▊              | 1191/2000 [4:55:48<4:24:24, 19.61s/it]

Episode 1191/2000, real env return = 260.85


 60%|█████████████████████              | 1201/2000 [4:59:00<4:29:39, 20.25s/it]

Episode 1201/2000, real env return = 4768.25


 61%|█████████████████████▏             | 1211/2000 [5:02:06<4:29:34, 20.50s/it]

Episode 1211/2000, real env return = 4812.63


 61%|█████████████████████▎             | 1221/2000 [5:05:20<4:27:51, 20.63s/it]

Episode 1221/2000, real env return = 4510.54


 62%|█████████████████████▌             | 1231/2000 [5:08:30<4:05:33, 19.16s/it]

Episode 1231/2000, real env return = 4761.25


 62%|█████████████████████▋             | 1241/2000 [5:11:41<3:47:07, 17.95s/it]

Episode 1241/2000, real env return = 33.16


 63%|█████████████████████▉             | 1251/2000 [5:14:42<3:34:49, 17.21s/it]

Episode 1251/2000, real env return = 3986.78


 63%|██████████████████████             | 1261/2000 [5:17:30<3:24:58, 16.64s/it]

Episode 1261/2000, real env return = 4608.18


 64%|██████████████████████▏            | 1271/2000 [5:20:25<3:31:09, 17.38s/it]

Episode 1271/2000, real env return = 98.44


 64%|██████████████████████▍            | 1281/2000 [5:23:47<3:53:05, 19.45s/it]

Episode 1281/2000, real env return = 95.14


 65%|██████████████████████▌            | 1291/2000 [5:27:08<4:15:42, 21.64s/it]

Episode 1291/2000, real env return = 4783.67


 65%|██████████████████████▊            | 1301/2000 [5:30:28<3:32:52, 18.27s/it]

Episode 1301/2000, real env return = 4823.58


 66%|██████████████████████▉            | 1311/2000 [5:33:30<3:24:10, 17.78s/it]

Episode 1311/2000, real env return = 4916.81


 66%|███████████████████████            | 1321/2000 [5:36:44<3:49:21, 20.27s/it]

Episode 1321/2000, real env return = 4875.40


 67%|███████████████████████▎           | 1331/2000 [5:39:44<3:23:08, 18.22s/it]

Episode 1331/2000, real env return = 4828.16


 67%|███████████████████████▍           | 1341/2000 [5:43:00<3:46:17, 20.60s/it]

Episode 1341/2000, real env return = 4710.78


 68%|███████████████████████▋           | 1351/2000 [5:45:55<3:14:05, 17.94s/it]

Episode 1351/2000, real env return = 4569.19


 68%|███████████████████████▊           | 1361/2000 [5:49:05<3:21:19, 18.90s/it]

Episode 1361/2000, real env return = 4938.16


 69%|███████████████████████▉           | 1371/2000 [5:52:02<3:10:19, 18.15s/it]

Episode 1371/2000, real env return = 4560.26


 69%|████████████████████████▏          | 1381/2000 [5:55:07<3:10:45, 18.49s/it]

Episode 1381/2000, real env return = 4737.28


 70%|████████████████████████▎          | 1391/2000 [5:58:14<3:20:41, 19.77s/it]

Episode 1391/2000, real env return = 4427.09


 70%|████████████████████████▌          | 1401/2000 [6:01:36<3:29:57, 21.03s/it]

Episode 1401/2000, real env return = 4691.90


 71%|████████████████████████▋          | 1411/2000 [6:04:22<2:48:43, 17.19s/it]

Episode 1411/2000, real env return = 129.57


 71%|████████████████████████▊          | 1421/2000 [6:07:36<3:13:40, 20.07s/it]

Episode 1421/2000, real env return = 4931.59


 72%|█████████████████████████          | 1431/2000 [6:10:50<2:58:19, 18.80s/it]

Episode 1431/2000, real env return = 4604.27


 72%|█████████████████████████▏         | 1441/2000 [6:13:44<2:40:50, 17.26s/it]

Episode 1441/2000, real env return = 4313.84


 73%|█████████████████████████▍         | 1451/2000 [6:16:50<2:41:20, 17.63s/it]

Episode 1451/2000, real env return = 188.56


 73%|█████████████████████████▌         | 1461/2000 [6:20:12<2:58:40, 19.89s/it]

Episode 1461/2000, real env return = 839.51


 74%|█████████████████████████▋         | 1471/2000 [6:23:30<2:59:31, 20.36s/it]

Episode 1471/2000, real env return = 114.67


 74%|█████████████████████████▉         | 1481/2000 [6:26:52<3:00:21, 20.85s/it]

Episode 1481/2000, real env return = 1758.41


 75%|██████████████████████████         | 1491/2000 [6:29:49<2:42:27, 19.15s/it]

Episode 1491/2000, real env return = 14.21


 75%|██████████████████████████▎        | 1501/2000 [6:32:52<2:26:40, 17.64s/it]

Episode 1501/2000, real env return = 147.19


 76%|██████████████████████████▍        | 1511/2000 [6:36:05<2:31:10, 18.55s/it]

Episode 1511/2000, real env return = 4567.01


 76%|██████████████████████████▌        | 1521/2000 [6:39:15<2:37:56, 19.78s/it]

Episode 1521/2000, real env return = 4760.04


 77%|██████████████████████████▊        | 1531/2000 [6:42:25<2:29:08, 19.08s/it]

Episode 1531/2000, real env return = 142.22


 77%|██████████████████████████▉        | 1541/2000 [6:45:30<2:20:03, 18.31s/it]

Episode 1541/2000, real env return = 4668.47


 78%|███████████████████████████▏       | 1551/2000 [6:48:37<2:13:17, 17.81s/it]

Episode 1551/2000, real env return = 3783.22


 78%|███████████████████████████▎       | 1561/2000 [6:51:59<2:20:10, 19.16s/it]

Episode 1561/2000, real env return = 2346.90


 79%|███████████████████████████▍       | 1571/2000 [6:55:08<2:12:19, 18.51s/it]

Episode 1571/2000, real env return = 4184.90


 79%|███████████████████████████▋       | 1581/2000 [6:58:18<2:14:31, 19.26s/it]

Episode 1581/2000, real env return = 4385.61


 80%|███████████████████████████▊       | 1591/2000 [7:01:31<2:19:08, 20.41s/it]

Episode 1591/2000, real env return = 187.12


 80%|████████████████████████████       | 1601/2000 [7:04:22<2:00:05, 18.06s/it]

Episode 1601/2000, real env return = 132.03


 81%|████████████████████████████▏      | 1611/2000 [7:07:17<1:54:23, 17.64s/it]

Episode 1611/2000, real env return = 4813.90


 81%|████████████████████████████▎      | 1621/2000 [7:10:30<1:51:34, 17.66s/it]

Episode 1621/2000, real env return = 207.39


 82%|████████████████████████████▌      | 1631/2000 [7:13:26<1:48:35, 17.66s/it]

Episode 1631/2000, real env return = 3167.40


 82%|████████████████████████████▋      | 1641/2000 [7:16:48<2:02:33, 20.48s/it]

Episode 1641/2000, real env return = 1804.69


 83%|████████████████████████████▉      | 1651/2000 [7:20:02<1:57:53, 20.27s/it]

Episode 1651/2000, real env return = 219.48


 83%|█████████████████████████████      | 1661/2000 [7:23:06<1:44:53, 18.57s/it]

Episode 1661/2000, real env return = 2295.29


 84%|█████████████████████████████▏     | 1671/2000 [7:26:20<1:46:58, 19.51s/it]

Episode 1671/2000, real env return = 2002.34


 84%|█████████████████████████████▍     | 1681/2000 [7:29:34<1:42:05, 19.20s/it]

Episode 1681/2000, real env return = 3938.78


 85%|█████████████████████████████▌     | 1691/2000 [7:33:03<1:50:36, 21.48s/it]

Episode 1691/2000, real env return = 4781.80


 85%|█████████████████████████████▊     | 1701/2000 [7:36:01<1:33:41, 18.80s/it]

Episode 1701/2000, real env return = 478.25


 86%|█████████████████████████████▉     | 1711/2000 [7:39:27<1:37:38, 20.27s/it]

Episode 1711/2000, real env return = 4496.56


 86%|██████████████████████████████     | 1721/2000 [7:42:38<1:28:15, 18.98s/it]

Episode 1721/2000, real env return = 2179.76


 87%|██████████████████████████████▎    | 1731/2000 [7:45:37<1:21:39, 18.22s/it]

Episode 1731/2000, real env return = 4636.23


 87%|██████████████████████████████▍    | 1741/2000 [7:48:54<1:23:31, 19.35s/it]

Episode 1741/2000, real env return = 2670.98


 88%|██████████████████████████████▋    | 1751/2000 [7:52:17<1:17:39, 18.71s/it]

Episode 1751/2000, real env return = 409.57


 88%|██████████████████████████████▊    | 1761/2000 [7:55:29<1:15:48, 19.03s/it]

Episode 1761/2000, real env return = 4804.76


 89%|██████████████████████████████▉    | 1771/2000 [7:58:53<1:15:47, 19.86s/it]

Episode 1771/2000, real env return = 4675.05


 89%|████████████████████████████████▉    | 1781/2000 [8:01:49<57:59, 15.89s/it]

Episode 1781/2000, real env return = 4733.58


 90%|█████████████████████████████████▏   | 1791/2000 [8:04:34<59:36, 17.11s/it]

Episode 1791/2000, real env return = 4667.41


 90%|█████████████████████████████████▎   | 1801/2000 [8:07:17<56:39, 17.08s/it]

Episode 1801/2000, real env return = 4902.96


 91%|█████████████████████████████████▌   | 1811/2000 [8:09:57<51:09, 16.24s/it]

Episode 1811/2000, real env return = -4.25


 91%|█████████████████████████████████▋   | 1821/2000 [8:12:50<51:37, 17.30s/it]

Episode 1821/2000, real env return = 4767.92


 92%|█████████████████████████████████▊   | 1831/2000 [8:15:30<41:34, 14.76s/it]

Episode 1831/2000, real env return = 4729.13


 92%|██████████████████████████████████   | 1841/2000 [8:17:46<34:45, 13.12s/it]

Episode 1841/2000, real env return = 4207.74


 93%|██████████████████████████████████▏  | 1851/2000 [8:20:05<36:14, 14.59s/it]

Episode 1851/2000, real env return = 148.51


 93%|██████████████████████████████████▍  | 1861/2000 [8:22:15<29:54, 12.91s/it]

Episode 1861/2000, real env return = 4572.19


 94%|██████████████████████████████████▌  | 1871/2000 [8:24:17<28:06, 13.07s/it]

Episode 1871/2000, real env return = 126.47


 94%|██████████████████████████████████▊  | 1881/2000 [8:26:18<21:32, 10.87s/it]

Episode 1881/2000, real env return = 147.69


 95%|██████████████████████████████████▉  | 1891/2000 [8:28:24<20:45, 11.43s/it]

Episode 1891/2000, real env return = 4979.92


 95%|███████████████████████████████████▏ | 1901/2000 [8:30:36<23:05, 14.00s/it]

Episode 1901/2000, real env return = 191.19


 96%|███████████████████████████████████▎ | 1911/2000 [8:32:32<17:29, 11.79s/it]

Episode 1911/2000, real env return = 3343.71


 96%|███████████████████████████████████▌ | 1921/2000 [8:34:39<16:47, 12.76s/it]

Episode 1921/2000, real env return = 2757.25


 97%|███████████████████████████████████▋ | 1931/2000 [8:36:45<15:34, 13.55s/it]

Episode 1931/2000, real env return = 4912.81


 97%|███████████████████████████████████▉ | 1941/2000 [8:38:47<11:32, 11.74s/it]

Episode 1941/2000, real env return = 5074.12


 98%|████████████████████████████████████ | 1951/2000 [8:40:26<08:21, 10.24s/it]

Episode 1951/2000, real env return = 4900.71


 98%|████████████████████████████████████▎| 1961/2000 [8:41:54<06:11,  9.51s/it]

Episode 1961/2000, real env return = 4832.18


 99%|████████████████████████████████████▍| 1971/2000 [8:43:28<04:37,  9.57s/it]

Episode 1971/2000, real env return = 4548.49


 99%|████████████████████████████████████▋| 1981/2000 [8:44:57<02:50,  8.95s/it]

Episode 1981/2000, real env return = 127.38


100%|████████████████████████████████████▊| 1991/2000 [8:46:24<01:18,  8.72s/it]

Episode 1991/2000, real env return = 284.78


100%|█████████████████████████████████████| 2000/2000 [8:47:36<00:00, 15.83s/it]


Training finished.


In [4]:
np.save('gaussian_walker_train1.npy', train1)