In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter

import tqdm

import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import copy
# import multiprocessing as mp
from torch.multiprocessing import Pipe

import gym

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

In [2]:
device = torch.device("cpu")

In [3]:
import caviar_tools
from beamselect_env import BeamSelectionEnv

In [4]:
reward_type = 'test'    # 'test' or 'train'
epi = [0,0] #[start,end] 
epi_val = [500,500]

# gym_env_train = BeamSelectionEnv(epi,reward_type)

gym_env_ind = list()
for i in range(epi[0],epi[1]+1):
    gym_env_ind.append(BeamSelectionEnv([i,i],reward_type))

gym_env_val = BeamSelectionEnv(epi_val)



In [5]:
n_steps_epi = list()
n_steps_epi_val = list()
for i in range(epi[0],epi[1]+1):
    n_steps_epi.append(caviar_tools.linecount([i,i]))

for i in range(epi_val[0],epi_val[1]+1):
    n_steps_epi_val.append(caviar_tools.linecount([i,i]))

n_steps = sum(n_steps_epi)
n_steps_val = sum(n_steps_epi_val)

In [6]:
train_method = 'ICM'
env_id = None #BreakoutNoFrameskip-v4
env_type = 'beamselect'
env = gym_env_ind[0]

# Hyper Params

In [7]:
lam = 0.95
num_worker = 1

num_step = int(128)

ppo_eps = float(0.1)
epoch = int(3)
mini_batch = int(8)
BATCH_SIZE = int(num_step * num_worker / mini_batch) #16
learning_rate = float(1e-4)
entropy_coef = float(0.001)
gamma = float(0.99)
eta = float(1)

clip_grad_norm = float(0.5)

pre_obs_norm_step = int(10)#int(10000)

HISTORY_SIZE = 16
STATES_USED = 13

In [8]:
input_size = [HISTORY_SIZE,STATES_USED]  
output_size = 192 #64*3

In [9]:
from utils_cur import *
from agents_cur import *

In [10]:
reward_rms = RunningMeanStd()
obs_rms = RunningMeanStd(shape=(1, HISTORY_SIZE, 1, STATES_USED))


discounted_reward = RewardForwardFilter(gamma)

agent = ICMAgent

In [11]:
agent = agent(
        input_size,
        output_size,
        num_worker,
        num_step,
        gamma,
        lam=lam,
        learning_rate=learning_rate,
        ent_coef=entropy_coef,
        clip_grad_norm=clip_grad_norm,
        epoch=epoch,
        batch_size=BATCH_SIZE,
        ppo_eps=ppo_eps,
        eta=eta,
        use_cuda=False,
        use_gae=False,
        use_noisy_net=False
    )

In [12]:
states = np.zeros([1, HISTORY_SIZE, 1,STATES_USED])

sample_episode = 0
sample_rall = 0
sample_step = 0
sample_env_idx = 0
sample_i_rall = 0
global_update = 0
global_step = 0


In [13]:
Transition = namedtuple('Transition',
                        ('state'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size:int=BATCH_SIZE):
        return random.sample(self.memory, BATCH_SIZE)

    def __len__(self):
        return len(self.memory)

In [14]:
history = ReplayMemory(HISTORY_SIZE)

for i in range(HISTORY_SIZE):
    history.push(np.zeros((STATES_USED, )))

In [15]:
a = history.sample()
np.array(a).shape

(16, 1, 13)

In [16]:
def run(action):
    s, reward, done, info = env.step([action//64, action%64])
    # print(type(s), s.shape)
    history.push(s.astype(np.float))
    
    return [np.array(history.sample(BATCH_SIZE)), reward, done, done, reward]

In [17]:
# normalize obs
print('Start to initailize observation normalization parameter.....')
next_obs = []
steps = 0
while steps < pre_obs_norm_step:
    steps += num_worker
    actions = np.random.randint(0, output_size, size=(num_worker,))

    for action in actions:
        s, r, d, rd, lr = run(action)
        next_obs.append(s[:])
        
next_obs = np.stack(next_obs)
obs_rms.update(next_obs)
print('End to initalize...')

Start to initailize observation normalization parameter.....
End to initalize...


In [18]:
while True:
    total_state, total_reward, total_done, total_next_state, total_action, total_int_reward, total_next_obs, total_values, total_policy =         [], [], [], [], [], [], [], [], []
    global_step += (num_worker * num_step)
    global_update += 1

    # Step 1. n-step rollout
    for _ in tqdm.tqdm_notebook(range(num_step)):
        actions, value, policy = agent.get_action((states - obs_rms.mean) / np.sqrt(obs_rms.var)) #Normalization

        next_states, rewards, dones, real_dones, log_rewards, next_obs = [], [], [], [], [], []
        for action in actions:
            s, r, d, rd, lr = run(action)
            next_states.append(s)
            rewards.append(r)
            dones.append(d)
            real_dones.append(rd)
            log_rewards.append(lr)

        next_states = np.stack(next_states)
        rewards = np.hstack(rewards)
        dones = np.hstack(dones)
        real_dones = np.hstack(real_dones)

        # total reward = int reward
        intrinsic_reward = agent.compute_intrinsic_reward(
            (states - obs_rms.mean) / np.sqrt(obs_rms.var),
            (next_states - obs_rms.mean) / np.sqrt(obs_rms.var),
            actions)
        sample_i_rall += intrinsic_reward[sample_env_idx]

        total_int_reward.append(intrinsic_reward)
        total_state.append(states)
        total_next_state.append(next_states)
        total_reward.append(rewards)
        total_done.append(dones)
        total_action.append(actions)
        total_values.append(value)
        total_policy.append(policy)

        states = next_states[:, :, :, :]

        sample_rall += log_rewards[sample_env_idx]

        sample_step += 1
        if real_dones[sample_env_idx]:
            sample_episode += 1
            # writer.add_scalar('data/reward_per_epi', sample_rall, sample_episode)
            # writer.add_scalar('data/reward_per_rollout', sample_rall, global_update)
            # writer.add_scalar('data/step', sample_step, sample_episode)
            sample_rall = 0
            sample_step = 0
            sample_i_rall = 0

    # calculate last next value
    _, value, _ = agent.get_action((states - obs_rms.mean) / np.sqrt(obs_rms.var))
    total_values.append(value)
    # --------------------------------------------------

    total_state = np.stack(total_state).transpose([1, 0, 2, 3, 4]).reshape([-1, HISTORY_SIZE, 1, STATES_USED])
    total_next_state = np.stack(total_next_state).transpose([1, 0, 2, 3, 4]).reshape([-1, HISTORY_SIZE, 1, STATES_USED])
    total_action = np.stack(total_action).transpose().reshape([-1])
    total_done = np.stack(total_done).transpose()
    total_values = np.stack(total_values).transpose()
    total_logging_policy = torch.stack(total_policy).view(-1, output_size).cpu().numpy()

    # Step 2. calculate intrinsic reward
    # running mean intrinsic reward
    total_int_reward = np.stack(total_int_reward).transpose()
    total_reward_per_env = np.array([discounted_reward.update(reward_per_step) for reward_per_step in
                                        total_int_reward.T])
    mean, std, count = np.mean(total_reward_per_env), np.std(total_reward_per_env), len(total_reward_per_env)
    reward_rms.update_from_moments(mean, std ** 2, count)

    # normalize intrinsic reward
    total_int_reward /= np.sqrt(reward_rms.var)
    # writer.add_scalar('data/int_reward_per_epi', np.sum(total_int_reward) / num_worker, sample_episode)
    # writer.add_scalar('data/int_reward_per_rollout', np.sum(total_int_reward) / num_worker, global_update)
    # -------------------------------------------------------------------------------------------

    # logging Max action probability
    # writer.add_scalar('data/max_prob', softmax(total_logging_policy).max(1).mean(), sample_episode)

    # Step 3. make target and advantage
    target, adv = make_train_data(total_int_reward,
                                    np.zeros_like(total_int_reward),
                                    total_values,
                                    gamma,
                                    num_step,
                                    num_worker)

    adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8)
    # -----------------------------------------------

    # Step 5. Training!
    agent.train_model((total_state - obs_rms.mean) / np.sqrt(obs_rms.var),
                        (total_next_state - obs_rms.mean) / np.sqrt(obs_rms.var),
                        target, total_action,
                        adv,
                        total_policy)

    if global_step % (num_worker * num_step) == 0:
        print('Now Global Step :{}'.format(global_step))
        print(f'Total reward : {np.mean(total_reward)}')
        # torch.save(agent.model.state_dict(), model_path)
        # torch.save(agent.icm.state_dict(), icm_path)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  import sys


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.66it/s]

Now Global Step :128
Total reward : -0.17319209033230426





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.38it/s]

Now Global Step :256
Total reward : -0.17596126327067607





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.11it/s]

Now Global Step :384
Total reward : -0.17505421007428248





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.54it/s]

Now Global Step :512
Total reward : -0.1733857909448477





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.74it/s]

Now Global Step :640
Total reward : -0.17407003550112654





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.50it/s]

Now Global Step :768
Total reward : -0.1745622566661958





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.95it/s]

Now Global Step :896
Total reward : -0.17465943823674113





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.11it/s]

Now Global Step :1024
Total reward : -0.17767695083303858





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.23it/s]

Now Global Step :1152
Total reward : -0.17091132697307418





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.24it/s]

Now Global Step :1280
Total reward : -0.16995962048661178





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.56it/s]

Now Global Step :1408
Total reward : -0.16573793549472826





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.87it/s]


Now Global Step :1536
Total reward : -0.17071464896563943


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.95it/s]


Now Global Step :1664
Total reward : -0.16983805675585067


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  5.05it/s]


Now Global Step :1792
Total reward : -0.17194243608772455


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.09it/s]

Now Global Step :1920
Total reward : -0.16664650324221417





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.96it/s]


Now Global Step :2048
Total reward : -0.16602479077312482


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.56it/s]


Now Global Step :2176
Total reward : -0.17216235979011815


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.51it/s]

Now Global Step :2304
Total reward : -0.17347288265676536





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.90it/s]

Now Global Step :2432
Total reward : -0.17511263254331655





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.80it/s]

Now Global Step :2560
Total reward : -0.17532234365479002





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.27it/s]


Now Global Step :2688
Total reward : -0.17391899158780133


  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.98it/s]

Now Global Step :2816
Total reward : -0.17442608517206376





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.42it/s]

Now Global Step :2944
Total reward : -0.17024638684365576





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.22it/s]

Now Global Step :3072
Total reward : -0.16646136660204128





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.30it/s]

Now Global Step :3200
Total reward : -0.15506349611251125





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.63it/s]

Now Global Step :3328
Total reward : -0.12698653961927453





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.65it/s]

Now Global Step :3456
Total reward : -0.12192728187454432





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.29it/s]

Now Global Step :3584
Total reward : -0.10879798454845098





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.76it/s]

Now Global Step :3712
Total reward : -0.10729691096651096





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.34it/s]

Now Global Step :3840
Total reward : -0.09277804424633179





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.47it/s]

Now Global Step :3968
Total reward : -0.07851464571965298





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.78it/s]

Now Global Step :4096
Total reward : -0.1025666813548466





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.62it/s]

Now Global Step :4224
Total reward : -0.10496141583110644





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.46it/s]

Now Global Step :4352
Total reward : -0.10224870714886627





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.45it/s]

Now Global Step :4480
Total reward : -0.10623578745794182





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.11it/s]

Now Global Step :4608
Total reward : -0.07893569378566845





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.20it/s]

Now Global Step :4736
Total reward : -0.08578294123463462





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.16it/s]

Now Global Step :4864
Total reward : -0.0996821964475185





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.91it/s]

Now Global Step :4992
Total reward : -0.09182950060873178





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.06it/s]

Now Global Step :5120
Total reward : -0.029003584027325197





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.08it/s]

Now Global Step :5248
Total reward : -0.11125417660022371





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.86it/s]

Now Global Step :5376
Total reward : -0.07608449391298051





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.99it/s]

Now Global Step :5504
Total reward : -0.043666211076551054





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.31it/s]

Now Global Step :5632
Total reward : -0.037827356424393846





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.32it/s]

Now Global Step :5760
Total reward : -0.07072168725977211





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.15it/s]

Now Global Step :5888
Total reward : -0.08400066127541227





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.16it/s]

Now Global Step :6016
Total reward : -0.09281117975774322





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.22it/s]

Now Global Step :6144
Total reward : -0.11950434863725101





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.20it/s]

Now Global Step :6272
Total reward : -0.11726641417169069





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.21it/s]

Now Global Step :6400
Total reward : -0.1272159204380171





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.50it/s]

Now Global Step :6528
Total reward : -0.12540934898722475





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.89it/s]

Now Global Step :6656
Total reward : -0.14094080223946506





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.88it/s]

Now Global Step :6784
Total reward : -0.14046295625533345





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.78it/s]

Now Global Step :6912
Total reward : -0.1397647798049084





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.64it/s]

Now Global Step :7040
Total reward : -0.13535610965144473





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.63it/s]

Now Global Step :7168
Total reward : -0.1188172936501796





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.55it/s]

Now Global Step :7296
Total reward : -0.13850935460051447





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.43it/s]

Now Global Step :7424
Total reward : -0.14365972047385456





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.54it/s]

Now Global Step :7552
Total reward : -0.14895356087419362





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.34it/s]

Now Global Step :7680
Total reward : -0.14988524423474459





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:01<00:00,  2.86it/s]

Now Global Step :7808
Total reward : -0.15016953468775163





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.36it/s]

Now Global Step :7936
Total reward : -0.14602658626312337





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.49it/s]

Now Global Step :8064
Total reward : -0.14737865439845513





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:01<00:00,  2.92it/s]

Now Global Step :8192
Total reward : -0.16955114574085467





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.05it/s]

Now Global Step :8320
Total reward : -0.17034489523936616





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.05it/s]

Now Global Step :8448
Total reward : -0.17171662878444058





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.74it/s]

Now Global Step :8576
Total reward : -0.16714040427674676





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:01<00:00,  2.99it/s]

Now Global Step :8704
Total reward : -0.16836067220680356





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.82it/s]

Now Global Step :8832
Total reward : -0.1622009917042129





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.14it/s]

Now Global Step :8960
Total reward : -0.1681252278934062





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.57it/s]

Now Global Step :9088
Total reward : -0.15798805001018093





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.15it/s]

Now Global Step :9216
Total reward : -0.16112020250989034





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.41it/s]

Now Global Step :9344
Total reward : -0.15794881070543437





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.43it/s]

Now Global Step :9472
Total reward : -0.17067587454137273





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.54it/s]

Now Global Step :9600
Total reward : -0.1640022533749248





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.46it/s]

Now Global Step :9728
Total reward : -0.1682833235637304





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.99it/s]

Now Global Step :9856
Total reward : -0.1668047274596796





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.01it/s]

Now Global Step :9984
Total reward : -0.17636267977637085





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.85it/s]

Now Global Step :10112
Total reward : -0.17481761423320738





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.80it/s]

Now Global Step :10240
Total reward : -0.17895671990624423





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.07it/s]

Now Global Step :10368
Total reward : -0.17859535466631715





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.49it/s]

Now Global Step :10496
Total reward : -0.17779345450029022





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.84it/s]

Now Global Step :10624
Total reward : -0.17687621621227634





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.64it/s]

Now Global Step :10752
Total reward : -0.17842221964133864





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.38it/s]

Now Global Step :10880
Total reward : -0.17880429191208808





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.54it/s]

Now Global Step :11008
Total reward : -0.18259965335763706





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.04it/s]

Now Global Step :11136
Total reward : -0.17650323606051724





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.90it/s]

Now Global Step :11264
Total reward : -0.1787317785946048





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.06it/s]

Now Global Step :11392
Total reward : -0.17905698173922652





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.49it/s]

Now Global Step :11520
Total reward : -0.17860972654016366





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  4.11it/s]

Now Global Step :11648
Total reward : -0.17421426523852013





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.71it/s]

Now Global Step :11776
Total reward : -0.17639613154912592





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.60it/s]

Now Global Step :11904
Total reward : -0.17905949753487116





  0%|          | 0/128 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00,  3.76it/s]

Now Global Step :12032
Total reward : -0.1759455897087763





  0%|          | 0/128 [00:00<?, ?it/s]

KeyboardInterrupt: 