In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.squad_model_kldiv import MaskGeneratingModelForSQUAD
from tqdm import tqdm

In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# ds = load_dataset("imdb")
ds = load_dataset("rajpurkar/squad")
# ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
# test_ds = ds['test']
test_ds = ds['validation']

llm_exp_helper = DataHelper(tokenizer)
# collate_fn = llm_exp_helper.get_collate_fun('imdb')
# collate_fn = llm_exp_helper.get_collate_fun('sst2')
collate_fn = llm_exp_helper.get_collate_fun('squad')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    device_map=device,
    token=access_token,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Configure mask model and  Training parameters
mask_gen_model = MaskGeneratingModelForSQUAD()
mask_gen_model.to(device)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

optimizer = torch.optim.Adam(mask_gen_model.parameters(), lr=1e-5)

`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.45


In [6]:
next(iter(train_dataloader))

{'input_ids': tensor([[128000, 128006,   9125,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        ...,
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'context_mask': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}

In [12]:
mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
        gen_tokens = gen_outputs.sequences
        pad_length = gen_tokens.size(1) - input_ids.size(1)
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
        # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
        unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
        unpaded_token_mask[:, :-pad_length] = 1
        gen_attention_mask = gen_attention_mask * unpaded_token_mask

        # get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
        response_mask = gen_attention_mask.clone()
        response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, response_mask, optimizer,
                                                   num_steps=5, mini_batch_size=16, ppo_epochs=2)


        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
               f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
               f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               f"Entropy = {loss_dict['entropy']:.4f}, " \
               f"Returns = {loss_dict['returns']:.4f}, " \
               f"Value = {loss_dict['value']:.4f}, " \
                f"mask_loss = {loss_dict['mask_loss']:.4f}" \
                f"std_loss = {loss_dict['std_loss']:.4f}" \
            #    f"Cont_loss = {loss_dict['contrast_loss']:.4f}, "  \
               
        pbar.set_description(log)

        if idx % 10 == 0:
            print()
        if idx % 20 == 0 and idx != 0:
            torch.save(mask_gen_model.state_dict(), f'saved_model/squad_kldv_{epoch}_{idx}.pth') 

  0%|          | 0/5475 [00:00<?, ?it/s]

reward_raw: tensor(0.6461, device='cuda:0') reward: tensor(1.1935, device='cuda:0') factor: tensor(0.4526, device='cuda:0')
reward_raw: tensor(0.6290, device='cuda:0') reward: tensor(1.1810, device='cuda:0') factor: tensor(0.4480, device='cuda:0')
reward_raw: tensor(0.6890, device='cuda:0') reward: tensor(1.2329, device='cuda:0') factor: tensor(0.4561, device='cuda:0')
reward_raw: tensor(0.6839, device='cuda:0') reward: tensor(1.2277, device='cuda:0') factor: tensor(0.4562, device='cuda:0')
reward_raw: tensor(0.6438, device='cuda:0') reward: tensor(1.1887, device='cuda:0') factor: tensor(0.4551, device='cuda:0')


Epoch 1, Step 1: Loss = -0.1069, Actor Loss = -0.1797, Critic Loss = 0.1468, Entropy = 0.5244, Returns = 1.2459, Value = 1.2635, mask_loss = 0.4616std_loss = 0.2658:   0%|          | 1/5475 [00:11<17:46:05, 11.69s/it]

ratio tensor(1.6880, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.1086, device='cuda:0') factor: tensor(0.4108, device='cuda:0')
reward_raw: tensor(0.5747, device='cuda:0') reward: tensor(1.1643, device='cuda:0') factor: tensor(0.4104, device='cuda:0')
reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.0734, device='cuda:0') factor: tensor(0.4108, device='cuda:0')
reward_raw: tensor(0.5703, device='cuda:0') reward: tensor(1.1553, device='cuda:0') factor: tensor(0.4150, device='cuda:0')
reward_raw: tensor(0.5157, device='cuda:0') reward: tensor(1.1101, device='cuda:0') factor: tensor(0.4056, device='cuda:0')


Epoch 1, Step 2: Loss = 0.2160, Actor Loss = 0.1592, Critic Loss = 0.1147, Entropy = 0.5095, Returns = 1.1318, Value = 1.2042, mask_loss = 0.4322std_loss = 0.2679:   0%|          | 2/5475 [00:26<20:14:42, 13.32s/it]  

ratio tensor(1.2132, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6266, device='cuda:0') reward: tensor(1.1890, device='cuda:0') factor: tensor(0.4376, device='cuda:0')
reward_raw: tensor(0.6561, device='cuda:0') reward: tensor(1.2099, device='cuda:0') factor: tensor(0.4462, device='cuda:0')
reward_raw: tensor(0.6467, device='cuda:0') reward: tensor(1.2160, device='cuda:0') factor: tensor(0.4307, device='cuda:0')
reward_raw: tensor(0.6469, device='cuda:0') reward: tensor(1.2007, device='cuda:0') factor: tensor(0.4463, device='cuda:0')
reward_raw: tensor(0.6334, device='cuda:0') reward: tensor(1.1882, device='cuda:0') factor: tensor(0.4452, device='cuda:0')


Epoch 1, Step 3: Loss = 0.0349, Actor Loss = -0.0263, Critic Loss = 0.1236, Entropy = 0.5379, Returns = 1.2255, Value = 1.2087, mask_loss = 0.4539std_loss = 0.2547:   0%|          | 3/5475 [00:42<22:03:45, 14.51s/it]

ratio tensor(1.7985, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5308, device='cuda:0') reward: tensor(1.0882, device='cuda:0') factor: tensor(0.4426, device='cuda:0')
reward_raw: tensor(0.5820, device='cuda:0') reward: tensor(1.1169, device='cuda:0') factor: tensor(0.4651, device='cuda:0')
reward_raw: tensor(0.5246, device='cuda:0') reward: tensor(1.0683, device='cuda:0') factor: tensor(0.4563, device='cuda:0')
reward_raw: tensor(0.5813, device='cuda:0') reward: tensor(1.1304, device='cuda:0') factor: tensor(0.4510, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.0998, device='cuda:0') factor: tensor(0.4509, device='cuda:0')


Epoch 1, Step 4: Loss = 0.2029, Actor Loss = 0.0449, Critic Loss = 0.3171, Entropy = 0.5136, Returns = 1.0588, Value = 0.9207, mask_loss = 0.4272std_loss = 0.2623:   0%|          | 4/5475 [00:55<21:29:10, 14.14s/it] 

ratio tensor(1.9825, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5387, device='cuda:0') reward: tensor(1.1884, device='cuda:0') factor: tensor(0.3503, device='cuda:0')
reward_raw: tensor(0.5978, device='cuda:0') reward: tensor(1.2254, device='cuda:0') factor: tensor(0.3724, device='cuda:0')
reward_raw: tensor(0.5600, device='cuda:0') reward: tensor(1.1945, device='cuda:0') factor: tensor(0.3655, device='cuda:0')
reward_raw: tensor(0.5941, device='cuda:0') reward: tensor(1.2142, device='cuda:0') factor: tensor(0.3799, device='cuda:0')
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.1996, device='cuda:0') factor: tensor(0.3655, device='cuda:0')


Epoch 1, Step 5: Loss = -0.1318, Actor Loss = -0.2049, Critic Loss = 0.1472, Entropy = 0.5064, Returns = 1.1724, Value = 1.0144, mask_loss = 0.3632std_loss = 0.2501:   0%|          | 5/5475 [01:17<25:58:52, 17.10s/it]

ratio tensor(1.1783, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5313, device='cuda:0') reward: tensor(1.1131, device='cuda:0') factor: tensor(0.4182, device='cuda:0')
reward_raw: tensor(0.5325, device='cuda:0') reward: tensor(1.1168, device='cuda:0') factor: tensor(0.4157, device='cuda:0')
reward_raw: tensor(0.5058, device='cuda:0') reward: tensor(1.0849, device='cuda:0') factor: tensor(0.4209, device='cuda:0')
reward_raw: tensor(0.5009, device='cuda:0') reward: tensor(1.0824, device='cuda:0') factor: tensor(0.4185, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.0982, device='cuda:0') factor: tensor(0.4051, device='cuda:0')


Epoch 1, Step 6: Loss = -0.2459, Actor Loss = -0.3167, Critic Loss = 0.1425, Entropy = 0.5060, Returns = 1.1379, Value = 1.0261, mask_loss = 0.4086std_loss = 0.2681:   0%|          | 6/5475 [01:33<25:18:27, 16.66s/it]

ratio tensor(1.5694, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(1.0896, device='cuda:0') factor: tensor(0.4649, device='cuda:0')
reward_raw: tensor(0.5416, device='cuda:0') reward: tensor(1.0902, device='cuda:0') factor: tensor(0.4515, device='cuda:0')
reward_raw: tensor(0.5256, device='cuda:0') reward: tensor(1.0852, device='cuda:0') factor: tensor(0.4405, device='cuda:0')
reward_raw: tensor(0.5985, device='cuda:0') reward: tensor(1.1409, device='cuda:0') factor: tensor(0.4576, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.1146, device='cuda:0') factor: tensor(0.4534, device='cuda:0')


Epoch 1, Step 7: Loss = 0.1357, Actor Loss = 0.0455, Critic Loss = 0.1815, Entropy = 0.5315, Returns = 1.0529, Value = 1.1897, mask_loss = 0.4782std_loss = 0.2592:   0%|          | 7/5475 [01:48<24:31:15, 16.14s/it]  

ratio tensor(1.3058, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.0486, device='cuda:0') factor: tensor(0.4729, device='cuda:0')
reward_raw: tensor(0.4600, device='cuda:0') reward: tensor(1.0010, device='cuda:0') factor: tensor(0.4591, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.0756, device='cuda:0') factor: tensor(0.4660, device='cuda:0')
reward_raw: tensor(0.5294, device='cuda:0') reward: tensor(1.0690, device='cuda:0') factor: tensor(0.4604, device='cuda:0')
reward_raw: tensor(0.5346, device='cuda:0') reward: tensor(1.0858, device='cuda:0') factor: tensor(0.4487, device='cuda:0')


Epoch 1, Step 8: Loss = 0.2471, Actor Loss = 0.1841, Critic Loss = 0.1269, Entropy = 0.5040, Returns = 1.0622, Value = 0.8790, mask_loss = 0.5505std_loss = 0.2681:   0%|          | 8/5475 [02:03<23:58:27, 15.79s/it]

ratio tensor(0.0833, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6144, device='cuda:0') reward: tensor(1.1009, device='cuda:0') factor: tensor(0.5135, device='cuda:0')
reward_raw: tensor(0.6192, device='cuda:0') reward: tensor(1.0963, device='cuda:0') factor: tensor(0.5229, device='cuda:0')
reward_raw: tensor(0.6146, device='cuda:0') reward: tensor(1.1024, device='cuda:0') factor: tensor(0.5122, device='cuda:0')
reward_raw: tensor(0.5986, device='cuda:0') reward: tensor(1.0895, device='cuda:0') factor: tensor(0.5091, device='cuda:0')
reward_raw: tensor(0.6164, device='cuda:0') reward: tensor(1.0974, device='cuda:0') factor: tensor(0.5191, device='cuda:0')


Epoch 1, Step 9: Loss = -0.0594, Actor Loss = -0.1005, Critic Loss = 0.0832, Entropy = 0.5132, Returns = 1.1124, Value = 1.0310, mask_loss = 0.5090std_loss = 0.2616:   0%|          | 9/5475 [02:21<24:51:47, 16.38s/it]

ratio tensor(1.4721, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6226, device='cuda:0') reward: tensor(1.1298, device='cuda:0') factor: tensor(0.4929, device='cuda:0')
reward_raw: tensor(0.5807, device='cuda:0') reward: tensor(1.0994, device='cuda:0') factor: tensor(0.4813, device='cuda:0')
reward_raw: tensor(0.6169, device='cuda:0') reward: tensor(1.1319, device='cuda:0') factor: tensor(0.4850, device='cuda:0')
reward_raw: tensor(0.5928, device='cuda:0') reward: tensor(1.1138, device='cuda:0') factor: tensor(0.4790, device='cuda:0')
reward_raw: tensor(0.6072, device='cuda:0') reward: tensor(1.1233, device='cuda:0') factor: tensor(0.4839, device='cuda:0')


Epoch 1, Step 10: Loss = -0.2058, Actor Loss = -0.2684, Critic Loss = 0.1263, Entropy = 0.5181, Returns = 1.1111, Value = 1.0181, mask_loss = 0.4925std_loss = 0.2699:   0%|          | 10/5475 [02:35<23:54:52, 15.75s/it]

ratio tensor(2.2398, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6537, device='cuda:0') reward: tensor(1.1123, device='cuda:0') factor: tensor(0.5414, device='cuda:0')
reward_raw: tensor(0.6732, device='cuda:0') reward: tensor(1.1484, device='cuda:0') factor: tensor(0.5248, device='cuda:0')
reward_raw: tensor(0.6398, device='cuda:0') reward: tensor(1.1111, device='cuda:0') factor: tensor(0.5287, device='cuda:0')
reward_raw: tensor(0.6458, device='cuda:0') reward: tensor(1.1203, device='cuda:0') factor: tensor(0.5254, device='cuda:0')
reward_raw: tensor(0.6628, device='cuda:0') reward: tensor(1.1252, device='cuda:0') factor: tensor(0.5376, device='cuda:0')


Epoch 1, Step 11: Loss = 0.1175, Actor Loss = 0.0631, Critic Loss = 0.1098, Entropy = 0.5091, Returns = 1.1308, Value = 1.1559, mask_loss = 0.5113std_loss = 0.2743:   0%|          | 11/5475 [02:50<23:34:34, 15.53s/it]  

ratio tensor(4.0541, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6727, device='cuda:0') reward: tensor(1.1556, device='cuda:0') factor: tensor(0.5171, device='cuda:0')
reward_raw: tensor(0.6494, device='cuda:0') reward: tensor(1.1371, device='cuda:0') factor: tensor(0.5123, device='cuda:0')
reward_raw: tensor(0.6574, device='cuda:0') reward: tensor(1.1461, device='cuda:0') factor: tensor(0.5113, device='cuda:0')
reward_raw: tensor(0.6531, device='cuda:0') reward: tensor(1.1385, device='cuda:0') factor: tensor(0.5145, device='cuda:0')
reward_raw: tensor(0.6511, device='cuda:0') reward: tensor(1.1306, device='cuda:0') factor: tensor(0.5205, device='cuda:0')


Epoch 1, Step 12: Loss = 0.1362, Actor Loss = 0.0934, Critic Loss = 0.0865, Entropy = 0.5099, Returns = 1.1647, Value = 1.1601, mask_loss = 0.4945std_loss = 0.2636:   0%|          | 12/5475 [03:04<22:41:33, 14.95s/it]

ratio tensor(0.9420, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.0747, device='cuda:0') factor: tensor(0.4946, device='cuda:0')
reward_raw: tensor(0.5689, device='cuda:0') reward: tensor(1.0819, device='cuda:0') factor: tensor(0.4870, device='cuda:0')
reward_raw: tensor(0.5920, device='cuda:0') reward: tensor(1.0909, device='cuda:0') factor: tensor(0.5011, device='cuda:0')
reward_raw: tensor(0.5534, device='cuda:0') reward: tensor(1.0616, device='cuda:0') factor: tensor(0.4918, device='cuda:0')
reward_raw: tensor(0.6166, device='cuda:0') reward: tensor(1.1224, device='cuda:0') factor: tensor(0.4942, device='cuda:0')


Epoch 1, Step 13: Loss = -0.2169, Actor Loss = -0.2745, Critic Loss = 0.1162, Entropy = 0.5129, Returns = 1.0635, Value = 0.9721, mask_loss = 0.4880std_loss = 0.2618:   0%|          | 13/5475 [03:19<22:31:59, 14.85s/it]

ratio tensor(1.6812, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6503, device='cuda:0') reward: tensor(1.1219, device='cuda:0') factor: tensor(0.5285, device='cuda:0')
reward_raw: tensor(0.6451, device='cuda:0') reward: tensor(1.1130, device='cuda:0') factor: tensor(0.5321, device='cuda:0')
reward_raw: tensor(0.6574, device='cuda:0') reward: tensor(1.1234, device='cuda:0') factor: tensor(0.5340, device='cuda:0')
reward_raw: tensor(0.6551, device='cuda:0') reward: tensor(1.1104, device='cuda:0') factor: tensor(0.5447, device='cuda:0')
reward_raw: tensor(0.6573, device='cuda:0') reward: tensor(1.1438, device='cuda:0') factor: tensor(0.5135, device='cuda:0')


Epoch 1, Step 14: Loss = 0.1013, Actor Loss = 0.0343, Critic Loss = 0.1352, Entropy = 0.5170, Returns = 1.0915, Value = 1.1380, mask_loss = 0.5152std_loss = 0.2659:   0%|          | 14/5475 [03:39<24:47:59, 16.35s/it]  

ratio tensor(1.1713, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6108, device='cuda:0') reward: tensor(1.1102, device='cuda:0') factor: tensor(0.5006, device='cuda:0')
reward_raw: tensor(0.6509, device='cuda:0') reward: tensor(1.1536, device='cuda:0') factor: tensor(0.4973, device='cuda:0')
reward_raw: tensor(0.6488, device='cuda:0') reward: tensor(1.1623, device='cuda:0') factor: tensor(0.4865, device='cuda:0')
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.0742, device='cuda:0') factor: tensor(0.4949, device='cuda:0')
reward_raw: tensor(0.6044, device='cuda:0') reward: tensor(1.1097, device='cuda:0') factor: tensor(0.4948, device='cuda:0')


Epoch 1, Step 15: Loss = -0.1313, Actor Loss = -0.2094, Critic Loss = 0.1574, Entropy = 0.5321, Returns = 1.1624, Value = 0.9949, mask_loss = 0.4951std_loss = 0.2509:   0%|          | 15/5475 [04:00<26:54:53, 17.75s/it]

ratio tensor(1.1747, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6146, device='cuda:0') reward: tensor(1.0876, device='cuda:0') factor: tensor(0.5270, device='cuda:0')
reward_raw: tensor(0.6093, device='cuda:0') reward: tensor(1.0763, device='cuda:0') factor: tensor(0.5330, device='cuda:0')
reward_raw: tensor(0.6168, device='cuda:0') reward: tensor(1.0768, device='cuda:0') factor: tensor(0.5401, device='cuda:0')
reward_raw: tensor(0.5968, device='cuda:0') reward: tensor(1.0698, device='cuda:0') factor: tensor(0.5270, device='cuda:0')
reward_raw: tensor(0.6045, device='cuda:0') reward: tensor(1.0685, device='cuda:0') factor: tensor(0.5359, device='cuda:0')


Epoch 1, Step 16: Loss = -0.1088, Actor Loss = -0.1971, Critic Loss = 0.1776, Entropy = 0.5145, Returns = 1.0506, Value = 0.9735, mask_loss = 0.5385std_loss = 0.2597:   0%|          | 16/5475 [04:12<24:39:08, 16.26s/it]

ratio tensor(2.3085, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6508, device='cuda:0') reward: tensor(1.1308, device='cuda:0') factor: tensor(0.5200, device='cuda:0')
reward_raw: tensor(0.6711, device='cuda:0') reward: tensor(1.1359, device='cuda:0') factor: tensor(0.5352, device='cuda:0')
reward_raw: tensor(0.6548, device='cuda:0') reward: tensor(1.1322, device='cuda:0') factor: tensor(0.5226, device='cuda:0')
reward_raw: tensor(0.6795, device='cuda:0') reward: tensor(1.1498, device='cuda:0') factor: tensor(0.5297, device='cuda:0')
reward_raw: tensor(0.6732, device='cuda:0') reward: tensor(1.1524, device='cuda:0') factor: tensor(0.5208, device='cuda:0')


Epoch 1, Step 17: Loss = 0.0339, Actor Loss = -0.0038, Critic Loss = 0.0766, Entropy = 0.5409, Returns = 1.1498, Value = 1.2575, mask_loss = 0.5327std_loss = 0.2355:   0%|          | 17/5475 [04:28<24:27:52, 16.14s/it] 

ratio tensor(1.0630, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6616, device='cuda:0') reward: tensor(1.1082, device='cuda:0') factor: tensor(0.5534, device='cuda:0')
reward_raw: tensor(0.6412, device='cuda:0') reward: tensor(1.1088, device='cuda:0') factor: tensor(0.5324, device='cuda:0')
reward_raw: tensor(0.6456, device='cuda:0') reward: tensor(1.1048, device='cuda:0') factor: tensor(0.5408, device='cuda:0')
reward_raw: tensor(0.6392, device='cuda:0') reward: tensor(1.0985, device='cuda:0') factor: tensor(0.5407, device='cuda:0')
reward_raw: tensor(0.5899, device='cuda:0') reward: tensor(1.0630, device='cuda:0') factor: tensor(0.5268, device='cuda:0')


Epoch 1, Step 18: Loss = -0.0116, Actor Loss = -0.1037, Critic Loss = 0.1853, Entropy = 0.5308, Returns = 1.1584, Value = 1.0477, mask_loss = 0.5100std_loss = 0.2565:   0%|          | 18/5475 [04:43<23:59:44, 15.83s/it]

ratio tensor(1.9303, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5525, device='cuda:0') reward: tensor(1.0439, device='cuda:0') factor: tensor(0.5086, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.0699, device='cuda:0') factor: tensor(0.4971, device='cuda:0')
reward_raw: tensor(0.5524, device='cuda:0') reward: tensor(1.0586, device='cuda:0') factor: tensor(0.4938, device='cuda:0')
reward_raw: tensor(0.5634, device='cuda:0') reward: tensor(1.0651, device='cuda:0') factor: tensor(0.4983, device='cuda:0')
reward_raw: tensor(0.5708, device='cuda:0') reward: tensor(1.0621, device='cuda:0') factor: tensor(0.5087, device='cuda:0')


Epoch 1, Step 19: Loss = 0.1073, Actor Loss = 0.0516, Critic Loss = 0.1124, Entropy = 0.5125, Returns = 1.0960, Value = 1.1871, mask_loss = 0.5164std_loss = 0.2664:   0%|          | 19/5475 [04:58<23:27:59, 15.48s/it]  

ratio tensor(0.7797, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5880, device='cuda:0') reward: tensor(1.1298, device='cuda:0') factor: tensor(0.4582, device='cuda:0')
reward_raw: tensor(0.5955, device='cuda:0') reward: tensor(1.1421, device='cuda:0') factor: tensor(0.4534, device='cuda:0')
reward_raw: tensor(0.5763, device='cuda:0') reward: tensor(1.1234, device='cuda:0') factor: tensor(0.4529, device='cuda:0')
reward_raw: tensor(0.5708, device='cuda:0') reward: tensor(1.1107, device='cuda:0') factor: tensor(0.4601, device='cuda:0')
reward_raw: tensor(0.5479, device='cuda:0') reward: tensor(1.0918, device='cuda:0') factor: tensor(0.4561, device='cuda:0')


Epoch 1, Step 20: Loss = 0.1107, Actor Loss = 0.0022, Critic Loss = 0.2181, Entropy = 0.5289, Returns = 1.0903, Value = 1.2813, mask_loss = 0.4304std_loss = 0.2571:   0%|          | 20/5475 [05:13<23:21:26, 15.41s/it]

ratio tensor(1.6891, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6436, device='cuda:0') reward: tensor(1.1920, device='cuda:0') factor: tensor(0.4516, device='cuda:0')
reward_raw: tensor(0.5733, device='cuda:0') reward: tensor(1.1075, device='cuda:0') factor: tensor(0.4658, device='cuda:0')
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.1058, device='cuda:0') factor: tensor(0.4425, device='cuda:0')
reward_raw: tensor(0.6225, device='cuda:0') reward: tensor(1.1670, device='cuda:0') factor: tensor(0.4555, device='cuda:0')
reward_raw: tensor(0.5702, device='cuda:0') reward: tensor(1.1126, device='cuda:0') factor: tensor(0.4575, device='cuda:0')


Epoch 1, Step 21: Loss = 0.0657, Actor Loss = -0.0269, Critic Loss = 0.1863, Entropy = 0.5035, Returns = 1.0758, Value = 1.0746, mask_loss = 0.4790std_loss = 0.2423:   0%|          | 20/5475 [05:32<23:21:26, 15.41s/it]

ratio tensor(0.8431, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 21: Loss = 0.0657, Actor Loss = -0.0269, Critic Loss = 0.1863, Entropy = 0.5035, Returns = 1.0758, Value = 1.0746, mask_loss = 0.4790std_loss = 0.2423:   0%|          | 21/5475 [05:32<24:54:10, 16.44s/it]

reward_raw: tensor(0.6459, device='cuda:0') reward: tensor(1.1700, device='cuda:0') factor: tensor(0.4759, device='cuda:0')
reward_raw: tensor(0.6201, device='cuda:0') reward: tensor(1.1282, device='cuda:0') factor: tensor(0.4920, device='cuda:0')
reward_raw: tensor(0.6653, device='cuda:0') reward: tensor(1.1641, device='cuda:0') factor: tensor(0.5012, device='cuda:0')
reward_raw: tensor(0.6807, device='cuda:0') reward: tensor(1.1929, device='cuda:0') factor: tensor(0.4878, device='cuda:0')
reward_raw: tensor(0.6845, device='cuda:0') reward: tensor(1.1992, device='cuda:0') factor: tensor(0.4853, device='cuda:0')


Epoch 1, Step 22: Loss = 0.2066, Actor Loss = 0.1415, Critic Loss = 0.1312, Entropy = 0.5059, Returns = 1.1530, Value = 1.1597, mask_loss = 0.4727std_loss = 0.2629:   0%|          | 22/5475 [05:45<23:21:50, 15.42s/it] 

ratio tensor(0.8495, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5678, device='cuda:0') reward: tensor(1.0546, device='cuda:0') factor: tensor(0.5132, device='cuda:0')
reward_raw: tensor(0.5826, device='cuda:0') reward: tensor(1.0666, device='cuda:0') factor: tensor(0.5160, device='cuda:0')
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.0356, device='cuda:0') factor: tensor(0.5084, device='cuda:0')
reward_raw: tensor(0.5921, device='cuda:0') reward: tensor(1.0762, device='cuda:0') factor: tensor(0.5159, device='cuda:0')
reward_raw: tensor(0.5649, device='cuda:0') reward: tensor(1.0445, device='cuda:0') factor: tensor(0.5204, device='cuda:0')


Epoch 1, Step 23: Loss = 0.2501, Actor Loss = 0.1523, Critic Loss = 0.1967, Entropy = 0.4985, Returns = 1.0348, Value = 1.0813, mask_loss = 0.4687std_loss = 0.2719:   0%|          | 23/5475 [05:57<21:49:02, 14.41s/it]

ratio tensor(0.9669, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6080, device='cuda:0') reward: tensor(1.1427, device='cuda:0') factor: tensor(0.4653, device='cuda:0')
reward_raw: tensor(0.6394, device='cuda:0') reward: tensor(1.1636, device='cuda:0') factor: tensor(0.4758, device='cuda:0')
reward_raw: tensor(0.6855, device='cuda:0') reward: tensor(1.2108, device='cuda:0') factor: tensor(0.4748, device='cuda:0')
reward_raw: tensor(0.6239, device='cuda:0') reward: tensor(1.1598, device='cuda:0') factor: tensor(0.4641, device='cuda:0')
reward_raw: tensor(0.6173, device='cuda:0') reward: tensor(1.1523, device='cuda:0') factor: tensor(0.4650, device='cuda:0')


Epoch 1, Step 24: Loss = -0.2163, Actor Loss = -0.2596, Critic Loss = 0.0875, Entropy = 0.5040, Returns = 1.1628, Value = 1.1521, mask_loss = 0.4964std_loss = 0.2736:   0%|          | 24/5475 [06:12<22:09:38, 14.64s/it]

ratio tensor(1.1885, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5945, device='cuda:0') reward: tensor(1.0844, device='cuda:0') factor: tensor(0.5102, device='cuda:0')
reward_raw: tensor(0.5870, device='cuda:0') reward: tensor(1.0768, device='cuda:0') factor: tensor(0.5102, device='cuda:0')
reward_raw: tensor(0.6281, device='cuda:0') reward: tensor(1.1050, device='cuda:0') factor: tensor(0.5231, device='cuda:0')
reward_raw: tensor(0.6071, device='cuda:0') reward: tensor(1.1031, device='cuda:0') factor: tensor(0.5040, device='cuda:0')
reward_raw: tensor(0.6343, device='cuda:0') reward: tensor(1.1154, device='cuda:0') factor: tensor(0.5189, device='cuda:0')


Epoch 1, Step 25: Loss = 0.3656, Actor Loss = 0.3058, Critic Loss = 0.1205, Entropy = 0.5165, Returns = 1.0857, Value = 1.0848, mask_loss = 0.4343std_loss = 0.2577:   0%|          | 25/5475 [06:28<22:39:43, 14.97s/it]  

ratio tensor(1.8608, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.0973, device='cuda:0') factor: tensor(0.4307, device='cuda:0')
reward_raw: tensor(0.5404, device='cuda:0') reward: tensor(1.1043, device='cuda:0') factor: tensor(0.4361, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.1315, device='cuda:0') factor: tensor(0.4237, device='cuda:0')
reward_raw: tensor(0.5573, device='cuda:0') reward: tensor(1.1284, device='cuda:0') factor: tensor(0.4289, device='cuda:0')
reward_raw: tensor(0.6094, device='cuda:0') reward: tensor(1.1722, device='cuda:0') factor: tensor(0.4372, device='cuda:0')


Epoch 1, Step 26: Loss = -0.0674, Actor Loss = -0.1362, Critic Loss = 0.1386, Entropy = 0.5267, Returns = 1.1349, Value = 0.9912, mask_loss = 0.4156std_loss = 0.2471:   0%|          | 26/5475 [06:46<23:56:23, 15.82s/it]

ratio tensor(1.3342, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5584, device='cuda:0') reward: tensor(1.1310, device='cuda:0') factor: tensor(0.4274, device='cuda:0')
reward_raw: tensor(0.6199, device='cuda:0') reward: tensor(1.1736, device='cuda:0') factor: tensor(0.4463, device='cuda:0')
reward_raw: tensor(0.5519, device='cuda:0') reward: tensor(1.1258, device='cuda:0') factor: tensor(0.4261, device='cuda:0')
reward_raw: tensor(0.5926, device='cuda:0') reward: tensor(1.1644, device='cuda:0') factor: tensor(0.4281, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.0977, device='cuda:0') factor: tensor(0.4319, device='cuda:0')


Epoch 1, Step 27: Loss = 0.1035, Actor Loss = -0.0490, Critic Loss = 0.3059, Entropy = 0.5293, Returns = 1.0918, Value = 1.3153, mask_loss = 0.4181std_loss = 0.2458:   0%|          | 27/5475 [07:02<23:55:27, 15.81s/it] 

ratio tensor(1.2067, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6002, device='cuda:0') reward: tensor(1.1424, device='cuda:0') factor: tensor(0.4577, device='cuda:0')
reward_raw: tensor(0.6313, device='cuda:0') reward: tensor(1.1864, device='cuda:0') factor: tensor(0.4449, device='cuda:0')
reward_raw: tensor(0.6221, device='cuda:0') reward: tensor(1.1807, device='cuda:0') factor: tensor(0.4413, device='cuda:0')
reward_raw: tensor(0.5833, device='cuda:0') reward: tensor(1.1434, device='cuda:0') factor: tensor(0.4399, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.1474, device='cuda:0') factor: tensor(0.4367, device='cuda:0')


Epoch 1, Step 28: Loss = 0.0681, Actor Loss = -0.0129, Critic Loss = 0.1630, Entropy = 0.5292, Returns = 1.1840, Value = 1.2230, mask_loss = 0.3997std_loss = 0.2432:   1%|          | 28/5475 [07:17<23:38:21, 15.62s/it]

ratio tensor(1.2577, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6214, device='cuda:0') reward: tensor(1.1826, device='cuda:0') factor: tensor(0.4388, device='cuda:0')
reward_raw: tensor(0.6248, device='cuda:0') reward: tensor(1.1745, device='cuda:0') factor: tensor(0.4502, device='cuda:0')
reward_raw: tensor(0.5759, device='cuda:0') reward: tensor(1.1401, device='cuda:0') factor: tensor(0.4357, device='cuda:0')
reward_raw: tensor(0.5900, device='cuda:0') reward: tensor(1.1450, device='cuda:0') factor: tensor(0.4449, device='cuda:0')
reward_raw: tensor(0.5809, device='cuda:0') reward: tensor(1.1311, device='cuda:0') factor: tensor(0.4498, device='cuda:0')


Epoch 1, Step 29: Loss = 0.2781, Actor Loss = 0.2199, Critic Loss = 0.1175, Entropy = 0.5334, Returns = 1.1200, Value = 1.2812, mask_loss = 0.4363std_loss = 0.2496:   1%|          | 29/5475 [07:33<23:57:34, 15.84s/it] 

ratio tensor(0.8665, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.1374, device='cuda:0') factor: tensor(0.3742, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.1587, device='cuda:0') factor: tensor(0.3927, device='cuda:0')
reward_raw: tensor(0.5536, device='cuda:0') reward: tensor(1.1694, device='cuda:0') factor: tensor(0.3842, device='cuda:0')
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.0903, device='cuda:0') factor: tensor(0.3838, device='cuda:0')
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.1666, device='cuda:0') factor: tensor(0.3736, device='cuda:0')


Epoch 1, Step 30: Loss = 0.0132, Actor Loss = -0.0354, Critic Loss = 0.0981, Entropy = 0.5237, Returns = 1.1104, Value = 1.0840, mask_loss = 0.3829std_loss = 0.2346:   1%|          | 30/5475 [07:55<26:30:25, 17.53s/it]

ratio tensor(1.1583, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5936, device='cuda:0') reward: tensor(1.2347, device='cuda:0') factor: tensor(0.3589, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.1858, device='cuda:0') factor: tensor(0.3587, device='cuda:0')
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.1966, device='cuda:0') factor: tensor(0.3681, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.1789, device='cuda:0') factor: tensor(0.3724, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.1925, device='cuda:0') factor: tensor(0.3715, device='cuda:0')


Epoch 1, Step 31: Loss = 0.2026, Actor Loss = 0.1452, Critic Loss = 0.1159, Entropy = 0.4989, Returns = 1.1822, Value = 1.2926, mask_loss = 0.3466std_loss = 0.2298:   1%|          | 31/5475 [08:16<28:28:11, 18.83s/it] 

ratio tensor(0.9790, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.1636, device='cuda:0') factor: tensor(0.4077, device='cuda:0')
reward_raw: tensor(0.6020, device='cuda:0') reward: tensor(1.1906, device='cuda:0') factor: tensor(0.4114, device='cuda:0')
reward_raw: tensor(0.5708, device='cuda:0') reward: tensor(1.1531, device='cuda:0') factor: tensor(0.4177, device='cuda:0')
reward_raw: tensor(0.6064, device='cuda:0') reward: tensor(1.1862, device='cuda:0') factor: tensor(0.4202, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.1418, device='cuda:0') factor: tensor(0.4175, device='cuda:0')


Epoch 1, Step 32: Loss = -0.1143, Actor Loss = -0.1475, Critic Loss = 0.0674, Entropy = 0.5129, Returns = 1.1500, Value = 1.1547, mask_loss = 0.3965std_loss = 0.2569:   1%|          | 32/5475 [08:30<26:02:55, 17.23s/it]

ratio tensor(1.3946, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.0635, device='cuda:0') factor: tensor(0.4378, device='cuda:0')
reward_raw: tensor(0.5255, device='cuda:0') reward: tensor(1.0992, device='cuda:0') factor: tensor(0.4262, device='cuda:0')
reward_raw: tensor(0.5523, device='cuda:0') reward: tensor(1.1156, device='cuda:0') factor: tensor(0.4367, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.1183, device='cuda:0') factor: tensor(0.4449, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.0587, device='cuda:0') factor: tensor(0.4331, device='cuda:0')


Epoch 1, Step 33: Loss = 0.0758, Actor Loss = 0.0364, Critic Loss = 0.0798, Entropy = 0.5074, Returns = 1.1056, Value = 1.2033, mask_loss = 0.4323std_loss = 0.2537:   1%|          | 33/5475 [08:44<24:22:19, 16.12s/it]  

ratio tensor(1.2216, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5679, device='cuda:0') reward: tensor(1.1675, device='cuda:0') factor: tensor(0.4004, device='cuda:0')
reward_raw: tensor(0.5677, device='cuda:0') reward: tensor(1.1537, device='cuda:0') factor: tensor(0.4140, device='cuda:0')
reward_raw: tensor(0.5266, device='cuda:0') reward: tensor(1.1038, device='cuda:0') factor: tensor(0.4228, device='cuda:0')
reward_raw: tensor(0.5516, device='cuda:0') reward: tensor(1.1528, device='cuda:0') factor: tensor(0.3988, device='cuda:0')
reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.1161, device='cuda:0') factor: tensor(0.4253, device='cuda:0')


Epoch 1, Step 34: Loss = 0.4563, Actor Loss = 0.3714, Critic Loss = 0.1707, Entropy = 0.4916, Returns = 1.0872, Value = 1.2193, mask_loss = 0.3385std_loss = 0.2344:   1%|          | 34/5475 [09:00<24:22:58, 16.13s/it]

ratio tensor(0.1952, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4691, device='cuda:0') reward: tensor(1.1301, device='cuda:0') factor: tensor(0.3390, device='cuda:0')
reward_raw: tensor(0.4647, device='cuda:0') reward: tensor(1.1372, device='cuda:0') factor: tensor(0.3275, device='cuda:0')
reward_raw: tensor(0.4531, device='cuda:0') reward: tensor(1.1271, device='cuda:0') factor: tensor(0.3260, device='cuda:0')
reward_raw: tensor(0.4732, device='cuda:0') reward: tensor(1.1455, device='cuda:0') factor: tensor(0.3277, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.1651, device='cuda:0') factor: tensor(0.3397, device='cuda:0')


Epoch 1, Step 35: Loss = -0.0438, Actor Loss = -0.1030, Critic Loss = 0.1194, Entropy = 0.4886, Returns = 1.0721, Value = 1.2159, mask_loss = 0.3524std_loss = 0.2225:   1%|          | 35/5475 [09:21<26:42:03, 17.67s/it]

ratio tensor(1.0175, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.2062, device='cuda:0') factor: tensor(0.3146, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.2061, device='cuda:0') factor: tensor(0.2958, device='cuda:0')
reward_raw: tensor(0.5464, device='cuda:0') reward: tensor(1.2437, device='cuda:0') factor: tensor(0.3027, device='cuda:0')
reward_raw: tensor(0.5228, device='cuda:0') reward: tensor(1.2132, device='cuda:0') factor: tensor(0.3096, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.2234, device='cuda:0') factor: tensor(0.2973, device='cuda:0')


Epoch 1, Step 36: Loss = 0.2383, Actor Loss = 0.1148, Critic Loss = 0.2480, Entropy = 0.4863, Returns = 1.1907, Value = 1.3342, mask_loss = 0.3327std_loss = 0.2273:   1%|          | 36/5475 [09:41<27:57:22, 18.50s/it]  

ratio tensor(0.8443, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5969, device='cuda:0') reward: tensor(1.2559, device='cuda:0') factor: tensor(0.3410, device='cuda:0')
reward_raw: tensor(0.6347, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.3482, device='cuda:0')
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.2143, device='cuda:0') factor: tensor(0.3447, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.2049, device='cuda:0') factor: tensor(0.3471, device='cuda:0')
reward_raw: tensor(0.5598, device='cuda:0') reward: tensor(1.2221, device='cuda:0') factor: tensor(0.3378, device='cuda:0')


Epoch 1, Step 37: Loss = 0.1609, Actor Loss = 0.1057, Critic Loss = 0.1113, Entropy = 0.5010, Returns = 1.1984, Value = 1.3013, mask_loss = 0.3324std_loss = 0.2368:   1%|          | 37/5475 [09:54<25:25:09, 16.83s/it]

ratio tensor(1.1754, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5933, device='cuda:0') reward: tensor(1.2260, device='cuda:0') factor: tensor(0.3673, device='cuda:0')
reward_raw: tensor(0.5851, device='cuda:0') reward: tensor(1.2286, device='cuda:0') factor: tensor(0.3564, device='cuda:0')
reward_raw: tensor(0.5650, device='cuda:0') reward: tensor(1.2035, device='cuda:0') factor: tensor(0.3615, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.1967, device='cuda:0') factor: tensor(0.3519, device='cuda:0')
reward_raw: tensor(0.5564, device='cuda:0') reward: tensor(1.1794, device='cuda:0') factor: tensor(0.3769, device='cuda:0')


Epoch 1, Step 38: Loss = -0.0034, Actor Loss = -0.0482, Critic Loss = 0.0906, Entropy = 0.4903, Returns = 1.1674, Value = 1.0899, mask_loss = 0.3499std_loss = 0.2436:   1%|          | 38/5475 [10:07<23:39:26, 15.66s/it]

ratio tensor(1.0269, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5515, device='cuda:0') reward: tensor(1.1723, device='cuda:0') factor: tensor(0.3793, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.1411, device='cuda:0') factor: tensor(0.3744, device='cuda:0')
reward_raw: tensor(0.5145, device='cuda:0') reward: tensor(1.1289, device='cuda:0') factor: tensor(0.3856, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.1054, device='cuda:0') factor: tensor(0.3880, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.1575, device='cuda:0') factor: tensor(0.3911, device='cuda:0')


Epoch 1, Step 39: Loss = -0.1688, Actor Loss = -0.2700, Critic Loss = 0.2035, Entropy = 0.5126, Returns = 1.1478, Value = 1.0639, mask_loss = 0.3859std_loss = 0.2511:   1%|          | 39/5475 [10:19<21:50:54, 14.47s/it]

ratio tensor(1.3531, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6220, device='cuda:0') reward: tensor(1.2757, device='cuda:0') factor: tensor(0.3463, device='cuda:0')
reward_raw: tensor(0.5455, device='cuda:0') reward: tensor(1.1885, device='cuda:0') factor: tensor(0.3569, device='cuda:0')
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.2217, device='cuda:0') factor: tensor(0.3512, device='cuda:0')
reward_raw: tensor(0.4942, device='cuda:0') reward: tensor(1.1539, device='cuda:0') factor: tensor(0.3403, device='cuda:0')
reward_raw: tensor(0.5565, device='cuda:0') reward: tensor(1.2197, device='cuda:0') factor: tensor(0.3368, device='cuda:0')


Epoch 1, Step 40: Loss = 0.1470, Actor Loss = 0.1022, Critic Loss = 0.0907, Entropy = 0.5102, Returns = 1.1188, Value = 1.2407, mask_loss = 0.3623std_loss = 0.2335:   1%|          | 40/5475 [10:34<22:16:12, 14.75s/it]  

ratio tensor(0.7041, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.1891, device='cuda:0') factor: tensor(0.3597, device='cuda:0')
reward_raw: tensor(0.5294, device='cuda:0') reward: tensor(1.1967, device='cuda:0') factor: tensor(0.3327, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.1521, device='cuda:0') factor: tensor(0.3518, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.1712, device='cuda:0') factor: tensor(0.3447, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.1497, device='cuda:0') factor: tensor(0.3402, device='cuda:0')


Epoch 1, Step 41: Loss = -0.0405, Actor Loss = -0.0664, Critic Loss = 0.0527, Entropy = 0.4996, Returns = 1.2174, Value = 1.1058, mask_loss = 0.3556std_loss = 0.2484:   1%|          | 40/5475 [10:48<22:16:12, 14.75s/it]

ratio tensor(0.9684, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 41: Loss = -0.0405, Actor Loss = -0.0664, Critic Loss = 0.0527, Entropy = 0.4996, Returns = 1.2174, Value = 1.1058, mask_loss = 0.3556std_loss = 0.2484:   1%|          | 41/5475 [10:48<21:54:40, 14.52s/it]

reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.1984, device='cuda:0') factor: tensor(0.3357, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.1742, device='cuda:0') factor: tensor(0.3440, device='cuda:0')
reward_raw: tensor(0.4751, device='cuda:0') reward: tensor(1.1421, device='cuda:0') factor: tensor(0.3329, device='cuda:0')
reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.1423, device='cuda:0') factor: tensor(0.3497, device='cuda:0')
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.1952, device='cuda:0') factor: tensor(0.3541, device='cuda:0')


Epoch 1, Step 42: Loss = -0.0448, Actor Loss = -0.0709, Critic Loss = 0.0533, Entropy = 0.5133, Returns = 1.1962, Value = 1.1250, mask_loss = 0.3342std_loss = 0.2238:   1%|          | 42/5475 [11:04<22:31:16, 14.92s/it]

ratio tensor(1.2359, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5551, device='cuda:0') reward: tensor(1.2133, device='cuda:0') factor: tensor(0.3418, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.1990, device='cuda:0') factor: tensor(0.3286, device='cuda:0')
reward_raw: tensor(0.5830, device='cuda:0') reward: tensor(1.2551, device='cuda:0') factor: tensor(0.3279, device='cuda:0')
reward_raw: tensor(0.5752, device='cuda:0') reward: tensor(1.2281, device='cuda:0') factor: tensor(0.3471, device='cuda:0')
reward_raw: tensor(0.5539, device='cuda:0') reward: tensor(1.2119, device='cuda:0') factor: tensor(0.3420, device='cuda:0')


Epoch 1, Step 43: Loss = -0.1115, Actor Loss = -0.2477, Critic Loss = 0.2736, Entropy = 0.4980, Returns = 1.2548, Value = 1.1826, mask_loss = 0.3528std_loss = 0.2391:   1%|          | 43/5475 [11:18<21:52:37, 14.50s/it]

ratio tensor(3.8822, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5897, device='cuda:0') reward: tensor(1.2626, device='cuda:0') factor: tensor(0.3271, device='cuda:0')
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.2453, device='cuda:0') factor: tensor(0.3238, device='cuda:0')
reward_raw: tensor(0.5890, device='cuda:0') reward: tensor(1.2556, device='cuda:0') factor: tensor(0.3334, device='cuda:0')
reward_raw: tensor(0.5569, device='cuda:0') reward: tensor(1.2354, device='cuda:0') factor: tensor(0.3216, device='cuda:0')
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.2519, device='cuda:0') factor: tensor(0.3258, device='cuda:0')


Epoch 1, Step 44: Loss = -0.1354, Actor Loss = -0.2202, Critic Loss = 0.1705, Entropy = 0.4696, Returns = 1.2785, Value = 1.1232, mask_loss = 0.3283std_loss = 0.2483:   1%|          | 44/5475 [11:35<23:03:01, 15.28s/it]

ratio tensor(3.0613, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5894, device='cuda:0') reward: tensor(1.2253, device='cuda:0') factor: tensor(0.3641, device='cuda:0')
reward_raw: tensor(0.5884, device='cuda:0') reward: tensor(1.2342, device='cuda:0') factor: tensor(0.3543, device='cuda:0')
reward_raw: tensor(0.5827, device='cuda:0') reward: tensor(1.2141, device='cuda:0') factor: tensor(0.3686, device='cuda:0')
reward_raw: tensor(0.6109, device='cuda:0') reward: tensor(1.2374, device='cuda:0') factor: tensor(0.3736, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.1714, device='cuda:0') factor: tensor(0.3405, device='cuda:0')


Epoch 1, Step 45: Loss = 0.1862, Actor Loss = 0.1169, Critic Loss = 0.1395, Entropy = 0.4897, Returns = 1.2172, Value = 1.2308, mask_loss = 0.3682std_loss = 0.2529:   1%|          | 45/5475 [11:48<22:02:36, 14.61s/it]  

ratio tensor(0.9295, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4891, device='cuda:0') reward: tensor(1.1520, device='cuda:0') factor: tensor(0.3370, device='cuda:0')
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(1.1498, device='cuda:0') factor: tensor(0.3371, device='cuda:0')
reward_raw: tensor(0.4977, device='cuda:0') reward: tensor(1.1629, device='cuda:0') factor: tensor(0.3348, device='cuda:0')
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.1675, device='cuda:0') factor: tensor(0.3491, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.1903, device='cuda:0') factor: tensor(0.3404, device='cuda:0')


Epoch 1, Step 46: Loss = 0.0260, Actor Loss = -0.0220, Critic Loss = 0.0969, Entropy = 0.4969, Returns = 1.1493, Value = 1.1102, mask_loss = 0.3394std_loss = 0.2399:   1%|          | 46/5475 [12:04<22:42:51, 15.06s/it]

ratio tensor(0.9754, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5046, device='cuda:0') reward: tensor(1.1447, device='cuda:0') factor: tensor(0.3599, device='cuda:0')
reward_raw: tensor(0.4307, device='cuda:0') reward: tensor(1.0829, device='cuda:0') factor: tensor(0.3478, device='cuda:0')
reward_raw: tensor(0.4456, device='cuda:0') reward: tensor(1.0977, device='cuda:0') factor: tensor(0.3478, device='cuda:0')
reward_raw: tensor(0.4641, device='cuda:0') reward: tensor(1.1113, device='cuda:0') factor: tensor(0.3528, device='cuda:0')
reward_raw: tensor(0.3984, device='cuda:0') reward: tensor(1.0493, device='cuda:0') factor: tensor(0.3491, device='cuda:0')


Epoch 1, Step 47: Loss = 0.0564, Actor Loss = 0.0114, Critic Loss = 0.0910, Entropy = 0.5013, Returns = 1.1117, Value = 1.1731, mask_loss = 0.3272std_loss = 0.2270:   1%|          | 47/5475 [12:19<22:45:51, 15.10s/it] 

ratio tensor(1.1646, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5199, device='cuda:0') reward: tensor(1.1740, device='cuda:0') factor: tensor(0.3459, device='cuda:0')
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.1938, device='cuda:0') factor: tensor(0.3460, device='cuda:0')
reward_raw: tensor(0.5044, device='cuda:0') reward: tensor(1.1658, device='cuda:0') factor: tensor(0.3386, device='cuda:0')
reward_raw: tensor(0.5358, device='cuda:0') reward: tensor(1.1855, device='cuda:0') factor: tensor(0.3503, device='cuda:0')
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.1618, device='cuda:0') factor: tensor(0.3512, device='cuda:0')


Epoch 1, Step 48: Loss = 0.0308, Actor Loss = -0.0323, Critic Loss = 0.1272, Entropy = 0.4831, Returns = 1.3017, Value = 1.3608, mask_loss = 0.3118std_loss = 0.2282:   1%|          | 48/5475 [12:43<26:40:12, 17.69s/it]

ratio tensor(1.2605, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4909, device='cuda:0') reward: tensor(1.1928, device='cuda:0') factor: tensor(0.2981, device='cuda:0')
reward_raw: tensor(0.4787, device='cuda:0') reward: tensor(1.1746, device='cuda:0') factor: tensor(0.3041, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.1995, device='cuda:0') factor: tensor(0.3206, device='cuda:0')
reward_raw: tensor(0.4840, device='cuda:0') reward: tensor(1.1636, device='cuda:0') factor: tensor(0.3203, device='cuda:0')
reward_raw: tensor(0.4648, device='cuda:0') reward: tensor(1.1439, device='cuda:0') factor: tensor(0.3208, device='cuda:0')


Epoch 1, Step 49: Loss = 0.0810, Actor Loss = 0.0412, Critic Loss = 0.0806, Entropy = 0.5004, Returns = 1.2419, Value = 1.2327, mask_loss = 0.3278std_loss = 0.2288:   1%|          | 49/5475 [12:58<25:31:07, 16.93s/it] 

ratio tensor(0.6163, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.1638, device='cuda:0') factor: tensor(0.3775, device='cuda:0')
reward_raw: tensor(0.5250, device='cuda:0') reward: tensor(1.1541, device='cuda:0') factor: tensor(0.3709, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.1575, device='cuda:0') factor: tensor(0.3755, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.1861, device='cuda:0') factor: tensor(0.3777, device='cuda:0')
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.1733, device='cuda:0') factor: tensor(0.3820, device='cuda:0')


Epoch 1, Step 50: Loss = -0.1478, Actor Loss = -0.1969, Critic Loss = 0.0992, Entropy = 0.4955, Returns = 1.1424, Value = 1.0681, mask_loss = 0.3684std_loss = 0.2499:   1%|          | 50/5475 [13:14<25:09:26, 16.69s/it]

ratio tensor(1.1671, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5204, device='cuda:0') reward: tensor(1.2015, device='cuda:0') factor: tensor(0.3189, device='cuda:0')
reward_raw: tensor(0.5368, device='cuda:0') reward: tensor(1.2140, device='cuda:0') factor: tensor(0.3228, device='cuda:0')
reward_raw: tensor(0.5280, device='cuda:0') reward: tensor(1.2092, device='cuda:0') factor: tensor(0.3188, device='cuda:0')
reward_raw: tensor(0.5202, device='cuda:0') reward: tensor(1.1987, device='cuda:0') factor: tensor(0.3215, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.1908, device='cuda:0') factor: tensor(0.3122, device='cuda:0')


Epoch 1, Step 51: Loss = 0.1383, Actor Loss = 0.0951, Critic Loss = 0.0874, Entropy = 0.4860, Returns = 1.2273, Value = 1.3533, mask_loss = 0.3142std_loss = 0.2319:   1%|          | 51/5475 [13:33<26:19:00, 17.47s/it]  

ratio tensor(1.0319, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.1841, device='cuda:0') factor: tensor(0.3402, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.1852, device='cuda:0') factor: tensor(0.3275, device='cuda:0')
reward_raw: tensor(0.5321, device='cuda:0') reward: tensor(1.2001, device='cuda:0') factor: tensor(0.3320, device='cuda:0')
reward_raw: tensor(0.4662, device='cuda:0') reward: tensor(1.1311, device='cuda:0') factor: tensor(0.3351, device='cuda:0')
reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.1893, device='cuda:0') factor: tensor(0.3503, device='cuda:0')


Epoch 1, Step 52: Loss = 0.0056, Actor Loss = -0.0565, Critic Loss = 0.1252, Entropy = 0.4866, Returns = 1.2136, Value = 1.1525, mask_loss = 0.3411std_loss = 0.2483:   1%|          | 52/5475 [13:47<24:30:38, 16.27s/it]

ratio tensor(1.0813, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.1999, device='cuda:0') factor: tensor(0.3400, device='cuda:0')
reward_raw: tensor(0.5702, device='cuda:0') reward: tensor(1.2410, device='cuda:0') factor: tensor(0.3291, device='cuda:0')
reward_raw: tensor(0.5738, device='cuda:0') reward: tensor(1.2317, device='cuda:0') factor: tensor(0.3421, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.2284, device='cuda:0') factor: tensor(0.3293, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.1864, device='cuda:0') factor: tensor(0.3238, device='cuda:0')


Epoch 1, Step 53: Loss = 0.0652, Actor Loss = 0.0171, Critic Loss = 0.0971, Entropy = 0.4704, Returns = 1.2378, Value = 1.2241, mask_loss = 0.3355std_loss = 0.2551:   1%|          | 53/5475 [14:03<24:20:20, 16.16s/it] 

ratio tensor(0.9858, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.2076, device='cuda:0') factor: tensor(0.3409, device='cuda:0')
reward_raw: tensor(0.5705, device='cuda:0') reward: tensor(1.2244, device='cuda:0') factor: tensor(0.3461, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.1790, device='cuda:0') factor: tensor(0.3361, device='cuda:0')
reward_raw: tensor(0.5148, device='cuda:0') reward: tensor(1.1649, device='cuda:0') factor: tensor(0.3499, device='cuda:0')
reward_raw: tensor(0.5997, device='cuda:0') reward: tensor(1.2376, device='cuda:0') factor: tensor(0.3621, device='cuda:0')


Epoch 1, Step 54: Loss = -0.0754, Actor Loss = -0.1079, Critic Loss = 0.0659, Entropy = 0.4690, Returns = 1.1738, Value = 1.2335, mask_loss = 0.3312std_loss = 0.2586:   1%|          | 54/5475 [14:16<22:58:01, 15.25s/it]

ratio tensor(1.2889, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.2079, device='cuda:0') factor: tensor(0.3282, device='cuda:0')
reward_raw: tensor(0.4877, device='cuda:0') reward: tensor(1.1624, device='cuda:0') factor: tensor(0.3253, device='cuda:0')
reward_raw: tensor(0.5045, device='cuda:0') reward: tensor(1.1842, device='cuda:0') factor: tensor(0.3203, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.1807, device='cuda:0') factor: tensor(0.3255, device='cuda:0')
reward_raw: tensor(0.5264, device='cuda:0') reward: tensor(1.1913, device='cuda:0') factor: tensor(0.3351, device='cuda:0')


Epoch 1, Step 55: Loss = 0.2117, Actor Loss = 0.1492, Critic Loss = 0.1259, Entropy = 0.4674, Returns = 1.1458, Value = 1.1570, mask_loss = 0.3256std_loss = 0.2523:   1%|          | 55/5475 [14:31<22:57:47, 15.25s/it]  

ratio tensor(0.7181, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4850, device='cuda:0') reward: tensor(1.1860, device='cuda:0') factor: tensor(0.2991, device='cuda:0')
reward_raw: tensor(0.4685, device='cuda:0') reward: tensor(1.1893, device='cuda:0') factor: tensor(0.2792, device='cuda:0')
reward_raw: tensor(0.4743, device='cuda:0') reward: tensor(1.1866, device='cuda:0') factor: tensor(0.2877, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.2232, device='cuda:0') factor: tensor(0.2932, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.2255, device='cuda:0') factor: tensor(0.2940, device='cuda:0')


Epoch 1, Step 56: Loss = 0.1309, Actor Loss = 0.0623, Critic Loss = 0.1383, Entropy = 0.4782, Returns = 1.2283, Value = 1.0952, mask_loss = 0.3206std_loss = 0.2390:   1%|          | 56/5475 [14:45<22:21:51, 14.86s/it]

ratio tensor(0.9802, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.1413, device='cuda:0') factor: tensor(0.3306, device='cuda:0')
reward_raw: tensor(0.4744, device='cuda:0') reward: tensor(1.1505, device='cuda:0') factor: tensor(0.3239, device='cuda:0')
reward_raw: tensor(0.5040, device='cuda:0') reward: tensor(1.1808, device='cuda:0') factor: tensor(0.3232, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.1755, device='cuda:0') factor: tensor(0.3457, device='cuda:0')
reward_raw: tensor(0.4839, device='cuda:0') reward: tensor(1.1560, device='cuda:0') factor: tensor(0.3278, device='cuda:0')


Epoch 1, Step 57: Loss = -0.5172, Actor Loss = -0.6494, Critic Loss = 0.2652, Entropy = 0.4598, Returns = 1.1538, Value = 1.0342, mask_loss = 0.3005std_loss = 0.2438:   1%|          | 57/5475 [15:08<25:58:55, 17.26s/it]

ratio tensor(6.7739, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.1652, device='cuda:0') factor: tensor(0.3369, device='cuda:0')
reward_raw: tensor(0.5526, device='cuda:0') reward: tensor(1.2096, device='cuda:0') factor: tensor(0.3429, device='cuda:0')
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(1.1256, device='cuda:0') factor: tensor(0.3524, device='cuda:0')
reward_raw: tensor(0.5244, device='cuda:0') reward: tensor(1.1799, device='cuda:0') factor: tensor(0.3446, device='cuda:0')
reward_raw: tensor(0.5250, device='cuda:0') reward: tensor(1.1857, device='cuda:0') factor: tensor(0.3394, device='cuda:0')


Epoch 1, Step 58: Loss = 0.3573, Actor Loss = 0.2966, Critic Loss = 0.1224, Entropy = 0.4744, Returns = 1.1470, Value = 1.3040, mask_loss = 0.3423std_loss = 0.2627:   1%|          | 58/5475 [15:21<23:57:29, 15.92s/it]  

ratio tensor(1.0930, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5612, device='cuda:0') reward: tensor(1.2312, device='cuda:0') factor: tensor(0.3299, device='cuda:0')
reward_raw: tensor(0.5705, device='cuda:0') reward: tensor(1.2329, device='cuda:0') factor: tensor(0.3376, device='cuda:0')
reward_raw: tensor(0.5662, device='cuda:0') reward: tensor(1.2361, device='cuda:0') factor: tensor(0.3301, device='cuda:0')
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.2065, device='cuda:0') factor: tensor(0.3302, device='cuda:0')
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.2216, device='cuda:0') factor: tensor(0.3273, device='cuda:0')


Epoch 1, Step 59: Loss = -0.2081, Actor Loss = -0.2748, Critic Loss = 0.1343, Entropy = 0.4787, Returns = 1.2224, Value = 0.9493, mask_loss = 0.3563std_loss = 0.2531:   1%|          | 59/5475 [15:38<24:27:08, 16.25s/it]

ratio tensor(1.3975, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5359, device='cuda:0') reward: tensor(1.2013, device='cuda:0') factor: tensor(0.3346, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.1549, device='cuda:0') factor: tensor(0.3273, device='cuda:0')
reward_raw: tensor(0.5126, device='cuda:0') reward: tensor(1.1888, device='cuda:0') factor: tensor(0.3238, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.1660, device='cuda:0') factor: tensor(0.3254, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.1739, device='cuda:0') factor: tensor(0.3125, device='cuda:0')


Epoch 1, Step 60: Loss = -0.0935, Actor Loss = -0.1515, Critic Loss = 0.1169, Entropy = 0.4695, Returns = 1.1955, Value = 1.2183, mask_loss = 0.3481std_loss = 0.2667:   1%|          | 60/5475 [15:54<24:31:46, 16.31s/it]

ratio tensor(1.1451, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.2051, device='cuda:0') factor: tensor(0.3789, device='cuda:0')
reward_raw: tensor(0.5900, device='cuda:0') reward: tensor(1.2231, device='cuda:0') factor: tensor(0.3668, device='cuda:0')
reward_raw: tensor(0.5951, device='cuda:0') reward: tensor(1.2219, device='cuda:0') factor: tensor(0.3732, device='cuda:0')
reward_raw: tensor(0.5620, device='cuda:0') reward: tensor(1.1888, device='cuda:0') factor: tensor(0.3732, device='cuda:0')
reward_raw: tensor(0.6007, device='cuda:0') reward: tensor(1.2202, device='cuda:0') factor: tensor(0.3804, device='cuda:0')


Epoch 1, Step 61: Loss = 0.0055, Actor Loss = -0.0187, Critic Loss = 0.0492, Entropy = 0.4925, Returns = 1.2224, Value = 1.2023, mask_loss = 0.3872std_loss = 0.2627:   1%|          | 60/5475 [16:09<24:31:46, 16.31s/it] 

ratio tensor(0.9019, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 61: Loss = 0.0055, Actor Loss = -0.0187, Critic Loss = 0.0492, Entropy = 0.4925, Returns = 1.2224, Value = 1.2023, mask_loss = 0.3872std_loss = 0.2627:   1%|          | 61/5475 [16:09<23:52:43, 15.88s/it]

reward_raw: tensor(0.4888, device='cuda:0') reward: tensor(1.1610, device='cuda:0') factor: tensor(0.3278, device='cuda:0')
reward_raw: tensor(0.4548, device='cuda:0') reward: tensor(1.1253, device='cuda:0') factor: tensor(0.3296, device='cuda:0')
reward_raw: tensor(0.4650, device='cuda:0') reward: tensor(1.1373, device='cuda:0') factor: tensor(0.3277, device='cuda:0')
reward_raw: tensor(0.4938, device='cuda:0') reward: tensor(1.1568, device='cuda:0') factor: tensor(0.3370, device='cuda:0')
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.1509, device='cuda:0') factor: tensor(0.3232, device='cuda:0')


Epoch 1, Step 62: Loss = 0.1971, Actor Loss = 0.1047, Critic Loss = 0.1856, Entropy = 0.4651, Returns = 1.1739, Value = 1.1355, mask_loss = 0.3761std_loss = 0.2444:   1%|          | 62/5475 [16:29<25:52:35, 17.21s/it] 

ratio tensor(0.7883, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.2045, device='cuda:0') factor: tensor(0.3628, device='cuda:0')
reward_raw: tensor(0.5453, device='cuda:0') reward: tensor(1.1878, device='cuda:0') factor: tensor(0.3575, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.1940, device='cuda:0') factor: tensor(0.3651, device='cuda:0')
reward_raw: tensor(0.5546, device='cuda:0') reward: tensor(1.1897, device='cuda:0') factor: tensor(0.3649, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.1602, device='cuda:0') factor: tensor(0.3762, device='cuda:0')


Epoch 1, Step 63: Loss = -0.1195, Actor Loss = -0.1637, Critic Loss = 0.0895, Entropy = 0.4954, Returns = 1.2184, Value = 1.1529, mask_loss = 0.3837std_loss = 0.2650:   1%|          | 63/5475 [16:43<24:23:07, 16.22s/it]

ratio tensor(2.1016, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5477, device='cuda:0') reward: tensor(1.1814, device='cuda:0') factor: tensor(0.3664, device='cuda:0')
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.1870, device='cuda:0') factor: tensor(0.3757, device='cuda:0')
reward_raw: tensor(0.5973, device='cuda:0') reward: tensor(1.2296, device='cuda:0') factor: tensor(0.3677, device='cuda:0')
reward_raw: tensor(0.5872, device='cuda:0') reward: tensor(1.2151, device='cuda:0') factor: tensor(0.3721, device='cuda:0')
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.1715, device='cuda:0') factor: tensor(0.3822, device='cuda:0')


Epoch 1, Step 64: Loss = -0.1675, Actor Loss = -0.2104, Critic Loss = 0.0867, Entropy = 0.4787, Returns = 1.1971, Value = 1.2878, mask_loss = 0.3725std_loss = 0.2681:   1%|          | 64/5475 [16:57<22:59:54, 15.30s/it]

ratio tensor(3.0895, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5689, device='cuda:0') reward: tensor(1.2576, device='cuda:0') factor: tensor(0.3113, device='cuda:0')
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.2232, device='cuda:0') factor: tensor(0.3113, device='cuda:0')
reward_raw: tensor(0.5612, device='cuda:0') reward: tensor(1.2367, device='cuda:0') factor: tensor(0.3245, device='cuda:0')
reward_raw: tensor(0.5858, device='cuda:0') reward: tensor(1.2657, device='cuda:0') factor: tensor(0.3200, device='cuda:0')
reward_raw: tensor(0.5868, device='cuda:0') reward: tensor(1.2645, device='cuda:0') factor: tensor(0.3224, device='cuda:0')


Epoch 1, Step 65: Loss = 0.1278, Actor Loss = 0.1006, Critic Loss = 0.0552, Entropy = 0.4302, Returns = 1.2770, Value = 1.2378, mask_loss = 0.3055std_loss = 0.2537:   1%|          | 65/5475 [17:13<23:28:30, 15.62s/it]  

ratio tensor(1.0434, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5658, device='cuda:0') reward: tensor(1.2532, device='cuda:0') factor: tensor(0.3127, device='cuda:0')
reward_raw: tensor(0.5429, device='cuda:0') reward: tensor(1.2321, device='cuda:0') factor: tensor(0.3109, device='cuda:0')
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.2413, device='cuda:0') factor: tensor(0.3120, device='cuda:0')
reward_raw: tensor(0.5645, device='cuda:0') reward: tensor(1.2483, device='cuda:0') factor: tensor(0.3162, device='cuda:0')
reward_raw: tensor(0.5427, device='cuda:0') reward: tensor(1.2351, device='cuda:0') factor: tensor(0.3076, device='cuda:0')


Epoch 1, Step 66: Loss = 0.4516, Actor Loss = 0.3939, Critic Loss = 0.1163, Entropy = 0.4461, Returns = 1.1572, Value = 1.3242, mask_loss = 0.3306std_loss = 0.2720:   1%|          | 66/5475 [17:28<23:16:44, 15.49s/it]

ratio tensor(0.5383, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5833, device='cuda:0') reward: tensor(1.2200, device='cuda:0') factor: tensor(0.3634, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.1988, device='cuda:0') factor: tensor(0.3652, device='cuda:0')
reward_raw: tensor(0.5657, device='cuda:0') reward: tensor(1.2025, device='cuda:0') factor: tensor(0.3632, device='cuda:0')
reward_raw: tensor(0.5729, device='cuda:0') reward: tensor(1.2009, device='cuda:0') factor: tensor(0.3720, device='cuda:0')
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.2222, device='cuda:0') factor: tensor(0.3636, device='cuda:0')


Epoch 1, Step 67: Loss = 0.0489, Actor Loss = 0.0076, Critic Loss = 0.0836, Entropy = 0.4761, Returns = 1.1525, Value = 1.1805, mask_loss = 0.3592std_loss = 0.2597:   1%|          | 67/5475 [17:49<25:36:33, 17.05s/it]

ratio tensor(0.9444, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5895, device='cuda:0') reward: tensor(1.1919, device='cuda:0') factor: tensor(0.3976, device='cuda:0')
reward_raw: tensor(0.6095, device='cuda:0') reward: tensor(1.2130, device='cuda:0') factor: tensor(0.3966, device='cuda:0')
reward_raw: tensor(0.5992, device='cuda:0') reward: tensor(1.1977, device='cuda:0') factor: tensor(0.4015, device='cuda:0')
reward_raw: tensor(0.6068, device='cuda:0') reward: tensor(1.2098, device='cuda:0') factor: tensor(0.3971, device='cuda:0')
reward_raw: tensor(0.6142, device='cuda:0') reward: tensor(1.2087, device='cuda:0') factor: tensor(0.4055, device='cuda:0')


Epoch 1, Step 68: Loss = -0.0919, Actor Loss = -0.1488, Critic Loss = 0.1148, Entropy = 0.4818, Returns = 1.1681, Value = 1.3151, mask_loss = 0.3739std_loss = 0.2717:   1%|          | 68/5475 [18:05<25:11:01, 16.77s/it]

ratio tensor(2.0745, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6084, device='cuda:0') reward: tensor(1.1917, device='cuda:0') factor: tensor(0.4167, device='cuda:0')
reward_raw: tensor(0.5803, device='cuda:0') reward: tensor(1.1702, device='cuda:0') factor: tensor(0.4101, device='cuda:0')
reward_raw: tensor(0.5858, device='cuda:0') reward: tensor(1.1634, device='cuda:0') factor: tensor(0.4224, device='cuda:0')
reward_raw: tensor(0.6068, device='cuda:0') reward: tensor(1.1960, device='cuda:0') factor: tensor(0.4109, device='cuda:0')
reward_raw: tensor(0.5829, device='cuda:0') reward: tensor(1.1701, device='cuda:0') factor: tensor(0.4128, device='cuda:0')


Epoch 1, Step 69: Loss = 0.0785, Actor Loss = 0.0392, Critic Loss = 0.0796, Entropy = 0.4573, Returns = 1.2290, Value = 1.2328, mask_loss = 0.3392std_loss = 0.2667:   1%|▏         | 69/5475 [18:20<24:35:46, 16.38s/it]  

ratio tensor(0.6274, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5746, device='cuda:0') reward: tensor(1.1792, device='cuda:0') factor: tensor(0.3954, device='cuda:0')
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.1892, device='cuda:0') factor: tensor(0.3835, device='cuda:0')
reward_raw: tensor(0.5834, device='cuda:0') reward: tensor(1.1986, device='cuda:0') factor: tensor(0.3848, device='cuda:0')
reward_raw: tensor(0.5998, device='cuda:0') reward: tensor(1.2180, device='cuda:0') factor: tensor(0.3818, device='cuda:0')
reward_raw: tensor(0.5822, device='cuda:0') reward: tensor(1.2017, device='cuda:0') factor: tensor(0.3805, device='cuda:0')


Epoch 1, Step 70: Loss = 0.3356, Actor Loss = 0.2837, Critic Loss = 0.1048, Entropy = 0.4580, Returns = 1.1350, Value = 1.2833, mask_loss = 0.4061std_loss = 0.2761:   1%|▏         | 70/5475 [18:37<24:55:32, 16.60s/it]

ratio tensor(0.7112, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.1965, device='cuda:0') factor: tensor(0.3491, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.2020, device='cuda:0') factor: tensor(0.3415, device='cuda:0')
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.1973, device='cuda:0') factor: tensor(0.3526, device='cuda:0')
reward_raw: tensor(0.5314, device='cuda:0') reward: tensor(1.1845, device='cuda:0') factor: tensor(0.3469, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.1830, device='cuda:0') factor: tensor(0.3500, device='cuda:0')


Epoch 1, Step 71: Loss = -0.1539, Actor Loss = -0.2051, Critic Loss = 0.1033, Entropy = 0.4565, Returns = 1.2459, Value = 1.0695, mask_loss = 0.3260std_loss = 0.2600:   1%|▏         | 71/5475 [19:00<27:41:27, 18.45s/it]

ratio tensor(1.1332, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.2027, device='cuda:0') factor: tensor(0.3766, device='cuda:0')
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.2111, device='cuda:0') factor: tensor(0.3604, device='cuda:0')
reward_raw: tensor(0.5686, device='cuda:0') reward: tensor(1.2108, device='cuda:0') factor: tensor(0.3577, device='cuda:0')
reward_raw: tensor(0.5912, device='cuda:0') reward: tensor(1.2086, device='cuda:0') factor: tensor(0.3826, device='cuda:0')
reward_raw: tensor(0.5635, device='cuda:0') reward: tensor(1.1879, device='cuda:0') factor: tensor(0.3756, device='cuda:0')


Epoch 1, Step 72: Loss = 0.0236, Actor Loss = -0.0511, Critic Loss = 0.1502, Entropy = 0.4511, Returns = 1.1664, Value = 1.3072, mask_loss = 0.3800std_loss = 0.2833:   1%|▏         | 72/5475 [19:15<25:51:00, 17.22s/it] 

ratio tensor(2.0306, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5649, device='cuda:0') reward: tensor(1.1954, device='cuda:0') factor: tensor(0.3695, device='cuda:0')
reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.1893, device='cuda:0') factor: tensor(0.3543, device='cuda:0')
reward_raw: tensor(0.5883, device='cuda:0') reward: tensor(1.2213, device='cuda:0') factor: tensor(0.3670, device='cuda:0')
reward_raw: tensor(0.5625, device='cuda:0') reward: tensor(1.1904, device='cuda:0') factor: tensor(0.3721, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.1829, device='cuda:0') factor: tensor(0.3712, device='cuda:0')


Epoch 1, Step 73: Loss = 0.2240, Actor Loss = 0.1226, Critic Loss = 0.2038, Entropy = 0.4468, Returns = 1.2719, Value = 1.0942, mask_loss = 0.3353std_loss = 0.2786:   1%|▏         | 73/5475 [19:27<23:26:58, 15.63s/it] 

ratio tensor(0.1833, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4708, device='cuda:0') reward: tensor(1.1488, device='cuda:0') factor: tensor(0.3220, device='cuda:0')
reward_raw: tensor(0.4744, device='cuda:0') reward: tensor(1.1490, device='cuda:0') factor: tensor(0.3254, device='cuda:0')
reward_raw: tensor(0.4712, device='cuda:0') reward: tensor(1.1310, device='cuda:0') factor: tensor(0.3402, device='cuda:0')
reward_raw: tensor(0.4536, device='cuda:0') reward: tensor(1.1242, device='cuda:0') factor: tensor(0.3294, device='cuda:0')
reward_raw: tensor(0.4543, device='cuda:0') reward: tensor(1.1199, device='cuda:0') factor: tensor(0.3344, device='cuda:0')


Epoch 1, Step 74: Loss = 0.2310, Actor Loss = 0.1988, Critic Loss = 0.0654, Entropy = 0.4874, Returns = 1.1056, Value = 1.1133, mask_loss = 0.3666std_loss = 0.2579:   1%|▏         | 74/5475 [19:43<24:03:11, 16.03s/it]

ratio tensor(0.6004, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5661, device='cuda:0') reward: tensor(1.2530, device='cuda:0') factor: tensor(0.3131, device='cuda:0')
reward_raw: tensor(0.5780, device='cuda:0') reward: tensor(1.2627, device='cuda:0') factor: tensor(0.3153, device='cuda:0')
reward_raw: tensor(0.5712, device='cuda:0') reward: tensor(1.2478, device='cuda:0') factor: tensor(0.3234, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.2357, device='cuda:0') factor: tensor(0.3164, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.2400, device='cuda:0') factor: tensor(0.3232, device='cuda:0')


Epoch 1, Step 75: Loss = -0.4997, Actor Loss = -0.5351, Critic Loss = 0.0718, Entropy = 0.4482, Returns = 1.2829, Value = 1.2227, mask_loss = 0.3044std_loss = 0.2515:   1%|▏         | 75/5475 [19:58<23:15:45, 15.51s/it]

ratio tensor(1.9377, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4632, device='cuda:0') reward: tensor(1.1259, device='cuda:0') factor: tensor(0.3373, device='cuda:0')
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.1370, device='cuda:0') factor: tensor(0.3366, device='cuda:0')
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.1580, device='cuda:0') factor: tensor(0.3391, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.1479, device='cuda:0') factor: tensor(0.3433, device='cuda:0')
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.1424, device='cuda:0') factor: tensor(0.3295, device='cuda:0')


Epoch 1, Step 76: Loss = 0.4424, Actor Loss = 0.3093, Critic Loss = 0.2671, Entropy = 0.4825, Returns = 1.1218, Value = 1.3996, mask_loss = 0.3500std_loss = 0.2578:   1%|▏         | 76/5475 [20:11<22:24:14, 14.94s/it]  

ratio tensor(0.9225, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5925, device='cuda:0') reward: tensor(1.2199, device='cuda:0') factor: tensor(0.3726, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.1883, device='cuda:0') factor: tensor(0.3530, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.1829, device='cuda:0') factor: tensor(0.3699, device='cuda:0')
reward_raw: tensor(0.5972, device='cuda:0') reward: tensor(1.2260, device='cuda:0') factor: tensor(0.3712, device='cuda:0')
reward_raw: tensor(0.5438, device='cuda:0') reward: tensor(1.1997, device='cuda:0') factor: tensor(0.3441, device='cuda:0')


Epoch 1, Step 77: Loss = -0.1432, Actor Loss = -0.2503, Critic Loss = 0.2153, Entropy = 0.4867, Returns = 1.1500, Value = 0.9004, mask_loss = 0.3769std_loss = 0.2638:   1%|▏         | 77/5475 [20:27<22:46:37, 15.19s/it]

ratio tensor(1.2250, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.2451, device='cuda:0') factor: tensor(0.3092, device='cuda:0')
reward_raw: tensor(0.5308, device='cuda:0') reward: tensor(1.2242, device='cuda:0') factor: tensor(0.3066, device='cuda:0')
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.2258, device='cuda:0') factor: tensor(0.3142, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.2271, device='cuda:0') factor: tensor(0.3236, device='cuda:0')
reward_raw: tensor(0.5403, device='cuda:0') reward: tensor(1.2308, device='cuda:0') factor: tensor(0.3095, device='cuda:0')


Epoch 1, Step 78: Loss = 0.1312, Actor Loss = 0.0772, Critic Loss = 0.1088, Entropy = 0.4713, Returns = 1.2409, Value = 1.3627, mask_loss = 0.3187std_loss = 0.2400:   1%|▏         | 78/5475 [20:48<25:28:11, 16.99s/it]  

ratio tensor(0.7834, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5058, device='cuda:0') reward: tensor(1.1533, device='cuda:0') factor: tensor(0.3525, device='cuda:0')
reward_raw: tensor(0.5440, device='cuda:0') reward: tensor(1.1967, device='cuda:0') factor: tensor(0.3473, device='cuda:0')
reward_raw: tensor(0.5567, device='cuda:0') reward: tensor(1.2076, device='cuda:0') factor: tensor(0.3491, device='cuda:0')
reward_raw: tensor(0.5225, device='cuda:0') reward: tensor(1.1748, device='cuda:0') factor: tensor(0.3477, device='cuda:0')
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.2155, device='cuda:0') factor: tensor(0.3490, device='cuda:0')


Epoch 1, Step 79: Loss = 0.1918, Actor Loss = 0.0753, Critic Loss = 0.2340, Entropy = 0.4872, Returns = 1.2465, Value = 1.3527, mask_loss = 0.3610std_loss = 0.2566:   1%|▏         | 79/5475 [21:04<24:44:43, 16.51s/it]

ratio tensor(1.3306, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5652, device='cuda:0') reward: tensor(1.1955, device='cuda:0') factor: tensor(0.3697, device='cuda:0')
reward_raw: tensor(0.5635, device='cuda:0') reward: tensor(1.2133, device='cuda:0') factor: tensor(0.3502, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.1824, device='cuda:0') factor: tensor(0.3626, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.2130, device='cuda:0') factor: tensor(0.3557, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.1957, device='cuda:0') factor: tensor(0.3478, device='cuda:0')


Epoch 1, Step 80: Loss = -0.0177, Actor Loss = -0.0572, Critic Loss = 0.0800, Entropy = 0.5035, Returns = 1.1921, Value = 1.2283, mask_loss = 0.3897std_loss = 0.2512:   1%|▏         | 80/5475 [21:21<25:13:42, 16.83s/it]

ratio tensor(1.9982, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.2264, device='cuda:0') factor: tensor(0.3026, device='cuda:0')
reward_raw: tensor(0.5451, device='cuda:0') reward: tensor(1.2219, device='cuda:0') factor: tensor(0.3232, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.2106, device='cuda:0') factor: tensor(0.3338, device='cuda:0')
reward_raw: tensor(0.5434, device='cuda:0') reward: tensor(1.2106, device='cuda:0') factor: tensor(0.3328, device='cuda:0')
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.2091, device='cuda:0') factor: tensor(0.3281, device='cuda:0')


Epoch 1, Step 81: Loss = -0.1262, Actor Loss = -0.1615, Critic Loss = 0.0715, Entropy = 0.4584, Returns = 1.2088, Value = 1.1900, mask_loss = 0.3041std_loss = 0.2443:   1%|▏         | 80/5475 [21:41<25:13:42, 16.83s/it]

ratio tensor(1.2760, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 81: Loss = -0.1262, Actor Loss = -0.1615, Critic Loss = 0.0715, Entropy = 0.4584, Returns = 1.2088, Value = 1.1900, mask_loss = 0.3041std_loss = 0.2443:   1%|▏         | 81/5475 [21:41<26:30:34, 17.69s/it]

reward_raw: tensor(0.5636, device='cuda:0') reward: tensor(1.2533, device='cuda:0') factor: tensor(0.3103, device='cuda:0')
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.2270, device='cuda:0') factor: tensor(0.3333, device='cuda:0')
reward_raw: tensor(0.5923, device='cuda:0') reward: tensor(1.2466, device='cuda:0') factor: tensor(0.3456, device='cuda:0')
reward_raw: tensor(0.5437, device='cuda:0') reward: tensor(1.2221, device='cuda:0') factor: tensor(0.3216, device='cuda:0')
reward_raw: tensor(0.5560, device='cuda:0') reward: tensor(1.2273, device='cuda:0') factor: tensor(0.3287, device='cuda:0')


Epoch 1, Step 82: Loss = 0.0218, Actor Loss = -0.0659, Critic Loss = 0.1763, Entropy = 0.4681, Returns = 1.2862, Value = 1.2466, mask_loss = 0.3373std_loss = 0.2620:   1%|▏         | 82/5475 [21:54<24:29:39, 16.35s/it] 

ratio tensor(0.9880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5603, device='cuda:0') reward: tensor(1.2356, device='cuda:0') factor: tensor(0.3247, device='cuda:0')
reward_raw: tensor(0.5370, device='cuda:0') reward: tensor(1.2279, device='cuda:0') factor: tensor(0.3090, device='cuda:0')
reward_raw: tensor(0.4852, device='cuda:0') reward: tensor(1.1707, device='cuda:0') factor: tensor(0.3145, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.1736, device='cuda:0') factor: tensor(0.3106, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.1840, device='cuda:0') factor: tensor(0.3195, device='cuda:0')


Epoch 1, Step 83: Loss = 0.2751, Actor Loss = 0.2254, Critic Loss = 0.1003, Entropy = 0.4914, Returns = 1.2311, Value = 1.2909, mask_loss = 0.3534std_loss = 0.2508:   2%|▏         | 83/5475 [22:13<25:33:11, 17.06s/it] 

ratio tensor(0.9432, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5157, device='cuda:0') reward: tensor(1.2251, device='cuda:0') factor: tensor(0.2907, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.2066, device='cuda:0') factor: tensor(0.3067, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.2538, device='cuda:0') factor: tensor(0.2983, device='cuda:0')
reward_raw: tensor(0.5455, device='cuda:0') reward: tensor(1.2486, device='cuda:0') factor: tensor(0.2969, device='cuda:0')
reward_raw: tensor(0.5708, device='cuda:0') reward: tensor(1.2606, device='cuda:0') factor: tensor(0.3102, device='cuda:0')


Epoch 1, Step 84: Loss = -0.0890, Actor Loss = -0.1328, Critic Loss = 0.0884, Entropy = 0.4682, Returns = 1.2058, Value = 1.2904, mask_loss = 0.2936std_loss = 0.2285:   2%|▏         | 84/5475 [22:29<25:04:12, 16.74s/it]

ratio tensor(1.3599, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.1672, device='cuda:0') factor: tensor(0.3380, device='cuda:0')
reward_raw: tensor(0.4571, device='cuda:0') reward: tensor(1.1487, device='cuda:0') factor: tensor(0.3085, device='cuda:0')
reward_raw: tensor(0.4980, device='cuda:0') reward: tensor(1.1699, device='cuda:0') factor: tensor(0.3281, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.1793, device='cuda:0') factor: tensor(0.3289, device='cuda:0')
reward_raw: tensor(0.4906, device='cuda:0') reward: tensor(1.1599, device='cuda:0') factor: tensor(0.3306, device='cuda:0')


Epoch 1, Step 85: Loss = 0.0106, Actor Loss = -0.0373, Critic Loss = 0.0966, Entropy = 0.4794, Returns = 1.1265, Value = 1.2061, mask_loss = 0.3263std_loss = 0.2469:   2%|▏         | 85/5475 [22:43<24:00:02, 16.03s/it] 

ratio tensor(1.2160, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.2150, device='cuda:0') factor: tensor(0.3137, device='cuda:0')
reward_raw: tensor(0.5405, device='cuda:0') reward: tensor(1.2254, device='cuda:0') factor: tensor(0.3151, device='cuda:0')
reward_raw: tensor(0.5748, device='cuda:0') reward: tensor(1.2551, device='cuda:0') factor: tensor(0.3197, device='cuda:0')
reward_raw: tensor(0.5407, device='cuda:0') reward: tensor(1.2082, device='cuda:0') factor: tensor(0.3325, device='cuda:0')
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.2435, device='cuda:0') factor: tensor(0.3097, device='cuda:0')


Epoch 1, Step 86: Loss = 0.1024, Actor Loss = 0.0702, Critic Loss = 0.0654, Entropy = 0.4757, Returns = 1.2169, Value = 1.1733, mask_loss = 0.3135std_loss = 0.2362:   2%|▏         | 86/5475 [23:04<26:05:53, 17.43s/it] 

ratio tensor(1.1229, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5425, device='cuda:0') reward: tensor(1.1640, device='cuda:0') factor: tensor(0.3786, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.1208, device='cuda:0') factor: tensor(0.3781, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.1605, device='cuda:0') factor: tensor(0.3818, device='cuda:0')
reward_raw: tensor(0.5304, device='cuda:0') reward: tensor(1.1625, device='cuda:0') factor: tensor(0.3679, device='cuda:0')
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.1474, device='cuda:0') factor: tensor(0.3789, device='cuda:0')


Epoch 1, Step 87: Loss = 0.0901, Actor Loss = 0.0416, Critic Loss = 0.0980, Entropy = 0.5014, Returns = 1.1759, Value = 1.0827, mask_loss = 0.3976std_loss = 0.2594:   2%|▏         | 87/5475 [23:18<24:36:23, 16.44s/it]

ratio tensor(0.8544, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.2025, device='cuda:0') factor: tensor(0.3568, device='cuda:0')
reward_raw: tensor(0.5532, device='cuda:0') reward: tensor(1.2034, device='cuda:0') factor: tensor(0.3498, device='cuda:0')
reward_raw: tensor(0.5755, device='cuda:0') reward: tensor(1.2128, device='cuda:0') factor: tensor(0.3627, device='cuda:0')
reward_raw: tensor(0.5600, device='cuda:0') reward: tensor(1.2115, device='cuda:0') factor: tensor(0.3484, device='cuda:0')
reward_raw: tensor(0.5619, device='cuda:0') reward: tensor(1.2031, device='cuda:0') factor: tensor(0.3588, device='cuda:0')


Epoch 1, Step 88: Loss = -0.2045, Actor Loss = -0.2439, Critic Loss = 0.0797, Entropy = 0.5017, Returns = 1.2175, Value = 1.1401, mask_loss = 0.3623std_loss = 0.2387:   2%|▏         | 88/5475 [23:36<25:04:08, 16.75s/it]

ratio tensor(1.3640, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5536, device='cuda:0') reward: tensor(1.1963, device='cuda:0') factor: tensor(0.3573, device='cuda:0')
reward_raw: tensor(0.5573, device='cuda:0') reward: tensor(1.2073, device='cuda:0') factor: tensor(0.3500, device='cuda:0')
reward_raw: tensor(0.5699, device='cuda:0') reward: tensor(1.2047, device='cuda:0') factor: tensor(0.3652, device='cuda:0')
reward_raw: tensor(0.5293, device='cuda:0') reward: tensor(1.1748, device='cuda:0') factor: tensor(0.3545, device='cuda:0')
reward_raw: tensor(0.6098, device='cuda:0') reward: tensor(1.2567, device='cuda:0') factor: tensor(0.3531, device='cuda:0')


Epoch 1, Step 89: Loss = 0.1367, Actor Loss = 0.0873, Critic Loss = 0.0998, Entropy = 0.4914, Returns = 1.1752, Value = 1.2725, mask_loss = 0.3349std_loss = 0.2430:   2%|▏         | 89/5475 [23:47<22:50:02, 15.26s/it]  

ratio tensor(0.7537, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.2434, device='cuda:0') factor: tensor(0.2968, device='cuda:0')
reward_raw: tensor(0.5458, device='cuda:0') reward: tensor(1.2336, device='cuda:0') factor: tensor(0.3122, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.2700, device='cuda:0') factor: tensor(0.3044, device='cuda:0')
reward_raw: tensor(0.5518, device='cuda:0') reward: tensor(1.2462, device='cuda:0') factor: tensor(0.3055, device='cuda:0')
reward_raw: tensor(0.5654, device='cuda:0') reward: tensor(1.2616, device='cuda:0') factor: tensor(0.3038, device='cuda:0')


Epoch 1, Step 90: Loss = -0.0321, Actor Loss = -0.1040, Critic Loss = 0.1448, Entropy = 0.4774, Returns = 1.2589, Value = 1.1173, mask_loss = 0.3041std_loss = 0.2316:   2%|▏         | 90/5475 [24:05<24:04:04, 16.09s/it]

ratio tensor(1.1664, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.1730, device='cuda:0') factor: tensor(0.3218, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.2050, device='cuda:0') factor: tensor(0.3211, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.1969, device='cuda:0') factor: tensor(0.3142, device='cuda:0')
reward_raw: tensor(0.5671, device='cuda:0') reward: tensor(1.2447, device='cuda:0') factor: tensor(0.3225, device='cuda:0')
reward_raw: tensor(0.5649, device='cuda:0') reward: tensor(1.2287, device='cuda:0') factor: tensor(0.3362, device='cuda:0')


Epoch 1, Step 91: Loss = -0.1006, Actor Loss = -0.1457, Critic Loss = 0.0910, Entropy = 0.4651, Returns = 1.2661, Value = 1.1290, mask_loss = 0.3158std_loss = 0.2479:   2%|▏         | 91/5475 [24:22<24:09:37, 16.15s/it]

ratio tensor(1.1734, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4973, device='cuda:0') reward: tensor(1.1834, device='cuda:0') factor: tensor(0.3138, device='cuda:0')
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.2013, device='cuda:0') factor: tensor(0.3165, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.1702, device='cuda:0') factor: tensor(0.3131, device='cuda:0')
reward_raw: tensor(0.5430, device='cuda:0') reward: tensor(1.2211, device='cuda:0') factor: tensor(0.3220, device='cuda:0')
reward_raw: tensor(0.5219, device='cuda:0') reward: tensor(1.2086, device='cuda:0') factor: tensor(0.3133, device='cuda:0')


Epoch 1, Step 92: Loss = -0.1640, Actor Loss = -0.1895, Critic Loss = 0.0519, Entropy = 0.4801, Returns = 1.1977, Value = 1.1738, mask_loss = 0.3240std_loss = 0.2428:   2%|▏         | 92/5475 [24:38<24:24:03, 16.32s/it]

ratio tensor(1.1955, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.1442, device='cuda:0') factor: tensor(0.3277, device='cuda:0')
reward_raw: tensor(0.4890, device='cuda:0') reward: tensor(1.1665, device='cuda:0') factor: tensor(0.3225, device='cuda:0')
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.2062, device='cuda:0') factor: tensor(0.3287, device='cuda:0')
reward_raw: tensor(0.4593, device='cuda:0') reward: tensor(1.1386, device='cuda:0') factor: tensor(0.3206, device='cuda:0')
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.1771, device='cuda:0') factor: tensor(0.3247, device='cuda:0')


Epoch 1, Step 93: Loss = -0.0127, Actor Loss = -0.0544, Critic Loss = 0.0845, Entropy = 0.4552, Returns = 1.1904, Value = 1.2100, mask_loss = 0.3381std_loss = 0.2581:   2%|▏         | 93/5475 [24:54<23:50:32, 15.95s/it]

ratio tensor(1.2072, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.2450, device='cuda:0') factor: tensor(0.3007, device='cuda:0')
reward_raw: tensor(0.5702, device='cuda:0') reward: tensor(1.2613, device='cuda:0') factor: tensor(0.3089, device='cuda:0')
reward_raw: tensor(0.6042, device='cuda:0') reward: tensor(1.2915, device='cuda:0') factor: tensor(0.3127, device='cuda:0')
reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.2420, device='cuda:0') factor: tensor(0.2994, device='cuda:0')
reward_raw: tensor(0.6045, device='cuda:0') reward: tensor(1.2898, device='cuda:0') factor: tensor(0.3146, device='cuda:0')


Epoch 1, Step 94: Loss = -0.0696, Actor Loss = -0.1394, Critic Loss = 0.1405, Entropy = 0.4543, Returns = 1.2494, Value = 1.2483, mask_loss = 0.3069std_loss = 0.2523:   2%|▏         | 94/5475 [25:07<22:47:57, 15.25s/it]

ratio tensor(1.1879, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.1872, device='cuda:0') factor: tensor(0.2932, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.1810, device='cuda:0') factor: tensor(0.2994, device='cuda:0')
reward_raw: tensor(0.4635, device='cuda:0') reward: tensor(1.1730, device='cuda:0') factor: tensor(0.2905, device='cuda:0')
reward_raw: tensor(0.5269, device='cuda:0') reward: tensor(1.2325, device='cuda:0') factor: tensor(0.2944, device='cuda:0')
reward_raw: tensor(0.4551, device='cuda:0') reward: tensor(1.1606, device='cuda:0') factor: tensor(0.2945, device='cuda:0')


Epoch 1, Step 95: Loss = 0.2694, Actor Loss = 0.2145, Critic Loss = 0.1105, Entropy = 0.4284, Returns = 1.1735, Value = 1.2440, mask_loss = 0.2572std_loss = 0.2310:   2%|▏         | 95/5475 [25:22<22:35:43, 15.12s/it]  

ratio tensor(0.8446, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.2267, device='cuda:0') factor: tensor(0.2888, device='cuda:0')
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.2488, device='cuda:0') factor: tensor(0.2912, device='cuda:0')
reward_raw: tensor(0.5111, device='cuda:0') reward: tensor(1.2074, device='cuda:0') factor: tensor(0.3037, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.2683, device='cuda:0') factor: tensor(0.2893, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.2052, device='cuda:0') factor: tensor(0.2908, device='cuda:0')


Epoch 1, Step 96: Loss = 0.1478, Actor Loss = 0.0835, Critic Loss = 0.1295, Entropy = 0.4239, Returns = 1.1991, Value = 1.0640, mask_loss = 0.2672std_loss = 0.2455:   2%|▏         | 96/5475 [25:42<24:34:26, 16.45s/it]

ratio tensor(0.5904, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.2841, device='cuda:0') factor: tensor(0.2896, device='cuda:0')
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.2740, device='cuda:0') factor: tensor(0.2942, device='cuda:0')
reward_raw: tensor(0.5663, device='cuda:0') reward: tensor(1.2697, device='cuda:0') factor: tensor(0.2966, device='cuda:0')
reward_raw: tensor(0.5621, device='cuda:0') reward: tensor(1.2685, device='cuda:0') factor: tensor(0.2935, device='cuda:0')
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.2403, device='cuda:0') factor: tensor(0.2999, device='cuda:0')


Epoch 1, Step 97: Loss = -0.3009, Actor Loss = -0.3719, Critic Loss = 0.1431, Entropy = 0.4491, Returns = 1.2292, Value = 1.2980, mask_loss = 0.2933std_loss = 0.2441:   2%|▏         | 97/5475 [25:55<23:22:58, 15.65s/it]

ratio tensor(1.1898, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4744, device='cuda:0') reward: tensor(1.2070, device='cuda:0') factor: tensor(0.2674, device='cuda:0')
reward_raw: tensor(0.4511, device='cuda:0') reward: tensor(1.1717, device='cuda:0') factor: tensor(0.2795, device='cuda:0')
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.1790, device='cuda:0') factor: tensor(0.2717, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.1935, device='cuda:0') factor: tensor(0.2888, device='cuda:0')
reward_raw: tensor(0.4796, device='cuda:0') reward: tensor(1.2051, device='cuda:0') factor: tensor(0.2745, device='cuda:0')


Epoch 1, Step 98: Loss = -0.0657, Actor Loss = -0.0943, Critic Loss = 0.0582, Entropy = 0.4456, Returns = 1.2222, Value = 1.1963, mask_loss = 0.2807std_loss = 0.2392:   2%|▏         | 98/5475 [26:12<24:02:02, 16.09s/it]

ratio tensor(1.6436, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4896, device='cuda:0') reward: tensor(1.1841, device='cuda:0') factor: tensor(0.3055, device='cuda:0')
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.1712, device='cuda:0') factor: tensor(0.2917, device='cuda:0')
reward_raw: tensor(0.4878, device='cuda:0') reward: tensor(1.2004, device='cuda:0') factor: tensor(0.2874, device='cuda:0')
reward_raw: tensor(0.4694, device='cuda:0') reward: tensor(1.1643, device='cuda:0') factor: tensor(0.3051, device='cuda:0')
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.1752, device='cuda:0') factor: tensor(0.2919, device='cuda:0')


Epoch 1, Step 99: Loss = 0.2221, Actor Loss = 0.2111, Critic Loss = 0.0229, Entropy = 0.4439, Returns = 1.1542, Value = 1.1185, mask_loss = 0.3008std_loss = 0.2569:   2%|▏         | 99/5475 [26:30<24:45:33, 16.58s/it]  

ratio tensor(0.6028, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4659, device='cuda:0') reward: tensor(1.2179, device='cuda:0') factor: tensor(0.2480, device='cuda:0')
reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.2528, device='cuda:0') factor: tensor(0.2666, device='cuda:0')
reward_raw: tensor(0.4560, device='cuda:0') reward: tensor(1.1969, device='cuda:0') factor: tensor(0.2591, device='cuda:0')
reward_raw: tensor(0.5157, device='cuda:0') reward: tensor(1.2590, device='cuda:0') factor: tensor(0.2566, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(1.2291, device='cuda:0') factor: tensor(0.2468, device='cuda:0')


Epoch 1, Step 100: Loss = -0.1966, Actor Loss = -0.2917, Critic Loss = 0.1909, Entropy = 0.3934, Returns = 1.2052, Value = 0.9932, mask_loss = 0.2413std_loss = 0.2464:   2%|▏         | 100/5475 [26:46<24:22:51, 16.33s/it]

ratio tensor(10.4002, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.2304, device='cuda:0') factor: tensor(0.2661, device='cuda:0')
reward_raw: tensor(0.4890, device='cuda:0') reward: tensor(1.2273, device='cuda:0') factor: tensor(0.2617, device='cuda:0')
reward_raw: tensor(0.4611, device='cuda:0') reward: tensor(1.2096, device='cuda:0') factor: tensor(0.2514, device='cuda:0')
reward_raw: tensor(0.4844, device='cuda:0') reward: tensor(1.2345, device='cuda:0') factor: tensor(0.2499, device='cuda:0')
reward_raw: tensor(0.4734, device='cuda:0') reward: tensor(1.2226, device='cuda:0') factor: tensor(0.2508, device='cuda:0')


Epoch 1, Step 101: Loss = 0.1467, Actor Loss = 0.1029, Critic Loss = 0.0884, Entropy = 0.4314, Returns = 1.1221, Value = 1.2978, mask_loss = 0.2725std_loss = 0.2476:   2%|▏         | 100/5475 [27:08<24:22:51, 16.33s/it]  

ratio tensor(0.8172, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 101: Loss = 0.1467, Actor Loss = 0.1029, Critic Loss = 0.0884, Entropy = 0.4314, Returns = 1.1221, Value = 1.2978, mask_loss = 0.2725std_loss = 0.2476:   2%|▏         | 101/5475 [27:08<27:07:11, 18.17s/it]

reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.1884, device='cuda:0') factor: tensor(0.3036, device='cuda:0')
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.2035, device='cuda:0') factor: tensor(0.3022, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.2144, device='cuda:0') factor: tensor(0.2996, device='cuda:0')
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.2090, device='cuda:0') factor: tensor(0.2872, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.1974, device='cuda:0') factor: tensor(0.2884, device='cuda:0')


Epoch 1, Step 102: Loss = -0.1706, Actor Loss = -0.3064, Critic Loss = 0.2723, Entropy = 0.3715, Returns = 1.2222, Value = 0.9951, mask_loss = 0.2650std_loss = 0.2778:   2%|▏         | 102/5475 [27:22<25:02:33, 16.78s/it]

ratio tensor(3.8271, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5100, device='cuda:0') reward: tensor(1.2695, device='cuda:0') factor: tensor(0.2405, device='cuda:0')
reward_raw: tensor(0.4735, device='cuda:0') reward: tensor(1.2292, device='cuda:0') factor: tensor(0.2443, device='cuda:0')
reward_raw: tensor(0.4849, device='cuda:0') reward: tensor(1.2414, device='cuda:0') factor: tensor(0.2435, device='cuda:0')
reward_raw: tensor(0.5024, device='cuda:0') reward: tensor(1.2519, device='cuda:0') factor: tensor(0.2505, device='cuda:0')
reward_raw: tensor(0.4698, device='cuda:0') reward: tensor(1.2339, device='cuda:0') factor: tensor(0.2359, device='cuda:0')


Epoch 1, Step 103: Loss = 0.1923, Actor Loss = 0.1515, Critic Loss = 0.0824, Entropy = 0.3604, Returns = 1.2270, Value = 1.3189, mask_loss = 0.2473std_loss = 0.2752:   2%|▏         | 103/5475 [27:39<25:00:22, 16.76s/it]  

ratio tensor(1.1026, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4801, device='cuda:0') reward: tensor(1.2339, device='cuda:0') factor: tensor(0.2463, device='cuda:0')
reward_raw: tensor(0.4400, device='cuda:0') reward: tensor(1.2142, device='cuda:0') factor: tensor(0.2258, device='cuda:0')
reward_raw: tensor(0.4453, device='cuda:0') reward: tensor(1.1825, device='cuda:0') factor: tensor(0.2628, device='cuda:0')
reward_raw: tensor(0.4926, device='cuda:0') reward: tensor(1.2343, device='cuda:0') factor: tensor(0.2583, device='cuda:0')
reward_raw: tensor(0.4574, device='cuda:0') reward: tensor(1.2167, device='cuda:0') factor: tensor(0.2407, device='cuda:0')


Epoch 1, Step 104: Loss = -0.1801, Actor Loss = -0.2288, Critic Loss = 0.0982, Entropy = 0.3455, Returns = 1.2426, Value = 1.2173, mask_loss = 0.2301std_loss = 0.2640:   2%|▏         | 104/5475 [27:53<23:55:54, 16.04s/it]

ratio tensor(1.8185, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5091, device='cuda:0') reward: tensor(1.3041, device='cuda:0') factor: tensor(0.2050, device='cuda:0')
reward_raw: tensor(0.5108, device='cuda:0') reward: tensor(1.2888, device='cuda:0') factor: tensor(0.2220, device='cuda:0')
reward_raw: tensor(0.5228, device='cuda:0') reward: tensor(1.3115, device='cuda:0') factor: tensor(0.2113, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.3015, device='cuda:0') factor: tensor(0.2148, device='cuda:0')
reward_raw: tensor(0.5388, device='cuda:0') reward: tensor(1.3170, device='cuda:0') factor: tensor(0.2218, device='cuda:0')


Epoch 1, Step 105: Loss = 0.3536, Actor Loss = 0.2662, Critic Loss = 0.1755, Entropy = 0.3612, Returns = 1.3562, Value = 1.5295, mask_loss = 0.2200std_loss = 0.2447:   2%|▏         | 105/5475 [28:15<26:32:45, 17.80s/it]  

ratio tensor(0.6020, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.2696, device='cuda:0') factor: tensor(0.2229, device='cuda:0')
reward_raw: tensor(0.5397, device='cuda:0') reward: tensor(1.3233, device='cuda:0') factor: tensor(0.2163, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(1.2842, device='cuda:0') factor: tensor(0.2228, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.2641, device='cuda:0') factor: tensor(0.2273, device='cuda:0')
reward_raw: tensor(0.4935, device='cuda:0') reward: tensor(1.2742, device='cuda:0') factor: tensor(0.2193, device='cuda:0')


Epoch 1, Step 106: Loss = -0.0709, Actor Loss = -0.1305, Critic Loss = 0.1199, Entropy = 0.3481, Returns = 1.2653, Value = 1.2182, mask_loss = 0.2429std_loss = 0.2795:   2%|▏         | 106/5475 [28:35<27:43:10, 18.59s/it]

ratio tensor(1.4578, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.2728, device='cuda:0') factor: tensor(0.2465, device='cuda:0')
reward_raw: tensor(0.5543, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.2262, device='cuda:0')
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.2931, device='cuda:0') factor: tensor(0.2437, device='cuda:0')
reward_raw: tensor(0.5045, device='cuda:0') reward: tensor(1.2597, device='cuda:0') factor: tensor(0.2448, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.2686, device='cuda:0') factor: tensor(0.2364, device='cuda:0')


Epoch 1, Step 107: Loss = 0.0126, Actor Loss = -0.0365, Critic Loss = 0.0991, Entropy = 0.3630, Returns = 1.2852, Value = 1.4015, mask_loss = 0.2553std_loss = 0.2775:   2%|▏         | 107/5475 [28:48<25:04:49, 16.82s/it] 

ratio tensor(0.8530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4403, device='cuda:0') reward: tensor(1.2079, device='cuda:0') factor: tensor(0.2324, device='cuda:0')
reward_raw: tensor(0.4252, device='cuda:0') reward: tensor(1.1928, device='cuda:0') factor: tensor(0.2324, device='cuda:0')
reward_raw: tensor(0.4437, device='cuda:0') reward: tensor(1.2166, device='cuda:0') factor: tensor(0.2271, device='cuda:0')
reward_raw: tensor(0.4281, device='cuda:0') reward: tensor(1.1998, device='cuda:0') factor: tensor(0.2283, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.2243, device='cuda:0') factor: tensor(0.2208, device='cuda:0')


Epoch 1, Step 108: Loss = 0.3273, Actor Loss = 0.2810, Critic Loss = 0.0932, Entropy = 0.3555, Returns = 1.1234, Value = 1.2072, mask_loss = 0.2302std_loss = 0.2635:   2%|▏         | 108/5475 [29:06<25:26:34, 17.07s/it] 

ratio tensor(0.7917, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.2894, device='cuda:0') factor: tensor(0.2057, device='cuda:0')
reward_raw: tensor(0.5025, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1911, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.2658, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(1.2616, device='cuda:0') factor: tensor(0.2039, device='cuda:0')
reward_raw: tensor(0.4567, device='cuda:0') reward: tensor(1.2600, device='cuda:0') factor: tensor(0.1967, device='cuda:0')


Epoch 1, Step 109: Loss = 0.1787, Actor Loss = 0.0678, Critic Loss = 0.2225, Entropy = 0.3573, Returns = 1.2506, Value = 1.2618, mask_loss = 0.2200std_loss = 0.2480:   2%|▏         | 109/5475 [29:25<26:17:39, 17.64s/it]

ratio tensor(1.1165, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.2798, device='cuda:0') factor: tensor(0.2854, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.2725, device='cuda:0') factor: tensor(0.2620, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.2575, device='cuda:0') factor: tensor(0.2585, device='cuda:0')
reward_raw: tensor(0.5565, device='cuda:0') reward: tensor(1.3038, device='cuda:0') factor: tensor(0.2527, device='cuda:0')
reward_raw: tensor(0.5444, device='cuda:0') reward: tensor(1.2883, device='cuda:0') factor: tensor(0.2560, device='cuda:0')


Epoch 1, Step 110: Loss = -0.0986, Actor Loss = -0.1625, Critic Loss = 0.1285, Entropy = 0.3679, Returns = 1.2513, Value = 1.3344, mask_loss = 0.2565std_loss = 0.2726:   2%|▏         | 110/5475 [29:37<24:00:16, 16.11s/it]

ratio tensor(1.3493, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5187, device='cuda:0') reward: tensor(1.2711, device='cuda:0') factor: tensor(0.2476, device='cuda:0')
reward_raw: tensor(0.5355, device='cuda:0') reward: tensor(1.2770, device='cuda:0') factor: tensor(0.2585, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.2856, device='cuda:0') factor: tensor(0.2535, device='cuda:0')
reward_raw: tensor(0.5351, device='cuda:0') reward: tensor(1.2971, device='cuda:0') factor: tensor(0.2380, device='cuda:0')
reward_raw: tensor(0.5225, device='cuda:0') reward: tensor(1.2748, device='cuda:0') factor: tensor(0.2477, device='cuda:0')


Epoch 1, Step 111: Loss = 0.0880, Actor Loss = 0.0377, Critic Loss = 0.1014, Entropy = 0.3726, Returns = 1.3019, Value = 1.3446, mask_loss = 0.2354std_loss = 0.2556:   2%|▏         | 111/5475 [29:49<22:16:52, 14.95s/it]  

ratio tensor(1.1287, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5343, device='cuda:0') reward: tensor(1.3218, device='cuda:0') factor: tensor(0.2126, device='cuda:0')
reward_raw: tensor(0.5187, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.2282, device='cuda:0')
reward_raw: tensor(0.5332, device='cuda:0') reward: tensor(1.3098, device='cuda:0') factor: tensor(0.2234, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.3203, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.2701, device='cuda:0') factor: tensor(0.2369, device='cuda:0')


Epoch 1, Step 112: Loss = 0.1962, Actor Loss = 0.1714, Critic Loss = 0.0501, Entropy = 0.3482, Returns = 1.2720, Value = 1.2016, mask_loss = 0.2244std_loss = 0.2612:   2%|▏         | 112/5475 [30:05<22:38:26, 15.20s/it]

ratio tensor(0.9568, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4496, device='cuda:0') reward: tensor(1.2287, device='cuda:0') factor: tensor(0.2209, device='cuda:0')
reward_raw: tensor(0.4095, device='cuda:0') reward: tensor(1.1929, device='cuda:0') factor: tensor(0.2166, device='cuda:0')
reward_raw: tensor(0.4488, device='cuda:0') reward: tensor(1.2265, device='cuda:0') factor: tensor(0.2224, device='cuda:0')
reward_raw: tensor(0.4717, device='cuda:0') reward: tensor(1.2354, device='cuda:0') factor: tensor(0.2363, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.2607, device='cuda:0') factor: tensor(0.2197, device='cuda:0')


Epoch 1, Step 113: Loss = -0.0100, Actor Loss = -0.0976, Critic Loss = 0.1759, Entropy = 0.3568, Returns = 1.2309, Value = 1.3195, mask_loss = 0.2442std_loss = 0.2686:   2%|▏         | 113/5475 [30:18<21:43:18, 14.58s/it]

ratio tensor(1.1307, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4437, device='cuda:0') reward: tensor(1.2347, device='cuda:0') factor: tensor(0.2090, device='cuda:0')
reward_raw: tensor(0.4159, device='cuda:0') reward: tensor(1.2036, device='cuda:0') factor: tensor(0.2123, device='cuda:0')
reward_raw: tensor(0.4202, device='cuda:0') reward: tensor(1.2137, device='cuda:0') factor: tensor(0.2065, device='cuda:0')
reward_raw: tensor(0.3983, device='cuda:0') reward: tensor(1.1908, device='cuda:0') factor: tensor(0.2075, device='cuda:0')
reward_raw: tensor(0.4188, device='cuda:0') reward: tensor(1.2098, device='cuda:0') factor: tensor(0.2090, device='cuda:0')


Epoch 1, Step 114: Loss = 0.0948, Actor Loss = 0.0371, Critic Loss = 0.1162, Entropy = 0.3526, Returns = 1.1734, Value = 1.1486, mask_loss = 0.2115std_loss = 0.2401:   2%|▏         | 114/5475 [30:35<22:51:13, 15.35s/it]  

ratio tensor(0.8636, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.2593, device='cuda:0') factor: tensor(0.2315, device='cuda:0')
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.2714, device='cuda:0') factor: tensor(0.2296, device='cuda:0')
reward_raw: tensor(0.4665, device='cuda:0') reward: tensor(1.2449, device='cuda:0') factor: tensor(0.2217, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.2482, device='cuda:0') factor: tensor(0.2376, device='cuda:0')
reward_raw: tensor(0.5143, device='cuda:0') reward: tensor(1.2839, device='cuda:0') factor: tensor(0.2305, device='cuda:0')


Epoch 1, Step 115: Loss = 0.2231, Actor Loss = 0.1617, Critic Loss = 0.1237, Entropy = 0.3699, Returns = 1.1865, Value = 1.3659, mask_loss = 0.2448std_loss = 0.2637:   2%|▏         | 115/5475 [30:53<23:47:42, 15.98s/it]

ratio tensor(0.9542, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4880, device='cuda:0') reward: tensor(1.2487, device='cuda:0') factor: tensor(0.2393, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.2594, device='cuda:0') factor: tensor(0.2350, device='cuda:0')
reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.2389, device='cuda:0') factor: tensor(0.2366, device='cuda:0')
reward_raw: tensor(0.5024, device='cuda:0') reward: tensor(1.2642, device='cuda:0') factor: tensor(0.2381, device='cuda:0')
reward_raw: tensor(0.4722, device='cuda:0') reward: tensor(1.2509, device='cuda:0') factor: tensor(0.2213, device='cuda:0')


Epoch 1, Step 116: Loss = -0.0400, Actor Loss = -0.0928, Critic Loss = 0.1064, Entropy = 0.3926, Returns = 1.2901, Value = 1.2572, mask_loss = 0.2676std_loss = 0.2715:   2%|▏         | 116/5475 [31:08<23:11:12, 15.58s/it]

ratio tensor(0.7665, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4505, device='cuda:0') reward: tensor(1.2032, device='cuda:0') factor: tensor(0.2474, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.2471, device='cuda:0') factor: tensor(0.2519, device='cuda:0')
reward_raw: tensor(0.4775, device='cuda:0') reward: tensor(1.2220, device='cuda:0') factor: tensor(0.2556, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.2468, device='cuda:0') factor: tensor(0.2464, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.2204, device='cuda:0') factor: tensor(0.2518, device='cuda:0')


Epoch 1, Step 117: Loss = -0.1415, Actor Loss = -0.2150, Critic Loss = 0.1477, Entropy = 0.3909, Returns = 1.2014, Value = 1.1290, mask_loss = 0.2432std_loss = 0.2493:   2%|▏         | 117/5475 [31:25<23:50:36, 16.02s/it]

ratio tensor(1.5075, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.2388, device='cuda:0') factor: tensor(0.2614, device='cuda:0')
reward_raw: tensor(0.4725, device='cuda:0') reward: tensor(1.2053, device='cuda:0') factor: tensor(0.2673, device='cuda:0')
reward_raw: tensor(0.4684, device='cuda:0') reward: tensor(1.2048, device='cuda:0') factor: tensor(0.2636, device='cuda:0')
reward_raw: tensor(0.5083, device='cuda:0') reward: tensor(1.2469, device='cuda:0') factor: tensor(0.2614, device='cuda:0')
reward_raw: tensor(0.4708, device='cuda:0') reward: tensor(1.2165, device='cuda:0') factor: tensor(0.2542, device='cuda:0')


Epoch 1, Step 118: Loss = 0.1267, Actor Loss = 0.0724, Critic Loss = 0.1093, Entropy = 0.3741, Returns = 1.2556, Value = 1.2682, mask_loss = 0.2410std_loss = 0.2591:   2%|▏         | 118/5475 [31:40<23:27:56, 15.77s/it]  

ratio tensor(0.9393, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5722, device='cuda:0') reward: tensor(1.3484, device='cuda:0') factor: tensor(0.2238, device='cuda:0')
reward_raw: tensor(0.5368, device='cuda:0') reward: tensor(1.3019, device='cuda:0') factor: tensor(0.2349, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.2895, device='cuda:0') factor: tensor(0.2191, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.2878, device='cuda:0') factor: tensor(0.2286, device='cuda:0')
reward_raw: tensor(0.5272, device='cuda:0') reward: tensor(1.2898, device='cuda:0') factor: tensor(0.2375, device='cuda:0')


Epoch 1, Step 119: Loss = 0.2048, Actor Loss = 0.1747, Critic Loss = 0.0609, Entropy = 0.3655, Returns = 1.3186, Value = 1.2939, mask_loss = 0.2304std_loss = 0.2544:   2%|▏         | 119/5475 [31:53<22:25:52, 15.08s/it]

ratio tensor(0.9172, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.2552, device='cuda:0')
reward_raw: tensor(0.5428, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.2465, device='cuda:0')
reward_raw: tensor(0.5316, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.2353, device='cuda:0')
reward_raw: tensor(0.5612, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.2556, device='cuda:0')
reward_raw: tensor(0.5610, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.2500, device='cuda:0')


Epoch 1, Step 120: Loss = -0.0920, Actor Loss = -0.1429, Critic Loss = 0.1027, Entropy = 0.3824, Returns = 1.2991, Value = 1.3107, mask_loss = 0.2576std_loss = 0.2615:   2%|▏         | 120/5475 [32:12<23:56:51, 16.10s/it]

ratio tensor(1.2257, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5412, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.2255, device='cuda:0')
reward_raw: tensor(0.5612, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.2369, device='cuda:0')
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.2205, device='cuda:0')
reward_raw: tensor(0.5755, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.2340, device='cuda:0')
reward_raw: tensor(0.5455, device='cuda:0') reward: tensor(1.3094, device='cuda:0') factor: tensor(0.2361, device='cuda:0')


Epoch 1, Step 121: Loss = 0.0575, Actor Loss = 0.0063, Critic Loss = 0.1032, Entropy = 0.3693, Returns = 1.3092, Value = 1.4620, mask_loss = 0.2452std_loss = 0.2620:   2%|▏         | 120/5475 [32:33<23:56:51, 16.10s/it]  

ratio tensor(1.3490, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 121: Loss = 0.0575, Actor Loss = 0.0063, Critic Loss = 0.1032, Entropy = 0.3693, Returns = 1.3092, Value = 1.4620, mask_loss = 0.2452std_loss = 0.2620:   2%|▏         | 121/5475 [32:34<26:33:46, 17.86s/it]

reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.2611, device='cuda:0') factor: tensor(0.2551, device='cuda:0')
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.2922, device='cuda:0') factor: tensor(0.2535, device='cuda:0')
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.2524, device='cuda:0') factor: tensor(0.2572, device='cuda:0')
reward_raw: tensor(0.5332, device='cuda:0') reward: tensor(1.2959, device='cuda:0') factor: tensor(0.2373, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.2551, device='cuda:0') factor: tensor(0.2508, device='cuda:0')


Epoch 1, Step 122: Loss = 0.3382, Actor Loss = 0.2412, Critic Loss = 0.1948, Entropy = 0.3866, Returns = 1.2276, Value = 1.3371, mask_loss = 0.2721std_loss = 0.2709:   2%|▏         | 122/5475 [32:49<25:35:32, 17.21s/it]

ratio tensor(0.8732, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5315, device='cuda:0') reward: tensor(1.2845, device='cuda:0') factor: tensor(0.2470, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.2857, device='cuda:0') factor: tensor(0.2366, device='cuda:0')
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.2922, device='cuda:0') factor: tensor(0.2427, device='cuda:0')
reward_raw: tensor(0.5921, device='cuda:0') reward: tensor(1.3507, device='cuda:0') factor: tensor(0.2413, device='cuda:0')
reward_raw: tensor(0.5787, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.2388, device='cuda:0')


Epoch 1, Step 123: Loss = -0.1314, Actor Loss = -0.2065, Critic Loss = 0.1508, Entropy = 0.3328, Returns = 1.3198, Value = 1.2236, mask_loss = 0.2160std_loss = 0.2588:   2%|▏         | 123/5475 [33:05<24:55:44, 16.77s/it]

ratio tensor(1.2913, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5248, device='cuda:0') reward: tensor(1.2526, device='cuda:0') factor: tensor(0.2722, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.2794, device='cuda:0') factor: tensor(0.2498, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.2658, device='cuda:0') factor: tensor(0.2477, device='cuda:0')
reward_raw: tensor(0.5301, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.2529, device='cuda:0')
reward_raw: tensor(0.5494, device='cuda:0') reward: tensor(1.2863, device='cuda:0') factor: tensor(0.2631, device='cuda:0')


Epoch 1, Step 124: Loss = -0.0648, Actor Loss = -0.1181, Critic Loss = 0.1074, Entropy = 0.3492, Returns = 1.2796, Value = 1.2979, mask_loss = 0.2602std_loss = 0.2858:   2%|▏         | 124/5475 [33:18<23:17:53, 15.67s/it]

ratio tensor(1.0293, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.2855, device='cuda:0') factor: tensor(0.2617, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.2869, device='cuda:0') factor: tensor(0.2581, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.2713, device='cuda:0') factor: tensor(0.2604, device='cuda:0')
reward_raw: tensor(0.5521, device='cuda:0') reward: tensor(1.3077, device='cuda:0') factor: tensor(0.2443, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.2524, device='cuda:0')


Epoch 1, Step 125: Loss = 0.2477, Actor Loss = 0.1884, Critic Loss = 0.1193, Entropy = 0.3501, Returns = 1.2435, Value = 1.1357, mask_loss = 0.2589std_loss = 0.2825:   2%|▏         | 125/5475 [33:33<22:55:00, 15.42s/it]  

ratio tensor(0.7610, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.2726, device='cuda:0') factor: tensor(0.2246, device='cuda:0')
reward_raw: tensor(0.4696, device='cuda:0') reward: tensor(1.2534, device='cuda:0') factor: tensor(0.2162, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.2750, device='cuda:0') factor: tensor(0.2331, device='cuda:0')
reward_raw: tensor(0.5142, device='cuda:0') reward: tensor(1.2958, device='cuda:0') factor: tensor(0.2184, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.2639, device='cuda:0') factor: tensor(0.2184, device='cuda:0')


Epoch 1, Step 126: Loss = -0.1093, Actor Loss = -0.1553, Critic Loss = 0.0926, Entropy = 0.3201, Returns = 1.3322, Value = 1.2468, mask_loss = 0.1989std_loss = 0.2536:   2%|▏         | 126/5475 [33:48<22:46:18, 15.33s/it]

ratio tensor(1.3279, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5940, device='cuda:0') reward: tensor(1.3687, device='cuda:0') factor: tensor(0.2253, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3447, device='cuda:0') factor: tensor(0.2260, device='cuda:0')
reward_raw: tensor(0.6152, device='cuda:0') reward: tensor(1.3878, device='cuda:0') factor: tensor(0.2274, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.2206, device='cuda:0')
reward_raw: tensor(0.5556, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.2259, device='cuda:0')


Epoch 1, Step 127: Loss = -0.1356, Actor Loss = -0.2188, Critic Loss = 0.1670, Entropy = 0.3216, Returns = 1.3170, Value = 1.2823, mask_loss = 0.2561std_loss = 0.2948:   2%|▏         | 127/5475 [34:06<23:43:00, 15.96s/it]

ratio tensor(1.1569, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4262, device='cuda:0') reward: tensor(1.1876, device='cuda:0') factor: tensor(0.2386, device='cuda:0')
reward_raw: tensor(0.4167, device='cuda:0') reward: tensor(1.1804, device='cuda:0') factor: tensor(0.2362, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.2360, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.4725, device='cuda:0') reward: tensor(1.2305, device='cuda:0') factor: tensor(0.2420, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.2506, device='cuda:0') factor: tensor(0.2560, device='cuda:0')


Epoch 1, Step 128: Loss = 0.3373, Actor Loss = 0.2363, Critic Loss = 0.2027, Entropy = 0.3456, Returns = 1.1695, Value = 1.3232, mask_loss = 0.2278std_loss = 0.2632:   2%|▏         | 128/5475 [34:27<26:06:22, 17.58s/it]  

ratio tensor(0.6423, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5198, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5120, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.5159, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.3472, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3086, device='cuda:0') factor: tensor(0.1945, device='cuda:0')


Epoch 1, Step 129: Loss = 0.0876, Actor Loss = 0.0006, Critic Loss = 0.1746, Entropy = 0.3006, Returns = 1.2948, Value = 1.3650, mask_loss = 0.1888std_loss = 0.2514:   2%|▏         | 129/5475 [34:45<26:29:39, 17.84s/it]

ratio tensor(1.0537, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.2559, device='cuda:0') factor: tensor(0.2366, device='cuda:0')
reward_raw: tensor(0.5134, device='cuda:0') reward: tensor(1.2669, device='cuda:0') factor: tensor(0.2464, device='cuda:0')
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.2512, device='cuda:0') factor: tensor(0.2492, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.2546, device='cuda:0') factor: tensor(0.2573, device='cuda:0')
reward_raw: tensor(0.4766, device='cuda:0') reward: tensor(1.2434, device='cuda:0') factor: tensor(0.2332, device='cuda:0')


Epoch 1, Step 130: Loss = -0.0476, Actor Loss = -0.0927, Critic Loss = 0.0907, Entropy = 0.3295, Returns = 1.2498, Value = 1.2829, mask_loss = 0.2355std_loss = 0.2805:   2%|▏         | 130/5475 [35:02<25:47:45, 17.37s/it]

ratio tensor(1.0565, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4805, device='cuda:0') reward: tensor(1.2848, device='cuda:0') factor: tensor(0.1957, device='cuda:0')
reward_raw: tensor(0.4314, device='cuda:0') reward: tensor(1.2394, device='cuda:0') factor: tensor(0.1920, device='cuda:0')
reward_raw: tensor(0.4463, device='cuda:0') reward: tensor(1.2497, device='cuda:0') factor: tensor(0.1966, device='cuda:0')
reward_raw: tensor(0.4266, device='cuda:0') reward: tensor(1.2295, device='cuda:0') factor: tensor(0.1971, device='cuda:0')
reward_raw: tensor(0.4119, device='cuda:0') reward: tensor(1.2116, device='cuda:0') factor: tensor(0.2004, device='cuda:0')


Epoch 1, Step 131: Loss = 0.1940, Actor Loss = 0.1138, Critic Loss = 0.1610, Entropy = 0.3040, Returns = 1.1881, Value = 1.4436, mask_loss = 0.1923std_loss = 0.2574:   2%|▏         | 131/5475 [35:15<23:50:47, 16.06s/it]  

ratio tensor(1.2230, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4691, device='cuda:0') reward: tensor(1.2620, device='cuda:0') factor: tensor(0.2071, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.2468, device='cuda:0') factor: tensor(0.2187, device='cuda:0')
reward_raw: tensor(0.4387, device='cuda:0') reward: tensor(1.2325, device='cuda:0') factor: tensor(0.2062, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.2616, device='cuda:0') factor: tensor(0.2111, device='cuda:0')
reward_raw: tensor(0.4523, device='cuda:0') reward: tensor(1.2347, device='cuda:0') factor: tensor(0.2176, device='cuda:0')


Epoch 1, Step 132: Loss = 0.0441, Actor Loss = -0.0099, Critic Loss = 0.1085, Entropy = 0.2906, Returns = 1.2554, Value = 1.2293, mask_loss = 0.1957std_loss = 0.2693:   2%|▏         | 132/5475 [35:37<26:23:52, 17.79s/it]

ratio tensor(1.0255, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5000, device='cuda:0') reward: tensor(1.2768, device='cuda:0') factor: tensor(0.2232, device='cuda:0')
reward_raw: tensor(0.4790, device='cuda:0') reward: tensor(1.2628, device='cuda:0') factor: tensor(0.2162, device='cuda:0')
reward_raw: tensor(0.4904, device='cuda:0') reward: tensor(1.2654, device='cuda:0') factor: tensor(0.2249, device='cuda:0')
reward_raw: tensor(0.4702, device='cuda:0') reward: tensor(1.2470, device='cuda:0') factor: tensor(0.2232, device='cuda:0')
reward_raw: tensor(0.4831, device='cuda:0') reward: tensor(1.2689, device='cuda:0') factor: tensor(0.2141, device='cuda:0')


Epoch 1, Step 133: Loss = 0.0882, Actor Loss = 0.0643, Critic Loss = 0.0485, Entropy = 0.3168, Returns = 1.2849, Value = 1.1827, mask_loss = 0.2292std_loss = 0.2817:   2%|▏         | 133/5475 [35:55<26:53:21, 18.12s/it] 

ratio tensor(0.8869, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4628, device='cuda:0') reward: tensor(1.2539, device='cuda:0') factor: tensor(0.2089, device='cuda:0')
reward_raw: tensor(0.4530, device='cuda:0') reward: tensor(1.2399, device='cuda:0') factor: tensor(0.2131, device='cuda:0')
reward_raw: tensor(0.4566, device='cuda:0') reward: tensor(1.2474, device='cuda:0') factor: tensor(0.2092, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.2821, device='cuda:0') factor: tensor(0.2122, device='cuda:0')
reward_raw: tensor(0.4650, device='cuda:0') reward: tensor(1.2491, device='cuda:0') factor: tensor(0.2159, device='cuda:0')


Epoch 1, Step 134: Loss = -0.3924, Actor Loss = -0.4547, Critic Loss = 0.1253, Entropy = 0.3290, Returns = 1.2328, Value = 1.1763, mask_loss = 0.2055std_loss = 0.2480:   2%|▏         | 134/5475 [36:20<29:42:34, 20.03s/it]

ratio tensor(3.2359, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.2034, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.2193, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.2033, device='cuda:0')
reward_raw: tensor(0.5214, device='cuda:0') reward: tensor(1.3220, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.3329, device='cuda:0') factor: tensor(0.1968, device='cuda:0')


Epoch 1, Step 135: Loss = 0.4706, Actor Loss = 0.4034, Critic Loss = 0.1351, Entropy = 0.2864, Returns = 1.2733, Value = 1.5017, mask_loss = 0.1994std_loss = 0.2720:   2%|▏         | 135/5475 [36:33<26:45:42, 18.04s/it]  

ratio tensor(0.7169, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4099, device='cuda:0') reward: tensor(1.1878, device='cuda:0') factor: tensor(0.2221, device='cuda:0')
reward_raw: tensor(0.4313, device='cuda:0') reward: tensor(1.2197, device='cuda:0') factor: tensor(0.2116, device='cuda:0')
reward_raw: tensor(0.4552, device='cuda:0') reward: tensor(1.2385, device='cuda:0') factor: tensor(0.2167, device='cuda:0')
reward_raw: tensor(0.4191, device='cuda:0') reward: tensor(1.2055, device='cuda:0') factor: tensor(0.2136, device='cuda:0')
reward_raw: tensor(0.4290, device='cuda:0') reward: tensor(1.2147, device='cuda:0') factor: tensor(0.2143, device='cuda:0')


Epoch 1, Step 136: Loss = 0.2202, Actor Loss = 0.1306, Critic Loss = 0.1798, Entropy = 0.3028, Returns = 1.1941, Value = 0.9631, mask_loss = 0.2065std_loss = 0.2711:   2%|▏         | 136/5475 [36:48<25:17:24, 17.05s/it]

ratio tensor(0.7393, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5124, device='cuda:0') reward: tensor(1.2820, device='cuda:0') factor: tensor(0.2304, device='cuda:0')
reward_raw: tensor(0.4753, device='cuda:0') reward: tensor(1.2392, device='cuda:0') factor: tensor(0.2361, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.2676, device='cuda:0') factor: tensor(0.2308, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.2656, device='cuda:0') factor: tensor(0.2189, device='cuda:0')
reward_raw: tensor(0.4622, device='cuda:0') reward: tensor(1.2303, device='cuda:0') factor: tensor(0.2319, device='cuda:0')


Epoch 1, Step 137: Loss = -0.3446, Actor Loss = -0.3989, Critic Loss = 0.1093, Entropy = 0.3022, Returns = 1.3445, Value = 1.3546, mask_loss = 0.2218std_loss = 0.2857:   3%|▎         | 137/5475 [37:03<24:11:47, 16.32s/it]

ratio tensor(1.8272, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5275, device='cuda:0') reward: tensor(1.3069, device='cuda:0') factor: tensor(0.2206, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.2878, device='cuda:0') factor: tensor(0.2259, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.2965, device='cuda:0') factor: tensor(0.2186, device='cuda:0')
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.3012, device='cuda:0') factor: tensor(0.2208, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.2822, device='cuda:0') factor: tensor(0.2330, device='cuda:0')


Epoch 1, Step 138: Loss = 0.1709, Actor Loss = 0.0979, Critic Loss = 0.1466, Entropy = 0.2995, Returns = 1.2060, Value = 1.3850, mask_loss = 0.2198std_loss = 0.2816:   3%|▎         | 138/5475 [37:18<23:55:10, 16.13s/it]  

ratio tensor(0.9104, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4379, device='cuda:0') reward: tensor(1.2294, device='cuda:0') factor: tensor(0.2086, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.2848, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.2884, device='cuda:0') factor: tensor(0.2013, device='cuda:0')
reward_raw: tensor(0.4559, device='cuda:0') reward: tensor(1.2511, device='cuda:0') factor: tensor(0.2047, device='cuda:0')
reward_raw: tensor(0.4551, device='cuda:0') reward: tensor(1.2566, device='cuda:0') factor: tensor(0.1985, device='cuda:0')


Epoch 1, Step 139: Loss = 0.1925, Actor Loss = 0.1520, Critic Loss = 0.0817, Entropy = 0.3141, Returns = 1.2441, Value = 1.2098, mask_loss = 0.2203std_loss = 0.2766:   3%|▎         | 139/5475 [37:42<27:17:18, 18.41s/it]

ratio tensor(0.8133, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5460, device='cuda:0') reward: tensor(1.2923, device='cuda:0') factor: tensor(0.2537, device='cuda:0')
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.2804, device='cuda:0') factor: tensor(0.2564, device='cuda:0')
reward_raw: tensor(0.5459, device='cuda:0') reward: tensor(1.2957, device='cuda:0') factor: tensor(0.2502, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.3052, device='cuda:0') factor: tensor(0.2546, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.2509, device='cuda:0')


Epoch 1, Step 140: Loss = -0.4106, Actor Loss = -0.4771, Critic Loss = 0.1336, Entropy = 0.3034, Returns = 1.3333, Value = 1.2882, mask_loss = 0.2635std_loss = 0.3160:   3%|▎         | 140/5475 [37:55<24:42:56, 16.68s/it]

ratio tensor(1.4997, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4818, device='cuda:0') reward: tensor(1.2623, device='cuda:0') factor: tensor(0.2195, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.2896, device='cuda:0') factor: tensor(0.2255, device='cuda:0')
reward_raw: tensor(0.5042, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.2176, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.2702, device='cuda:0') factor: tensor(0.2229, device='cuda:0')
reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.2647, device='cuda:0') factor: tensor(0.2194, device='cuda:0')


Epoch 1, Step 141: Loss = 0.1546, Actor Loss = 0.1079, Critic Loss = 0.0938, Entropy = 0.2868, Returns = 1.3689, Value = 1.3395, mask_loss = 0.2179std_loss = 0.2908:   3%|▎         | 140/5475 [38:11<24:42:56, 16.68s/it]  

ratio tensor(0.9812, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 141: Loss = 0.1546, Actor Loss = 0.1079, Critic Loss = 0.0938, Entropy = 0.2868, Returns = 1.3689, Value = 1.3395, mask_loss = 0.2179std_loss = 0.2908:   3%|▎         | 141/5475 [38:11<24:30:03, 16.54s/it]

reward_raw: tensor(0.6320, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.2325, device='cuda:0')
reward_raw: tensor(0.6126, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.2263, device='cuda:0')
reward_raw: tensor(0.6119, device='cuda:0') reward: tensor(1.3807, device='cuda:0') factor: tensor(0.2312, device='cuda:0')
reward_raw: tensor(0.6055, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.2340, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.3752, device='cuda:0') factor: tensor(0.2156, device='cuda:0')


Epoch 1, Step 142: Loss = -0.0896, Actor Loss = -0.1437, Critic Loss = 0.1087, Entropy = 0.2798, Returns = 1.3269, Value = 1.1201, mask_loss = 0.2066std_loss = 0.2804:   3%|▎         | 142/5475 [38:26<23:42:50, 16.01s/it]

ratio tensor(0.9565, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5377, device='cuda:0') reward: tensor(1.3366, device='cuda:0') factor: tensor(0.2011, device='cuda:0')
reward_raw: tensor(0.5008, device='cuda:0') reward: tensor(1.2940, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.5239, device='cuda:0') reward: tensor(1.3248, device='cuda:0') factor: tensor(0.1991, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.1998, device='cuda:0')
reward_raw: tensor(0.5161, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.2022, device='cuda:0')


Epoch 1, Step 143: Loss = 0.0301, Actor Loss = -0.0257, Critic Loss = 0.1119, Entropy = 0.2500, Returns = 1.2786, Value = 1.3884, mask_loss = 0.1981std_loss = 0.2928:   3%|▎         | 143/5475 [38:39<22:37:17, 15.27s/it] 

ratio tensor(2.6716, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.2530, device='cuda:0') factor: tensor(0.2186, device='cuda:0')
reward_raw: tensor(0.4536, device='cuda:0') reward: tensor(1.2462, device='cuda:0') factor: tensor(0.2074, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.2727, device='cuda:0') factor: tensor(0.2218, device='cuda:0')
reward_raw: tensor(0.4803, device='cuda:0') reward: tensor(1.2679, device='cuda:0') factor: tensor(0.2124, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.2034, device='cuda:0')


Epoch 1, Step 144: Loss = 0.2778, Actor Loss = 0.2073, Critic Loss = 0.1415, Entropy = 0.2508, Returns = 1.2174, Value = 1.1424, mask_loss = 0.2006std_loss = 0.2865:   3%|▎         | 144/5475 [38:53<21:53:46, 14.79s/it] 

ratio tensor(0.7724, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.2835, device='cuda:0') factor: tensor(0.2157, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.2729, device='cuda:0') factor: tensor(0.2144, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.2972, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.3033, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.5116, device='cuda:0') reward: tensor(1.2885, device='cuda:0') factor: tensor(0.2232, device='cuda:0')


Epoch 1, Step 145: Loss = -0.0856, Actor Loss = -0.1150, Critic Loss = 0.0593, Entropy = 0.2712, Returns = 1.3064, Value = 1.2069, mask_loss = 0.2320std_loss = 0.3078:   3%|▎         | 145/5475 [39:10<23:03:57, 15.58s/it]

ratio tensor(1.3404, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.3548, device='cuda:0') factor: tensor(0.1914, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.2025, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3314, device='cuda:0') factor: tensor(0.2198, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.3494, device='cuda:0') factor: tensor(0.2099, device='cuda:0')
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.3420, device='cuda:0') factor: tensor(0.1965, device='cuda:0')


Epoch 1, Step 146: Loss = -0.1218, Actor Loss = -0.1802, Critic Loss = 0.1175, Entropy = 0.2676, Returns = 1.3188, Value = 1.4789, mask_loss = 0.2307std_loss = 0.3045:   3%|▎         | 146/5475 [39:25<22:47:20, 15.40s/it]

ratio tensor(1.4356, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5437, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.2430, device='cuda:0')
reward_raw: tensor(0.5500, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.2451, device='cuda:0')
reward_raw: tensor(0.5466, device='cuda:0') reward: tensor(1.2986, device='cuda:0') factor: tensor(0.2480, device='cuda:0')
reward_raw: tensor(0.5468, device='cuda:0') reward: tensor(1.3088, device='cuda:0') factor: tensor(0.2380, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.2415, device='cuda:0')


Epoch 1, Step 147: Loss = -0.0808, Actor Loss = -0.0933, Critic Loss = 0.0254, Entropy = 0.2686, Returns = 1.4251, Value = 1.3383, mask_loss = 0.2385std_loss = 0.3170:   3%|▎         | 147/5475 [39:38<21:38:25, 14.62s/it]

ratio tensor(1.6538, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(1.2192, device='cuda:0') factor: tensor(0.2731, device='cuda:0')
reward_raw: tensor(0.5310, device='cuda:0') reward: tensor(1.2626, device='cuda:0') factor: tensor(0.2684, device='cuda:0')
reward_raw: tensor(0.4947, device='cuda:0') reward: tensor(1.2340, device='cuda:0') factor: tensor(0.2607, device='cuda:0')
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.2539, device='cuda:0') factor: tensor(0.2654, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.2222, device='cuda:0') factor: tensor(0.2741, device='cuda:0')


Epoch 1, Step 148: Loss = 0.4061, Actor Loss = 0.3725, Critic Loss = 0.0677, Entropy = 0.2648, Returns = 1.2168, Value = 1.2354, mask_loss = 0.2481std_loss = 0.3215:   3%|▎         | 148/5475 [39:53<21:50:03, 14.76s/it]  

ratio tensor(0.6021, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5264, device='cuda:0') reward: tensor(1.3052, device='cuda:0') factor: tensor(0.2213, device='cuda:0')
reward_raw: tensor(0.5316, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.2221, device='cuda:0')
reward_raw: tensor(0.5196, device='cuda:0') reward: tensor(1.3047, device='cuda:0') factor: tensor(0.2150, device='cuda:0')
reward_raw: tensor(0.5278, device='cuda:0') reward: tensor(1.3092, device='cuda:0') factor: tensor(0.2187, device='cuda:0')
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.2904, device='cuda:0') factor: tensor(0.2175, device='cuda:0')


Epoch 1, Step 149: Loss = -0.1149, Actor Loss = -0.2036, Critic Loss = 0.1779, Entropy = 0.2522, Returns = 1.2597, Value = 1.2416, mask_loss = 0.2155std_loss = 0.3046:   3%|▎         | 149/5475 [40:09<22:25:31, 15.16s/it]

ratio tensor(1.2314, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.2573, device='cuda:0') factor: tensor(0.2285, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.2533, device='cuda:0') factor: tensor(0.2295, device='cuda:0')
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.2644, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.2807, device='cuda:0') factor: tensor(0.2295, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.2747, device='cuda:0') factor: tensor(0.2249, device='cuda:0')


Epoch 1, Step 150: Loss = 0.0449, Actor Loss = -0.0304, Critic Loss = 0.1511, Entropy = 0.2755, Returns = 1.2730, Value = 1.3949, mask_loss = 0.2344std_loss = 0.3077:   3%|▎         | 150/5475 [40:27<23:34:00, 15.93s/it] 

ratio tensor(0.9241, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5179, device='cuda:0') reward: tensor(1.2796, device='cuda:0') factor: tensor(0.2384, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.2695, device='cuda:0') factor: tensor(0.2328, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.2816, device='cuda:0') factor: tensor(0.2279, device='cuda:0')
reward_raw: tensor(0.5083, device='cuda:0') reward: tensor(1.2699, device='cuda:0') factor: tensor(0.2384, device='cuda:0')
reward_raw: tensor(0.4898, device='cuda:0') reward: tensor(1.2488, device='cuda:0') factor: tensor(0.2410, device='cuda:0')


Epoch 1, Step 151: Loss = 0.2527, Actor Loss = 0.2025, Critic Loss = 0.1010, Entropy = 0.2615, Returns = 1.2837, Value = 1.3146, mask_loss = 0.2360std_loss = 0.3180:   3%|▎         | 151/5475 [40:40<22:21:56, 15.12s/it] 

ratio tensor(0.3026, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5118, device='cuda:0') reward: tensor(1.3045, device='cuda:0') factor: tensor(0.2072, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.2883, device='cuda:0') factor: tensor(0.2285, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.2900, device='cuda:0') factor: tensor(0.2223, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.2966, device='cuda:0') factor: tensor(0.2283, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.2746, device='cuda:0') factor: tensor(0.2199, device='cuda:0')


Epoch 1, Step 152: Loss = 0.0867, Actor Loss = 0.0587, Critic Loss = 0.0565, Entropy = 0.2242, Returns = 1.2736, Value = 1.2311, mask_loss = 0.2027std_loss = 0.3114:   3%|▎         | 152/5475 [40:56<22:45:06, 15.39s/it]

ratio tensor(0.3737, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6087, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.2205, device='cuda:0')
reward_raw: tensor(0.6183, device='cuda:0') reward: tensor(1.3953, device='cuda:0') factor: tensor(0.2230, device='cuda:0')
reward_raw: tensor(0.6157, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.2153, device='cuda:0')
reward_raw: tensor(0.6130, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.2236, device='cuda:0')
reward_raw: tensor(0.5905, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.2175, device='cuda:0')


Epoch 1, Step 153: Loss = -0.0949, Actor Loss = -0.1591, Critic Loss = 0.1290, Entropy = 0.2195, Returns = 1.4261, Value = 1.5807, mask_loss = 0.1946std_loss = 0.3001:   3%|▎         | 153/5475 [41:15<24:10:37, 16.35s/it]

ratio tensor(0.9533, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.2586, device='cuda:0') factor: tensor(0.2101, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.2737, device='cuda:0') factor: tensor(0.2108, device='cuda:0')
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.2635, device='cuda:0') factor: tensor(0.2115, device='cuda:0')
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.2536, device='cuda:0') factor: tensor(0.2103, device='cuda:0')
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.2596, device='cuda:0') factor: tensor(0.2123, device='cuda:0')


Epoch 1, Step 154: Loss = 0.4603, Actor Loss = 0.3270, Critic Loss = 0.2672, Entropy = 0.2056, Returns = 1.2742, Value = 1.4335, mask_loss = 0.1875std_loss = 0.3018:   3%|▎         | 154/5475 [41:27<22:28:33, 15.21s/it]  

ratio tensor(0.1906, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5501, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.1726, device='cuda:0')
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.3890, device='cuda:0') factor: tensor(0.1719, device='cuda:0')
reward_raw: tensor(0.5430, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.1701, device='cuda:0')
reward_raw: tensor(0.5534, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1781, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.1731, device='cuda:0')


Epoch 1, Step 155: Loss = 0.1840, Actor Loss = 0.0970, Critic Loss = 0.1743, Entropy = 0.1774, Returns = 1.4273, Value = 1.3208, mask_loss = 0.1523std_loss = 0.2858:   3%|▎         | 155/5475 [41:42<22:14:31, 15.05s/it]

ratio tensor(0.8622, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3231, device='cuda:0') factor: tensor(0.1767, device='cuda:0')
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.3414, device='cuda:0') factor: tensor(0.1828, device='cuda:0')
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3176, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.1822, device='cuda:0')
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1796, device='cuda:0')


Epoch 1, Step 156: Loss = -0.2545, Actor Loss = -0.2917, Critic Loss = 0.0747, Entropy = 0.1873, Returns = 1.3670, Value = 1.2936, mask_loss = 0.1681std_loss = 0.2949:   3%|▎         | 156/5475 [41:55<21:10:40, 14.33s/it]

ratio tensor(1.5181, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1558, device='cuda:0')
reward_raw: tensor(0.4940, device='cuda:0') reward: tensor(1.3425, device='cuda:0') factor: tensor(0.1515, device='cuda:0')
reward_raw: tensor(0.4684, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.1580, device='cuda:0')
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.3333, device='cuda:0') factor: tensor(0.1532, device='cuda:0')
reward_raw: tensor(0.4721, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.1533, device='cuda:0')


Epoch 1, Step 157: Loss = -0.1448, Actor Loss = -0.1805, Critic Loss = 0.0717, Entropy = 0.1809, Returns = 1.3903, Value = 1.4828, mask_loss = 0.1534std_loss = 0.2855:   3%|▎         | 157/5475 [42:11<22:04:27, 14.94s/it]

ratio tensor(2.2116, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.5517, device='cuda:0') reward: tensor(1.3725, device='cuda:0') factor: tensor(0.1792, device='cuda:0')
reward_raw: tensor(0.5171, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.5480, device='cuda:0') reward: tensor(1.3750, device='cuda:0') factor: tensor(0.1730, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.3616, device='cuda:0') factor: tensor(0.1777, device='cuda:0')


Epoch 1, Step 158: Loss = 0.1739, Actor Loss = 0.1290, Critic Loss = 0.0902, Entropy = 0.2185, Returns = 1.3474, Value = 1.3731, mask_loss = 0.1658std_loss = 0.2731:   3%|▎         | 158/5475 [42:29<23:27:25, 15.88s/it]  

ratio tensor(0.5776, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1686, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.3516, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3289, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.3523, device='cuda:0') factor: tensor(0.1764, device='cuda:0')
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.3866, device='cuda:0') factor: tensor(0.1747, device='cuda:0')


Epoch 1, Step 159: Loss = 0.0009, Actor Loss = -0.1086, Critic Loss = 0.2193, Entropy = 0.1605, Returns = 1.3555, Value = 1.0896, mask_loss = 0.1515std_loss = 0.2829:   3%|▎         | 159/5475 [42:44<23:04:44, 15.63s/it]

ratio tensor(0.6962, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4588, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.4503, device='cuda:0') reward: tensor(1.3326, device='cuda:0') factor: tensor(0.1177, device='cuda:0')
reward_raw: tensor(0.4641, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.1313, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3438, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.4506, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1255, device='cuda:0')


Epoch 1, Step 160: Loss = 0.0443, Actor Loss = -0.0079, Critic Loss = 0.1046, Entropy = 0.1477, Returns = 1.3003, Value = 1.3307, mask_loss = 0.1201std_loss = 0.2564:   3%|▎         | 160/5475 [43:04<24:53:02, 16.85s/it]

ratio tensor(0.6819, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5468, device='cuda:0') reward: tensor(1.4287, device='cuda:0') factor: tensor(0.1180, device='cuda:0')
reward_raw: tensor(0.5315, device='cuda:0') reward: tensor(1.4190, device='cuda:0') factor: tensor(0.1124, device='cuda:0')
reward_raw: tensor(0.5628, device='cuda:0') reward: tensor(1.4440, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.5397, device='cuda:0') reward: tensor(1.4272, device='cuda:0') factor: tensor(0.1125, device='cuda:0')
reward_raw: tensor(0.5548, device='cuda:0') reward: tensor(1.4332, device='cuda:0') factor: tensor(0.1216, device='cuda:0')


Epoch 1, Step 161: Loss = -0.2696, Actor Loss = -0.3281, Critic Loss = 0.1173, Entropy = 0.1356, Returns = 1.4649, Value = 1.4562, mask_loss = 0.1107std_loss = 0.2545:   3%|▎         | 160/5475 [43:21<24:53:02, 16.85s/it]

ratio tensor(1.8592, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 161: Loss = -0.2696, Actor Loss = -0.3281, Critic Loss = 0.1173, Entropy = 0.1356, Returns = 1.4649, Value = 1.4562, mask_loss = 0.1107std_loss = 0.2545:   3%|▎         | 161/5475 [43:21<24:57:41, 16.91s/it]

reward_raw: tensor(0.3986, device='cuda:0') reward: tensor(1.2807, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.3943, device='cuda:0') reward: tensor(1.2844, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.3982, device='cuda:0') reward: tensor(1.2882, device='cuda:0') factor: tensor(0.1101, device='cuda:0')
reward_raw: tensor(0.4015, device='cuda:0') reward: tensor(1.2853, device='cuda:0') factor: tensor(0.1162, device='cuda:0')
reward_raw: tensor(0.3773, device='cuda:0') reward: tensor(1.2558, device='cuda:0') factor: tensor(0.1214, device='cuda:0')


Epoch 1, Step 162: Loss = 0.5085, Actor Loss = 0.3974, Critic Loss = 0.2226, Entropy = 0.1852, Returns = 1.2981, Value = 1.5021, mask_loss = 0.1316std_loss = 0.2521:   3%|▎         | 162/5475 [43:35<23:51:07, 16.16s/it]  

ratio tensor(0.3414, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5818, device='cuda:0') reward: tensor(1.4350, device='cuda:0') factor: tensor(0.1468, device='cuda:0')
reward_raw: tensor(0.5567, device='cuda:0') reward: tensor(1.4088, device='cuda:0') factor: tensor(0.1479, device='cuda:0')
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.4371, device='cuda:0') factor: tensor(0.1452, device='cuda:0')
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.4119, device='cuda:0') factor: tensor(0.1434, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1519, device='cuda:0')


Epoch 1, Step 163: Loss = -0.0356, Actor Loss = -0.0695, Critic Loss = 0.0682, Entropy = 0.2109, Returns = 1.4077, Value = 1.4357, mask_loss = 0.1452std_loss = 0.2584:   3%|▎         | 163/5475 [43:52<23:48:56, 16.14s/it]

ratio tensor(1.2328, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4574, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.1611, device='cuda:0')
reward_raw: tensor(0.4134, device='cuda:0') reward: tensor(1.2527, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.4414, device='cuda:0') reward: tensor(1.2777, device='cuda:0') factor: tensor(0.1637, device='cuda:0')
reward_raw: tensor(0.4129, device='cuda:0') reward: tensor(1.2552, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.4211, device='cuda:0') reward: tensor(1.2654, device='cuda:0') factor: tensor(0.1557, device='cuda:0')


Epoch 1, Step 164: Loss = 0.1603, Actor Loss = 0.0967, Critic Loss = 0.1277, Entropy = 0.2266, Returns = 1.2702, Value = 1.3232, mask_loss = 0.1852std_loss = 0.2846:   3%|▎         | 164/5475 [44:06<23:17:58, 15.79s/it]  

ratio tensor(0.8173, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4088, device='cuda:0') reward: tensor(1.2476, device='cuda:0') factor: tensor(0.1612, device='cuda:0')
reward_raw: tensor(0.4408, device='cuda:0') reward: tensor(1.2780, device='cuda:0') factor: tensor(0.1628, device='cuda:0')
reward_raw: tensor(0.3969, device='cuda:0') reward: tensor(1.2441, device='cuda:0') factor: tensor(0.1528, device='cuda:0')
reward_raw: tensor(0.3754, device='cuda:0') reward: tensor(1.2205, device='cuda:0') factor: tensor(0.1549, device='cuda:0')
reward_raw: tensor(0.4216, device='cuda:0') reward: tensor(1.2648, device='cuda:0') factor: tensor(0.1568, device='cuda:0')


Epoch 1, Step 165: Loss = 0.1143, Actor Loss = 0.0624, Critic Loss = 0.1042, Entropy = 0.2163, Returns = 1.1714, Value = 1.1061, mask_loss = 0.1670std_loss = 0.2812:   3%|▎         | 165/5475 [44:20<22:07:01, 14.99s/it]

ratio tensor(0.8597, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4985, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.1586, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.3443, device='cuda:0') factor: tensor(0.1717, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.1715, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.3799, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1644, device='cuda:0')


Epoch 1, Step 166: Loss = -0.2770, Actor Loss = -0.3188, Critic Loss = 0.0842, Entropy = 0.2207, Returns = 1.3222, Value = 1.2572, mask_loss = 0.1546std_loss = 0.2642:   3%|▎         | 166/5475 [44:37<22:58:33, 15.58s/it]

ratio tensor(1.2098, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4718, device='cuda:0') reward: tensor(1.3302, device='cuda:0') factor: tensor(0.1415, device='cuda:0')
reward_raw: tensor(0.4699, device='cuda:0') reward: tensor(1.3205, device='cuda:0') factor: tensor(0.1493, device='cuda:0')
reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.3245, device='cuda:0') factor: tensor(0.1510, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.1540, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.1467, device='cuda:0')


Epoch 1, Step 167: Loss = 0.2276, Actor Loss = 0.1338, Critic Loss = 0.1881, Entropy = 0.2084, Returns = 1.3816, Value = 1.6248, mask_loss = 0.1373std_loss = 0.2496:   3%|▎         | 167/5475 [44:53<23:30:24, 15.94s/it]  

ratio tensor(0.8133, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4417, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.4471, device='cuda:0') reward: tensor(1.3057, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.4419, device='cuda:0') reward: tensor(1.3057, device='cuda:0') factor: tensor(0.1362, device='cuda:0')
reward_raw: tensor(0.4189, device='cuda:0') reward: tensor(1.2807, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.4729, device='cuda:0') reward: tensor(1.3348, device='cuda:0') factor: tensor(0.1381, device='cuda:0')


Epoch 1, Step 168: Loss = 0.2366, Actor Loss = 0.1936, Critic Loss = 0.0864, Entropy = 0.2084, Returns = 1.2026, Value = 1.1138, mask_loss = 0.1363std_loss = 0.2492:   3%|▎         | 168/5475 [45:15<25:55:48, 17.59s/it]

ratio tensor(0.8402, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4538, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.1354, device='cuda:0')
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.3284, device='cuda:0') factor: tensor(0.1435, device='cuda:0')
reward_raw: tensor(0.4704, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1355, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.3578, device='cuda:0') factor: tensor(0.1411, device='cuda:0')
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(1.3407, device='cuda:0') factor: tensor(0.1373, device='cuda:0')


Epoch 1, Step 169: Loss = 0.0335, Actor Loss = -0.0220, Critic Loss = 0.1115, Entropy = 0.2119, Returns = 1.2369, Value = 1.3842, mask_loss = 0.1413std_loss = 0.2494:   3%|▎         | 169/5475 [45:33<26:13:00, 17.79s/it]

ratio tensor(0.8139, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4208, device='cuda:0') reward: tensor(1.2880, device='cuda:0') factor: tensor(0.1328, device='cuda:0')
reward_raw: tensor(0.4475, device='cuda:0') reward: tensor(1.3009, device='cuda:0') factor: tensor(0.1466, device='cuda:0')
reward_raw: tensor(0.4519, device='cuda:0') reward: tensor(1.3165, device='cuda:0') factor: tensor(0.1354, device='cuda:0')
reward_raw: tensor(0.4400, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.1321, device='cuda:0')
reward_raw: tensor(0.4238, device='cuda:0') reward: tensor(1.2925, device='cuda:0') factor: tensor(0.1313, device='cuda:0')


Epoch 1, Step 170: Loss = 0.0838, Actor Loss = 0.0173, Critic Loss = 0.1333, Entropy = 0.1892, Returns = 1.3607, Value = 1.3135, mask_loss = 0.1298std_loss = 0.2519:   3%|▎         | 170/5475 [45:56<28:42:16, 19.48s/it] 

ratio tensor(0.8844, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4239, device='cuda:0') reward: tensor(1.2985, device='cuda:0') factor: tensor(0.1253, device='cuda:0')
reward_raw: tensor(0.4144, device='cuda:0') reward: tensor(1.2896, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.4126, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.1214, device='cuda:0')
reward_raw: tensor(0.4153, device='cuda:0') reward: tensor(1.2824, device='cuda:0') factor: tensor(0.1329, device='cuda:0')
reward_raw: tensor(0.4119, device='cuda:0') reward: tensor(1.2845, device='cuda:0') factor: tensor(0.1274, device='cuda:0')


Epoch 1, Step 171: Loss = -0.0465, Actor Loss = -0.0888, Critic Loss = 0.0849, Entropy = 0.1724, Returns = 1.2078, Value = 1.1100, mask_loss = 0.1230std_loss = 0.2528:   3%|▎         | 171/5475 [46:11<26:29:47, 17.98s/it]

ratio tensor(1.0145, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4062, device='cuda:0') reward: tensor(1.2833, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4242, device='cuda:0') reward: tensor(1.2932, device='cuda:0') factor: tensor(0.1309, device='cuda:0')
reward_raw: tensor(0.4236, device='cuda:0') reward: tensor(1.3004, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.4036, device='cuda:0') reward: tensor(1.2725, device='cuda:0') factor: tensor(0.1312, device='cuda:0')
reward_raw: tensor(0.4309, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.1248, device='cuda:0')


Epoch 1, Step 172: Loss = -0.0578, Actor Loss = -0.1392, Critic Loss = 0.1633, Entropy = 0.1951, Returns = 1.3303, Value = 1.3102, mask_loss = 0.1299std_loss = 0.2453:   3%|▎         | 172/5475 [46:27<25:30:34, 17.32s/it]

ratio tensor(1.0572, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.3435, device='cuda:0') factor: tensor(0.1637, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3583, device='cuda:0') factor: tensor(0.1657, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1657, device='cuda:0')
reward_raw: tensor(0.4987, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1561, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.1520, device='cuda:0')


Epoch 1, Step 173: Loss = -0.1602, Actor Loss = -0.2278, Critic Loss = 0.1357, Entropy = 0.2073, Returns = 1.4213, Value = 1.3641, mask_loss = 0.1427std_loss = 0.2537:   3%|▎         | 173/5475 [46:42<24:34:10, 16.68s/it]

ratio tensor(1.2166, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4641, device='cuda:0') reward: tensor(1.3233, device='cuda:0') factor: tensor(0.1408, device='cuda:0')
reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.3209, device='cuda:0') factor: tensor(0.1404, device='cuda:0')
reward_raw: tensor(0.4297, device='cuda:0') reward: tensor(1.3003, device='cuda:0') factor: tensor(0.1294, device='cuda:0')
reward_raw: tensor(0.4431, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.1289, device='cuda:0')
reward_raw: tensor(0.4336, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.1380, device='cuda:0')


Epoch 1, Step 174: Loss = 0.2189, Actor Loss = 0.1828, Critic Loss = 0.0727, Entropy = 0.1911, Returns = 1.2801, Value = 1.3279, mask_loss = 0.1323std_loss = 0.2490:   3%|▎         | 174/5475 [47:02<26:08:14, 17.75s/it]  

ratio tensor(0.4944, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.1813, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1822, device='cuda:0')
reward_raw: tensor(0.5360, device='cuda:0') reward: tensor(1.3632, device='cuda:0') factor: tensor(0.1728, device='cuda:0')
reward_raw: tensor(0.5637, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.1852, device='cuda:0')
reward_raw: tensor(0.5256, device='cuda:0') reward: tensor(1.3537, device='cuda:0') factor: tensor(0.1719, device='cuda:0')


Epoch 1, Step 175: Loss = 0.0329, Actor Loss = -0.0486, Critic Loss = 0.1633, Entropy = 0.1901, Returns = 1.2851, Value = 1.2879, mask_loss = 0.2005std_loss = 0.3184:   3%|▎         | 175/5475 [47:15<23:46:39, 16.15s/it]

ratio tensor(1.1783, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4462, device='cuda:0') reward: tensor(1.3058, device='cuda:0') factor: tensor(0.1404, device='cuda:0')
reward_raw: tensor(0.4762, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.1484, device='cuda:0')
reward_raw: tensor(0.4685, device='cuda:0') reward: tensor(1.3221, device='cuda:0') factor: tensor(0.1464, device='cuda:0')
reward_raw: tensor(0.4498, device='cuda:0') reward: tensor(1.3180, device='cuda:0') factor: tensor(0.1318, device='cuda:0')
reward_raw: tensor(0.4725, device='cuda:0') reward: tensor(1.3253, device='cuda:0') factor: tensor(0.1472, device='cuda:0')


Epoch 1, Step 176: Loss = -0.1391, Actor Loss = -0.1667, Critic Loss = 0.0556, Entropy = 0.1783, Returns = 1.3139, Value = 1.3319, mask_loss = 0.1446std_loss = 0.2746:   3%|▎         | 176/5475 [47:31<24:03:02, 16.34s/it]

ratio tensor(1.0342, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4344, device='cuda:0') reward: tensor(1.2894, device='cuda:0') factor: tensor(0.1450, device='cuda:0')
reward_raw: tensor(0.4585, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.4333, device='cuda:0') reward: tensor(1.2874, device='cuda:0') factor: tensor(0.1459, device='cuda:0')
reward_raw: tensor(0.4325, device='cuda:0') reward: tensor(1.2888, device='cuda:0') factor: tensor(0.1438, device='cuda:0')
reward_raw: tensor(0.4317, device='cuda:0') reward: tensor(1.2833, device='cuda:0') factor: tensor(0.1485, device='cuda:0')


Epoch 1, Step 177: Loss = 0.4272, Actor Loss = 0.3681, Critic Loss = 0.1186, Entropy = 0.1553, Returns = 1.3298, Value = 1.3945, mask_loss = 0.1300std_loss = 0.2667:   3%|▎         | 177/5475 [47:53<26:18:24, 17.88s/it]  

ratio tensor(0.5380, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4398, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.1175, device='cuda:0')
reward_raw: tensor(0.4397, device='cuda:0') reward: tensor(1.3200, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.4542, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.1165, device='cuda:0')
reward_raw: tensor(0.4554, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.1126, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.1169, device='cuda:0')


Epoch 1, Step 178: Loss = -0.0073, Actor Loss = -0.0349, Critic Loss = 0.0555, Entropy = 0.1465, Returns = 1.2927, Value = 1.2091, mask_loss = 0.1111std_loss = 0.2483:   3%|▎         | 178/5475 [48:09<25:30:13, 17.33s/it]

ratio tensor(1.0915, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4274, device='cuda:0') reward: tensor(1.3154, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.4415, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.4322, device='cuda:0') reward: tensor(1.3164, device='cuda:0') factor: tensor(0.1159, device='cuda:0')
reward_raw: tensor(0.4176, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.1097, device='cuda:0')
reward_raw: tensor(0.4165, device='cuda:0') reward: tensor(1.2955, device='cuda:0') factor: tensor(0.1210, device='cuda:0')


Epoch 1, Step 179: Loss = -0.0844, Actor Loss = -0.1255, Critic Loss = 0.0825, Entropy = 0.1433, Returns = 1.2894, Value = 1.3847, mask_loss = 0.1139std_loss = 0.2553:   3%|▎         | 179/5475 [48:23<24:13:00, 16.46s/it]

ratio tensor(1.0640, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4274, device='cuda:0') reward: tensor(1.3100, device='cuda:0') factor: tensor(0.1174, device='cuda:0')
reward_raw: tensor(0.4502, device='cuda:0') reward: tensor(1.3312, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.4270, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.1199, device='cuda:0')
reward_raw: tensor(0.4558, device='cuda:0') reward: tensor(1.3343, device='cuda:0') factor: tensor(0.1215, device='cuda:0')
reward_raw: tensor(0.4476, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.1196, device='cuda:0')


Epoch 1, Step 180: Loss = 0.2587, Actor Loss = 0.1697, Critic Loss = 0.1782, Entropy = 0.1315, Returns = 1.2583, Value = 1.4440, mask_loss = 0.1125std_loss = 0.2567:   3%|▎         | 180/5475 [48:41<24:53:29, 16.92s/it]  

ratio tensor(0.8973, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.3192, device='cuda:0') factor: tensor(0.1485, device='cuda:0')
reward_raw: tensor(0.5067, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1455, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3563, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.4866, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.1449, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.3656, device='cuda:0') factor: tensor(0.1436, device='cuda:0')


Epoch 1, Step 181: Loss = 0.0335, Actor Loss = -0.0455, Critic Loss = 0.1584, Entropy = 0.1749, Returns = 1.4297, Value = 1.4007, mask_loss = 0.1560std_loss = 0.2837:   3%|▎         | 180/5475 [48:57<24:53:29, 16.92s/it]

ratio tensor(1.7513, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 181: Loss = 0.0335, Actor Loss = -0.0455, Critic Loss = 0.1584, Entropy = 0.1749, Returns = 1.4297, Value = 1.4007, mask_loss = 0.1560std_loss = 0.2837:   3%|▎         | 181/5475 [48:58<24:36:54, 16.74s/it]

reward_raw: tensor(0.4955, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1287, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.3771, device='cuda:0') factor: tensor(0.1218, device='cuda:0')
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.1293, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.3908, device='cuda:0') factor: tensor(0.1187, device='cuda:0')


Epoch 1, Step 182: Loss = 0.1013, Actor Loss = -0.0237, Critic Loss = 0.2504, Entropy = 0.1760, Returns = 1.4472, Value = 1.4821, mask_loss = 0.1325std_loss = 0.2613:   3%|▎         | 182/5475 [49:13<23:55:44, 16.28s/it]

ratio tensor(1.2160, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5288, device='cuda:0') reward: tensor(1.3815, device='cuda:0') factor: tensor(0.1472, device='cuda:0')
reward_raw: tensor(0.5121, device='cuda:0') reward: tensor(1.3670, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.4829, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.3520, device='cuda:0') factor: tensor(0.1430, device='cuda:0')
reward_raw: tensor(0.5009, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1342, device='cuda:0')


Epoch 1, Step 183: Loss = 0.0945, Actor Loss = 0.0513, Critic Loss = 0.0869, Entropy = 0.1735, Returns = 1.3764, Value = 1.4242, mask_loss = 0.1359std_loss = 0.2691:   3%|▎         | 183/5475 [49:33<25:50:14, 17.58s/it] 

ratio tensor(0.9660, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4161, device='cuda:0') reward: tensor(1.2818, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.3575, device='cuda:0') reward: tensor(1.2196, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.3938, device='cuda:0') reward: tensor(1.2571, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.4087, device='cuda:0') reward: tensor(1.2730, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.4122, device='cuda:0') reward: tensor(1.2738, device='cuda:0') factor: tensor(0.1384, device='cuda:0')


Epoch 1, Step 184: Loss = 0.1904, Actor Loss = 0.1337, Critic Loss = 0.1138, Entropy = 0.1958, Returns = 1.2798, Value = 1.2398, mask_loss = 0.1372std_loss = 0.2548:   3%|▎         | 184/5475 [49:48<24:40:14, 16.79s/it]

ratio tensor(0.9146, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4564, device='cuda:0') reward: tensor(1.2915, device='cuda:0') factor: tensor(0.1649, device='cuda:0')
reward_raw: tensor(0.4539, device='cuda:0') reward: tensor(1.2927, device='cuda:0') factor: tensor(0.1613, device='cuda:0')
reward_raw: tensor(0.4569, device='cuda:0') reward: tensor(1.2982, device='cuda:0') factor: tensor(0.1587, device='cuda:0')
reward_raw: tensor(0.4465, device='cuda:0') reward: tensor(1.2791, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.4475, device='cuda:0') reward: tensor(1.2880, device='cuda:0') factor: tensor(0.1595, device='cuda:0')


Epoch 1, Step 185: Loss = -0.3589, Actor Loss = -0.3982, Critic Loss = 0.0790, Entropy = 0.1905, Returns = 1.3772, Value = 1.2641, mask_loss = 0.1391std_loss = 0.2557:   3%|▎         | 185/5475 [50:11<27:23:56, 18.65s/it]

ratio tensor(87.7719, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4089, device='cuda:0') reward: tensor(1.2595, device='cuda:0') factor: tensor(0.1494, device='cuda:0')
reward_raw: tensor(0.3808, device='cuda:0') reward: tensor(1.2201, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.3985, device='cuda:0') reward: tensor(1.2480, device='cuda:0') factor: tensor(0.1505, device='cuda:0')
reward_raw: tensor(0.3720, device='cuda:0') reward: tensor(1.2259, device='cuda:0') factor: tensor(0.1461, device='cuda:0')
reward_raw: tensor(0.4246, device='cuda:0') reward: tensor(1.2715, device='cuda:0') factor: tensor(0.1532, device='cuda:0')


Epoch 1, Step 186: Loss = 0.2973, Actor Loss = 0.2574, Critic Loss = 0.0801, Entropy = 0.1839, Returns = 1.2474, Value = 1.3579, mask_loss = 0.1628std_loss = 0.2845:   3%|▎         | 186/5475 [50:23<24:07:15, 16.42s/it]  

ratio tensor(0.6981, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4070, device='cuda:0') reward: tensor(1.2773, device='cuda:0') factor: tensor(0.1297, device='cuda:0')
reward_raw: tensor(0.4357, device='cuda:0') reward: tensor(1.3025, device='cuda:0') factor: tensor(0.1331, device='cuda:0')
reward_raw: tensor(0.4579, device='cuda:0') reward: tensor(1.3244, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.4150, device='cuda:0') reward: tensor(1.2913, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.4371, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.1346, device='cuda:0')


Epoch 1, Step 187: Loss = 0.0775, Actor Loss = 0.0078, Critic Loss = 0.1397, Entropy = 0.1790, Returns = 1.2949, Value = 1.2033, mask_loss = 0.1419std_loss = 0.2703:   3%|▎         | 187/5475 [50:39<23:59:21, 16.33s/it]

ratio tensor(1.1759, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4458, device='cuda:0') reward: tensor(1.3088, device='cuda:0') factor: tensor(0.1369, device='cuda:0')
reward_raw: tensor(0.4450, device='cuda:0') reward: tensor(1.3086, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.4627, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.4488, device='cuda:0') reward: tensor(1.3089, device='cuda:0') factor: tensor(0.1399, device='cuda:0')
reward_raw: tensor(0.4358, device='cuda:0') reward: tensor(1.3118, device='cuda:0') factor: tensor(0.1240, device='cuda:0')


Epoch 1, Step 188: Loss = -0.1773, Actor Loss = -0.2423, Critic Loss = 0.1302, Entropy = 0.1505, Returns = 1.2735, Value = 1.2773, mask_loss = 0.1311std_loss = 0.2681:   3%|▎         | 188/5475 [50:56<24:14:19, 16.50s/it]

ratio tensor(1.2333, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4224, device='cuda:0') reward: tensor(1.3013, device='cuda:0') factor: tensor(0.1211, device='cuda:0')
reward_raw: tensor(0.4043, device='cuda:0') reward: tensor(1.2787, device='cuda:0') factor: tensor(0.1256, device='cuda:0')
reward_raw: tensor(0.4244, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.4278, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.1281, device='cuda:0')
reward_raw: tensor(0.4205, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.1243, device='cuda:0')


Epoch 1, Step 189: Loss = 0.1599, Actor Loss = 0.0907, Critic Loss = 0.1389, Entropy = 0.1684, Returns = 1.2614, Value = 1.4334, mask_loss = 0.1318std_loss = 0.2625:   3%|▎         | 189/5475 [51:09<22:44:19, 15.49s/it]  

ratio tensor(0.8784, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3274, device='cuda:0') factor: tensor(0.1743, device='cuda:0')
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.3461, device='cuda:0') factor: tensor(0.1709, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1691, device='cuda:0')
reward_raw: tensor(0.4976, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1716, device='cuda:0')
reward_raw: tensor(0.5074, device='cuda:0') reward: tensor(1.3443, device='cuda:0') factor: tensor(0.1631, device='cuda:0')


Epoch 1, Step 190: Loss = 0.1457, Actor Loss = 0.1173, Critic Loss = 0.0573, Entropy = 0.1717, Returns = 1.2616, Value = 1.2595, mask_loss = 0.1690std_loss = 0.2994:   3%|▎         | 190/5475 [51:22<21:56:42, 14.95s/it]

ratio tensor(0.7010, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4443, device='cuda:0') reward: tensor(1.3073, device='cuda:0') factor: tensor(0.1370, device='cuda:0')
reward_raw: tensor(0.4806, device='cuda:0') reward: tensor(1.3464, device='cuda:0') factor: tensor(0.1342, device='cuda:0')
reward_raw: tensor(0.4386, device='cuda:0') reward: tensor(1.3153, device='cuda:0') factor: tensor(0.1232, device='cuda:0')
reward_raw: tensor(0.4720, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.1320, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1333, device='cuda:0')


Epoch 1, Step 191: Loss = -0.0852, Actor Loss = -0.1614, Critic Loss = 0.1527, Entropy = 0.1665, Returns = 1.3807, Value = 1.2282, mask_loss = 0.1218std_loss = 0.2496:   3%|▎         | 191/5475 [51:39<22:31:26, 15.35s/it]

ratio tensor(1.2573, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4211, device='cuda:0') reward: tensor(1.2863, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.4225, device='cuda:0') reward: tensor(1.2869, device='cuda:0') factor: tensor(0.1356, device='cuda:0')
reward_raw: tensor(0.4500, device='cuda:0') reward: tensor(1.3084, device='cuda:0') factor: tensor(0.1416, device='cuda:0')
reward_raw: tensor(0.4495, device='cuda:0') reward: tensor(1.3097, device='cuda:0') factor: tensor(0.1398, device='cuda:0')
reward_raw: tensor(0.4250, device='cuda:0') reward: tensor(1.2875, device='cuda:0') factor: tensor(0.1374, device='cuda:0')


Epoch 1, Step 192: Loss = -0.0757, Actor Loss = -0.1225, Critic Loss = 0.0940, Entropy = 0.1661, Returns = 1.2487, Value = 1.2870, mask_loss = 0.1294std_loss = 0.2605:   4%|▎         | 192/5475 [51:53<21:55:28, 14.94s/it]

ratio tensor(1.0249, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4383, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.1205, device='cuda:0')
reward_raw: tensor(0.4182, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.1246, device='cuda:0')
reward_raw: tensor(0.4250, device='cuda:0') reward: tensor(1.2968, device='cuda:0') factor: tensor(0.1281, device='cuda:0')
reward_raw: tensor(0.4287, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.1231, device='cuda:0')
reward_raw: tensor(0.4255, device='cuda:0') reward: tensor(1.3050, device='cuda:0') factor: tensor(0.1205, device='cuda:0')


Epoch 1, Step 193: Loss = 0.0860, Actor Loss = 0.0565, Critic Loss = 0.0594, Entropy = 0.1605, Returns = 1.2491, Value = 1.2807, mask_loss = 0.1360std_loss = 0.2746:   4%|▎         | 193/5475 [52:09<22:29:04, 15.32s/it]  

ratio tensor(0.8643, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5105, device='cuda:0') reward: tensor(1.3663, device='cuda:0') factor: tensor(0.1442, device='cuda:0')
reward_raw: tensor(0.4825, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.1453, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.3589, device='cuda:0') factor: tensor(0.1473, device='cuda:0')
reward_raw: tensor(0.5357, device='cuda:0') reward: tensor(1.3854, device='cuda:0') factor: tensor(0.1503, device='cuda:0')
reward_raw: tensor(0.4851, device='cuda:0') reward: tensor(1.3425, device='cuda:0') factor: tensor(0.1426, device='cuda:0')


Epoch 1, Step 194: Loss = -0.0572, Actor Loss = -0.0914, Critic Loss = 0.0689, Entropy = 0.1882, Returns = 1.3317, Value = 1.3660, mask_loss = 0.1600std_loss = 0.2866:   4%|▎         | 194/5475 [52:24<22:29:08, 15.33s/it]

ratio tensor(1.3258, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1301, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1356, device='cuda:0')
reward_raw: tensor(0.4767, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.1334, device='cuda:0')
reward_raw: tensor(0.5084, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1351, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.3575, device='cuda:0') factor: tensor(0.1339, device='cuda:0')


Epoch 1, Step 195: Loss = 0.4089, Actor Loss = 0.3487, Critic Loss = 0.1207, Entropy = 0.1603, Returns = 1.3364, Value = 1.4932, mask_loss = 0.1188std_loss = 0.2473:   4%|▎         | 195/5475 [52:38<21:34:02, 14.70s/it]  

ratio tensor(1.3428, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5121, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1634, device='cuda:0')
reward_raw: tensor(0.5091, device='cuda:0') reward: tensor(1.3412, device='cuda:0') factor: tensor(0.1679, device='cuda:0')
reward_raw: tensor(0.4855, device='cuda:0') reward: tensor(1.3244, device='cuda:0') factor: tensor(0.1611, device='cuda:0')
reward_raw: tensor(0.5027, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1691, device='cuda:0')
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.1749, device='cuda:0')


Epoch 1, Step 196: Loss = 0.1889, Actor Loss = 0.1359, Critic Loss = 0.1064, Entropy = 0.1747, Returns = 1.3832, Value = 1.3826, mask_loss = 0.1637std_loss = 0.2944:   4%|▎         | 196/5475 [52:48<19:55:24, 13.59s/it]

ratio tensor(1.0415, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4392, device='cuda:0') reward: tensor(1.3096, device='cuda:0') factor: tensor(0.1296, device='cuda:0')
reward_raw: tensor(0.4396, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.1432, device='cuda:0')
reward_raw: tensor(0.4432, device='cuda:0') reward: tensor(1.3008, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.4330, device='cuda:0') reward: tensor(1.2998, device='cuda:0') factor: tensor(0.1331, device='cuda:0')
reward_raw: tensor(0.4394, device='cuda:0') reward: tensor(1.3078, device='cuda:0') factor: tensor(0.1316, device='cuda:0')


Epoch 1, Step 197: Loss = 0.1217, Actor Loss = 0.0656, Critic Loss = 0.1126, Entropy = 0.1691, Returns = 1.2415, Value = 1.2951, mask_loss = 0.1517std_loss = 0.2819:   4%|▎         | 197/5475 [53:03<20:31:30, 14.00s/it]

ratio tensor(1.2602, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5075, device='cuda:0') reward: tensor(1.3794, device='cuda:0') factor: tensor(0.1281, device='cuda:0')
reward_raw: tensor(0.4648, device='cuda:0') reward: tensor(1.3505, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.4828, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.1212, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.1233, device='cuda:0')


Epoch 1, Step 198: Loss = -0.0099, Actor Loss = -0.0296, Critic Loss = 0.0397, Entropy = 0.1782, Returns = 1.3900, Value = 1.4120, mask_loss = 0.1349std_loss = 0.2594:   4%|▎         | 198/5475 [53:21<22:03:56, 15.05s/it]

ratio tensor(1.1952, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5363, device='cuda:0') reward: tensor(1.3779, device='cuda:0') factor: tensor(0.1584, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.1606, device='cuda:0')
reward_raw: tensor(0.4766, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.1568, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.3585, device='cuda:0') factor: tensor(0.1508, device='cuda:0')
reward_raw: tensor(0.4624, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.1575, device='cuda:0')


Epoch 1, Step 199: Loss = 0.1116, Actor Loss = 0.0541, Critic Loss = 0.1153, Entropy = 0.2025, Returns = 1.4033, Value = 1.4324, mask_loss = 0.1486std_loss = 0.2604:   4%|▎         | 199/5475 [53:38<23:01:29, 15.71s/it]  

ratio tensor(1.5486, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4225, device='cuda:0') reward: tensor(1.2683, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.4364, device='cuda:0') reward: tensor(1.2801, device='cuda:0') factor: tensor(0.1563, device='cuda:0')
reward_raw: tensor(0.4296, device='cuda:0') reward: tensor(1.2649, device='cuda:0') factor: tensor(0.1647, device='cuda:0')
reward_raw: tensor(0.4601, device='cuda:0') reward: tensor(1.2998, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.4040, device='cuda:0') reward: tensor(1.2480, device='cuda:0') factor: tensor(0.1560, device='cuda:0')


Epoch 1, Step 200: Loss = 0.1103, Actor Loss = 0.0771, Critic Loss = 0.0668, Entropy = 0.1816, Returns = 1.2748, Value = 1.2696, mask_loss = 0.1585std_loss = 0.2890:   4%|▎         | 200/5475 [53:58<24:59:44, 17.06s/it]

ratio tensor(0.9195, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.3105, device='cuda:0') factor: tensor(0.1611, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.4763, device='cuda:0') reward: tensor(1.3144, device='cuda:0') factor: tensor(0.1619, device='cuda:0')
reward_raw: tensor(0.4756, device='cuda:0') reward: tensor(1.3078, device='cuda:0') factor: tensor(0.1678, device='cuda:0')
reward_raw: tensor(0.4720, device='cuda:0') reward: tensor(1.3063, device='cuda:0') factor: tensor(0.1656, device='cuda:0')


Epoch 1, Step 201: Loss = -0.2020, Actor Loss = -0.2464, Critic Loss = 0.0893, Entropy = 0.1868, Returns = 1.2614, Value = 1.1900, mask_loss = 0.1717std_loss = 0.2932:   4%|▎         | 200/5475 [54:16<24:59:44, 17.06s/it]

ratio tensor(1.2726, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 201: Loss = -0.2020, Actor Loss = -0.2464, Critic Loss = 0.0893, Entropy = 0.1868, Returns = 1.2614, Value = 1.1900, mask_loss = 0.1717std_loss = 0.2932:   4%|▎         | 201/5475 [54:16<25:24:52, 17.35s/it]

reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.4528, device='cuda:0') reward: tensor(1.3157, device='cuda:0') factor: tensor(0.1371, device='cuda:0')
reward_raw: tensor(0.4502, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.1398, device='cuda:0')
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.1423, device='cuda:0')
reward_raw: tensor(0.4577, device='cuda:0') reward: tensor(1.3203, device='cuda:0') factor: tensor(0.1374, device='cuda:0')


Epoch 1, Step 202: Loss = 0.2422, Actor Loss = 0.1852, Critic Loss = 0.1145, Entropy = 0.1864, Returns = 1.3205, Value = 1.4346, mask_loss = 0.1471std_loss = 0.2732:   4%|▎         | 202/5475 [54:30<23:32:59, 16.08s/it]  

ratio tensor(0.8643, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5762, device='cuda:0') reward: tensor(1.4141, device='cuda:0') factor: tensor(0.1621, device='cuda:0')
reward_raw: tensor(0.5584, device='cuda:0') reward: tensor(1.4066, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.5288, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.1545, device='cuda:0')
reward_raw: tensor(0.5570, device='cuda:0') reward: tensor(1.3993, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.3956, device='cuda:0') factor: tensor(0.1546, device='cuda:0')


Epoch 1, Step 203: Loss = -0.1247, Actor Loss = -0.2036, Critic Loss = 0.1581, Entropy = 0.1898, Returns = 1.4517, Value = 1.3970, mask_loss = 0.1467std_loss = 0.2611:   4%|▎         | 203/5475 [54:53<26:37:24, 18.18s/it]

ratio tensor(1.2361, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3915, device='cuda:0') reward: tensor(1.2575, device='cuda:0') factor: tensor(0.1340, device='cuda:0')
reward_raw: tensor(0.3969, device='cuda:0') reward: tensor(1.2675, device='cuda:0') factor: tensor(0.1294, device='cuda:0')
reward_raw: tensor(0.3837, device='cuda:0') reward: tensor(1.2515, device='cuda:0') factor: tensor(0.1322, device='cuda:0')
reward_raw: tensor(0.3861, device='cuda:0') reward: tensor(1.2455, device='cuda:0') factor: tensor(0.1406, device='cuda:0')
reward_raw: tensor(0.3632, device='cuda:0') reward: tensor(1.2351, device='cuda:0') factor: tensor(0.1281, device='cuda:0')


Epoch 1, Step 204: Loss = 0.0721, Actor Loss = 0.0456, Critic Loss = 0.0534, Entropy = 0.1759, Returns = 1.3103, Value = 1.2537, mask_loss = 0.1294std_loss = 0.2542:   4%|▎         | 204/5475 [55:07<24:47:31, 16.93s/it]  

ratio tensor(0.7980, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3859, device='cuda:0') reward: tensor(1.2224, device='cuda:0') factor: tensor(0.1634, device='cuda:0')
reward_raw: tensor(0.3970, device='cuda:0') reward: tensor(1.2356, device='cuda:0') factor: tensor(0.1614, device='cuda:0')
reward_raw: tensor(0.4126, device='cuda:0') reward: tensor(1.2411, device='cuda:0') factor: tensor(0.1715, device='cuda:0')
reward_raw: tensor(0.3928, device='cuda:0') reward: tensor(1.2250, device='cuda:0') factor: tensor(0.1678, device='cuda:0')
reward_raw: tensor(0.3859, device='cuda:0') reward: tensor(1.2174, device='cuda:0') factor: tensor(0.1684, device='cuda:0')


Epoch 1, Step 205: Loss = 0.0820, Actor Loss = 0.0331, Critic Loss = 0.0983, Entropy = 0.2110, Returns = 1.2287, Value = 1.0881, mask_loss = 0.1873std_loss = 0.3031:   4%|▎         | 205/5475 [55:22<24:15:20, 16.57s/it]

ratio tensor(0.8394, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4226, device='cuda:0') reward: tensor(1.2391, device='cuda:0') factor: tensor(0.1835, device='cuda:0')
reward_raw: tensor(0.4360, device='cuda:0') reward: tensor(1.2567, device='cuda:0') factor: tensor(0.1793, device='cuda:0')
reward_raw: tensor(0.4691, device='cuda:0') reward: tensor(1.2848, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.4623, device='cuda:0') reward: tensor(1.2833, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.4662, device='cuda:0') reward: tensor(1.2840, device='cuda:0') factor: tensor(0.1822, device='cuda:0')


Epoch 1, Step 206: Loss = 0.0461, Actor Loss = 0.0210, Critic Loss = 0.0506, Entropy = 0.1939, Returns = 1.2671, Value = 1.3803, mask_loss = 0.1658std_loss = 0.2832:   4%|▍         | 206/5475 [55:37<23:19:40, 15.94s/it]

ratio tensor(1.3590, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5158, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.1779, device='cuda:0')
reward_raw: tensor(0.5198, device='cuda:0') reward: tensor(1.3434, device='cuda:0') factor: tensor(0.1764, device='cuda:0')
reward_raw: tensor(0.5149, device='cuda:0') reward: tensor(1.3439, device='cuda:0') factor: tensor(0.1710, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.3452, device='cuda:0') factor: tensor(0.1716, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.1748, device='cuda:0')


Epoch 1, Step 207: Loss = -0.2128, Actor Loss = -0.2961, Critic Loss = 0.1668, Entropy = 0.1984, Returns = 1.3396, Value = 1.3854, mask_loss = 0.1861std_loss = 0.3070:   4%|▍         | 207/5475 [55:51<22:26:26, 15.34s/it]

ratio tensor(2.2783, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5448, device='cuda:0') reward: tensor(1.4129, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.5076, device='cuda:0') reward: tensor(1.3855, device='cuda:0') factor: tensor(0.1221, device='cuda:0')
reward_raw: tensor(0.5274, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1243, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.1252, device='cuda:0')


Epoch 1, Step 208: Loss = 0.2608, Actor Loss = 0.2340, Critic Loss = 0.0538, Entropy = 0.1870, Returns = 1.3964, Value = 1.4614, mask_loss = 0.1426std_loss = 0.2606:   4%|▍         | 208/5475 [56:06<22:33:55, 15.42s/it]  

ratio tensor(0.4864, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.3435, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.4643, device='cuda:0') reward: tensor(1.2993, device='cuda:0') factor: tensor(0.1649, device='cuda:0')
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.1600, device='cuda:0')
reward_raw: tensor(0.4818, device='cuda:0') reward: tensor(1.3256, device='cuda:0') factor: tensor(0.1561, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.3401, device='cuda:0') factor: tensor(0.1592, device='cuda:0')


Epoch 1, Step 209: Loss = -0.0856, Actor Loss = -0.1078, Critic Loss = 0.0448, Entropy = 0.2059, Returns = 1.3266, Value = 1.2658, mask_loss = 0.1776std_loss = 0.2917:   4%|▍         | 209/5475 [56:22<22:40:24, 15.50s/it]

ratio tensor(1.0464, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4335, device='cuda:0') reward: tensor(1.2801, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.4128, device='cuda:0') reward: tensor(1.2596, device='cuda:0') factor: tensor(0.1532, device='cuda:0')
reward_raw: tensor(0.4192, device='cuda:0') reward: tensor(1.2646, device='cuda:0') factor: tensor(0.1547, device='cuda:0')
reward_raw: tensor(0.4020, device='cuda:0') reward: tensor(1.2490, device='cuda:0') factor: tensor(0.1530, device='cuda:0')
reward_raw: tensor(0.4030, device='cuda:0') reward: tensor(1.2440, device='cuda:0') factor: tensor(0.1591, device='cuda:0')


Epoch 1, Step 210: Loss = 0.1870, Actor Loss = 0.1184, Critic Loss = 0.1376, Entropy = 0.2265, Returns = 1.2065, Value = 1.3342, mask_loss = 0.1882std_loss = 0.2970:   4%|▍         | 210/5475 [56:35<21:22:46, 14.62s/it]  

ratio tensor(0.3891, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3134, device='cuda:0') factor: tensor(0.1834, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3125, device='cuda:0') factor: tensor(0.1782, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.1738, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.1856, device='cuda:0')
reward_raw: tensor(0.5032, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.1876, device='cuda:0')


Epoch 1, Step 211: Loss = -0.0560, Actor Loss = -0.1275, Critic Loss = 0.1435, Entropy = 0.2235, Returns = 1.3048, Value = 1.1079, mask_loss = 0.1862std_loss = 0.2917:   4%|▍         | 211/5475 [56:53<23:01:12, 15.74s/it]

ratio tensor(1.1336, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4645, device='cuda:0') reward: tensor(1.3157, device='cuda:0') factor: tensor(0.1487, device='cuda:0')
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3401, device='cuda:0') factor: tensor(0.1423, device='cuda:0')
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.3371, device='cuda:0') factor: tensor(0.1443, device='cuda:0')
reward_raw: tensor(0.4980, device='cuda:0') reward: tensor(1.3464, device='cuda:0') factor: tensor(0.1516, device='cuda:0')
reward_raw: tensor(0.4806, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.1435, device='cuda:0')


Epoch 1, Step 212: Loss = -0.0539, Actor Loss = -0.0763, Critic Loss = 0.0453, Entropy = 0.2025, Returns = 1.3868, Value = 1.4386, mask_loss = 0.1308std_loss = 0.2423:   4%|▍         | 212/5475 [57:14<25:14:30, 17.27s/it]

ratio tensor(0.8990, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.1631, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(1.3659, device='cuda:0') factor: tensor(0.1659, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1731, device='cuda:0')
reward_raw: tensor(0.5206, device='cuda:0') reward: tensor(1.3548, device='cuda:0') factor: tensor(0.1657, device='cuda:0')
reward_raw: tensor(0.5251, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.1797, device='cuda:0')


Epoch 1, Step 213: Loss = -0.0284, Actor Loss = -0.1005, Critic Loss = 0.1447, Entropy = 0.2067, Returns = 1.3341, Value = 1.2475, mask_loss = 0.1659std_loss = 0.2822:   4%|▍         | 213/5475 [57:26<22:58:23, 15.72s/it]

ratio tensor(1.9045, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4955, device='cuda:0') reward: tensor(1.3229, device='cuda:0') factor: tensor(0.1726, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1791, device='cuda:0')
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.1725, device='cuda:0')
reward_raw: tensor(0.4837, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1687, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.1619, device='cuda:0')


Epoch 1, Step 214: Loss = -0.0955, Actor Loss = -0.1623, Critic Loss = 0.1342, Entropy = 0.2052, Returns = 1.3199, Value = 1.1084, mask_loss = 0.1646std_loss = 0.2743:   4%|▍         | 214/5475 [57:43<23:23:51, 16.01s/it]

ratio tensor(0.9085, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5036, device='cuda:0') reward: tensor(1.3308, device='cuda:0') factor: tensor(0.1728, device='cuda:0')
reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.2867, device='cuda:0') factor: tensor(0.1746, device='cuda:0')
reward_raw: tensor(0.4794, device='cuda:0') reward: tensor(1.3073, device='cuda:0') factor: tensor(0.1721, device='cuda:0')
reward_raw: tensor(0.4815, device='cuda:0') reward: tensor(1.2981, device='cuda:0') factor: tensor(0.1834, device='cuda:0')
reward_raw: tensor(0.4643, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.1732, device='cuda:0')


Epoch 1, Step 215: Loss = 0.1538, Actor Loss = 0.1055, Critic Loss = 0.0970, Entropy = 0.2129, Returns = 1.3489, Value = 1.3561, mask_loss = 0.1637std_loss = 0.2692:   4%|▍         | 215/5475 [58:00<24:11:23, 16.56s/it]  

ratio tensor(0.7879, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4615, device='cuda:0') reward: tensor(1.2746, device='cuda:0') factor: tensor(0.1869, device='cuda:0')
reward_raw: tensor(0.4564, device='cuda:0') reward: tensor(1.2537, device='cuda:0') factor: tensor(0.2027, device='cuda:0')
reward_raw: tensor(0.4302, device='cuda:0') reward: tensor(1.2377, device='cuda:0') factor: tensor(0.1925, device='cuda:0')
reward_raw: tensor(0.4479, device='cuda:0') reward: tensor(1.2594, device='cuda:0') factor: tensor(0.1886, device='cuda:0')
reward_raw: tensor(0.4439, device='cuda:0') reward: tensor(1.2507, device='cuda:0') factor: tensor(0.1932, device='cuda:0')


Epoch 1, Step 216: Loss = 0.0278, Actor Loss = -0.0373, Critic Loss = 0.1308, Entropy = 0.2258, Returns = 1.2546, Value = 1.1681, mask_loss = 0.1798std_loss = 0.2888:   4%|▍         | 216/5475 [58:15<23:22:05, 16.00s/it]

ratio tensor(1.0380, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4702, device='cuda:0') reward: tensor(1.2938, device='cuda:0') factor: tensor(0.1764, device='cuda:0')
reward_raw: tensor(0.5068, device='cuda:0') reward: tensor(1.3189, device='cuda:0') factor: tensor(0.1879, device='cuda:0')
reward_raw: tensor(0.4916, device='cuda:0') reward: tensor(1.2935, device='cuda:0') factor: tensor(0.1981, device='cuda:0')
reward_raw: tensor(0.4930, device='cuda:0') reward: tensor(1.3118, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.4912, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.1752, device='cuda:0')


Epoch 1, Step 217: Loss = 0.0442, Actor Loss = -0.0106, Critic Loss = 0.1100, Entropy = 0.2192, Returns = 1.2935, Value = 1.3749, mask_loss = 0.1688std_loss = 0.2807:   4%|▍         | 217/5475 [58:30<22:51:44, 15.65s/it]

ratio tensor(0.2952, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.3603, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.1304, device='cuda:0')
reward_raw: tensor(0.4927, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.3481, device='cuda:0') factor: tensor(0.1469, device='cuda:0')
reward_raw: tensor(0.4516, device='cuda:0') reward: tensor(1.3180, device='cuda:0') factor: tensor(0.1336, device='cuda:0')


Epoch 1, Step 218: Loss = 0.1301, Actor Loss = 0.0858, Critic Loss = 0.0890, Entropy = 0.1577, Returns = 1.3264, Value = 1.3082, mask_loss = 0.1131std_loss = 0.2431:   4%|▍         | 218/5475 [58:44<22:07:24, 15.15s/it] 

ratio tensor(0.7529, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4927, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.1458, device='cuda:0')
reward_raw: tensor(0.4650, device='cuda:0') reward: tensor(1.3218, device='cuda:0') factor: tensor(0.1433, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.3652, device='cuda:0') factor: tensor(0.1467, device='cuda:0')
reward_raw: tensor(0.5251, device='cuda:0') reward: tensor(1.3756, device='cuda:0') factor: tensor(0.1495, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1473, device='cuda:0')


Epoch 1, Step 219: Loss = -0.1130, Actor Loss = -0.1754, Critic Loss = 0.1251, Entropy = 0.1474, Returns = 1.4346, Value = 1.2935, mask_loss = 0.1373std_loss = 0.2795:   4%|▍         | 219/5475 [58:59<21:53:01, 14.99s/it]

ratio tensor(1.3523, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4916, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1224, device='cuda:0')
reward_raw: tensor(0.4660, device='cuda:0') reward: tensor(1.3434, device='cuda:0') factor: tensor(0.1226, device='cuda:0')
reward_raw: tensor(0.4787, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1268, device='cuda:0')
reward_raw: tensor(0.4143, device='cuda:0') reward: tensor(1.2906, device='cuda:0') factor: tensor(0.1236, device='cuda:0')
reward_raw: tensor(0.4742, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1315, device='cuda:0')


Epoch 1, Step 220: Loss = 0.1430, Actor Loss = 0.0949, Critic Loss = 0.0964, Entropy = 0.1519, Returns = 1.3506, Value = 1.4865, mask_loss = 0.1327std_loss = 0.2685:   4%|▍         | 220/5475 [59:19<24:11:09, 16.57s/it]  

ratio tensor(0.7006, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1136, device='cuda:0')
reward_raw: tensor(0.4435, device='cuda:0') reward: tensor(1.3285, device='cuda:0') factor: tensor(0.1150, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3851, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.4806, device='cuda:0') reward: tensor(1.3660, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.4685, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1091, device='cuda:0')


Epoch 1, Step 221: Loss = 0.0236, Actor Loss = -0.0027, Critic Loss = 0.0528, Entropy = 0.1552, Returns = 1.4033, Value = 1.3782, mask_loss = 0.1294std_loss = 0.2639:   4%|▍         | 220/5475 [59:35<24:11:09, 16.57s/it]

ratio tensor(0.9990, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 221: Loss = 0.0236, Actor Loss = -0.0027, Critic Loss = 0.0528, Entropy = 0.1552, Returns = 1.4033, Value = 1.3782, mask_loss = 0.1294std_loss = 0.2639:   4%|▍         | 221/5475 [59:35<24:09:08, 16.55s/it]

reward_raw: tensor(0.6149, device='cuda:0') reward: tensor(1.4715, device='cuda:0') factor: tensor(0.1434, device='cuda:0')
reward_raw: tensor(0.6131, device='cuda:0') reward: tensor(1.4672, device='cuda:0') factor: tensor(0.1458, device='cuda:0')
reward_raw: tensor(0.6170, device='cuda:0') reward: tensor(1.4684, device='cuda:0') factor: tensor(0.1486, device='cuda:0')
reward_raw: tensor(0.6182, device='cuda:0') reward: tensor(1.4818, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.6017, device='cuda:0') reward: tensor(1.4653, device='cuda:0') factor: tensor(0.1364, device='cuda:0')


Epoch 1, Step 222: Loss = -0.2101, Actor Loss = -0.2529, Critic Loss = 0.0859, Entropy = 0.1604, Returns = 1.4553, Value = 1.3915, mask_loss = 0.1409std_loss = 0.2746:   4%|▍         | 222/5475 [59:48<22:25:07, 15.36s/it]

ratio tensor(1.7796, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.4263, device='cuda:0') factor: tensor(0.1326, device='cuda:0')
reward_raw: tensor(0.5669, device='cuda:0') reward: tensor(1.4264, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.5479, device='cuda:0') reward: tensor(1.4048, device='cuda:0') factor: tensor(0.1430, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.4037, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.4209, device='cuda:0') factor: tensor(0.1362, device='cuda:0')


Epoch 1, Step 223: Loss = 0.2649, Actor Loss = 0.2269, Critic Loss = 0.0764, Entropy = 0.1719, Returns = 1.4691, Value = 1.5139, mask_loss = 0.1500std_loss = 0.2848:   4%|▍         | 223/5475 [1:00:00<20:47:37, 14.25s/it]

ratio tensor(0.6701, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.3673, device='cuda:0') factor: tensor(0.1278, device='cuda:0')
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3842, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1239, device='cuda:0')
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1251, device='cuda:0')


Epoch 1, Step 224: Loss = -0.0367, Actor Loss = -0.1118, Critic Loss = 0.1507, Entropy = 0.1556, Returns = 1.4258, Value = 1.2999, mask_loss = 0.1335std_loss = 0.2730:   4%|▍         | 224/5475 [1:00:15<21:26:29, 14.70s/it]

ratio tensor(1.1621, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4527, device='cuda:0') reward: tensor(1.3010, device='cuda:0') factor: tensor(0.1517, device='cuda:0')
reward_raw: tensor(0.4651, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.1446, device='cuda:0')
reward_raw: tensor(0.4515, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.1417, device='cuda:0')
reward_raw: tensor(0.4523, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.1424, device='cuda:0')
reward_raw: tensor(0.4492, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.1413, device='cuda:0')


Epoch 1, Step 225: Loss = 0.0864, Actor Loss = 0.0391, Critic Loss = 0.0950, Entropy = 0.1501, Returns = 1.3281, Value = 1.4655, mask_loss = 0.1449std_loss = 0.2894:   4%|▍         | 225/5475 [1:00:29<21:00:28, 14.41s/it]  

ratio tensor(0.8550, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4954, device='cuda:0') reward: tensor(1.3617, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.5042, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1349, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.3820, device='cuda:0') factor: tensor(0.1342, device='cuda:0')
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.3688, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.4985, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.1426, device='cuda:0')


Epoch 1, Step 226: Loss = 0.0663, Actor Loss = -0.0214, Critic Loss = 0.1757, Entropy = 0.1602, Returns = 1.4589, Value = 1.1978, mask_loss = 0.1463std_loss = 0.2789:   4%|▍         | 226/5475 [1:00:42<20:31:14, 14.07s/it]

ratio tensor(0.9046, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5081, device='cuda:0') reward: tensor(1.3474, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.4511, device='cuda:0') reward: tensor(1.3004, device='cuda:0') factor: tensor(0.1508, device='cuda:0')
reward_raw: tensor(0.4492, device='cuda:0') reward: tensor(1.2958, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.5067, device='cuda:0') reward: tensor(1.3575, device='cuda:0') factor: tensor(0.1491, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.1533, device='cuda:0')


Epoch 1, Step 227: Loss = -0.1094, Actor Loss = -0.1860, Critic Loss = 0.1536, Entropy = 0.1749, Returns = 1.3439, Value = 1.3304, mask_loss = 0.1630std_loss = 0.2919:   4%|▍         | 227/5475 [1:00:58<21:06:10, 14.48s/it]

ratio tensor(1.2789, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.3634, device='cuda:0') factor: tensor(0.1428, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3565, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.1400, device='cuda:0')
reward_raw: tensor(0.5478, device='cuda:0') reward: tensor(1.4080, device='cuda:0') factor: tensor(0.1398, device='cuda:0')
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.1466, device='cuda:0')


Epoch 1, Step 228: Loss = 0.0490, Actor Loss = -0.0026, Critic Loss = 0.1036, Entropy = 0.1740, Returns = 1.3789, Value = 1.4562, mask_loss = 0.1448std_loss = 0.2757:   4%|▍         | 228/5475 [1:01:11<20:38:17, 14.16s/it] 

ratio tensor(0.9997, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5923, device='cuda:0') reward: tensor(1.4729, device='cuda:0') factor: tensor(0.1194, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.4410, device='cuda:0') factor: tensor(0.1228, device='cuda:0')
reward_raw: tensor(0.5946, device='cuda:0') reward: tensor(1.4648, device='cuda:0') factor: tensor(0.1298, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.4333, device='cuda:0') factor: tensor(0.1209, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.4374, device='cuda:0') factor: tensor(0.1225, device='cuda:0')


Epoch 1, Step 229: Loss = 0.0480, Actor Loss = 0.0048, Critic Loss = 0.0866, Entropy = 0.1589, Returns = 1.4221, Value = 1.4451, mask_loss = 0.1186std_loss = 0.2486:   4%|▍         | 229/5475 [1:01:25<20:35:02, 14.13s/it] 

ratio tensor(1.1137, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.1245, device='cuda:0')
reward_raw: tensor(0.4405, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.1128, device='cuda:0')
reward_raw: tensor(0.4433, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.1207, device='cuda:0')
reward_raw: tensor(0.4724, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1294, device='cuda:0')


Epoch 1, Step 230: Loss = 0.1854, Actor Loss = 0.1608, Critic Loss = 0.0494, Entropy = 0.1477, Returns = 1.3164, Value = 1.3948, mask_loss = 0.1095std_loss = 0.2441:   4%|▍         | 230/5475 [1:01:39<20:30:20, 14.07s/it]

ratio tensor(0.8761, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5085, device='cuda:0') reward: tensor(1.3736, device='cuda:0') factor: tensor(0.1349, device='cuda:0')
reward_raw: tensor(0.5139, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.1290, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.3663, device='cuda:0') factor: tensor(0.1375, device='cuda:0')
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.3677, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.1403, device='cuda:0')


Epoch 1, Step 231: Loss = 0.0314, Actor Loss = -0.0157, Critic Loss = 0.0944, Entropy = 0.1567, Returns = 1.3762, Value = 1.2713, mask_loss = 0.1383std_loss = 0.2763:   4%|▍         | 231/5475 [1:01:52<20:09:34, 13.84s/it]

ratio tensor(0.9604, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.3382, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.4328, device='cuda:0') reward: tensor(1.2908, device='cuda:0') factor: tensor(0.1420, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.1488, device='cuda:0')
reward_raw: tensor(0.4705, device='cuda:0') reward: tensor(1.3181, device='cuda:0') factor: tensor(0.1524, device='cuda:0')


Epoch 1, Step 232: Loss = -0.0745, Actor Loss = -0.1197, Critic Loss = 0.0909, Entropy = 0.1828, Returns = 1.3253, Value = 1.3935, mask_loss = 0.1272std_loss = 0.2431:   4%|▍         | 232/5475 [1:02:07<20:24:47, 14.02s/it]

ratio tensor(2.3846, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3153, device='cuda:0') factor: tensor(0.1461, device='cuda:0')
reward_raw: tensor(0.4574, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.1402, device='cuda:0')
reward_raw: tensor(0.4558, device='cuda:0') reward: tensor(1.3208, device='cuda:0') factor: tensor(0.1350, device='cuda:0')
reward_raw: tensor(0.4628, device='cuda:0') reward: tensor(1.3160, device='cuda:0') factor: tensor(0.1468, device='cuda:0')


Epoch 1, Step 233: Loss = 0.2043, Actor Loss = 0.1504, Critic Loss = 0.1081, Entropy = 0.1754, Returns = 1.3214, Value = 1.4557, mask_loss = 0.1673std_loss = 0.2852:   4%|▍         | 233/5475 [1:02:29<24:03:13, 16.52s/it]  

ratio tensor(0.9867, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4587, device='cuda:0') reward: tensor(1.3043, device='cuda:0') factor: tensor(0.1544, device='cuda:0')
reward_raw: tensor(0.4302, device='cuda:0') reward: tensor(1.2827, device='cuda:0') factor: tensor(0.1476, device='cuda:0')
reward_raw: tensor(0.4496, device='cuda:0') reward: tensor(1.2954, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.4372, device='cuda:0') reward: tensor(1.2802, device='cuda:0') factor: tensor(0.1570, device='cuda:0')
reward_raw: tensor(0.4562, device='cuda:0') reward: tensor(1.3012, device='cuda:0') factor: tensor(0.1550, device='cuda:0')


Epoch 1, Step 234: Loss = 0.1284, Actor Loss = 0.1013, Critic Loss = 0.0547, Entropy = 0.1548, Returns = 1.3126, Value = 1.2805, mask_loss = 0.1288std_loss = 0.2649:   4%|▍         | 234/5475 [1:02:47<24:28:53, 16.82s/it]

ratio tensor(0.7860, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4702, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.1566, device='cuda:0')
reward_raw: tensor(0.4718, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.1556, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3352, device='cuda:0') factor: tensor(0.1555, device='cuda:0')
reward_raw: tensor(0.4820, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.4548, device='cuda:0') reward: tensor(1.2976, device='cuda:0') factor: tensor(0.1572, device='cuda:0')


Epoch 1, Step 235: Loss = -0.2058, Actor Loss = -0.2531, Critic Loss = 0.0948, Entropy = 0.1612, Returns = 1.3440, Value = 1.3015, mask_loss = 0.1527std_loss = 0.2932:   4%|▍         | 235/5475 [1:02:58<22:10:51, 15.24s/it]

ratio tensor(1.2105, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5278, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.1523, device='cuda:0')
reward_raw: tensor(0.5255, device='cuda:0') reward: tensor(1.3795, device='cuda:0') factor: tensor(0.1460, device='cuda:0')
reward_raw: tensor(0.5192, device='cuda:0') reward: tensor(1.3682, device='cuda:0') factor: tensor(0.1510, device='cuda:0')
reward_raw: tensor(0.5187, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.3616, device='cuda:0') factor: tensor(0.1439, device='cuda:0')


Epoch 1, Step 236: Loss = -0.0430, Actor Loss = -0.1029, Critic Loss = 0.1200, Entropy = 0.1577, Returns = 1.4883, Value = 1.4405, mask_loss = 0.1490std_loss = 0.2889:   4%|▍         | 236/5475 [1:03:13<21:47:08, 14.97s/it]

ratio tensor(1.1158, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3776, device='cuda:0') factor: tensor(0.1253, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3877, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5078, device='cuda:0') reward: tensor(1.3748, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3873, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.4947, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.1340, device='cuda:0')


Epoch 1, Step 237: Loss = 0.0659, Actor Loss = -0.0115, Critic Loss = 0.1551, Entropy = 0.1297, Returns = 1.3977, Value = 1.3594, mask_loss = 0.1119std_loss = 0.2552:   4%|▍         | 237/5475 [1:03:31<23:27:10, 16.12s/it] 

ratio tensor(1.0530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4831, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3710, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.1439, device='cuda:0')


Epoch 1, Step 238: Loss = -0.1190, Actor Loss = -0.1653, Critic Loss = 0.0929, Entropy = 0.1328, Returns = 1.4510, Value = 1.3321, mask_loss = 0.1248std_loss = 0.2647:   4%|▍         | 238/5475 [1:03:47<23:01:44, 15.83s/it]

ratio tensor(0.9728, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3835, device='cuda:0') reward: tensor(1.2704, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.3721, device='cuda:0') reward: tensor(1.2628, device='cuda:0') factor: tensor(0.1093, device='cuda:0')
reward_raw: tensor(0.3449, device='cuda:0') reward: tensor(1.2338, device='cuda:0') factor: tensor(0.1111, device='cuda:0')
reward_raw: tensor(0.3457, device='cuda:0') reward: tensor(1.2387, device='cuda:0') factor: tensor(0.1070, device='cuda:0')
reward_raw: tensor(0.3781, device='cuda:0') reward: tensor(1.2661, device='cuda:0') factor: tensor(0.1120, device='cuda:0')


Epoch 1, Step 239: Loss = 0.3111, Actor Loss = 0.2156, Critic Loss = 0.1913, Entropy = 0.1436, Returns = 1.2269, Value = 1.4512, mask_loss = 0.1190std_loss = 0.2590:   4%|▍         | 239/5475 [1:04:06<24:26:27, 16.80s/it]  

ratio tensor(1.2436, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.1371, device='cuda:0')
reward_raw: tensor(0.4391, device='cuda:0') reward: tensor(1.3054, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.3341, device='cuda:0') factor: tensor(0.1243, device='cuda:0')
reward_raw: tensor(0.4345, device='cuda:0') reward: tensor(1.3033, device='cuda:0') factor: tensor(0.1312, device='cuda:0')
reward_raw: tensor(0.4535, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.1349, device='cuda:0')


Epoch 1, Step 240: Loss = -0.0591, Actor Loss = -0.1471, Critic Loss = 0.1763, Entropy = 0.1785, Returns = 1.3054, Value = 1.0205, mask_loss = 0.1532std_loss = 0.2805:   4%|▍         | 240/5475 [1:04:22<24:01:52, 16.53s/it]

ratio tensor(1.0377, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4404, device='cuda:0') reward: tensor(1.3028, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.4505, device='cuda:0') reward: tensor(1.3203, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.3247, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.4469, device='cuda:0') reward: tensor(1.3050, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3129, device='cuda:0') factor: tensor(0.1378, device='cuda:0')


Epoch 1, Step 241: Loss = -0.1498, Actor Loss = -0.2205, Critic Loss = 0.1416, Entropy = 0.1229, Returns = 1.3748, Value = 1.3315, mask_loss = 0.1206std_loss = 0.2712:   4%|▍         | 240/5475 [1:04:37<24:01:52, 16.53s/it]

ratio tensor(2.2862, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 241: Loss = -0.1498, Actor Loss = -0.2205, Critic Loss = 0.1416, Entropy = 0.1229, Returns = 1.3748, Value = 1.3315, mask_loss = 0.1206std_loss = 0.2712:   4%|▍         | 241/5475 [1:04:37<23:33:53, 16.21s/it]

reward_raw: tensor(0.4263, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.4054, device='cuda:0') reward: tensor(1.2694, device='cuda:0') factor: tensor(0.1360, device='cuda:0')
reward_raw: tensor(0.3924, device='cuda:0') reward: tensor(1.2587, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.3906, device='cuda:0') reward: tensor(1.2539, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.3676, device='cuda:0') reward: tensor(1.2333, device='cuda:0') factor: tensor(0.1342, device='cuda:0')


Epoch 1, Step 242: Loss = 0.0951, Actor Loss = 0.0487, Critic Loss = 0.0931, Entropy = 0.1320, Returns = 1.2565, Value = 1.2896, mask_loss = 0.1257std_loss = 0.2720:   4%|▍         | 242/5475 [1:04:53<23:36:00, 16.24s/it]  

ratio tensor(0.9738, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3698, device='cuda:0') reward: tensor(1.2551, device='cuda:0') factor: tensor(0.1147, device='cuda:0')
reward_raw: tensor(0.3515, device='cuda:0') reward: tensor(1.2427, device='cuda:0') factor: tensor(0.1088, device='cuda:0')
reward_raw: tensor(0.3706, device='cuda:0') reward: tensor(1.2521, device='cuda:0') factor: tensor(0.1185, device='cuda:0')
reward_raw: tensor(0.3563, device='cuda:0') reward: tensor(1.2445, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.3597, device='cuda:0') reward: tensor(1.2461, device='cuda:0') factor: tensor(0.1136, device='cuda:0')


Epoch 1, Step 243: Loss = -0.0609, Actor Loss = -0.1458, Critic Loss = 0.1701, Entropy = 0.1359, Returns = 1.2371, Value = 1.1063, mask_loss = 0.1272std_loss = 0.2707:   4%|▍         | 243/5475 [1:05:14<25:31:59, 17.57s/it]

ratio tensor(0.9792, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4178, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.3995, device='cuda:0') reward: tensor(1.2822, device='cuda:0') factor: tensor(0.1174, device='cuda:0')
reward_raw: tensor(0.3924, device='cuda:0') reward: tensor(1.2810, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.4493, device='cuda:0') reward: tensor(1.3344, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4226, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.1135, device='cuda:0')


Epoch 1, Step 244: Loss = 0.0912, Actor Loss = 0.0222, Critic Loss = 0.1381, Entropy = 0.1248, Returns = 1.3435, Value = 1.3377, mask_loss = 0.1104std_loss = 0.2587:   4%|▍         | 244/5475 [1:05:37<27:46:53, 19.12s/it]  

ratio tensor(1.0928, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4710, device='cuda:0') reward: tensor(1.3367, device='cuda:0') factor: tensor(0.1343, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1288, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3506, device='cuda:0') factor: tensor(0.1350, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(1.3390, device='cuda:0') factor: tensor(0.1370, device='cuda:0')
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.3548, device='cuda:0') factor: tensor(0.1261, device='cuda:0')


Epoch 1, Step 245: Loss = -0.1158, Actor Loss = -0.1676, Critic Loss = 0.1039, Entropy = 0.1316, Returns = 1.3526, Value = 1.4276, mask_loss = 0.1428std_loss = 0.2808:   4%|▍         | 245/5475 [1:05:53<26:20:54, 18.14s/it]

ratio tensor(1.2242, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.4346, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.3994, device='cuda:0') factor: tensor(0.1168, device='cuda:0')
reward_raw: tensor(0.5408, device='cuda:0') reward: tensor(1.4279, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.5443, device='cuda:0') reward: tensor(1.4252, device='cuda:0') factor: tensor(0.1191, device='cuda:0')


Epoch 1, Step 246: Loss = 0.2176, Actor Loss = 0.1682, Critic Loss = 0.0990, Entropy = 0.1346, Returns = 1.3871, Value = 1.4582, mask_loss = 0.1187std_loss = 0.2586:   4%|▍         | 246/5475 [1:06:14<27:55:00, 19.22s/it]  

ratio tensor(1.2003, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4121, device='cuda:0') reward: tensor(1.2935, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.4350, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.4448, device='cuda:0') reward: tensor(1.3263, device='cuda:0') factor: tensor(0.1185, device='cuda:0')
reward_raw: tensor(0.4380, device='cuda:0') reward: tensor(1.3234, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.4389, device='cuda:0') reward: tensor(1.3163, device='cuda:0') factor: tensor(0.1225, device='cuda:0')


Epoch 1, Step 247: Loss = 0.1163, Actor Loss = 0.0862, Critic Loss = 0.0605, Entropy = 0.1385, Returns = 1.2775, Value = 1.2155, mask_loss = 0.1159std_loss = 0.2587:   5%|▍         | 247/5475 [1:06:36<28:55:28, 19.92s/it]

ratio tensor(0.9932, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4193, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.4301, device='cuda:0') reward: tensor(1.3219, device='cuda:0') factor: tensor(0.1082, device='cuda:0')
reward_raw: tensor(0.4415, device='cuda:0') reward: tensor(1.3297, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.4231, device='cuda:0') reward: tensor(1.3160, device='cuda:0') factor: tensor(0.1071, device='cuda:0')
reward_raw: tensor(0.4348, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.1053, device='cuda:0')


Epoch 1, Step 248: Loss = 0.0948, Actor Loss = 0.0198, Critic Loss = 0.1503, Entropy = 0.1177, Returns = 1.2807, Value = 1.2567, mask_loss = 0.1051std_loss = 0.2529:   5%|▍         | 248/5475 [1:06:55<28:36:54, 19.71s/it]

ratio tensor(0.9663, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3896, device='cuda:0') reward: tensor(1.2680, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.4176, device='cuda:0') reward: tensor(1.2942, device='cuda:0') factor: tensor(0.1234, device='cuda:0')
reward_raw: tensor(0.4074, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.1212, device='cuda:0')
reward_raw: tensor(0.4115, device='cuda:0') reward: tensor(1.2835, device='cuda:0') factor: tensor(0.1280, device='cuda:0')
reward_raw: tensor(0.4129, device='cuda:0') reward: tensor(1.2892, device='cuda:0') factor: tensor(0.1236, device='cuda:0')


Epoch 1, Step 249: Loss = 0.0169, Actor Loss = -0.0508, Critic Loss = 0.1358, Entropy = 0.1219, Returns = 1.2402, Value = 1.4006, mask_loss = 0.1424std_loss = 0.2936:   5%|▍         | 249/5475 [1:07:10<26:24:53, 18.20s/it]

ratio tensor(8.6171, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.3617, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.4919, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.4632, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4698, device='cuda:0') reward: tensor(1.3635, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1045, device='cuda:0')


Epoch 1, Step 250: Loss = -0.0464, Actor Loss = -0.0873, Critic Loss = 0.0819, Entropy = 0.0933, Returns = 1.3567, Value = 1.3986, mask_loss = 0.1013std_loss = 0.2592:   5%|▍         | 250/5475 [1:07:32<27:59:44, 19.29s/it]

ratio tensor(1.0454, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4778, device='cuda:0') reward: tensor(1.3471, device='cuda:0') factor: tensor(0.1307, device='cuda:0')
reward_raw: tensor(0.4793, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.4908, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.4837, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1273, device='cuda:0')


Epoch 1, Step 251: Loss = 0.1412, Actor Loss = 0.0937, Critic Loss = 0.0952, Entropy = 0.1257, Returns = 1.3476, Value = 1.3837, mask_loss = 0.1369std_loss = 0.2870:   5%|▍         | 251/5475 [1:07:47<26:16:29, 18.11s/it]  

ratio tensor(0.7555, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5328, device='cuda:0') reward: tensor(1.4265, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.4167, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.4150, device='cuda:0') factor: tensor(0.1092, device='cuda:0')
reward_raw: tensor(0.5012, device='cuda:0') reward: tensor(1.3936, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.3879, device='cuda:0') factor: tensor(0.1110, device='cuda:0')


Epoch 1, Step 252: Loss = 0.1690, Actor Loss = 0.1068, Critic Loss = 0.1247, Entropy = 0.1159, Returns = 1.4375, Value = 1.4727, mask_loss = 0.1039std_loss = 0.2530:   5%|▍         | 252/5475 [1:08:00<24:15:48, 16.72s/it]

ratio tensor(1.5243, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4669, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4573, device='cuda:0') reward: tensor(1.3526, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.4485, device='cuda:0') reward: tensor(1.3483, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4608, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.4637, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1019, device='cuda:0')


Epoch 1, Step 253: Loss = -0.0422, Actor Loss = -0.1261, Critic Loss = 0.1682, Entropy = 0.1124, Returns = 1.3951, Value = 1.4342, mask_loss = 0.1140std_loss = 0.2688:   5%|▍         | 253/5475 [1:08:21<26:00:52, 17.93s/it]

ratio tensor(1.1583, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.1220, device='cuda:0')
reward_raw: tensor(0.4548, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.1150, device='cuda:0')
reward_raw: tensor(0.4805, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1159, device='cuda:0')
reward_raw: tensor(0.4763, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.4802, device='cuda:0') reward: tensor(1.3647, device='cuda:0') factor: tensor(0.1156, device='cuda:0')


Epoch 1, Step 254: Loss = 1.0280, Actor Loss = 0.9730, Critic Loss = 0.1102, Entropy = 0.1907, Returns = 1.3550, Value = 1.4017, mask_loss = 0.1639std_loss = 0.2852:   5%|▍         | 254/5475 [1:08:36<24:47:07, 17.09s/it]  

ratio tensor(0.8025, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4756, device='cuda:0') reward: tensor(1.3238, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.4757, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.1558, device='cuda:0')
reward_raw: tensor(0.4478, device='cuda:0') reward: tensor(1.2955, device='cuda:0') factor: tensor(0.1524, device='cuda:0')
reward_raw: tensor(0.4477, device='cuda:0') reward: tensor(1.2859, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.1609, device='cuda:0')


Epoch 1, Step 255: Loss = 0.1373, Actor Loss = 0.0528, Critic Loss = 0.1696, Entropy = 0.2885, Returns = 1.3444, Value = 1.3539, mask_loss = 0.2107std_loss = 0.2771:   5%|▍         | 255/5475 [1:08:52<24:09:57, 16.67s/it]

ratio tensor(0.3037, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4606, device='cuda:0') reward: tensor(1.2263, device='cuda:0') factor: tensor(0.2343, device='cuda:0')
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.2625, device='cuda:0') factor: tensor(0.2251, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.2750, device='cuda:0') factor: tensor(0.2248, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.2605, device='cuda:0') factor: tensor(0.2299, device='cuda:0')
reward_raw: tensor(0.5058, device='cuda:0') reward: tensor(1.2755, device='cuda:0') factor: tensor(0.2303, device='cuda:0')


Epoch 1, Step 256: Loss = -0.1621, Actor Loss = -0.1949, Critic Loss = 0.0662, Entropy = 0.3080, Returns = 1.2508, Value = 1.2671, mask_loss = 0.2263std_loss = 0.2832:   5%|▍         | 256/5475 [1:09:09<24:20:17, 16.79s/it]

ratio tensor(2.6397, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5100, device='cuda:0') reward: tensor(1.2842, device='cuda:0') factor: tensor(0.2258, device='cuda:0')
reward_raw: tensor(0.5106, device='cuda:0') reward: tensor(1.2814, device='cuda:0') factor: tensor(0.2292, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.2740, device='cuda:0') factor: tensor(0.2301, device='cuda:0')
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.2846, device='cuda:0') factor: tensor(0.2361, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.3109, device='cuda:0') factor: tensor(0.2243, device='cuda:0')


Epoch 1, Step 257: Loss = 0.3236, Actor Loss = 0.2349, Critic Loss = 0.1781, Entropy = 0.3255, Returns = 1.3272, Value = 1.3858, mask_loss = 0.2574std_loss = 0.2955:   5%|▍         | 257/5475 [1:09:22<22:36:08, 15.59s/it]  

ratio tensor(0.6728, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6047, device='cuda:0') reward: tensor(1.3448, device='cuda:0') factor: tensor(0.2599, device='cuda:0')
reward_raw: tensor(0.5976, device='cuda:0') reward: tensor(1.3266, device='cuda:0') factor: tensor(0.2710, device='cuda:0')
reward_raw: tensor(0.6281, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.2609, device='cuda:0')
reward_raw: tensor(0.6244, device='cuda:0') reward: tensor(1.3547, device='cuda:0') factor: tensor(0.2697, device='cuda:0')
reward_raw: tensor(0.6243, device='cuda:0') reward: tensor(1.3590, device='cuda:0') factor: tensor(0.2652, device='cuda:0')


Epoch 1, Step 258: Loss = -0.1436, Actor Loss = -0.2153, Critic Loss = 0.1440, Entropy = 0.3285, Returns = 1.3712, Value = 1.1438, mask_loss = 0.3093std_loss = 0.3077:   5%|▍         | 258/5475 [1:09:38<22:52:59, 15.79s/it]

ratio tensor(1.2418, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.2892, device='cuda:0') factor: tensor(0.2201, device='cuda:0')
reward_raw: tensor(0.5124, device='cuda:0') reward: tensor(1.2815, device='cuda:0') factor: tensor(0.2309, device='cuda:0')
reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.2967, device='cuda:0') factor: tensor(0.2228, device='cuda:0')
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.2737, device='cuda:0') factor: tensor(0.2350, device='cuda:0')
reward_raw: tensor(0.5191, device='cuda:0') reward: tensor(1.2971, device='cuda:0') factor: tensor(0.2219, device='cuda:0')


Epoch 1, Step 259: Loss = -0.2250, Actor Loss = -0.2692, Critic Loss = 0.0891, Entropy = 0.3098, Returns = 1.2072, Value = 1.1982, mask_loss = 0.2262std_loss = 0.2831:   5%|▍         | 259/5475 [1:09:58<24:42:07, 17.05s/it]

ratio tensor(1.2498, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3097, device='cuda:0') factor: tensor(0.2220, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5085, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.2219, device='cuda:0')
reward_raw: tensor(0.5026, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.2164, device='cuda:0')
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.3151, device='cuda:0') factor: tensor(0.2130, device='cuda:0')


Epoch 1, Step 260: Loss = 0.2164, Actor Loss = 0.1433, Critic Loss = 0.1467, Entropy = 0.3116, Returns = 1.3023, Value = 1.4225, mask_loss = 0.2335std_loss = 0.2913:   5%|▍         | 260/5475 [1:10:16<25:00:19, 17.26s/it]  

ratio tensor(1.0909, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5351, device='cuda:0') reward: tensor(1.3197, device='cuda:0') factor: tensor(0.2154, device='cuda:0')
reward_raw: tensor(0.5804, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.2104, device='cuda:0')
reward_raw: tensor(0.5664, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.2194, device='cuda:0')
reward_raw: tensor(0.5757, device='cuda:0') reward: tensor(1.3685, device='cuda:0') factor: tensor(0.2072, device='cuda:0')
reward_raw: tensor(0.5836, device='cuda:0') reward: tensor(1.3680, device='cuda:0') factor: tensor(0.2156, device='cuda:0')


Epoch 1, Step 261: Loss = 0.1106, Actor Loss = 0.0574, Critic Loss = 0.1070, Entropy = 0.2841, Returns = 1.3690, Value = 1.4636, mask_loss = 0.2109std_loss = 0.2836:   5%|▍         | 260/5475 [1:10:33<25:00:19, 17.26s/it]

ratio tensor(1.0210, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 261: Loss = 0.1106, Actor Loss = 0.0574, Critic Loss = 0.1070, Entropy = 0.2841, Returns = 1.3690, Value = 1.4636, mask_loss = 0.2109std_loss = 0.2836:   5%|▍         | 261/5475 [1:10:33<24:56:39, 17.22s/it]

reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.2634, device='cuda:0') factor: tensor(0.2404, device='cuda:0')
reward_raw: tensor(0.5241, device='cuda:0') reward: tensor(1.2929, device='cuda:0') factor: tensor(0.2312, device='cuda:0')
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.2565, device='cuda:0') factor: tensor(0.2375, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.2618, device='cuda:0') factor: tensor(0.2379, device='cuda:0')
reward_raw: tensor(0.4987, device='cuda:0') reward: tensor(1.2585, device='cuda:0') factor: tensor(0.2403, device='cuda:0')


Epoch 1, Step 262: Loss = 0.2989, Actor Loss = 0.2302, Critic Loss = 0.1380, Entropy = 0.2942, Returns = 1.1879, Value = 1.1767, mask_loss = 0.2324std_loss = 0.2968:   5%|▍         | 262/5475 [1:10:49<24:21:37, 16.82s/it]

ratio tensor(0.8406, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.2814, device='cuda:0') factor: tensor(0.2443, device='cuda:0')
reward_raw: tensor(0.5655, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.2423, device='cuda:0')
reward_raw: tensor(0.5523, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.2287, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3312, device='cuda:0') factor: tensor(0.2292, device='cuda:0')
reward_raw: tensor(0.5716, device='cuda:0') reward: tensor(1.3382, device='cuda:0') factor: tensor(0.2333, device='cuda:0')


Epoch 1, Step 263: Loss = -0.0225, Actor Loss = -0.0579, Critic Loss = 0.0712, Entropy = 0.2702, Returns = 1.3290, Value = 1.3417, mask_loss = 0.2106std_loss = 0.2896:   5%|▍         | 263/5475 [1:11:04<23:38:35, 16.33s/it]

ratio tensor(1.3878, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.2686, device='cuda:0') factor: tensor(0.2605, device='cuda:0')
reward_raw: tensor(0.5716, device='cuda:0') reward: tensor(1.3082, device='cuda:0') factor: tensor(0.2634, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.3357, device='cuda:0') factor: tensor(0.2647, device='cuda:0')
reward_raw: tensor(0.5967, device='cuda:0') reward: tensor(1.3322, device='cuda:0') factor: tensor(0.2645, device='cuda:0')
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.3154, device='cuda:0') factor: tensor(0.2571, device='cuda:0')


Epoch 1, Step 264: Loss = -0.1348, Actor Loss = -0.1775, Critic Loss = 0.0861, Entropy = 0.2737, Returns = 1.4049, Value = 1.4338, mask_loss = 0.2245std_loss = 0.2977:   5%|▍         | 264/5475 [1:11:19<23:12:26, 16.03s/it]

ratio tensor(1.3061, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4778, device='cuda:0') reward: tensor(1.2558, device='cuda:0') factor: tensor(0.2220, device='cuda:0')
reward_raw: tensor(0.4757, device='cuda:0') reward: tensor(1.2546, device='cuda:0') factor: tensor(0.2211, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.2896, device='cuda:0') factor: tensor(0.2094, device='cuda:0')
reward_raw: tensor(0.4825, device='cuda:0') reward: tensor(1.2699, device='cuda:0') factor: tensor(0.2126, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.2746, device='cuda:0') factor: tensor(0.2236, device='cuda:0')


Epoch 1, Step 265: Loss = 0.3513, Actor Loss = 0.2898, Critic Loss = 0.1234, Entropy = 0.2603, Returns = 1.2191, Value = 1.2566, mask_loss = 0.2250std_loss = 0.3101:   5%|▍         | 265/5475 [1:11:40<25:15:29, 17.45s/it]  

ratio tensor(0.8795, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.2451, device='cuda:0')
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.2458, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.3063, device='cuda:0') factor: tensor(0.2529, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.2471, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.2490, device='cuda:0')


Epoch 1, Step 266: Loss = -0.1229, Actor Loss = -0.1604, Critic Loss = 0.0755, Entropy = 0.2827, Returns = 1.3565, Value = 1.2373, mask_loss = 0.2426std_loss = 0.3031:   5%|▍         | 266/5475 [1:11:56<24:26:51, 16.90s/it]

ratio tensor(1.0423, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.3164, device='cuda:0') factor: tensor(0.2294, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3134, device='cuda:0') factor: tensor(0.2182, device='cuda:0')
reward_raw: tensor(0.5246, device='cuda:0') reward: tensor(1.2967, device='cuda:0') factor: tensor(0.2279, device='cuda:0')
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.3207, device='cuda:0') factor: tensor(0.2292, device='cuda:0')
reward_raw: tensor(0.5478, device='cuda:0') reward: tensor(1.3271, device='cuda:0') factor: tensor(0.2207, device='cuda:0')


Epoch 1, Step 267: Loss = -0.1644, Actor Loss = -0.2252, Critic Loss = 0.1222, Entropy = 0.2839, Returns = 1.2707, Value = 1.3607, mask_loss = 0.2199std_loss = 0.2907:   5%|▍         | 267/5475 [1:12:11<23:28:59, 16.23s/it]

ratio tensor(1.0818, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5374, device='cuda:0') reward: tensor(1.2764, device='cuda:0') factor: tensor(0.2609, device='cuda:0')
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.2462, device='cuda:0')
reward_raw: tensor(0.5490, device='cuda:0') reward: tensor(1.2827, device='cuda:0') factor: tensor(0.2663, device='cuda:0')
reward_raw: tensor(0.5508, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.2547, device='cuda:0')
reward_raw: tensor(0.5437, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.2476, device='cuda:0')


Epoch 1, Step 268: Loss = 0.1464, Actor Loss = 0.0866, Critic Loss = 0.1202, Entropy = 0.2938, Returns = 1.2586, Value = 1.2763, mask_loss = 0.2476std_loss = 0.3056:   5%|▍         | 268/5475 [1:12:25<22:34:15, 15.61s/it]  

ratio tensor(1.1495, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.2885, device='cuda:0') factor: tensor(0.2513, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.2760, device='cuda:0') factor: tensor(0.2357, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.2654, device='cuda:0') factor: tensor(0.2483, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.2885, device='cuda:0') factor: tensor(0.2479, device='cuda:0')
reward_raw: tensor(0.5145, device='cuda:0') reward: tensor(1.2847, device='cuda:0') factor: tensor(0.2298, device='cuda:0')


Epoch 1, Step 269: Loss = -0.0412, Actor Loss = -0.0776, Critic Loss = 0.0733, Entropy = 0.2877, Returns = 1.1966, Value = 1.1039, mask_loss = 0.2534std_loss = 0.3105:   5%|▍         | 269/5475 [1:12:43<23:42:34, 16.40s/it]

ratio tensor(1.4525, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5377, device='cuda:0') reward: tensor(1.3177, device='cuda:0') factor: tensor(0.2200, device='cuda:0')
reward_raw: tensor(0.5337, device='cuda:0') reward: tensor(1.2978, device='cuda:0') factor: tensor(0.2359, device='cuda:0')
reward_raw: tensor(0.5269, device='cuda:0') reward: tensor(1.3003, device='cuda:0') factor: tensor(0.2266, device='cuda:0')
reward_raw: tensor(0.5438, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.2199, device='cuda:0')
reward_raw: tensor(0.5280, device='cuda:0') reward: tensor(1.3005, device='cuda:0') factor: tensor(0.2276, device='cuda:0')


Epoch 1, Step 270: Loss = -0.1103, Actor Loss = -0.1488, Critic Loss = 0.0776, Entropy = 0.2753, Returns = 1.3534, Value = 1.3543, mask_loss = 0.2141std_loss = 0.2914:   5%|▍         | 270/5475 [1:12:56<22:15:32, 15.40s/it]

ratio tensor(1.7015, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4721, device='cuda:0') reward: tensor(1.2408, device='cuda:0') factor: tensor(0.2313, device='cuda:0')
reward_raw: tensor(0.4707, device='cuda:0') reward: tensor(1.2468, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.4958, device='cuda:0') reward: tensor(1.2604, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.2598, device='cuda:0') factor: tensor(0.2247, device='cuda:0')
reward_raw: tensor(0.4871, device='cuda:0') reward: tensor(1.2518, device='cuda:0') factor: tensor(0.2353, device='cuda:0')


Epoch 1, Step 271: Loss = 0.0362, Actor Loss = -0.0152, Critic Loss = 0.1032, Entropy = 0.2675, Returns = 1.2159, Value = 1.2397, mask_loss = 0.2241std_loss = 0.3007:   5%|▍         | 271/5475 [1:13:09<21:23:39, 14.80s/it] 

ratio tensor(1.3200, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5992, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.2108, device='cuda:0')
reward_raw: tensor(0.5761, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.2180, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.2112, device='cuda:0')
reward_raw: tensor(0.5706, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.5899, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.2134, device='cuda:0')


Epoch 1, Step 272: Loss = 0.2120, Actor Loss = 0.1735, Critic Loss = 0.0775, Entropy = 0.2472, Returns = 1.3253, Value = 1.3000, mask_loss = 0.1910std_loss = 0.2811:   5%|▍         | 272/5475 [1:13:24<21:31:44, 14.90s/it] 

ratio tensor(0.7931, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.2024, device='cuda:0')
reward_raw: tensor(0.5214, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.2048, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.3407, device='cuda:0') factor: tensor(0.2096, device='cuda:0')
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.3146, device='cuda:0') factor: tensor(0.2136, device='cuda:0')
reward_raw: tensor(0.5255, device='cuda:0') reward: tensor(1.3197, device='cuda:0') factor: tensor(0.2059, device='cuda:0')


Epoch 1, Step 273: Loss = -0.1572, Actor Loss = -0.2057, Critic Loss = 0.0976, Entropy = 0.2823, Returns = 1.3697, Value = 1.3076, mask_loss = 0.2266std_loss = 0.2971:   5%|▍         | 273/5475 [1:13:37<20:30:58, 14.20s/it]

ratio tensor(2.1035, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4867, device='cuda:0') reward: tensor(1.2758, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.2838, device='cuda:0') factor: tensor(0.2157, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3117, device='cuda:0') factor: tensor(0.2058, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.2799, device='cuda:0') factor: tensor(0.2146, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.2931, device='cuda:0') factor: tensor(0.2083, device='cuda:0')


Epoch 1, Step 274: Loss = 0.0059, Actor Loss = -0.0405, Critic Loss = 0.0933, Entropy = 0.2425, Returns = 1.3510, Value = 1.3748, mask_loss = 0.1899std_loss = 0.2829:   5%|▌         | 274/5475 [1:13:55<21:59:38, 15.22s/it] 

ratio tensor(1.9180, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5655, device='cuda:0') reward: tensor(1.3150, device='cuda:0') factor: tensor(0.2505, device='cuda:0')
reward_raw: tensor(0.5760, device='cuda:0') reward: tensor(1.3340, device='cuda:0') factor: tensor(0.2420, device='cuda:0')
reward_raw: tensor(0.5636, device='cuda:0') reward: tensor(1.3034, device='cuda:0') factor: tensor(0.2602, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.3074, device='cuda:0') factor: tensor(0.2493, device='cuda:0')
reward_raw: tensor(0.6026, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.2420, device='cuda:0')


Epoch 1, Step 275: Loss = 0.2810, Actor Loss = 0.2446, Critic Loss = 0.0733, Entropy = 0.2618, Returns = 1.2993, Value = 1.3646, mask_loss = 0.2268std_loss = 0.3015:   5%|▌         | 275/5475 [1:14:10<21:51:28, 15.13s/it] 

ratio tensor(0.7531, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4598, device='cuda:0') reward: tensor(1.2665, device='cuda:0') factor: tensor(0.1934, device='cuda:0')
reward_raw: tensor(0.4508, device='cuda:0') reward: tensor(1.2593, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.4851, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.1922, device='cuda:0')
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.2544, device='cuda:0') factor: tensor(0.2000, device='cuda:0')
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.2764, device='cuda:0') factor: tensor(0.1908, device='cuda:0')


Epoch 1, Step 276: Loss = -0.0263, Actor Loss = -0.0663, Critic Loss = 0.0805, Entropy = 0.2111, Returns = 1.1966, Value = 1.2356, mask_loss = 0.1785std_loss = 0.2876:   5%|▌         | 276/5475 [1:14:26<22:13:58, 15.39s/it]

ratio tensor(1.3150, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4705, device='cuda:0') reward: tensor(1.2666, device='cuda:0') factor: tensor(0.2039, device='cuda:0')
reward_raw: tensor(0.4706, device='cuda:0') reward: tensor(1.2663, device='cuda:0') factor: tensor(0.2043, device='cuda:0')
reward_raw: tensor(0.4642, device='cuda:0') reward: tensor(1.2664, device='cuda:0') factor: tensor(0.1978, device='cuda:0')
reward_raw: tensor(0.4619, device='cuda:0') reward: tensor(1.2589, device='cuda:0') factor: tensor(0.2031, device='cuda:0')
reward_raw: tensor(0.4661, device='cuda:0') reward: tensor(1.2727, device='cuda:0') factor: tensor(0.1934, device='cuda:0')


Epoch 1, Step 277: Loss = 0.0272, Actor Loss = 0.0021, Critic Loss = 0.0504, Entropy = 0.1937, Returns = 1.3170, Value = 1.3095, mask_loss = 0.1712std_loss = 0.2907:   5%|▌         | 277/5475 [1:14:41<22:19:07, 15.46s/it]  

ratio tensor(0.8192, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5359, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.2000, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.2064, device='cuda:0')
reward_raw: tensor(0.5801, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.2056, device='cuda:0')
reward_raw: tensor(0.5759, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5625, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.2026, device='cuda:0')


Epoch 1, Step 278: Loss = 0.0261, Actor Loss = -0.0297, Critic Loss = 0.1120, Entropy = 0.2088, Returns = 1.3655, Value = 1.3938, mask_loss = 0.1883std_loss = 0.2964:   5%|▌         | 278/5475 [1:14:55<21:33:15, 14.93s/it]

ratio tensor(0.9389, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.2053, device='cuda:0')
reward_raw: tensor(0.5202, device='cuda:0') reward: tensor(1.3252, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.1949, device='cuda:0')
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.2035, device='cuda:0')


Epoch 1, Step 279: Loss = 0.0201, Actor Loss = -0.0362, Critic Loss = 0.1130, Entropy = 0.2277, Returns = 1.3514, Value = 1.3477, mask_loss = 0.2043std_loss = 0.2996:   5%|▌         | 279/5475 [1:15:14<23:25:26, 16.23s/it]

ratio tensor(1.2436, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5394, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.1731, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3881, device='cuda:0') factor: tensor(0.1825, device='cuda:0')
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1781, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.1781, device='cuda:0')
reward_raw: tensor(0.5722, device='cuda:0') reward: tensor(1.3924, device='cuda:0') factor: tensor(0.1798, device='cuda:0')


Epoch 1, Step 280: Loss = -0.0706, Actor Loss = -0.1084, Critic Loss = 0.0760, Entropy = 0.2118, Returns = 1.3479, Value = 1.3411, mask_loss = 0.1767std_loss = 0.2881:   5%|▌         | 280/5475 [1:15:33<24:34:07, 17.03s/it]

ratio tensor(1.1798, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.2691, device='cuda:0') factor: tensor(0.1987, device='cuda:0')
reward_raw: tensor(0.4802, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.1948, device='cuda:0')
reward_raw: tensor(0.4541, device='cuda:0') reward: tensor(1.2628, device='cuda:0') factor: tensor(0.1913, device='cuda:0')
reward_raw: tensor(0.4815, device='cuda:0') reward: tensor(1.2777, device='cuda:0') factor: tensor(0.2038, device='cuda:0')
reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.2748, device='cuda:0') factor: tensor(0.2007, device='cuda:0')


Epoch 1, Step 281: Loss = 0.2452, Actor Loss = 0.2006, Critic Loss = 0.0897, Entropy = 0.2471, Returns = 1.2338, Value = 1.3307, mask_loss = 0.2013std_loss = 0.2969:   5%|▌         | 280/5475 [1:15:55<24:34:07, 17.03s/it]  

ratio tensor(0.8416, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 281: Loss = 0.2452, Actor Loss = 0.2006, Critic Loss = 0.0897, Entropy = 0.2471, Returns = 1.2338, Value = 1.3307, mask_loss = 0.2013std_loss = 0.2969:   5%|▌         | 281/5475 [1:15:55<26:53:48, 18.64s/it]

reward_raw: tensor(0.4987, device='cuda:0') reward: tensor(1.2714, device='cuda:0') factor: tensor(0.2273, device='cuda:0')
reward_raw: tensor(0.4978, device='cuda:0') reward: tensor(1.2808, device='cuda:0') factor: tensor(0.2170, device='cuda:0')
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.2598, device='cuda:0') factor: tensor(0.2211, device='cuda:0')
reward_raw: tensor(0.5014, device='cuda:0') reward: tensor(1.2748, device='cuda:0') factor: tensor(0.2266, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.2662, device='cuda:0') factor: tensor(0.2160, device='cuda:0')


Epoch 1, Step 282: Loss = 0.0256, Actor Loss = 0.0007, Critic Loss = 0.0503, Entropy = 0.2111, Returns = 1.2527, Value = 1.2296, mask_loss = 0.1950std_loss = 0.3089:   5%|▌         | 282/5475 [1:16:07<23:58:17, 16.62s/it]

ratio tensor(1.0923, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.2927, device='cuda:0') factor: tensor(0.1763, device='cuda:0')
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.1765, device='cuda:0')
reward_raw: tensor(0.4511, device='cuda:0') reward: tensor(1.2762, device='cuda:0') factor: tensor(0.1749, device='cuda:0')
reward_raw: tensor(0.4418, device='cuda:0') reward: tensor(1.2568, device='cuda:0') factor: tensor(0.1851, device='cuda:0')
reward_raw: tensor(0.4654, device='cuda:0') reward: tensor(1.2815, device='cuda:0') factor: tensor(0.1839, device='cuda:0')


Epoch 1, Step 283: Loss = 0.0632, Actor Loss = 0.0254, Critic Loss = 0.0762, Entropy = 0.2244, Returns = 1.2726, Value = 1.2775, mask_loss = 0.1952std_loss = 0.2962:   5%|▌         | 283/5475 [1:16:24<23:49:12, 16.52s/it]

ratio tensor(0.8297, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5596, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1938, device='cuda:0')
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.3611, device='cuda:0') factor: tensor(0.1979, device='cuda:0')
reward_raw: tensor(0.5666, device='cuda:0') reward: tensor(1.3686, device='cuda:0') factor: tensor(0.1981, device='cuda:0')
reward_raw: tensor(0.5722, device='cuda:0') reward: tensor(1.3783, device='cuda:0') factor: tensor(0.1940, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.3663, device='cuda:0') factor: tensor(0.1997, device='cuda:0')


Epoch 1, Step 284: Loss = 0.0538, Actor Loss = 0.0130, Critic Loss = 0.0820, Entropy = 0.2258, Returns = 1.4522, Value = 1.4335, mask_loss = 0.1864std_loss = 0.2864:   5%|▌         | 284/5475 [1:16:40<23:49:32, 16.52s/it]

ratio tensor(0.8054, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.1896, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.3252, device='cuda:0') factor: tensor(0.1898, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3218, device='cuda:0') factor: tensor(0.1955, device='cuda:0')
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1920, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3335, device='cuda:0') factor: tensor(0.1855, device='cuda:0')


Epoch 1, Step 285: Loss = 0.0475, Actor Loss = -0.0096, Critic Loss = 0.1145, Entropy = 0.1930, Returns = 1.2729, Value = 1.3456, mask_loss = 0.1777std_loss = 0.2935:   5%|▌         | 285/5475 [1:16:57<23:51:36, 16.55s/it]

ratio tensor(0.9294, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.2903, device='cuda:0') factor: tensor(0.1982, device='cuda:0')
reward_raw: tensor(0.5113, device='cuda:0') reward: tensor(1.3214, device='cuda:0') factor: tensor(0.1899, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.3071, device='cuda:0') factor: tensor(0.2002, device='cuda:0')
reward_raw: tensor(0.4840, device='cuda:0') reward: tensor(1.2889, device='cuda:0') factor: tensor(0.1951, device='cuda:0')
reward_raw: tensor(0.4699, device='cuda:0') reward: tensor(1.2710, device='cuda:0') factor: tensor(0.1988, device='cuda:0')


Epoch 1, Step 286: Loss = 0.1962, Actor Loss = 0.0578, Critic Loss = 0.2773, Entropy = 0.2062, Returns = 1.3165, Value = 1.3290, mask_loss = 0.1947std_loss = 0.3033:   5%|▌         | 286/5475 [1:17:11<22:57:10, 15.92s/it] 

ratio tensor(1.1692, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.3111, device='cuda:0') factor: tensor(0.1980, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.2096, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.3166, device='cuda:0') factor: tensor(0.1986, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.1973, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.3331, device='cuda:0') factor: tensor(0.1917, device='cuda:0')


Epoch 1, Step 287: Loss = 0.0347, Actor Loss = -0.0128, Critic Loss = 0.0954, Entropy = 0.2223, Returns = 1.3009, Value = 1.3947, mask_loss = 0.2067std_loss = 0.3057:   5%|▌         | 287/5475 [1:17:36<26:50:31, 18.63s/it]

ratio tensor(0.9999, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5548, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.1589, device='cuda:0')
reward_raw: tensor(0.5264, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5199, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.1600, device='cuda:0')
reward_raw: tensor(0.5426, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.1639, device='cuda:0')
reward_raw: tensor(0.5697, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.1677, device='cuda:0')


Epoch 1, Step 288: Loss = -0.2707, Actor Loss = -0.3085, Critic Loss = 0.0759, Entropy = 0.1658, Returns = 1.4098, Value = 1.3501, mask_loss = 0.1380std_loss = 0.2686:   5%|▌         | 288/5475 [1:17:52<25:29:58, 17.70s/it]

ratio tensor(1.6666, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5355, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.2075, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.2022, device='cuda:0')
reward_raw: tensor(0.5265, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.2092, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.3131, device='cuda:0') factor: tensor(0.1998, device='cuda:0')
reward_raw: tensor(0.5241, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.2074, device='cuda:0')


Epoch 1, Step 289: Loss = 0.1600, Actor Loss = 0.0897, Critic Loss = 0.1409, Entropy = 0.2139, Returns = 1.2630, Value = 1.3411, mask_loss = 0.2402std_loss = 0.3383:   5%|▌         | 289/5475 [1:18:13<27:14:30, 18.91s/it]  

ratio tensor(0.6954, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.3563, device='cuda:0') factor: tensor(0.1710, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.1659, device='cuda:0')
reward_raw: tensor(0.5331, device='cuda:0') reward: tensor(1.3701, device='cuda:0') factor: tensor(0.1631, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3417, device='cuda:0') factor: tensor(0.1610, device='cuda:0')


Epoch 1, Step 290: Loss = -0.0081, Actor Loss = -0.0305, Critic Loss = 0.0453, Entropy = 0.1836, Returns = 1.3644, Value = 1.2924, mask_loss = 0.1742std_loss = 0.2976:   5%|▌         | 290/5475 [1:18:29<25:43:02, 17.86s/it]

ratio tensor(1.2766, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4514, device='cuda:0') reward: tensor(1.2858, device='cuda:0') factor: tensor(0.1656, device='cuda:0')
reward_raw: tensor(0.5111, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1671, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3047, device='cuda:0') factor: tensor(0.1776, device='cuda:0')
reward_raw: tensor(0.4752, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.1629, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.2895, device='cuda:0') factor: tensor(0.1689, device='cuda:0')


Epoch 1, Step 291: Loss = 0.0337, Actor Loss = -0.0159, Critic Loss = 0.0996, Entropy = 0.1753, Returns = 1.2748, Value = 1.2853, mask_loss = 0.1584std_loss = 0.2883:   5%|▌         | 291/5475 [1:18:45<25:03:01, 17.40s/it] 

ratio tensor(1.1864, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.2928, device='cuda:0') factor: tensor(0.2003, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.2853, device='cuda:0') factor: tensor(0.2004, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3087, device='cuda:0') factor: tensor(0.1995, device='cuda:0')
reward_raw: tensor(0.4910, device='cuda:0') reward: tensor(1.2973, device='cuda:0') factor: tensor(0.1937, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.2955, device='cuda:0') factor: tensor(0.2060, device='cuda:0')


Epoch 1, Step 292: Loss = 0.1587, Actor Loss = 0.1102, Critic Loss = 0.0974, Entropy = 0.1877, Returns = 1.2524, Value = 1.1849, mask_loss = 0.1972std_loss = 0.3048:   5%|▌         | 292/5475 [1:19:06<26:19:04, 18.28s/it] 

ratio tensor(0.9560, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.3102, device='cuda:0') factor: tensor(0.1818, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.1879, device='cuda:0')
reward_raw: tensor(0.5020, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.1759, device='cuda:0')
reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.1856, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.2966, device='cuda:0') factor: tensor(0.1832, device='cuda:0')


Epoch 1, Step 293: Loss = 0.0985, Actor Loss = 0.0311, Critic Loss = 0.1352, Entropy = 0.1997, Returns = 1.2839, Value = 1.3401, mask_loss = 0.2184std_loss = 0.3125:   5%|▌         | 293/5475 [1:19:19<24:20:28, 16.91s/it]

ratio tensor(1.0245, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.3617, device='cuda:0') factor: tensor(0.1941, device='cuda:0')
reward_raw: tensor(0.5441, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.2027, device='cuda:0')
reward_raw: tensor(0.5516, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.2026, device='cuda:0')
reward_raw: tensor(0.5616, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.1917, device='cuda:0')
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.3788, device='cuda:0') factor: tensor(0.1961, device='cuda:0')


Epoch 1, Step 294: Loss = -0.3600, Actor Loss = -0.4165, Critic Loss = 0.1134, Entropy = 0.1944, Returns = 1.3909, Value = 1.3290, mask_loss = 0.1950std_loss = 0.3030:   5%|▌         | 294/5475 [1:19:33<23:00:59, 15.99s/it]

ratio tensor(3.1498, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3214, device='cuda:0') factor: tensor(0.1866, device='cuda:0')
reward_raw: tensor(0.5314, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.1951, device='cuda:0')
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.1855, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.1937, device='cuda:0')


Epoch 1, Step 295: Loss = 0.2744, Actor Loss = 0.2286, Critic Loss = 0.0919, Entropy = 0.1625, Returns = 1.3773, Value = 1.3881, mask_loss = 0.1887std_loss = 0.3244:   5%|▌         | 295/5475 [1:19:46<21:37:19, 15.03s/it]  

ratio tensor(0.8563, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4815, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.1872, device='cuda:0')
reward_raw: tensor(0.5081, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.1792, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.1830, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.2877, device='cuda:0') factor: tensor(0.1849, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.3087, device='cuda:0') factor: tensor(0.1827, device='cuda:0')


Epoch 1, Step 296: Loss = 0.0714, Actor Loss = 0.0484, Critic Loss = 0.0462, Entropy = 0.1233, Returns = 1.3311, Value = 1.2876, mask_loss = 0.1638std_loss = 0.3137:   5%|▌         | 296/5475 [1:19:59<20:56:48, 14.56s/it]

ratio tensor(0.9075, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4081, device='cuda:0') reward: tensor(1.2472, device='cuda:0') factor: tensor(0.1608, device='cuda:0')
reward_raw: tensor(0.3957, device='cuda:0') reward: tensor(1.2463, device='cuda:0') factor: tensor(0.1494, device='cuda:0')
reward_raw: tensor(0.4034, device='cuda:0') reward: tensor(1.2446, device='cuda:0') factor: tensor(0.1588, device='cuda:0')
reward_raw: tensor(0.4006, device='cuda:0') reward: tensor(1.2461, device='cuda:0') factor: tensor(0.1545, device='cuda:0')
reward_raw: tensor(0.4090, device='cuda:0') reward: tensor(1.2570, device='cuda:0') factor: tensor(0.1520, device='cuda:0')


Epoch 1, Step 297: Loss = -0.1204, Actor Loss = -0.1795, Critic Loss = 0.1185, Entropy = 0.1580, Returns = 1.2884, Value = 1.2529, mask_loss = 0.1466std_loss = 0.2787:   5%|▌         | 297/5475 [1:20:14<20:58:49, 14.59s/it]

ratio tensor(2.0600, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.1612, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.1587, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.1582, device='cuda:0')
reward_raw: tensor(0.5241, device='cuda:0') reward: tensor(1.3660, device='cuda:0') factor: tensor(0.1580, device='cuda:0')
reward_raw: tensor(0.5281, device='cuda:0') reward: tensor(1.3660, device='cuda:0') factor: tensor(0.1621, device='cuda:0')


Epoch 1, Step 298: Loss = -0.0486, Actor Loss = -0.0814, Critic Loss = 0.0659, Entropy = 0.1800, Returns = 1.3736, Value = 1.4668, mask_loss = 0.1746std_loss = 0.2905:   5%|▌         | 298/5475 [1:20:30<21:32:14, 14.98s/it]

ratio tensor(1.0772, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5232, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1312, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1456, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.3798, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.5105, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3541, device='cuda:0') factor: tensor(0.1359, device='cuda:0')


Epoch 1, Step 299: Loss = 0.0042, Actor Loss = -0.0386, Critic Loss = 0.0859, Entropy = 0.1444, Returns = 1.4173, Value = 1.3504, mask_loss = 0.1426std_loss = 0.2881:   5%|▌         | 299/5475 [1:20:41<19:58:24, 13.89s/it] 

ratio tensor(1.2875, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4973, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1637, device='cuda:0')
reward_raw: tensor(0.4848, device='cuda:0') reward: tensor(1.3148, device='cuda:0') factor: tensor(0.1701, device='cuda:0')
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.2931, device='cuda:0') factor: tensor(0.1709, device='cuda:0')
reward_raw: tensor(0.4772, device='cuda:0') reward: tensor(1.3121, device='cuda:0') factor: tensor(0.1652, device='cuda:0')
reward_raw: tensor(0.4861, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.1686, device='cuda:0')


Epoch 1, Step 300: Loss = 0.0832, Actor Loss = 0.0229, Critic Loss = 0.1210, Entropy = 0.1708, Returns = 1.3185, Value = 1.3156, mask_loss = 0.1724std_loss = 0.2973:   5%|▌         | 300/5475 [1:20:57<20:42:29, 14.41s/it] 

ratio tensor(12.1749, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.1349, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.5298, device='cuda:0') reward: tensor(1.3954, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.5326, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.1350, device='cuda:0')


Epoch 1, Step 301: Loss = -0.0117, Actor Loss = -0.0478, Critic Loss = 0.0725, Entropy = 0.1439, Returns = 1.3496, Value = 1.3582, mask_loss = 0.1363std_loss = 0.2765:   5%|▌         | 300/5475 [1:21:12<20:42:29, 14.41s/it]

ratio tensor(0.9927, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 301: Loss = -0.0117, Actor Loss = -0.0478, Critic Loss = 0.0725, Entropy = 0.1439, Returns = 1.3496, Value = 1.3582, mask_loss = 0.1363std_loss = 0.2765:   5%|▌         | 301/5475 [1:21:12<21:03:31, 14.65s/it]

reward_raw: tensor(0.4947, device='cuda:0') reward: tensor(1.3475, device='cuda:0') factor: tensor(0.1472, device='cuda:0')
reward_raw: tensor(0.4903, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.1475, device='cuda:0')
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.4846, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1496, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3325, device='cuda:0') factor: tensor(0.1473, device='cuda:0')


Epoch 1, Step 302: Loss = 0.3212, Actor Loss = 0.2413, Critic Loss = 0.1599, Entropy = 0.1378, Returns = 1.3326, Value = 1.4666, mask_loss = 0.1508std_loss = 0.2898:   6%|▌         | 302/5475 [1:21:33<23:57:42, 16.68s/it]  

ratio tensor(1.0806, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3496, device='cuda:0') factor: tensor(0.1280, device='cuda:0')
reward_raw: tensor(0.4665, device='cuda:0') reward: tensor(1.3264, device='cuda:0') factor: tensor(0.1402, device='cuda:0')
reward_raw: tensor(0.4648, device='cuda:0') reward: tensor(1.3370, device='cuda:0') factor: tensor(0.1278, device='cuda:0')
reward_raw: tensor(0.4691, device='cuda:0') reward: tensor(1.3418, device='cuda:0') factor: tensor(0.1273, device='cuda:0')
reward_raw: tensor(0.4562, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1295, device='cuda:0')


Epoch 1, Step 303: Loss = 0.2794, Actor Loss = 0.2309, Critic Loss = 0.0971, Entropy = 0.1249, Returns = 1.2934, Value = 1.3375, mask_loss = 0.1147std_loss = 0.2621:   6%|▌         | 303/5475 [1:21:48<22:51:14, 15.91s/it]

ratio tensor(1.6601, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3533, device='cuda:0') factor: tensor(0.1299, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.4609, device='cuda:0') reward: tensor(1.3215, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.1351, device='cuda:0')
reward_raw: tensor(0.4674, device='cuda:0') reward: tensor(1.3388, device='cuda:0') factor: tensor(0.1286, device='cuda:0')


Epoch 1, Step 304: Loss = -0.1098, Actor Loss = -0.1540, Critic Loss = 0.0886, Entropy = 0.1365, Returns = 1.3570, Value = 1.3405, mask_loss = 0.1263std_loss = 0.2703:   6%|▌         | 304/5475 [1:22:02<22:13:39, 15.47s/it]

ratio tensor(1.0206, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1437, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3683, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1514, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.3931, device='cuda:0') factor: tensor(0.1480, device='cuda:0')
reward_raw: tensor(0.5200, device='cuda:0') reward: tensor(1.3718, device='cuda:0') factor: tensor(0.1482, device='cuda:0')


Epoch 1, Step 305: Loss = 0.1287, Actor Loss = 0.0583, Critic Loss = 0.1411, Entropy = 0.1448, Returns = 1.3270, Value = 1.4221, mask_loss = 0.1365std_loss = 0.2785:   6%|▌         | 305/5475 [1:22:17<22:12:12, 15.46s/it]  

ratio tensor(1.0405, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.4051, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.5575, device='cuda:0') reward: tensor(1.4185, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.5388, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.1346, device='cuda:0')
reward_raw: tensor(0.5200, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1446, device='cuda:0')
reward_raw: tensor(0.5694, device='cuda:0') reward: tensor(1.4350, device='cuda:0') factor: tensor(0.1344, device='cuda:0')


Epoch 1, Step 306: Loss = 0.0239, Actor Loss = -0.0500, Critic Loss = 0.1480, Entropy = 0.1630, Returns = 1.3766, Value = 1.2649, mask_loss = 0.1485std_loss = 0.2801:   6%|▌         | 306/5475 [1:22:34<22:40:23, 15.79s/it]

ratio tensor(0.9261, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3402, device='cuda:0') factor: tensor(0.1631, device='cuda:0')
reward_raw: tensor(0.4878, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.4660, device='cuda:0') reward: tensor(1.3081, device='cuda:0') factor: tensor(0.1579, device='cuda:0')
reward_raw: tensor(0.4941, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.1616, device='cuda:0')


Epoch 1, Step 307: Loss = 0.0750, Actor Loss = 0.0463, Critic Loss = 0.0576, Entropy = 0.1414, Returns = 1.2958, Value = 1.3561, mask_loss = 0.1651std_loss = 0.3012:   6%|▌         | 307/5475 [1:22:50<22:39:23, 15.78s/it] 

ratio tensor(0.8835, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4301, device='cuda:0') reward: tensor(1.2696, device='cuda:0') factor: tensor(0.1604, device='cuda:0')
reward_raw: tensor(0.4429, device='cuda:0') reward: tensor(1.2831, device='cuda:0') factor: tensor(0.1598, device='cuda:0')
reward_raw: tensor(0.4378, device='cuda:0') reward: tensor(1.2729, device='cuda:0') factor: tensor(0.1649, device='cuda:0')
reward_raw: tensor(0.4369, device='cuda:0') reward: tensor(1.2791, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.4305, device='cuda:0') reward: tensor(1.2739, device='cuda:0') factor: tensor(0.1566, device='cuda:0')


Epoch 1, Step 308: Loss = -0.2074, Actor Loss = -0.4729, Critic Loss = 0.5312, Entropy = 0.1418, Returns = 1.2465, Value = 1.0465, mask_loss = 0.1656std_loss = 0.2838:   6%|▌         | 308/5475 [1:23:08<23:42:27, 16.52s/it]

ratio tensor(2.5193, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3463, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.4235, device='cuda:0') reward: tensor(1.2855, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.4535, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.4209, device='cuda:0') reward: tensor(1.2749, device='cuda:0') factor: tensor(0.1460, device='cuda:0')


Epoch 1, Step 309: Loss = 0.1903, Actor Loss = 0.1191, Critic Loss = 0.1427, Entropy = 0.1314, Returns = 1.3492, Value = 1.3531, mask_loss = 0.1375std_loss = 0.2821:   6%|▌         | 309/5475 [1:23:25<23:45:30, 16.56s/it]  

ratio tensor(0.8847, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1480, device='cuda:0')
reward_raw: tensor(0.4954, device='cuda:0') reward: tensor(1.3454, device='cuda:0') factor: tensor(0.1500, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3637, device='cuda:0') factor: tensor(0.1478, device='cuda:0')
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.1443, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3640, device='cuda:0') factor: tensor(0.1487, device='cuda:0')


Epoch 1, Step 310: Loss = 0.1154, Actor Loss = 0.0574, Critic Loss = 0.1162, Entropy = 0.1245, Returns = 1.3319, Value = 1.3543, mask_loss = 0.1586std_loss = 0.3098:   6%|▌         | 310/5475 [1:23:41<23:44:24, 16.55s/it]

ratio tensor(0.9197, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5009, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.1680, device='cuda:0')
reward_raw: tensor(0.4794, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.1665, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.3413, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.1602, device='cuda:0')
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.1571, device='cuda:0')


Epoch 1, Step 311: Loss = -0.0082, Actor Loss = -0.0751, Critic Loss = 0.1341, Entropy = 0.1534, Returns = 1.3344, Value = 1.4836, mask_loss = 0.1533std_loss = 0.2938:   6%|▌         | 311/5475 [1:23:53<21:44:26, 15.16s/it]

ratio tensor(0.9348, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4346, device='cuda:0') reward: tensor(1.2916, device='cuda:0') factor: tensor(0.1430, device='cuda:0')
reward_raw: tensor(0.4361, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.3494, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.4399, device='cuda:0') reward: tensor(1.3008, device='cuda:0') factor: tensor(0.1392, device='cuda:0')
reward_raw: tensor(0.4539, device='cuda:0') reward: tensor(1.3078, device='cuda:0') factor: tensor(0.1461, device='cuda:0')


Epoch 1, Step 312: Loss = 0.1233, Actor Loss = 0.0836, Critic Loss = 0.0796, Entropy = 0.1395, Returns = 1.2936, Value = 1.3002, mask_loss = 0.1498std_loss = 0.2922:   6%|▌         | 312/5475 [1:24:09<21:56:44, 15.30s/it]  

ratio tensor(1.0994, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5377, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.1440, device='cuda:0')
reward_raw: tensor(0.5444, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.5573, device='cuda:0') reward: tensor(1.4088, device='cuda:0') factor: tensor(0.1485, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.4117, device='cuda:0') factor: tensor(0.1370, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.3793, device='cuda:0') factor: tensor(0.1449, device='cuda:0')


Epoch 1, Step 313: Loss = -0.0868, Actor Loss = -0.1399, Critic Loss = 0.1063, Entropy = 0.1259, Returns = 1.5125, Value = 1.2796, mask_loss = 0.1437std_loss = 0.2939:   6%|▌         | 313/5475 [1:24:27<23:09:10, 16.15s/it]

ratio tensor(1.0668, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.1976, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.2755, device='cuda:0') factor: tensor(0.2019, device='cuda:0')
reward_raw: tensor(0.4805, device='cuda:0') reward: tensor(1.2803, device='cuda:0') factor: tensor(0.2002, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.1987, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.1835, device='cuda:0')


Epoch 1, Step 314: Loss = 0.0412, Actor Loss = -0.0017, Critic Loss = 0.0860, Entropy = 0.1666, Returns = 1.2061, Value = 1.3721, mask_loss = 0.1897std_loss = 0.3203:   6%|▌         | 314/5475 [1:24:43<23:04:00, 16.09s/it] 

ratio tensor(0.9328, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.3979, device='cuda:0') factor: tensor(0.1478, device='cuda:0')
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.4258, device='cuda:0') factor: tensor(0.1484, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.5434, device='cuda:0') reward: tensor(1.4033, device='cuda:0') factor: tensor(0.1401, device='cuda:0')


Epoch 1, Step 315: Loss = 0.1271, Actor Loss = -0.0309, Critic Loss = 0.3163, Entropy = 0.1300, Returns = 1.4289, Value = 1.2803, mask_loss = 0.1296std_loss = 0.2764:   6%|▌         | 315/5475 [1:24:55<21:14:25, 14.82s/it]

ratio tensor(1.0867, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3970, device='cuda:0') reward: tensor(1.2685, device='cuda:0') factor: tensor(0.1285, device='cuda:0')
reward_raw: tensor(0.3609, device='cuda:0') reward: tensor(1.2392, device='cuda:0') factor: tensor(0.1217, device='cuda:0')
reward_raw: tensor(0.3946, device='cuda:0') reward: tensor(1.2663, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.3939, device='cuda:0') reward: tensor(1.2680, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.3858, device='cuda:0') reward: tensor(1.2630, device='cuda:0') factor: tensor(0.1229, device='cuda:0')


Epoch 1, Step 316: Loss = -0.0119, Actor Loss = -0.0429, Critic Loss = 0.0623, Entropy = 0.1321, Returns = 1.2388, Value = 1.2410, mask_loss = 0.1227std_loss = 0.2633:   6%|▌         | 316/5475 [1:25:14<23:02:00, 16.07s/it]

ratio tensor(0.9453, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.3424, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4341, device='cuda:0') reward: tensor(1.3102, device='cuda:0') factor: tensor(0.1238, device='cuda:0')
reward_raw: tensor(0.4319, device='cuda:0') reward: tensor(1.3112, device='cuda:0') factor: tensor(0.1207, device='cuda:0')
reward_raw: tensor(0.4161, device='cuda:0') reward: tensor(1.2901, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.4515, device='cuda:0') reward: tensor(1.3344, device='cuda:0') factor: tensor(0.1171, device='cuda:0')


Epoch 1, Step 317: Loss = -0.0824, Actor Loss = -0.1162, Critic Loss = 0.0678, Entropy = 0.1129, Returns = 1.3538, Value = 1.3744, mask_loss = 0.1009std_loss = 0.2396:   6%|▌         | 317/5475 [1:25:37<26:04:09, 18.19s/it]

ratio tensor(1.0057, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4536, device='cuda:0') reward: tensor(1.2762, device='cuda:0') factor: tensor(0.1774, device='cuda:0')
reward_raw: tensor(0.4831, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.1763, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.3053, device='cuda:0') factor: tensor(0.1661, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.1681, device='cuda:0')
reward_raw: tensor(0.4679, device='cuda:0') reward: tensor(1.2864, device='cuda:0') factor: tensor(0.1815, device='cuda:0')


Epoch 1, Step 318: Loss = 0.1754, Actor Loss = 0.1245, Critic Loss = 0.1021, Entropy = 0.1336, Returns = 1.2937, Value = 1.3424, mask_loss = 0.1677std_loss = 0.3020:   6%|▌         | 318/5475 [1:25:52<24:53:38, 17.38s/it]  

ratio tensor(0.7204, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4758, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.4805, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.1437, device='cuda:0')
reward_raw: tensor(0.4866, device='cuda:0') reward: tensor(1.3376, device='cuda:0') factor: tensor(0.1490, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3649, device='cuda:0') factor: tensor(0.1439, device='cuda:0')


Epoch 1, Step 319: Loss = -0.0461, Actor Loss = -0.0768, Critic Loss = 0.0615, Entropy = 0.1288, Returns = 1.3404, Value = 1.3658, mask_loss = 0.1424std_loss = 0.2880:   6%|▌         | 319/5475 [1:26:05<22:50:37, 15.95s/it]

ratio tensor(1.1911, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4321, device='cuda:0') reward: tensor(1.2953, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.4315, device='cuda:0') reward: tensor(1.2887, device='cuda:0') factor: tensor(0.1428, device='cuda:0')
reward_raw: tensor(0.4769, device='cuda:0') reward: tensor(1.3403, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.4333, device='cuda:0') reward: tensor(1.2985, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.4592, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.1369, device='cuda:0')


Epoch 1, Step 320: Loss = 0.1319, Actor Loss = 0.1041, Critic Loss = 0.0557, Entropy = 0.1351, Returns = 1.3727, Value = 1.3830, mask_loss = 0.1433std_loss = 0.2897:   6%|▌         | 320/5475 [1:26:22<23:28:20, 16.39s/it]  

ratio tensor(0.7160, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.3409, device='cuda:0') factor: tensor(0.1728, device='cuda:0')
reward_raw: tensor(0.5217, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1672, device='cuda:0')
reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.1769, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1760, device='cuda:0')


Epoch 1, Step 321: Loss = -0.1645, Actor Loss = -0.2207, Critic Loss = 0.1128, Entropy = 0.1441, Returns = 1.4209, Value = 1.2701, mask_loss = 0.1822std_loss = 0.3195:   6%|▌         | 320/5475 [1:26:37<23:28:20, 16.39s/it]

ratio tensor(1.2479, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 321: Loss = -0.1645, Actor Loss = -0.2207, Critic Loss = 0.1128, Entropy = 0.1441, Returns = 1.4209, Value = 1.2701, mask_loss = 0.1822std_loss = 0.3195:   6%|▌         | 321/5475 [1:26:38<22:57:37, 16.04s/it]

reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.3474, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1235, device='cuda:0')
reward_raw: tensor(0.5084, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1309, device='cuda:0')
reward_raw: tensor(0.4758, device='cuda:0') reward: tensor(1.3520, device='cuda:0') factor: tensor(0.1238, device='cuda:0')
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.1310, device='cuda:0')


Epoch 1, Step 322: Loss = 0.0455, Actor Loss = -0.0129, Critic Loss = 0.1170, Entropy = 0.1347, Returns = 1.3948, Value = 1.5082, mask_loss = 0.1235std_loss = 0.2682:   6%|▌         | 322/5475 [1:27:00<25:36:35, 17.89s/it] 

ratio tensor(1.0276, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.1515, device='cuda:0')
reward_raw: tensor(0.4855, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.4651, device='cuda:0') reward: tensor(1.3133, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(1.3274, device='cuda:0') factor: tensor(0.1506, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.1475, device='cuda:0')


Epoch 1, Step 323: Loss = 0.6595, Actor Loss = 0.6008, Critic Loss = 0.1177, Entropy = 0.1165, Returns = 1.3023, Value = 1.4171, mask_loss = 0.1494std_loss = 0.3032:   6%|▌         | 323/5475 [1:27:17<25:28:11, 17.80s/it] 

ratio tensor(1.5810, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4848, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1361, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.3551, device='cuda:0') factor: tensor(0.1362, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.1361, device='cuda:0')
reward_raw: tensor(0.4731, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.1348, device='cuda:0')


Epoch 1, Step 324: Loss = -0.0042, Actor Loss = -0.0387, Critic Loss = 0.0692, Entropy = 0.1290, Returns = 1.3954, Value = 1.3901, mask_loss = 0.1326std_loss = 0.2764:   6%|▌         | 324/5475 [1:27:32<24:03:15, 16.81s/it]

ratio tensor(0.6973, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.4100, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1312, device='cuda:0')
reward_raw: tensor(0.5371, device='cuda:0') reward: tensor(1.4017, device='cuda:0') factor: tensor(0.1354, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.3984, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.3868, device='cuda:0') factor: tensor(0.1374, device='cuda:0')


Epoch 1, Step 325: Loss = 0.0311, Actor Loss = -0.0064, Critic Loss = 0.0752, Entropy = 0.1234, Returns = 1.4315, Value = 1.4421, mask_loss = 0.1326std_loss = 0.2832:   6%|▌         | 325/5475 [1:27:50<24:25:36, 17.08s/it] 

ratio tensor(0.9831, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1429, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.1449, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.3919, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.3953, device='cuda:0') factor: tensor(0.1411, device='cuda:0')


Epoch 1, Step 326: Loss = 0.0432, Actor Loss = 0.0098, Critic Loss = 0.0670, Entropy = 0.1326, Returns = 1.3038, Value = 1.4120, mask_loss = 0.1479std_loss = 0.2936:   6%|▌         | 326/5475 [1:28:04<23:22:07, 16.34s/it] 

ratio tensor(1.0771, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.3541, device='cuda:0') factor: tensor(0.1568, device='cuda:0')
reward_raw: tensor(0.5260, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.5139, device='cuda:0') reward: tensor(1.3580, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.3683, device='cuda:0') factor: tensor(0.1560, device='cuda:0')


Epoch 1, Step 327: Loss = 0.1999, Actor Loss = 0.1750, Critic Loss = 0.0502, Entropy = 0.1372, Returns = 1.3338, Value = 1.3486, mask_loss = 0.1518std_loss = 0.2963:   6%|▌         | 327/5475 [1:28:16<21:29:16, 15.03s/it]

ratio tensor(0.6295, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4074, device='cuda:0') reward: tensor(1.2591, device='cuda:0') factor: tensor(0.1483, device='cuda:0')
reward_raw: tensor(0.3942, device='cuda:0') reward: tensor(1.2359, device='cuda:0') factor: tensor(0.1583, device='cuda:0')
reward_raw: tensor(0.4083, device='cuda:0') reward: tensor(1.2579, device='cuda:0') factor: tensor(0.1504, device='cuda:0')
reward_raw: tensor(0.3900, device='cuda:0') reward: tensor(1.2376, device='cuda:0') factor: tensor(0.1524, device='cuda:0')
reward_raw: tensor(0.4049, device='cuda:0') reward: tensor(1.2457, device='cuda:0') factor: tensor(0.1592, device='cuda:0')


Epoch 1, Step 328: Loss = 0.2988, Actor Loss = 0.2059, Critic Loss = 0.1859, Entropy = 0.1427, Returns = 1.1633, Value = 1.2758, mask_loss = 0.1863std_loss = 0.3209:   6%|▌         | 328/5475 [1:28:37<24:04:40, 16.84s/it]

ratio tensor(1.0216, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.3953, device='cuda:0') factor: tensor(0.1656, device='cuda:0')
reward_raw: tensor(0.5603, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.5527, device='cuda:0') reward: tensor(1.3757, device='cuda:0') factor: tensor(0.1770, device='cuda:0')
reward_raw: tensor(0.5643, device='cuda:0') reward: tensor(1.3913, device='cuda:0') factor: tensor(0.1730, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.1723, device='cuda:0')


Epoch 1, Step 329: Loss = -0.1083, Actor Loss = -0.1359, Critic Loss = 0.0554, Entropy = 0.1183, Returns = 1.3673, Value = 1.4473, mask_loss = 0.1454std_loss = 0.2983:   6%|▌         | 329/5475 [1:28:52<23:07:41, 16.18s/it]

ratio tensor(0.9977, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.1572, device='cuda:0')
reward_raw: tensor(0.5296, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.1653, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.1627, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.1643, device='cuda:0')
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.3560, device='cuda:0') factor: tensor(0.1543, device='cuda:0')


Epoch 1, Step 330: Loss = 0.1232, Actor Loss = 0.0815, Critic Loss = 0.0836, Entropy = 0.1300, Returns = 1.4350, Value = 1.4254, mask_loss = 0.1745std_loss = 0.3248:   6%|▌         | 330/5475 [1:29:08<23:18:25, 16.31s/it]  

ratio tensor(1.0874, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4578, device='cuda:0') reward: tensor(1.3094, device='cuda:0') factor: tensor(0.1484, device='cuda:0')
reward_raw: tensor(0.4527, device='cuda:0') reward: tensor(1.3037, device='cuda:0') factor: tensor(0.1490, device='cuda:0')
reward_raw: tensor(0.4379, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.1442, device='cuda:0')
reward_raw: tensor(0.4609, device='cuda:0') reward: tensor(1.3143, device='cuda:0') factor: tensor(0.1465, device='cuda:0')
reward_raw: tensor(0.4771, device='cuda:0') reward: tensor(1.3279, device='cuda:0') factor: tensor(0.1492, device='cuda:0')


Epoch 1, Step 331: Loss = 0.1764, Actor Loss = 0.0686, Critic Loss = 0.2159, Entropy = 0.0989, Returns = 1.3403, Value = 1.1017, mask_loss = 0.1398std_loss = 0.3016:   6%|▌         | 331/5475 [1:29:24<22:47:18, 15.95s/it]

ratio tensor(0.9438, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.4050, device='cuda:0') factor: tensor(0.1386, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.4077, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.1408, device='cuda:0')
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.3865, device='cuda:0') factor: tensor(0.1392, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3961, device='cuda:0') factor: tensor(0.1291, device='cuda:0')


Epoch 1, Step 332: Loss = -0.0076, Actor Loss = -0.0975, Critic Loss = 0.1799, Entropy = 0.0976, Returns = 1.4173, Value = 1.5877, mask_loss = 0.1202std_loss = 0.2796:   6%|▌         | 332/5475 [1:29:41<23:37:42, 16.54s/it]

ratio tensor(1.1090, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.1432, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3526, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.4707, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.4909, device='cuda:0') reward: tensor(1.3561, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.1371, device='cuda:0')


Epoch 1, Step 333: Loss = 0.1222, Actor Loss = 0.0774, Critic Loss = 0.0897, Entropy = 0.0962, Returns = 1.4028, Value = 1.3260, mask_loss = 0.1197std_loss = 0.2794:   6%|▌         | 333/5475 [1:29:57<23:16:27, 16.29s/it]  

ratio tensor(1.0175, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.1230, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.1245, device='cuda:0')
reward_raw: tensor(0.5139, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.1238, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.5121, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1292, device='cuda:0')


Epoch 1, Step 334: Loss = -0.0756, Actor Loss = -0.1034, Critic Loss = 0.0558, Entropy = 0.0890, Returns = 1.3266, Value = 1.3272, mask_loss = 0.1211std_loss = 0.2840:   6%|▌         | 334/5475 [1:30:14<23:25:04, 16.40s/it]

ratio tensor(1.4061, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4341, device='cuda:0') reward: tensor(1.2820, device='cuda:0') factor: tensor(0.1521, device='cuda:0')
reward_raw: tensor(0.4455, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.1458, device='cuda:0')
reward_raw: tensor(0.4348, device='cuda:0') reward: tensor(1.2845, device='cuda:0') factor: tensor(0.1503, device='cuda:0')
reward_raw: tensor(0.4254, device='cuda:0') reward: tensor(1.2741, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.4382, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.1502, device='cuda:0')


Epoch 1, Step 335: Loss = 0.1769, Actor Loss = 0.1414, Critic Loss = 0.0712, Entropy = 0.1010, Returns = 1.2346, Value = 1.2626, mask_loss = 0.1550std_loss = 0.3141:   6%|▌         | 335/5475 [1:30:25<21:18:30, 14.92s/it]  

ratio tensor(0.9176, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4292, device='cuda:0') reward: tensor(1.2925, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.4192, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.1420, device='cuda:0')
reward_raw: tensor(0.4213, device='cuda:0') reward: tensor(1.2819, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.4204, device='cuda:0') reward: tensor(1.2837, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.4382, device='cuda:0') reward: tensor(1.2982, device='cuda:0') factor: tensor(0.1400, device='cuda:0')


Epoch 1, Step 336: Loss = -0.1343, Actor Loss = -0.2121, Critic Loss = 0.1558, Entropy = 0.1133, Returns = 1.2528, Value = 1.1159, mask_loss = 0.1580std_loss = 0.3175:   6%|▌         | 336/5475 [1:30:40<21:24:41, 15.00s/it]

ratio tensor(0.8815, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4682, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1433, device='cuda:0')
reward_raw: tensor(0.4342, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.1463, device='cuda:0')
reward_raw: tensor(0.4280, device='cuda:0') reward: tensor(1.2796, device='cuda:0') factor: tensor(0.1484, device='cuda:0')
reward_raw: tensor(0.4242, device='cuda:0') reward: tensor(1.2793, device='cuda:0') factor: tensor(0.1449, device='cuda:0')
reward_raw: tensor(0.4572, device='cuda:0') reward: tensor(1.3127, device='cuda:0') factor: tensor(0.1445, device='cuda:0')


Epoch 1, Step 337: Loss = 0.0167, Actor Loss = -0.0261, Critic Loss = 0.0856, Entropy = 0.1006, Returns = 1.2621, Value = 1.4049, mask_loss = 0.1253std_loss = 0.2844:   6%|▌         | 337/5475 [1:30:54<20:55:52, 14.67s/it] 

ratio tensor(0.9730, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4380, device='cuda:0') reward: tensor(1.3067, device='cuda:0') factor: tensor(0.1313, device='cuda:0')
reward_raw: tensor(0.4420, device='cuda:0') reward: tensor(1.3092, device='cuda:0') factor: tensor(0.1328, device='cuda:0')
reward_raw: tensor(0.4343, device='cuda:0') reward: tensor(1.2932, device='cuda:0') factor: tensor(0.1411, device='cuda:0')
reward_raw: tensor(0.4646, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.1407, device='cuda:0')
reward_raw: tensor(0.4560, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.1330, device='cuda:0')


Epoch 1, Step 338: Loss = 0.2142, Actor Loss = 0.1908, Critic Loss = 0.0470, Entropy = 0.1200, Returns = 1.2926, Value = 1.2657, mask_loss = 0.1346std_loss = 0.2805:   6%|▌         | 338/5475 [1:31:10<21:08:30, 14.82s/it] 

ratio tensor(0.5941, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3441, device='cuda:0') factor: tensor(0.1242, device='cuda:0')
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3673, device='cuda:0') factor: tensor(0.1199, device='cuda:0')
reward_raw: tensor(0.4633, device='cuda:0') reward: tensor(1.3404, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4459, device='cuda:0') reward: tensor(1.3255, device='cuda:0') factor: tensor(0.1204, device='cuda:0')


Epoch 1, Step 339: Loss = -0.0779, Actor Loss = -0.1089, Critic Loss = 0.0621, Entropy = 0.0912, Returns = 1.3226, Value = 1.3514, mask_loss = 0.1087std_loss = 0.2700:   6%|▌         | 339/5475 [1:31:25<21:36:10, 15.14s/it]

ratio tensor(5.6108, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5217, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.1645, device='cuda:0')
reward_raw: tensor(0.5370, device='cuda:0') reward: tensor(1.3719, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.1671, device='cuda:0')
reward_raw: tensor(0.5392, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.1588, device='cuda:0')
reward_raw: tensor(0.5636, device='cuda:0') reward: tensor(1.3960, device='cuda:0') factor: tensor(0.1676, device='cuda:0')


Epoch 1, Step 340: Loss = 0.0374, Actor Loss = -0.0437, Critic Loss = 0.1623, Entropy = 0.1131, Returns = 1.4099, Value = 1.4522, mask_loss = 0.1567std_loss = 0.3140:   6%|▌         | 340/5475 [1:31:38<20:29:16, 14.36s/it] 

ratio tensor(1.0301, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.1212, device='cuda:0')
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.3724, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.1228, device='cuda:0')
reward_raw: tensor(0.4713, device='cuda:0') reward: tensor(1.3527, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.4787, device='cuda:0') reward: tensor(1.3508, device='cuda:0') factor: tensor(0.1279, device='cuda:0')


Epoch 1, Step 341: Loss = 0.3746, Actor Loss = 0.2882, Critic Loss = 0.1730, Entropy = 0.0966, Returns = 1.2858, Value = 1.4962, mask_loss = 0.1239std_loss = 0.2851:   6%|▌         | 340/5475 [1:31:53<20:29:16, 14.36s/it] 

ratio tensor(1.0933, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 341: Loss = 0.3746, Actor Loss = 0.2882, Critic Loss = 0.1730, Entropy = 0.0966, Returns = 1.2858, Value = 1.4962, mask_loss = 0.1239std_loss = 0.2851:   6%|▌         | 341/5475 [1:31:54<21:03:54, 14.77s/it]

reward_raw: tensor(0.4488, device='cuda:0') reward: tensor(1.3151, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.4412, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1280, device='cuda:0')
reward_raw: tensor(0.4394, device='cuda:0') reward: tensor(1.3107, device='cuda:0') factor: tensor(0.1287, device='cuda:0')
reward_raw: tensor(0.4371, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.1272, device='cuda:0')
reward_raw: tensor(0.4356, device='cuda:0') reward: tensor(1.3035, device='cuda:0') factor: tensor(0.1321, device='cuda:0')


Epoch 1, Step 342: Loss = -0.0370, Actor Loss = -0.0980, Critic Loss = 0.1221, Entropy = 0.1086, Returns = 1.3163, Value = 1.2239, mask_loss = 0.1180std_loss = 0.2733:   6%|▌         | 342/5475 [1:32:13<23:06:54, 16.21s/it]

ratio tensor(1.1519, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.1415, device='cuda:0')
reward_raw: tensor(0.4986, device='cuda:0') reward: tensor(1.3580, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.4974, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.1442, device='cuda:0')
reward_raw: tensor(0.4958, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(1.3455, device='cuda:0') factor: tensor(0.1378, device='cuda:0')


Epoch 1, Step 343: Loss = 0.2154, Actor Loss = 0.1623, Critic Loss = 0.1065, Entropy = 0.1204, Returns = 1.2948, Value = 1.4086, mask_loss = 0.1657std_loss = 0.3060:   6%|▋         | 343/5475 [1:32:26<21:41:25, 15.22s/it]  

ratio tensor(0.7004, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.3500, device='cuda:0') factor: tensor(0.1555, device='cuda:0')
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1545, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.3540, device='cuda:0') factor: tensor(0.1514, device='cuda:0')
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.3582, device='cuda:0') factor: tensor(0.1514, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.1542, device='cuda:0')


Epoch 1, Step 344: Loss = 0.0000, Actor Loss = -0.0305, Critic Loss = 0.0612, Entropy = 0.0901, Returns = 1.2852, Value = 1.3548, mask_loss = 0.1653std_loss = 0.3251:   6%|▋         | 344/5475 [1:32:41<21:41:19, 15.22s/it]

ratio tensor(1.0771, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3357, device='cuda:0') factor: tensor(0.1488, device='cuda:0')
reward_raw: tensor(0.4552, device='cuda:0') reward: tensor(1.3038, device='cuda:0') factor: tensor(0.1514, device='cuda:0')
reward_raw: tensor(0.4467, device='cuda:0') reward: tensor(1.2983, device='cuda:0') factor: tensor(0.1485, device='cuda:0')
reward_raw: tensor(0.4433, device='cuda:0') reward: tensor(1.2938, device='cuda:0') factor: tensor(0.1495, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3467, device='cuda:0') factor: tensor(0.1562, device='cuda:0')


Epoch 1, Step 345: Loss = -0.0984, Actor Loss = -0.1226, Critic Loss = 0.0485, Entropy = 0.0986, Returns = 1.4132, Value = 1.4279, mask_loss = 0.1484std_loss = 0.3147:   6%|▋         | 345/5475 [1:33:03<24:16:40, 17.04s/it]

ratio tensor(1.3064, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.1683, device='cuda:0')
reward_raw: tensor(0.5161, device='cuda:0') reward: tensor(1.3454, device='cuda:0') factor: tensor(0.1707, device='cuda:0')
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.1748, device='cuda:0')
reward_raw: tensor(0.5126, device='cuda:0') reward: tensor(1.3437, device='cuda:0') factor: tensor(0.1689, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.3521, device='cuda:0') factor: tensor(0.1646, device='cuda:0')


Epoch 1, Step 346: Loss = 0.2222, Actor Loss = 0.1818, Critic Loss = 0.0809, Entropy = 0.1241, Returns = 1.3415, Value = 1.2901, mask_loss = 0.1810std_loss = 0.3321:   6%|▋         | 346/5475 [1:33:23<25:49:58, 18.13s/it]  

ratio tensor(1.0322, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.3989, device='cuda:0') factor: tensor(0.1435, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.1404, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3881, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.5302, device='cuda:0') reward: tensor(1.3911, device='cuda:0') factor: tensor(0.1391, device='cuda:0')


Epoch 1, Step 347: Loss = -0.0718, Actor Loss = -0.1057, Critic Loss = 0.0680, Entropy = 0.1145, Returns = 1.3081, Value = 1.2963, mask_loss = 0.1388std_loss = 0.2969:   6%|▋         | 347/5475 [1:33:41<25:46:23, 18.09s/it]

ratio tensor(1.9415, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4600, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.4598, device='cuda:0') reward: tensor(1.3215, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.4298, device='cuda:0') reward: tensor(1.2911, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.1417, device='cuda:0')


Epoch 1, Step 348: Loss = -0.0971, Actor Loss = -0.1372, Critic Loss = 0.0805, Entropy = 0.1334, Returns = 1.3586, Value = 1.3855, mask_loss = 0.1533std_loss = 0.3011:   6%|▋         | 348/5475 [1:33:57<24:51:35, 17.46s/it]

ratio tensor(1.4302, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5613, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.1480, device='cuda:0')
reward_raw: tensor(0.5752, device='cuda:0') reward: tensor(1.4369, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.4251, device='cuda:0') factor: tensor(0.1480, device='cuda:0')
reward_raw: tensor(0.5517, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.1473, device='cuda:0')
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.4017, device='cuda:0') factor: tensor(0.1472, device='cuda:0')


Epoch 1, Step 349: Loss = 0.1307, Actor Loss = 0.0326, Critic Loss = 0.1965, Entropy = 0.1154, Returns = 1.4018, Value = 1.3083, mask_loss = 0.1438std_loss = 0.3042:   6%|▋         | 349/5475 [1:34:12<23:33:52, 16.55s/it]  

ratio tensor(0.9160, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5120, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.1622, device='cuda:0')
reward_raw: tensor(0.5320, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1627, device='cuda:0')
reward_raw: tensor(0.5206, device='cuda:0') reward: tensor(1.3650, device='cuda:0') factor: tensor(0.1556, device='cuda:0')
reward_raw: tensor(0.5173, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1579, device='cuda:0')
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.3323, device='cuda:0') factor: tensor(0.1652, device='cuda:0')


Epoch 1, Step 350: Loss = -0.0593, Actor Loss = -0.0945, Critic Loss = 0.0708, Entropy = 0.1267, Returns = 1.4520, Value = 1.3871, mask_loss = 0.1536std_loss = 0.3089:   6%|▋         | 350/5475 [1:34:30<24:09:02, 16.96s/it]

ratio tensor(1.2921, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5118, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.3582, device='cuda:0') factor: tensor(0.1408, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.1361, device='cuda:0')
reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.1350, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.3668, device='cuda:0') factor: tensor(0.1371, device='cuda:0')


Epoch 1, Step 351: Loss = -0.0676, Actor Loss = -0.0860, Critic Loss = 0.0371, Entropy = 0.1141, Returns = 1.3594, Value = 1.3718, mask_loss = 0.1407std_loss = 0.2967:   6%|▋         | 351/5475 [1:34:42<22:19:23, 15.68s/it]

ratio tensor(1.0894, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5657, device='cuda:0') reward: tensor(1.4357, device='cuda:0') factor: tensor(0.1300, device='cuda:0')
reward_raw: tensor(0.5970, device='cuda:0') reward: tensor(1.4576, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.5753, device='cuda:0') reward: tensor(1.4442, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.5864, device='cuda:0') reward: tensor(1.4479, device='cuda:0') factor: tensor(0.1385, device='cuda:0')
reward_raw: tensor(0.5785, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.1372, device='cuda:0')


Epoch 1, Step 352: Loss = 0.0238, Actor Loss = -0.0565, Critic Loss = 0.1607, Entropy = 0.1043, Returns = 1.4500, Value = 1.5959, mask_loss = 0.1209std_loss = 0.2802:   6%|▋         | 352/5475 [1:34:56<21:25:12, 15.05s/it] 

ratio tensor(1.2619, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4751, device='cuda:0') reward: tensor(1.2890, device='cuda:0') factor: tensor(0.1861, device='cuda:0')
reward_raw: tensor(0.4658, device='cuda:0') reward: tensor(1.2695, device='cuda:0') factor: tensor(0.1963, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.2915, device='cuda:0') factor: tensor(0.1832, device='cuda:0')
reward_raw: tensor(0.4879, device='cuda:0') reward: tensor(1.3027, device='cuda:0') factor: tensor(0.1852, device='cuda:0')
reward_raw: tensor(0.4400, device='cuda:0') reward: tensor(1.2541, device='cuda:0') factor: tensor(0.1859, device='cuda:0')


Epoch 1, Step 353: Loss = 0.2386, Actor Loss = 0.1979, Critic Loss = 0.0817, Entropy = 0.1498, Returns = 1.3072, Value = 1.2651, mask_loss = 0.1893std_loss = 0.3153:   6%|▋         | 353/5475 [1:35:07<19:40:37, 13.83s/it] 

ratio tensor(0.7208, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.1957, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.2022, device='cuda:0')
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.2960, device='cuda:0') factor: tensor(0.1905, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3146, device='cuda:0') factor: tensor(0.2036, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1992, device='cuda:0')


Epoch 1, Step 354: Loss = -0.0123, Actor Loss = -0.0729, Critic Loss = 0.1214, Entropy = 0.1590, Returns = 1.3192, Value = 1.4324, mask_loss = 0.2157std_loss = 0.3396:   6%|▋         | 354/5475 [1:35:18<18:36:12, 13.08s/it]

ratio tensor(1.1976, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.3823, device='cuda:0') factor: tensor(0.1228, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.1252, device='cuda:0')
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1293, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.1286, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.3907, device='cuda:0') factor: tensor(0.1260, device='cuda:0')


Epoch 1, Step 355: Loss = 0.2860, Actor Loss = 0.2433, Critic Loss = 0.0858, Entropy = 0.1199, Returns = 1.3665, Value = 1.3894, mask_loss = 0.1259std_loss = 0.2782:   6%|▋         | 355/5475 [1:35:35<20:21:26, 14.31s/it]  

ratio tensor(0.9851, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6168, device='cuda:0') reward: tensor(1.4558, device='cuda:0') factor: tensor(0.1610, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.4277, device='cuda:0') factor: tensor(0.1728, device='cuda:0')
reward_raw: tensor(0.6027, device='cuda:0') reward: tensor(1.4319, device='cuda:0') factor: tensor(0.1708, device='cuda:0')
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.4298, device='cuda:0') factor: tensor(0.1696, device='cuda:0')
reward_raw: tensor(0.6132, device='cuda:0') reward: tensor(1.4440, device='cuda:0') factor: tensor(0.1691, device='cuda:0')


Epoch 1, Step 356: Loss = -0.2273, Actor Loss = -0.2672, Critic Loss = 0.0801, Entropy = 0.1301, Returns = 1.4927, Value = 1.4733, mask_loss = 0.1363std_loss = 0.2847:   7%|▋         | 356/5475 [1:35:51<20:46:08, 14.61s/it]

ratio tensor(1.5042, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.1310, device='cuda:0')
reward_raw: tensor(0.5217, device='cuda:0') reward: tensor(1.3892, device='cuda:0') factor: tensor(0.1325, device='cuda:0')
reward_raw: tensor(0.5387, device='cuda:0') reward: tensor(1.4099, device='cuda:0') factor: tensor(0.1288, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.1329, device='cuda:0')
reward_raw: tensor(0.5451, device='cuda:0') reward: tensor(1.4046, device='cuda:0') factor: tensor(0.1405, device='cuda:0')


Epoch 1, Step 357: Loss = -0.0604, Actor Loss = -0.1009, Critic Loss = 0.0813, Entropy = 0.1012, Returns = 1.5078, Value = 1.5775, mask_loss = 0.1173std_loss = 0.2764:   7%|▋         | 357/5475 [1:36:06<20:55:50, 14.72s/it]

ratio tensor(1.3861, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4947, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1459, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.3644, device='cuda:0') factor: tensor(0.1455, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3637, device='cuda:0') factor: tensor(0.1491, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.1481, device='cuda:0')


Epoch 1, Step 358: Loss = 0.2637, Actor Loss = 0.2229, Critic Loss = 0.0818, Entropy = 0.1036, Returns = 1.3498, Value = 1.2516, mask_loss = 0.1297std_loss = 0.2796:   7%|▋         | 358/5475 [1:36:23<21:49:51, 15.36s/it]  

ratio tensor(0.6298, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3488, device='cuda:0') factor: tensor(0.1622, device='cuda:0')
reward_raw: tensor(0.4849, device='cuda:0') reward: tensor(1.3217, device='cuda:0') factor: tensor(0.1632, device='cuda:0')
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.3069, device='cuda:0') factor: tensor(0.1723, device='cuda:0')
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1704, device='cuda:0')
reward_raw: tensor(0.4775, device='cuda:0') reward: tensor(1.3103, device='cuda:0') factor: tensor(0.1672, device='cuda:0')


Epoch 1, Step 359: Loss = -0.0792, Actor Loss = -0.1170, Critic Loss = 0.0759, Entropy = 0.1147, Returns = 1.3203, Value = 1.2488, mask_loss = 0.1421std_loss = 0.2955:   7%|▋         | 359/5475 [1:36:38<21:43:49, 15.29s/it]

ratio tensor(1.6671, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5561, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.1610, device='cuda:0')
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.3686, device='cuda:0') factor: tensor(0.1663, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1657, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.1586, device='cuda:0')


Epoch 1, Step 360: Loss = 0.0546, Actor Loss = -0.0168, Critic Loss = 0.1429, Entropy = 0.1026, Returns = 1.3761, Value = 1.5915, mask_loss = 0.1622std_loss = 0.3233:   7%|▋         | 360/5475 [1:36:54<22:01:01, 15.50s/it] 

ratio tensor(0.9524, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.1573, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.1555, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3556, device='cuda:0') factor: tensor(0.1538, device='cuda:0')


Epoch 1, Step 361: Loss = 0.3605, Actor Loss = 0.2702, Critic Loss = 0.1809, Entropy = 0.1328, Returns = 1.3267, Value = 1.5643, mask_loss = 0.1775std_loss = 0.3259:   7%|▋         | 360/5475 [1:37:15<22:01:01, 15.50s/it] 

ratio tensor(0.5864, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 361: Loss = 0.3605, Actor Loss = 0.2702, Critic Loss = 0.1809, Entropy = 0.1328, Returns = 1.3267, Value = 1.5643, mask_loss = 0.1775std_loss = 0.3259:   7%|▋         | 361/5475 [1:37:15<24:37:32, 17.34s/it]

reward_raw: tensor(0.4747, device='cuda:0') reward: tensor(1.3117, device='cuda:0') factor: tensor(0.1630, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.3335, device='cuda:0') factor: tensor(0.1624, device='cuda:0')
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.1615, device='cuda:0')
reward_raw: tensor(0.4835, device='cuda:0') reward: tensor(1.3176, device='cuda:0') factor: tensor(0.1660, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3282, device='cuda:0') factor: tensor(0.1591, device='cuda:0')


Epoch 1, Step 362: Loss = 0.0385, Actor Loss = -0.1081, Critic Loss = 0.2934, Entropy = 0.1111, Returns = 1.3605, Value = 1.0680, mask_loss = 0.1671std_loss = 0.3203:   7%|▋         | 362/5475 [1:37:33<24:46:43, 17.45s/it]

ratio tensor(0.8972, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4643, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.1443, device='cuda:0')
reward_raw: tensor(0.4940, device='cuda:0') reward: tensor(1.3477, device='cuda:0') factor: tensor(0.1464, device='cuda:0')
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.3497, device='cuda:0') factor: tensor(0.1464, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3473, device='cuda:0') factor: tensor(0.1470, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.3252, device='cuda:0') factor: tensor(0.1522, device='cuda:0')


Epoch 1, Step 363: Loss = -0.0596, Actor Loss = -0.1001, Critic Loss = 0.0813, Entropy = 0.1149, Returns = 1.4005, Value = 1.4420, mask_loss = 0.1237std_loss = 0.2736:   7%|▋         | 363/5475 [1:37:50<24:35:23, 17.32s/it]

ratio tensor(1.5078, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.4124, device='cuda:0') factor: tensor(0.1682, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3929, device='cuda:0') factor: tensor(0.1583, device='cuda:0')
reward_raw: tensor(0.5909, device='cuda:0') reward: tensor(1.4328, device='cuda:0') factor: tensor(0.1580, device='cuda:0')
reward_raw: tensor(0.5950, device='cuda:0') reward: tensor(1.4356, device='cuda:0') factor: tensor(0.1594, device='cuda:0')
reward_raw: tensor(0.5875, device='cuda:0') reward: tensor(1.4306, device='cuda:0') factor: tensor(0.1570, device='cuda:0')


Epoch 1, Step 364: Loss = -0.0193, Actor Loss = -0.0506, Critic Loss = 0.0628, Entropy = 0.1293, Returns = 1.4388, Value = 1.4823, mask_loss = 0.1550std_loss = 0.3030:   7%|▋         | 364/5475 [1:38:06<24:05:59, 16.98s/it]

ratio tensor(0.8657, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.1573, device='cuda:0')
reward_raw: tensor(0.4870, device='cuda:0') reward: tensor(1.3271, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.1536, device='cuda:0')
reward_raw: tensor(0.4878, device='cuda:0') reward: tensor(1.3216, device='cuda:0') factor: tensor(0.1662, device='cuda:0')
reward_raw: tensor(0.4866, device='cuda:0') reward: tensor(1.3309, device='cuda:0') factor: tensor(0.1557, device='cuda:0')


Epoch 1, Step 365: Loss = 0.0917, Actor Loss = 0.0467, Critic Loss = 0.0904, Entropy = 0.1624, Returns = 1.3561, Value = 1.2412, mask_loss = 0.1764std_loss = 0.3118:   7%|▋         | 365/5475 [1:38:24<24:25:20, 17.21s/it]  

ratio tensor(3.3390, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5747, device='cuda:0') reward: tensor(1.3987, device='cuda:0') factor: tensor(0.1760, device='cuda:0')
reward_raw: tensor(0.5611, device='cuda:0') reward: tensor(1.3860, device='cuda:0') factor: tensor(0.1751, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.3773, device='cuda:0') factor: tensor(0.1713, device='cuda:0')
reward_raw: tensor(0.5527, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.1763, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.3851, device='cuda:0') factor: tensor(0.1817, device='cuda:0')


Epoch 1, Step 366: Loss = -0.0273, Actor Loss = -0.0712, Critic Loss = 0.0880, Entropy = 0.1430, Returns = 1.5091, Value = 1.4506, mask_loss = 0.1651std_loss = 0.3095:   7%|▋         | 366/5475 [1:38:40<23:45:59, 16.75s/it]

ratio tensor(0.9330, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.2388, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.2382, device='cuda:0')
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(1.3146, device='cuda:0') factor: tensor(0.2399, device='cuda:0')
reward_raw: tensor(0.5580, device='cuda:0') reward: tensor(1.3272, device='cuda:0') factor: tensor(0.2309, device='cuda:0')
reward_raw: tensor(0.5702, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.2407, device='cuda:0')


Epoch 1, Step 367: Loss = 0.1718, Actor Loss = 0.1244, Critic Loss = 0.0952, Entropy = 0.2050, Returns = 1.3683, Value = 1.3765, mask_loss = 0.2168std_loss = 0.3245:   7%|▋         | 367/5475 [1:38:56<23:31:31, 16.58s/it]  

ratio tensor(0.7826, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5361, device='cuda:0') reward: tensor(1.3213, device='cuda:0') factor: tensor(0.2148, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(1.3198, device='cuda:0') factor: tensor(0.2121, device='cuda:0')
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.2098, device='cuda:0')
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.3317, device='cuda:0') factor: tensor(0.2166, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.3344, device='cuda:0') factor: tensor(0.2208, device='cuda:0')


Epoch 1, Step 368: Loss = -0.1204, Actor Loss = -0.1679, Critic Loss = 0.0953, Entropy = 0.2033, Returns = 1.3508, Value = 1.2383, mask_loss = 0.2431std_loss = 0.3457:   7%|▋         | 368/5475 [1:39:13<23:56:41, 16.88s/it]

ratio tensor(1.5248, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5116, device='cuda:0') reward: tensor(1.3413, device='cuda:0') factor: tensor(0.1704, device='cuda:0')
reward_raw: tensor(0.4815, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.1659, device='cuda:0')
reward_raw: tensor(0.5228, device='cuda:0') reward: tensor(1.3506, device='cuda:0') factor: tensor(0.1723, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1713, device='cuda:0')
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1657, device='cuda:0')


Epoch 1, Step 369: Loss = -0.0436, Actor Loss = -0.1077, Critic Loss = 0.1285, Entropy = 0.1670, Returns = 1.3532, Value = 1.3242, mask_loss = 0.1566std_loss = 0.2918:   7%|▋         | 369/5475 [1:39:33<24:59:55, 17.63s/it]

ratio tensor(1.2631, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4312, device='cuda:0') reward: tensor(1.2823, device='cuda:0') factor: tensor(0.1490, device='cuda:0')
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.3275, device='cuda:0') factor: tensor(0.1475, device='cuda:0')
reward_raw: tensor(0.4323, device='cuda:0') reward: tensor(1.2831, device='cuda:0') factor: tensor(0.1492, device='cuda:0')
reward_raw: tensor(0.4405, device='cuda:0') reward: tensor(1.2910, device='cuda:0') factor: tensor(0.1495, device='cuda:0')
reward_raw: tensor(0.4470, device='cuda:0') reward: tensor(1.3044, device='cuda:0') factor: tensor(0.1426, device='cuda:0')


Epoch 1, Step 370: Loss = 0.0178, Actor Loss = -0.0444, Critic Loss = 0.1246, Entropy = 0.1230, Returns = 1.3006, Value = 1.2664, mask_loss = 0.1287std_loss = 0.2806:   7%|▋         | 370/5475 [1:39:52<25:31:19, 18.00s/it] 

ratio tensor(1.0600, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.2645, device='cuda:0') factor: tensor(0.2009, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.2780, device='cuda:0') factor: tensor(0.2062, device='cuda:0')
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.2851, device='cuda:0') factor: tensor(0.1983, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.2898, device='cuda:0') factor: tensor(0.2027, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.2982, device='cuda:0') factor: tensor(0.2053, device='cuda:0')


Epoch 1, Step 371: Loss = 0.0720, Actor Loss = 0.0297, Critic Loss = 0.0850, Entropy = 0.1822, Returns = 1.3027, Value = 1.3014, mask_loss = 0.2126std_loss = 0.3140:   7%|▋         | 371/5475 [1:40:11<25:55:52, 18.29s/it] 

ratio tensor(1.0767, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.4316, device='cuda:0') factor: tensor(0.1689, device='cuda:0')
reward_raw: tensor(0.6004, device='cuda:0') reward: tensor(1.4394, device='cuda:0') factor: tensor(0.1610, device='cuda:0')
reward_raw: tensor(0.6100, device='cuda:0') reward: tensor(1.4458, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.6002, device='cuda:0') reward: tensor(1.4325, device='cuda:0') factor: tensor(0.1677, device='cuda:0')
reward_raw: tensor(0.5968, device='cuda:0') reward: tensor(1.4316, device='cuda:0') factor: tensor(0.1651, device='cuda:0')


Epoch 1, Step 372: Loss = 0.0400, Actor Loss = -0.0092, Critic Loss = 0.0986, Entropy = 0.1691, Returns = 1.3374, Value = 1.3632, mask_loss = 0.1941std_loss = 0.3223:   7%|▋         | 372/5475 [1:40:26<24:45:17, 17.46s/it]

ratio tensor(1.0003, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4915, device='cuda:0') reward: tensor(1.2948, device='cuda:0') factor: tensor(0.1967, device='cuda:0')
reward_raw: tensor(0.4937, device='cuda:0') reward: tensor(1.2968, device='cuda:0') factor: tensor(0.1969, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.2884, device='cuda:0') factor: tensor(0.2027, device='cuda:0')
reward_raw: tensor(0.4877, device='cuda:0') reward: tensor(1.2848, device='cuda:0') factor: tensor(0.2030, device='cuda:0')
reward_raw: tensor(0.4986, device='cuda:0') reward: tensor(1.3066, device='cuda:0') factor: tensor(0.1920, device='cuda:0')


Epoch 1, Step 373: Loss = 0.0476, Actor Loss = 0.0241, Critic Loss = 0.0474, Entropy = 0.1842, Returns = 1.2582, Value = 1.2395, mask_loss = 0.2138std_loss = 0.3331:   7%|▋         | 373/5475 [1:40:44<25:03:56, 17.69s/it] 

ratio tensor(0.8563, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4730, device='cuda:0') reward: tensor(1.2795, device='cuda:0') factor: tensor(0.1935, device='cuda:0')
reward_raw: tensor(0.4295, device='cuda:0') reward: tensor(1.2385, device='cuda:0') factor: tensor(0.1909, device='cuda:0')
reward_raw: tensor(0.4251, device='cuda:0') reward: tensor(1.2401, device='cuda:0') factor: tensor(0.1850, device='cuda:0')
reward_raw: tensor(0.4089, device='cuda:0') reward: tensor(1.2168, device='cuda:0') factor: tensor(0.1921, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.2883, device='cuda:0') factor: tensor(0.1863, device='cuda:0')


Epoch 1, Step 374: Loss = 0.0238, Actor Loss = 0.0039, Critic Loss = 0.0401, Entropy = 0.1659, Returns = 1.2047, Value = 1.2268, mask_loss = 0.1812std_loss = 0.3086:   7%|▋         | 374/5475 [1:41:06<26:47:18, 18.91s/it]

ratio tensor(1.0782, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4935, device='cuda:0') reward: tensor(1.3294, device='cuda:0') factor: tensor(0.1641, device='cuda:0')
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.1722, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1627, device='cuda:0')
reward_raw: tensor(0.4828, device='cuda:0') reward: tensor(1.3184, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.1748, device='cuda:0')


Epoch 1, Step 375: Loss = -0.1770, Actor Loss = -0.2184, Critic Loss = 0.0833, Entropy = 0.1809, Returns = 1.3333, Value = 1.3846, mask_loss = 0.1667std_loss = 0.2921:   7%|▋         | 375/5475 [1:41:25<26:36:00, 18.78s/it]

ratio tensor(7.9598, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.3177, device='cuda:0') factor: tensor(0.2130, device='cuda:0')
reward_raw: tensor(0.5371, device='cuda:0') reward: tensor(1.3244, device='cuda:0') factor: tensor(0.2128, device='cuda:0')
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.2208, device='cuda:0')
reward_raw: tensor(0.5358, device='cuda:0') reward: tensor(1.3245, device='cuda:0') factor: tensor(0.2114, device='cuda:0')
reward_raw: tensor(0.5497, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.2184, device='cuda:0')


Epoch 1, Step 376: Loss = 0.3689, Actor Loss = 0.3151, Critic Loss = 0.1080, Entropy = 0.1815, Returns = 1.2529, Value = 1.3580, mask_loss = 0.2208std_loss = 0.3225:   7%|▋         | 376/5475 [1:41:38<24:21:32, 17.20s/it]  

ratio tensor(0.4742, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4686, device='cuda:0') reward: tensor(1.2659, device='cuda:0') factor: tensor(0.2027, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.2761, device='cuda:0') factor: tensor(0.2036, device='cuda:0')
reward_raw: tensor(0.4604, device='cuda:0') reward: tensor(1.2498, device='cuda:0') factor: tensor(0.2106, device='cuda:0')
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.2785, device='cuda:0') factor: tensor(0.2080, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.2666, device='cuda:0') factor: tensor(0.2027, device='cuda:0')


Epoch 1, Step 377: Loss = -0.1068, Actor Loss = -0.1401, Critic Loss = 0.0671, Entropy = 0.2018, Returns = 1.2765, Value = 1.1477, mask_loss = 0.2527std_loss = 0.3414:   7%|▋         | 377/5475 [1:41:52<22:59:12, 16.23s/it]

ratio tensor(1.1120, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5335, device='cuda:0') reward: tensor(1.3468, device='cuda:0') factor: tensor(0.1867, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.3488, device='cuda:0') factor: tensor(0.1727, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.3259, device='cuda:0') factor: tensor(0.1804, device='cuda:0')
reward_raw: tensor(0.4946, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.1804, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3054, device='cuda:0') factor: tensor(0.1746, device='cuda:0')


Epoch 1, Step 378: Loss = -0.3229, Actor Loss = -0.3473, Critic Loss = 0.0492, Entropy = 0.1516, Returns = 1.4026, Value = 1.4052, mask_loss = 0.1619std_loss = 0.3052:   7%|▋         | 378/5475 [1:42:11<24:00:40, 16.96s/it]

ratio tensor(9.4362, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5679, device='cuda:0') reward: tensor(1.3803, device='cuda:0') factor: tensor(0.1877, device='cuda:0')
reward_raw: tensor(0.5689, device='cuda:0') reward: tensor(1.3902, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.5639, device='cuda:0') reward: tensor(1.3890, device='cuda:0') factor: tensor(0.1749, device='cuda:0')
reward_raw: tensor(0.5768, device='cuda:0') reward: tensor(1.3925, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.5667, device='cuda:0') reward: tensor(1.3846, device='cuda:0') factor: tensor(0.1821, device='cuda:0')


Epoch 1, Step 379: Loss = 0.1875, Actor Loss = 0.1481, Critic Loss = 0.0791, Entropy = 0.1620, Returns = 1.3624, Value = 1.3801, mask_loss = 0.1594std_loss = 0.2972:   7%|▋         | 379/5475 [1:42:28<24:19:05, 17.18s/it]  

ratio tensor(0.7292, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3682, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.5343, device='cuda:0') reward: tensor(1.3830, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.5017, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.1586, device='cuda:0')
reward_raw: tensor(0.5322, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.4957, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.1574, device='cuda:0')


Epoch 1, Step 380: Loss = -0.1108, Actor Loss = -0.1475, Critic Loss = 0.0739, Entropy = 0.1801, Returns = 1.3721, Value = 1.4105, mask_loss = 0.1715std_loss = 0.2918:   7%|▋         | 380/5475 [1:42:46<24:40:32, 17.44s/it]

ratio tensor(1.1343, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.1679, device='cuda:0')
reward_raw: tensor(0.5532, device='cuda:0') reward: tensor(1.3776, device='cuda:0') factor: tensor(0.1756, device='cuda:0')
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.5733, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.1784, device='cuda:0')
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.1716, device='cuda:0')


Epoch 1, Step 381: Loss = 0.3160, Actor Loss = 0.2705, Critic Loss = 0.0912, Entropy = 0.1504, Returns = 1.3730, Value = 1.5097, mask_loss = 0.1666std_loss = 0.3092:   7%|▋         | 380/5475 [1:43:00<24:40:32, 17.44s/it]  

ratio tensor(1.0378, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 381: Loss = 0.3160, Actor Loss = 0.2705, Critic Loss = 0.0912, Entropy = 0.1504, Returns = 1.3730, Value = 1.5097, mask_loss = 0.1666std_loss = 0.3092:   7%|▋         | 381/5475 [1:43:00<23:04:25, 16.31s/it]

reward_raw: tensor(0.6099, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.2505, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.3019, device='cuda:0') factor: tensor(0.2454, device='cuda:0')
reward_raw: tensor(0.5862, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.2475, device='cuda:0')
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.2492, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.2501, device='cuda:0')


Epoch 1, Step 382: Loss = 0.0179, Actor Loss = 0.0028, Critic Loss = 0.0306, Entropy = 0.2209, Returns = 1.3190, Value = 1.3374, mask_loss = 0.2400std_loss = 0.3419:   7%|▋         | 382/5475 [1:43:12<20:58:09, 14.82s/it]

ratio tensor(0.8985, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5877, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.2140, device='cuda:0')
reward_raw: tensor(0.5863, device='cuda:0') reward: tensor(1.3687, device='cuda:0') factor: tensor(0.2176, device='cuda:0')
reward_raw: tensor(0.5959, device='cuda:0') reward: tensor(1.3752, device='cuda:0') factor: tensor(0.2208, device='cuda:0')
reward_raw: tensor(0.5852, device='cuda:0') reward: tensor(1.3750, device='cuda:0') factor: tensor(0.2102, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.3563, device='cuda:0') factor: tensor(0.2076, device='cuda:0')


Epoch 1, Step 383: Loss = -0.2794, Actor Loss = -0.3564, Critic Loss = 0.1544, Entropy = 0.1977, Returns = 1.4245, Value = 1.3074, mask_loss = 0.2104std_loss = 0.3256:   7%|▋         | 383/5475 [1:43:28<21:49:12, 15.43s/it]

ratio tensor(2.4717, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5853, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.2055, device='cuda:0')
reward_raw: tensor(0.5446, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.2132, device='cuda:0')
reward_raw: tensor(0.5759, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.1984, device='cuda:0')
reward_raw: tensor(0.5812, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.2133, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.2112, device='cuda:0')


Epoch 1, Step 384: Loss = 0.3328, Actor Loss = 0.2559, Critic Loss = 0.1543, Entropy = 0.1917, Returns = 1.3302, Value = 1.3768, mask_loss = 0.2184std_loss = 0.3296:   7%|▋         | 384/5475 [1:43:42<20:51:31, 14.75s/it]  

ratio tensor(0.7669, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4987, device='cuda:0') reward: tensor(1.2971, device='cuda:0') factor: tensor(0.2016, device='cuda:0')
reward_raw: tensor(0.5804, device='cuda:0') reward: tensor(1.3856, device='cuda:0') factor: tensor(0.1948, device='cuda:0')
reward_raw: tensor(0.5810, device='cuda:0') reward: tensor(1.3860, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.5484, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.1893, device='cuda:0')
reward_raw: tensor(0.5723, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.1969, device='cuda:0')


Epoch 1, Step 385: Loss = 0.0371, Actor Loss = 0.0020, Critic Loss = 0.0705, Entropy = 0.1907, Returns = 1.3526, Value = 1.2889, mask_loss = 0.2060std_loss = 0.3199:   7%|▋         | 385/5475 [1:43:58<21:41:30, 15.34s/it]

ratio tensor(0.8863, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5952, device='cuda:0') reward: tensor(1.3820, device='cuda:0') factor: tensor(0.2132, device='cuda:0')
reward_raw: tensor(0.6153, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.2108, device='cuda:0')
reward_raw: tensor(0.6169, device='cuda:0') reward: tensor(1.4014, device='cuda:0') factor: tensor(0.2155, device='cuda:0')
reward_raw: tensor(0.6261, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.2153, device='cuda:0')
reward_raw: tensor(0.6164, device='cuda:0') reward: tensor(1.4088, device='cuda:0') factor: tensor(0.2077, device='cuda:0')


Epoch 1, Step 386: Loss = -0.2296, Actor Loss = -0.2551, Critic Loss = 0.0515, Entropy = 0.1761, Returns = 1.4512, Value = 1.4711, mask_loss = 0.2002std_loss = 0.3291:   7%|▋         | 386/5475 [1:44:16<22:31:40, 15.94s/it]

ratio tensor(2.7570, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4986, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.2025, device='cuda:0')
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.2000, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3166, device='cuda:0') factor: tensor(0.1989, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.2947, device='cuda:0') factor: tensor(0.1997, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.1972, device='cuda:0')


Epoch 1, Step 387: Loss = 0.1560, Actor Loss = 0.1311, Critic Loss = 0.0503, Entropy = 0.1852, Returns = 1.2656, Value = 1.2695, mask_loss = 0.1922std_loss = 0.3161:   7%|▋         | 387/5475 [1:44:30<22:00:59, 15.58s/it]  

ratio tensor(0.5727, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6076, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.2321, device='cuda:0')
reward_raw: tensor(0.6216, device='cuda:0') reward: tensor(1.3939, device='cuda:0') factor: tensor(0.2278, device='cuda:0')
reward_raw: tensor(0.5891, device='cuda:0') reward: tensor(1.3640, device='cuda:0') factor: tensor(0.2251, device='cuda:0')
reward_raw: tensor(0.5924, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.2220, device='cuda:0')
reward_raw: tensor(0.6070, device='cuda:0') reward: tensor(1.3676, device='cuda:0') factor: tensor(0.2394, device='cuda:0')


Epoch 1, Step 388: Loss = -0.0344, Actor Loss = -0.0626, Critic Loss = 0.0567, Entropy = 0.1673, Returns = 1.4050, Value = 1.3846, mask_loss = 0.1777std_loss = 0.3022:   7%|▋         | 388/5475 [1:44:45<21:28:32, 15.20s/it]

ratio tensor(1.4992, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5399, device='cuda:0') reward: tensor(1.3147, device='cuda:0') factor: tensor(0.2252, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.2274, device='cuda:0')
reward_raw: tensor(0.5275, device='cuda:0') reward: tensor(1.2975, device='cuda:0') factor: tensor(0.2301, device='cuda:0')
reward_raw: tensor(0.5075, device='cuda:0') reward: tensor(1.2779, device='cuda:0') factor: tensor(0.2297, device='cuda:0')
reward_raw: tensor(0.5379, device='cuda:0') reward: tensor(1.3054, device='cuda:0') factor: tensor(0.2325, device='cuda:0')


Epoch 1, Step 389: Loss = 0.1453, Actor Loss = 0.0901, Critic Loss = 0.1108, Entropy = 0.1770, Returns = 1.2877, Value = 1.3425, mask_loss = 0.2172std_loss = 0.3219:   7%|▋         | 389/5475 [1:44:56<20:03:41, 14.20s/it]  

ratio tensor(1.3109, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.3361, device='cuda:0') factor: tensor(0.1846, device='cuda:0')
reward_raw: tensor(0.5656, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5628, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.1885, device='cuda:0')
reward_raw: tensor(0.5509, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.5564, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.1940, device='cuda:0')


Epoch 1, Step 390: Loss = -0.1574, Actor Loss = -0.2413, Critic Loss = 0.1681, Entropy = 0.1960, Returns = 1.3441, Value = 1.2169, mask_loss = 0.1977std_loss = 0.3164:   7%|▋         | 390/5475 [1:45:14<21:27:07, 15.19s/it]

ratio tensor(1.7167, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.1956, device='cuda:0')
reward_raw: tensor(0.6013, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.2010, device='cuda:0')
reward_raw: tensor(0.5960, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.2008, device='cuda:0')
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.2026, device='cuda:0')
reward_raw: tensor(0.5723, device='cuda:0') reward: tensor(1.3721, device='cuda:0') factor: tensor(0.2001, device='cuda:0')


Epoch 1, Step 391: Loss = 0.2221, Actor Loss = 0.1935, Critic Loss = 0.0574, Entropy = 0.1720, Returns = 1.3088, Value = 1.4392, mask_loss = 0.1827std_loss = 0.3000:   7%|▋         | 391/5475 [1:45:27<20:31:27, 14.53s/it]  

ratio tensor(0.6725, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.3931, device='cuda:0') factor: tensor(0.1619, device='cuda:0')
reward_raw: tensor(0.5720, device='cuda:0') reward: tensor(1.4043, device='cuda:0') factor: tensor(0.1677, device='cuda:0')
reward_raw: tensor(0.5736, device='cuda:0') reward: tensor(1.4077, device='cuda:0') factor: tensor(0.1660, device='cuda:0')
reward_raw: tensor(0.5706, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.1687, device='cuda:0')
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1657, device='cuda:0')


Epoch 1, Step 392: Loss = -0.1089, Actor Loss = -0.1568, Critic Loss = 0.0960, Entropy = 0.1580, Returns = 1.4012, Value = 1.3048, mask_loss = 0.1672std_loss = 0.2990:   7%|▋         | 392/5475 [1:45:43<21:12:27, 15.02s/it]

ratio tensor(1.4903, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5381, device='cuda:0') reward: tensor(1.3783, device='cuda:0') factor: tensor(0.1598, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.4127, device='cuda:0') factor: tensor(0.1595, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.3804, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3601, device='cuda:0') factor: tensor(0.1563, device='cuda:0')
reward_raw: tensor(0.5210, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1536, device='cuda:0')


Epoch 1, Step 393: Loss = -0.0991, Actor Loss = -0.1497, Critic Loss = 0.1014, Entropy = 0.1663, Returns = 1.4088, Value = 1.5015, mask_loss = 0.1604std_loss = 0.2896:   7%|▋         | 393/5475 [1:46:01<22:24:39, 15.88s/it]

ratio tensor(1.2199, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4676, device='cuda:0') reward: tensor(1.3227, device='cuda:0') factor: tensor(0.1448, device='cuda:0')
reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1437, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3329, device='cuda:0') factor: tensor(0.1420, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3414, device='cuda:0') factor: tensor(0.1512, device='cuda:0')
reward_raw: tensor(0.4768, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.1466, device='cuda:0')


Epoch 1, Step 394: Loss = 0.2590, Actor Loss = 0.2113, Critic Loss = 0.0956, Entropy = 0.1852, Returns = 1.3370, Value = 1.2939, mask_loss = 0.1616std_loss = 0.2809:   7%|▋         | 394/5475 [1:46:23<24:47:47, 17.57s/it]  

ratio tensor(0.6144, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6367, device='cuda:0') reward: tensor(1.4292, device='cuda:0') factor: tensor(0.2075, device='cuda:0')
reward_raw: tensor(0.6294, device='cuda:0') reward: tensor(1.4144, device='cuda:0') factor: tensor(0.2150, device='cuda:0')
reward_raw: tensor(0.6041, device='cuda:0') reward: tensor(1.4058, device='cuda:0') factor: tensor(0.1983, device='cuda:0')
reward_raw: tensor(0.6027, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.2025, device='cuda:0')
reward_raw: tensor(0.6462, device='cuda:0') reward: tensor(1.4377, device='cuda:0') factor: tensor(0.2085, device='cuda:0')


Epoch 1, Step 395: Loss = -0.2751, Actor Loss = -0.3127, Critic Loss = 0.0756, Entropy = 0.1529, Returns = 1.4503, Value = 1.5008, mask_loss = 0.2062std_loss = 0.3382:   7%|▋         | 395/5475 [1:46:39<24:08:07, 17.10s/it]

ratio tensor(1.2764, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5045, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.1944, device='cuda:0')
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.3257, device='cuda:0') factor: tensor(0.1963, device='cuda:0')
reward_raw: tensor(0.5428, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.1914, device='cuda:0')
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.3341, device='cuda:0') factor: tensor(0.1949, device='cuda:0')


Epoch 1, Step 396: Loss = 0.3032, Actor Loss = 0.2586, Critic Loss = 0.0895, Entropy = 0.1470, Returns = 1.3314, Value = 1.3710, mask_loss = 0.1758std_loss = 0.3076:   7%|▋         | 396/5475 [1:46:52<22:36:28, 16.02s/it]  

ratio tensor(0.6007, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.3570, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.2010, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.2046, device='cuda:0')
reward_raw: tensor(0.5460, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1959, device='cuda:0')
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.3622, device='cuda:0') factor: tensor(0.2001, device='cuda:0')


Epoch 1, Step 397: Loss = -0.0265, Actor Loss = -0.0734, Critic Loss = 0.0941, Entropy = 0.1584, Returns = 1.3564, Value = 1.1749, mask_loss = 0.1885std_loss = 0.3254:   7%|▋         | 397/5475 [1:47:06<21:36:22, 15.32s/it]

ratio tensor(1.0477, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1554, device='cuda:0')
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.1533, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.1521, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1534, device='cuda:0')


Epoch 1, Step 398: Loss = -0.1191, Actor Loss = -0.1991, Critic Loss = 0.1601, Entropy = 0.1575, Returns = 1.3751, Value = 1.5230, mask_loss = 0.1534std_loss = 0.2912:   7%|▋         | 398/5475 [1:47:26<23:52:30, 16.93s/it]

ratio tensor(2.2615, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.3722, device='cuda:0') factor: tensor(0.1873, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3608, device='cuda:0') factor: tensor(0.1968, device='cuda:0')
reward_raw: tensor(0.5723, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.1913, device='cuda:0')
reward_raw: tensor(0.5407, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1970, device='cuda:0')


Epoch 1, Step 399: Loss = 0.2306, Actor Loss = 0.1420, Critic Loss = 0.1775, Entropy = 0.1600, Returns = 1.3914, Value = 1.5218, mask_loss = 0.2164std_loss = 0.3287:   7%|▋         | 399/5475 [1:47:42<23:20:30, 16.55s/it]  

ratio tensor(1.2203, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4908, device='cuda:0') reward: tensor(1.3256, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.3145, device='cuda:0') factor: tensor(0.1777, device='cuda:0')
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.3117, device='cuda:0') factor: tensor(0.1696, device='cuda:0')
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.1699, device='cuda:0')
reward_raw: tensor(0.4949, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1698, device='cuda:0')


Epoch 1, Step 400: Loss = 0.1626, Actor Loss = 0.1178, Critic Loss = 0.0897, Entropy = 0.1561, Returns = 1.3411, Value = 1.2574, mask_loss = 0.1762std_loss = 0.3150:   7%|▋         | 400/5475 [1:47:58<23:13:40, 16.48s/it]

ratio tensor(0.7806, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.2681, device='cuda:0') factor: tensor(0.2237, device='cuda:0')
reward_raw: tensor(0.4972, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.2118, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.2792, device='cuda:0') factor: tensor(0.2115, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.2922, device='cuda:0') factor: tensor(0.2164, device='cuda:0')
reward_raw: tensor(0.4953, device='cuda:0') reward: tensor(1.2774, device='cuda:0') factor: tensor(0.2179, device='cuda:0')


Epoch 1, Step 401: Loss = 0.0200, Actor Loss = -0.0131, Critic Loss = 0.0665, Entropy = 0.1489, Returns = 1.2363, Value = 1.3106, mask_loss = 0.1996std_loss = 0.3352:   7%|▋         | 400/5475 [1:48:10<23:13:40, 16.48s/it]

ratio tensor(0.8369, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 401: Loss = 0.0200, Actor Loss = -0.0131, Critic Loss = 0.0665, Entropy = 0.1489, Returns = 1.2363, Value = 1.3106, mask_loss = 0.1996std_loss = 0.3352:   7%|▋         | 401/5475 [1:48:11<21:25:44, 15.20s/it]

reward_raw: tensor(0.5705, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.3909, device='cuda:0') factor: tensor(0.1765, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.1791, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.1709, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.3748, device='cuda:0') factor: tensor(0.1755, device='cuda:0')


Epoch 1, Step 402: Loss = -0.0768, Actor Loss = -0.1354, Critic Loss = 0.1174, Entropy = 0.1201, Returns = 1.3519, Value = 1.3839, mask_loss = 0.1472std_loss = 0.3041:   7%|▋         | 402/5475 [1:48:29<22:57:37, 16.29s/it]

ratio tensor(1.0055, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6036, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.2152, device='cuda:0')
reward_raw: tensor(0.6092, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.2175, device='cuda:0')
reward_raw: tensor(0.6084, device='cuda:0') reward: tensor(1.3930, device='cuda:0') factor: tensor(0.2155, device='cuda:0')
reward_raw: tensor(0.6115, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.2088, device='cuda:0')
reward_raw: tensor(0.6236, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.2246, device='cuda:0')


Epoch 1, Step 403: Loss = 0.0417, Actor Loss = -0.0077, Critic Loss = 0.0991, Entropy = 0.1457, Returns = 1.3473, Value = 1.3783, mask_loss = 0.1755std_loss = 0.3190:   7%|▋         | 403/5475 [1:48:41<20:50:20, 14.79s/it] 

ratio tensor(0.9830, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4903, device='cuda:0') reward: tensor(1.3028, device='cuda:0') factor: tensor(0.1876, device='cuda:0')
reward_raw: tensor(0.5064, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.5047, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.1905, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.3174, device='cuda:0') factor: tensor(0.1918, device='cuda:0')
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.2893, device='cuda:0') factor: tensor(0.1926, device='cuda:0')


Epoch 1, Step 404: Loss = 0.0936, Actor Loss = 0.0082, Critic Loss = 0.1711, Entropy = 0.1596, Returns = 1.3853, Value = 1.2733, mask_loss = 0.2019std_loss = 0.3311:   7%|▋         | 404/5475 [1:48:56<20:53:13, 14.83s/it] 

ratio tensor(1.2114, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1693, device='cuda:0')
reward_raw: tensor(0.5181, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.1628, device='cuda:0')
reward_raw: tensor(0.5370, device='cuda:0') reward: tensor(1.3676, device='cuda:0') factor: tensor(0.1694, device='cuda:0')
reward_raw: tensor(0.5512, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.1679, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.1683, device='cuda:0')


Epoch 1, Step 405: Loss = -0.2256, Actor Loss = -0.2577, Critic Loss = 0.0644, Entropy = 0.1031, Returns = 1.4109, Value = 1.3401, mask_loss = 0.1479std_loss = 0.3148:   7%|▋         | 405/5475 [1:49:11<21:00:08, 14.91s/it]

ratio tensor(1.5842, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.1621, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.1593, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.3485, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.3636, device='cuda:0') factor: tensor(0.1605, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1644, device='cuda:0')


Epoch 1, Step 406: Loss = 0.1659, Actor Loss = 0.1124, Critic Loss = 0.1074, Entropy = 0.1369, Returns = 1.3422, Value = 1.3794, mask_loss = 0.1617std_loss = 0.3114:   7%|▋         | 406/5475 [1:49:28<21:55:16, 15.57s/it]  

ratio tensor(0.8952, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4460, device='cuda:0') reward: tensor(1.2615, device='cuda:0') factor: tensor(0.1845, device='cuda:0')
reward_raw: tensor(0.4290, device='cuda:0') reward: tensor(1.2399, device='cuda:0') factor: tensor(0.1892, device='cuda:0')
reward_raw: tensor(0.4298, device='cuda:0') reward: tensor(1.2381, device='cuda:0') factor: tensor(0.1917, device='cuda:0')
reward_raw: tensor(0.4323, device='cuda:0') reward: tensor(1.2469, device='cuda:0') factor: tensor(0.1854, device='cuda:0')
reward_raw: tensor(0.4472, device='cuda:0') reward: tensor(1.2542, device='cuda:0') factor: tensor(0.1930, device='cuda:0')


Epoch 1, Step 407: Loss = 0.0010, Actor Loss = -0.0372, Critic Loss = 0.0767, Entropy = 0.1280, Returns = 1.2500, Value = 1.1281, mask_loss = 0.1791std_loss = 0.3224:   7%|▋         | 407/5475 [1:49:50<24:41:32, 17.54s/it]

ratio tensor(0.9508, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5395, device='cuda:0') reward: tensor(1.3709, device='cuda:0') factor: tensor(0.1686, device='cuda:0')
reward_raw: tensor(0.4927, device='cuda:0') reward: tensor(1.3308, device='cuda:0') factor: tensor(0.1619, device='cuda:0')
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.1645, device='cuda:0')
reward_raw: tensor(0.5254, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.1666, device='cuda:0')
reward_raw: tensor(0.5186, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.1671, device='cuda:0')


Epoch 1, Step 408: Loss = -0.2591, Actor Loss = -0.3158, Critic Loss = 0.1137, Entropy = 0.1555, Returns = 1.3443, Value = 1.3192, mask_loss = 0.1605std_loss = 0.2988:   7%|▋         | 408/5475 [1:50:08<24:48:53, 17.63s/it]

ratio tensor(1.6098, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4698, device='cuda:0') reward: tensor(1.3182, device='cuda:0') factor: tensor(0.1515, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(1.3108, device='cuda:0') factor: tensor(0.1512, device='cuda:0')
reward_raw: tensor(0.4546, device='cuda:0') reward: tensor(1.3052, device='cuda:0') factor: tensor(0.1494, device='cuda:0')
reward_raw: tensor(0.4661, device='cuda:0') reward: tensor(1.3107, device='cuda:0') factor: tensor(0.1554, device='cuda:0')
reward_raw: tensor(0.4589, device='cuda:0') reward: tensor(1.3054, device='cuda:0') factor: tensor(0.1535, device='cuda:0')


Epoch 1, Step 409: Loss = -0.0104, Actor Loss = -0.0537, Critic Loss = 0.0868, Entropy = 0.1199, Returns = 1.4092, Value = 1.2962, mask_loss = 0.1583std_loss = 0.3157:   7%|▋         | 409/5475 [1:50:24<24:10:30, 17.18s/it]

ratio tensor(1.0164, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1829, device='cuda:0')
reward_raw: tensor(0.5382, device='cuda:0') reward: tensor(1.3548, device='cuda:0') factor: tensor(0.1834, device='cuda:0')
reward_raw: tensor(0.5375, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.1799, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.3578, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.5582, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.1845, device='cuda:0')


Epoch 1, Step 410: Loss = -0.0595, Actor Loss = -0.0866, Critic Loss = 0.0546, Entropy = 0.1283, Returns = 1.3456, Value = 1.3015, mask_loss = 0.1682std_loss = 0.3214:   7%|▋         | 410/5475 [1:50:37<22:28:14, 15.97s/it]

ratio tensor(1.1353, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4974, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1714, device='cuda:0')
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.3407, device='cuda:0') factor: tensor(0.1756, device='cuda:0')
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.3035, device='cuda:0') factor: tensor(0.1715, device='cuda:0')
reward_raw: tensor(0.5154, device='cuda:0') reward: tensor(1.3324, device='cuda:0') factor: tensor(0.1830, device='cuda:0')
reward_raw: tensor(0.4816, device='cuda:0') reward: tensor(1.3051, device='cuda:0') factor: tensor(0.1765, device='cuda:0')


Epoch 1, Step 411: Loss = 0.0307, Actor Loss = -0.0142, Critic Loss = 0.0901, Entropy = 0.1432, Returns = 1.4141, Value = 1.3007, mask_loss = 0.1732std_loss = 0.3187:   8%|▊         | 411/5475 [1:50:51<21:40:30, 15.41s/it] 

ratio tensor(1.0607, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1459, device='cuda:0')
reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.3936, device='cuda:0') factor: tensor(0.1460, device='cuda:0')
reward_raw: tensor(0.5710, device='cuda:0') reward: tensor(1.4269, device='cuda:0') factor: tensor(0.1441, device='cuda:0')
reward_raw: tensor(0.5605, device='cuda:0') reward: tensor(1.4116, device='cuda:0') factor: tensor(0.1489, device='cuda:0')
reward_raw: tensor(0.5770, device='cuda:0') reward: tensor(1.4270, device='cuda:0') factor: tensor(0.1500, device='cuda:0')


Epoch 1, Step 412: Loss = 0.0012, Actor Loss = -0.0325, Critic Loss = 0.0676, Entropy = 0.1253, Returns = 1.4463, Value = 1.3860, mask_loss = 0.1367std_loss = 0.2800:   8%|▊         | 412/5475 [1:51:09<22:52:16, 16.26s/it]

ratio tensor(1.3175, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5858, device='cuda:0') reward: tensor(1.4144, device='cuda:0') factor: tensor(0.1714, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.3955, device='cuda:0') factor: tensor(0.1685, device='cuda:0')
reward_raw: tensor(0.5781, device='cuda:0') reward: tensor(1.4079, device='cuda:0') factor: tensor(0.1702, device='cuda:0')
reward_raw: tensor(0.5766, device='cuda:0') reward: tensor(1.4059, device='cuda:0') factor: tensor(0.1707, device='cuda:0')
reward_raw: tensor(0.5921, device='cuda:0') reward: tensor(1.4206, device='cuda:0') factor: tensor(0.1716, device='cuda:0')


Epoch 1, Step 413: Loss = -0.1076, Actor Loss = -0.1670, Critic Loss = 0.1190, Entropy = 0.1160, Returns = 1.4727, Value = 1.5295, mask_loss = 0.1465std_loss = 0.3015:   8%|▊         | 413/5475 [1:51:22<21:11:59, 15.08s/it]

ratio tensor(1.4325, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5083, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.2204, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.2855, device='cuda:0') factor: tensor(0.2140, device='cuda:0')
reward_raw: tensor(0.5045, device='cuda:0') reward: tensor(1.2899, device='cuda:0') factor: tensor(0.2147, device='cuda:0')
reward_raw: tensor(0.5238, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.2173, device='cuda:0')
reward_raw: tensor(0.5075, device='cuda:0') reward: tensor(1.2976, device='cuda:0') factor: tensor(0.2099, device='cuda:0')


Epoch 1, Step 414: Loss = 0.1808, Actor Loss = 0.1608, Critic Loss = 0.0403, Entropy = 0.1477, Returns = 1.3230, Value = 1.2653, mask_loss = 0.2381std_loss = 0.3659:   8%|▊         | 414/5475 [1:51:35<20:31:52, 14.60s/it]  

ratio tensor(0.6762, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5849, device='cuda:0') reward: tensor(1.4169, device='cuda:0') factor: tensor(0.1680, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.4339, device='cuda:0') factor: tensor(0.1666, device='cuda:0')
reward_raw: tensor(0.5981, device='cuda:0') reward: tensor(1.4243, device='cuda:0') factor: tensor(0.1738, device='cuda:0')
reward_raw: tensor(0.6120, device='cuda:0') reward: tensor(1.4398, device='cuda:0') factor: tensor(0.1722, device='cuda:0')
reward_raw: tensor(0.5936, device='cuda:0') reward: tensor(1.4266, device='cuda:0') factor: tensor(0.1670, device='cuda:0')


Epoch 1, Step 415: Loss = -0.3055, Actor Loss = -0.3608, Critic Loss = 0.1109, Entropy = 0.1150, Returns = 1.4119, Value = 1.3800, mask_loss = 0.1834std_loss = 0.3340:   8%|▊         | 415/5475 [1:51:51<21:01:59, 14.96s/it]

ratio tensor(2.0736, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5275, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.1796, device='cuda:0')
reward_raw: tensor(0.5231, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.1772, device='cuda:0')
reward_raw: tensor(0.5120, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.1687, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3269, device='cuda:0') factor: tensor(0.1754, device='cuda:0')
reward_raw: tensor(0.5222, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1703, device='cuda:0')


Epoch 1, Step 416: Loss = 0.0032, Actor Loss = -0.0999, Critic Loss = 0.2066, Entropy = 0.1392, Returns = 1.3471, Value = 1.2355, mask_loss = 0.1665std_loss = 0.3107:   8%|▊         | 416/5475 [1:52:14<24:26:35, 17.39s/it] 

ratio tensor(0.9750, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5822, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.2035, device='cuda:0')
reward_raw: tensor(0.5726, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.1989, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.2093, device='cuda:0')
reward_raw: tensor(0.6001, device='cuda:0') reward: tensor(1.3875, device='cuda:0') factor: tensor(0.2126, device='cuda:0')
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(1.3616, device='cuda:0') factor: tensor(0.2080, device='cuda:0')


Epoch 1, Step 417: Loss = 0.2213, Actor Loss = 0.1165, Critic Loss = 0.2100, Entropy = 0.1608, Returns = 1.3544, Value = 1.1631, mask_loss = 0.2376std_loss = 0.3552:   8%|▊         | 417/5475 [1:52:29<23:26:16, 16.68s/it] 

ratio tensor(0.8568, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.3446, device='cuda:0') factor: tensor(0.1955, device='cuda:0')
reward_raw: tensor(0.5505, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1960, device='cuda:0')
reward_raw: tensor(0.5524, device='cuda:0') reward: tensor(1.3580, device='cuda:0') factor: tensor(0.1944, device='cuda:0')
reward_raw: tensor(0.5467, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.3561, device='cuda:0') factor: tensor(0.1992, device='cuda:0')


Epoch 1, Step 418: Loss = -0.0967, Actor Loss = -0.1583, Critic Loss = 0.1235, Entropy = 0.1461, Returns = 1.3287, Value = 1.3638, mask_loss = 0.1978std_loss = 0.3352:   8%|▊         | 418/5475 [1:52:46<23:21:03, 16.62s/it]

ratio tensor(1.4105, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5774, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.1831, device='cuda:0')
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.3550, device='cuda:0') factor: tensor(0.1819, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.3471, device='cuda:0') factor: tensor(0.1826, device='cuda:0')
reward_raw: tensor(0.5404, device='cuda:0') reward: tensor(1.3592, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.5444, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.1744, device='cuda:0')


Epoch 1, Step 419: Loss = 0.6181, Actor Loss = 0.5720, Critic Loss = 0.0925, Entropy = 0.1329, Returns = 1.3271, Value = 1.3939, mask_loss = 0.1748std_loss = 0.3153:   8%|▊         | 419/5475 [1:53:01<22:37:48, 16.11s/it]  

ratio tensor(1.8699, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5304, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1802, device='cuda:0')
reward_raw: tensor(0.5451, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.1823, device='cuda:0')
reward_raw: tensor(0.5383, device='cuda:0') reward: tensor(1.3467, device='cuda:0') factor: tensor(0.1916, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.3772, device='cuda:0') factor: tensor(0.1860, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1882, device='cuda:0')


Epoch 1, Step 420: Loss = 0.1277, Actor Loss = 0.0816, Critic Loss = 0.0925, Entropy = 0.1754, Returns = 1.3584, Value = 1.2502, mask_loss = 0.2678std_loss = 0.3669:   8%|▊         | 420/5475 [1:53:16<22:28:11, 16.00s/it]

ratio tensor(2.1755, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.2576, device='cuda:0') factor: tensor(0.2430, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.2715, device='cuda:0') factor: tensor(0.2281, device='cuda:0')
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.2793, device='cuda:0') factor: tensor(0.2278, device='cuda:0')
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.2491, device='cuda:0') factor: tensor(0.2321, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.2746, device='cuda:0') factor: tensor(0.2369, device='cuda:0')


Epoch 1, Step 421: Loss = 0.0798, Actor Loss = 0.0012, Critic Loss = 0.1576, Entropy = 0.1933, Returns = 1.2220, Value = 1.2531, mask_loss = 0.2569std_loss = 0.3502:   8%|▊         | 420/5475 [1:53:40<22:28:11, 16.00s/it]

ratio tensor(1.3705, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 421: Loss = 0.0798, Actor Loss = 0.0012, Critic Loss = 0.1576, Entropy = 0.1933, Returns = 1.2220, Value = 1.2531, mask_loss = 0.2569std_loss = 0.3502:   8%|▊         | 421/5475 [1:53:40<25:48:13, 18.38s/it]

reward_raw: tensor(0.6110, device='cuda:0') reward: tensor(1.3154, device='cuda:0') factor: tensor(0.2956, device='cuda:0')
reward_raw: tensor(0.5867, device='cuda:0') reward: tensor(1.2965, device='cuda:0') factor: tensor(0.2902, device='cuda:0')
reward_raw: tensor(0.5939, device='cuda:0') reward: tensor(1.2954, device='cuda:0') factor: tensor(0.2985, device='cuda:0')
reward_raw: tensor(0.6028, device='cuda:0') reward: tensor(1.3115, device='cuda:0') factor: tensor(0.2913, device='cuda:0')
reward_raw: tensor(0.6113, device='cuda:0') reward: tensor(1.3144, device='cuda:0') factor: tensor(0.2969, device='cuda:0')


Epoch 1, Step 422: Loss = -0.1513, Actor Loss = -0.1777, Critic Loss = 0.0531, Entropy = 0.1736, Returns = 1.3087, Value = 1.3065, mask_loss = 0.2886std_loss = 0.3753:   8%|▊         | 422/5475 [1:53:54<23:39:13, 16.85s/it]

ratio tensor(1.9909, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5885, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.2617, device='cuda:0')
reward_raw: tensor(0.5710, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.2572, device='cuda:0')
reward_raw: tensor(0.5864, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.2572, device='cuda:0')
reward_raw: tensor(0.5960, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.2562, device='cuda:0')
reward_raw: tensor(0.5916, device='cuda:0') reward: tensor(1.3382, device='cuda:0') factor: tensor(0.2534, device='cuda:0')


Epoch 1, Step 423: Loss = 0.0646, Actor Loss = 0.0362, Critic Loss = 0.0571, Entropy = 0.1733, Returns = 1.3769, Value = 1.3308, mask_loss = 0.2455std_loss = 0.3570:   8%|▊         | 423/5475 [1:54:08<22:33:20, 16.07s/it]  

ratio tensor(0.8690, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5530, device='cuda:0') reward: tensor(1.3527, device='cuda:0') factor: tensor(0.2003, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.2030, device='cuda:0')
reward_raw: tensor(0.5384, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.2085, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.1989, device='cuda:0')
reward_raw: tensor(0.5524, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.2095, device='cuda:0')


Epoch 1, Step 424: Loss = -0.0715, Actor Loss = -0.1966, Critic Loss = 0.2505, Entropy = 0.1230, Returns = 1.3511, Value = 1.3989, mask_loss = 0.1748std_loss = 0.3270:   8%|▊         | 424/5475 [1:54:24<22:36:40, 16.12s/it]

ratio tensor(2.7916, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.3125, device='cuda:0') factor: tensor(0.2565, device='cuda:0')
reward_raw: tensor(0.5676, device='cuda:0') reward: tensor(1.3075, device='cuda:0') factor: tensor(0.2601, device='cuda:0')
reward_raw: tensor(0.5622, device='cuda:0') reward: tensor(1.2976, device='cuda:0') factor: tensor(0.2646, device='cuda:0')
reward_raw: tensor(0.5931, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.2576, device='cuda:0')
reward_raw: tensor(0.5769, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.2665, device='cuda:0')


Epoch 1, Step 425: Loss = 0.2203, Actor Loss = 0.1757, Critic Loss = 0.0895, Entropy = 0.1836, Returns = 1.3644, Value = 1.3378, mask_loss = 0.2495std_loss = 0.3517:   8%|▊         | 425/5475 [1:54:37<21:25:13, 15.27s/it]  

ratio tensor(1.0647, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6329, device='cuda:0') reward: tensor(1.3542, device='cuda:0') factor: tensor(0.2787, device='cuda:0')
reward_raw: tensor(0.6383, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.2763, device='cuda:0')
reward_raw: tensor(0.6349, device='cuda:0') reward: tensor(1.3438, device='cuda:0') factor: tensor(0.2911, device='cuda:0')
reward_raw: tensor(0.6250, device='cuda:0') reward: tensor(1.3434, device='cuda:0') factor: tensor(0.2816, device='cuda:0')
reward_raw: tensor(0.6633, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.2751, device='cuda:0')


Epoch 1, Step 426: Loss = 0.0141, Actor Loss = -0.0182, Critic Loss = 0.0650, Entropy = 0.1945, Returns = 1.3563, Value = 1.2991, mask_loss = 0.2878std_loss = 0.3605:   8%|▊         | 426/5475 [1:54:57<23:21:11, 16.65s/it]

ratio tensor(1.5263, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6242, device='cuda:0') reward: tensor(1.3270, device='cuda:0') factor: tensor(0.2972, device='cuda:0')
reward_raw: tensor(0.6412, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.2950, device='cuda:0')
reward_raw: tensor(0.6342, device='cuda:0') reward: tensor(1.3353, device='cuda:0') factor: tensor(0.2989, device='cuda:0')
reward_raw: tensor(0.6304, device='cuda:0') reward: tensor(1.3390, device='cuda:0') factor: tensor(0.2913, device='cuda:0')
reward_raw: tensor(0.6249, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.2950, device='cuda:0')


Epoch 1, Step 427: Loss = -0.1865, Actor Loss = -0.2207, Critic Loss = 0.0689, Entropy = 0.1837, Returns = 1.3349, Value = 1.2840, mask_loss = 0.2721std_loss = 0.3578:   8%|▊         | 427/5475 [1:55:16<24:23:57, 17.40s/it]

ratio tensor(2.7014, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5600, device='cuda:0') reward: tensor(1.2512, device='cuda:0') factor: tensor(0.3088, device='cuda:0')
reward_raw: tensor(0.5585, device='cuda:0') reward: tensor(1.2462, device='cuda:0') factor: tensor(0.3124, device='cuda:0')
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.2401, device='cuda:0') factor: tensor(0.3023, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.2346, device='cuda:0') factor: tensor(0.3089, device='cuda:0')
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.2399, device='cuda:0') factor: tensor(0.3089, device='cuda:0')


Epoch 1, Step 428: Loss = 0.0933, Actor Loss = 0.0574, Critic Loss = 0.0723, Entropy = 0.1910, Returns = 1.2932, Value = 1.2516, mask_loss = 0.2594std_loss = 0.3484:   8%|▊         | 428/5475 [1:55:32<23:42:11, 16.91s/it]  

ratio tensor(0.7190, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6274, device='cuda:0') reward: tensor(1.3513, device='cuda:0') factor: tensor(0.2760, device='cuda:0')
reward_raw: tensor(0.6114, device='cuda:0') reward: tensor(1.3386, device='cuda:0') factor: tensor(0.2728, device='cuda:0')
reward_raw: tensor(0.6405, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.2751, device='cuda:0')
reward_raw: tensor(0.6138, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.2761, device='cuda:0')
reward_raw: tensor(0.6400, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.2773, device='cuda:0')


Epoch 1, Step 429: Loss = -0.0328, Actor Loss = -0.0541, Critic Loss = 0.0429, Entropy = 0.1933, Returns = 1.3925, Value = 1.3692, mask_loss = 0.2904std_loss = 0.3781:   8%|▊         | 429/5475 [1:55:51<24:43:24, 17.64s/it]

ratio tensor(0.9992, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.3422, device='cuda:0') factor: tensor(0.2205, device='cuda:0')
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.3229, device='cuda:0') factor: tensor(0.2143, device='cuda:0')
reward_raw: tensor(0.5438, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.2102, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.3308, device='cuda:0') factor: tensor(0.2220, device='cuda:0')
reward_raw: tensor(0.5409, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2167, device='cuda:0')


Epoch 1, Step 430: Loss = 0.0777, Actor Loss = 0.0392, Critic Loss = 0.0773, Entropy = 0.1748, Returns = 1.2598, Value = 1.3542, mask_loss = 0.2253std_loss = 0.3496:   8%|▊         | 430/5475 [1:56:08<24:16:46, 17.33s/it]  

ratio tensor(0.5741, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.2293, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.2291, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.3118, device='cuda:0') factor: tensor(0.2275, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.2295, device='cuda:0')
reward_raw: tensor(0.5808, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.2378, device='cuda:0')


Epoch 1, Step 431: Loss = 0.0190, Actor Loss = -0.0217, Critic Loss = 0.0817, Entropy = 0.1502, Returns = 1.3245, Value = 1.2939, mask_loss = 0.2108std_loss = 0.3269:   8%|▊         | 431/5475 [1:56:25<24:06:53, 17.21s/it]

ratio tensor(3.2720, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5852, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.2598, device='cuda:0')
reward_raw: tensor(0.5984, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.2657, device='cuda:0')
reward_raw: tensor(0.5787, device='cuda:0') reward: tensor(1.3133, device='cuda:0') factor: tensor(0.2654, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.3485, device='cuda:0') factor: tensor(0.2520, device='cuda:0')
reward_raw: tensor(0.5820, device='cuda:0') reward: tensor(1.3234, device='cuda:0') factor: tensor(0.2587, device='cuda:0')


Epoch 1, Step 432: Loss = 0.0670, Actor Loss = 0.0421, Critic Loss = 0.0501, Entropy = 0.1736, Returns = 1.3495, Value = 1.3770, mask_loss = 0.2423std_loss = 0.3632:   8%|▊         | 432/5475 [1:56:39<22:52:45, 16.33s/it] 

ratio tensor(0.6361, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6228, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.3071, device='cuda:0')
reward_raw: tensor(0.6295, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.3056, device='cuda:0')
reward_raw: tensor(0.6190, device='cuda:0') reward: tensor(1.3168, device='cuda:0') factor: tensor(0.3022, device='cuda:0')
reward_raw: tensor(0.6124, device='cuda:0') reward: tensor(1.3046, device='cuda:0') factor: tensor(0.3079, device='cuda:0')
reward_raw: tensor(0.6449, device='cuda:0') reward: tensor(1.3331, device='cuda:0') factor: tensor(0.3118, device='cuda:0')


Epoch 1, Step 433: Loss = 0.1858, Actor Loss = 0.1135, Critic Loss = 0.1449, Entropy = 0.2028, Returns = 1.3671, Value = 1.4603, mask_loss = 0.2886std_loss = 0.3658:   8%|▊         | 433/5475 [1:56:58<23:51:08, 17.03s/it]

ratio tensor(2.5031, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5556, device='cuda:0') reward: tensor(1.2973, device='cuda:0') factor: tensor(0.2583, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.2594, device='cuda:0')
reward_raw: tensor(0.5631, device='cuda:0') reward: tensor(1.3057, device='cuda:0') factor: tensor(0.2575, device='cuda:0')
reward_raw: tensor(0.5447, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.2517, device='cuda:0')
reward_raw: tensor(0.5510, device='cuda:0') reward: tensor(1.2949, device='cuda:0') factor: tensor(0.2561, device='cuda:0')


Epoch 1, Step 434: Loss = 0.1119, Actor Loss = 0.0834, Critic Loss = 0.0572, Entropy = 0.1411, Returns = 1.3520, Value = 1.3404, mask_loss = 0.1941std_loss = 0.3245:   8%|▊         | 434/5475 [1:57:13<22:54:05, 16.36s/it]

ratio tensor(0.0088, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5318, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.1619, device='cuda:0')
reward_raw: tensor(0.5053, device='cuda:0') reward: tensor(1.3384, device='cuda:0') factor: tensor(0.1669, device='cuda:0')
reward_raw: tensor(0.5366, device='cuda:0') reward: tensor(1.3746, device='cuda:0') factor: tensor(0.1620, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3443, device='cuda:0') factor: tensor(0.1640, device='cuda:0')
reward_raw: tensor(0.5108, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.1659, device='cuda:0')


Epoch 1, Step 435: Loss = 0.0574, Actor Loss = 0.0270, Critic Loss = 0.0611, Entropy = 0.1231, Returns = 1.4037, Value = 1.4022, mask_loss = 0.1684std_loss = 0.3081:   8%|▊         | 435/5475 [1:57:30<23:13:39, 16.59s/it]

ratio tensor(0.4480, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5453, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.5509, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1892, device='cuda:0')
reward_raw: tensor(0.5526, device='cuda:0') reward: tensor(1.3665, device='cuda:0') factor: tensor(0.1862, device='cuda:0')
reward_raw: tensor(0.5529, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.1886, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.1814, device='cuda:0')


Epoch 1, Step 436: Loss = 0.0165, Actor Loss = -0.0374, Critic Loss = 0.1081, Entropy = 0.1275, Returns = 1.3077, Value = 1.4296, mask_loss = 0.1817std_loss = 0.3329:   8%|▊         | 436/5475 [1:57:46<23:12:52, 16.59s/it]

ratio tensor(1.6481, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4787, device='cuda:0') reward: tensor(1.2859, device='cuda:0') factor: tensor(0.1927, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.2909, device='cuda:0') factor: tensor(0.1870, device='cuda:0')
reward_raw: tensor(0.4630, device='cuda:0') reward: tensor(1.2766, device='cuda:0') factor: tensor(0.1864, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.2987, device='cuda:0') factor: tensor(0.1869, device='cuda:0')
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.3097, device='cuda:0') factor: tensor(0.1913, device='cuda:0')


Epoch 1, Step 437: Loss = 0.0602, Actor Loss = -0.0085, Critic Loss = 0.1377, Entropy = 0.1142, Returns = 1.2186, Value = 1.1888, mask_loss = 0.1894std_loss = 0.3419:   8%|▊         | 437/5475 [1:58:06<24:17:43, 17.36s/it]

ratio tensor(1.1760, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5923, device='cuda:0') reward: tensor(1.4244, device='cuda:0') factor: tensor(0.1679, device='cuda:0')
reward_raw: tensor(0.5915, device='cuda:0') reward: tensor(1.4269, device='cuda:0') factor: tensor(0.1645, device='cuda:0')
reward_raw: tensor(0.5981, device='cuda:0') reward: tensor(1.4336, device='cuda:0') factor: tensor(0.1645, device='cuda:0')
reward_raw: tensor(0.5869, device='cuda:0') reward: tensor(1.4173, device='cuda:0') factor: tensor(0.1696, device='cuda:0')
reward_raw: tensor(0.5819, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.1635, device='cuda:0')


Epoch 1, Step 438: Loss = 0.0870, Actor Loss = 0.0227, Critic Loss = 0.1288, Entropy = 0.1260, Returns = 1.3973, Value = 1.4697, mask_loss = 0.1836std_loss = 0.3324:   8%|▊         | 438/5475 [1:58:22<23:53:09, 17.07s/it] 

ratio tensor(0.9081, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5663, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5471, device='cuda:0') reward: tensor(1.3414, device='cuda:0') factor: tensor(0.2057, device='cuda:0')
reward_raw: tensor(0.5452, device='cuda:0') reward: tensor(1.3412, device='cuda:0') factor: tensor(0.2040, device='cuda:0')
reward_raw: tensor(0.5371, device='cuda:0') reward: tensor(1.3330, device='cuda:0') factor: tensor(0.2041, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.2145, device='cuda:0')


Epoch 1, Step 439: Loss = 0.0815, Actor Loss = 0.0403, Critic Loss = 0.0826, Entropy = 0.1125, Returns = 1.3216, Value = 1.3579, mask_loss = 0.1823std_loss = 0.3375:   8%|▊         | 439/5475 [1:58:38<23:37:24, 16.89s/it]

ratio tensor(1.2085, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5268, device='cuda:0') reward: tensor(1.3174, device='cuda:0') factor: tensor(0.2093, device='cuda:0')
reward_raw: tensor(0.5267, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.2105, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3082, device='cuda:0') factor: tensor(0.2099, device='cuda:0')
reward_raw: tensor(0.5198, device='cuda:0') reward: tensor(1.3092, device='cuda:0') factor: tensor(0.2106, device='cuda:0')
reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.2058, device='cuda:0')


Epoch 1, Step 440: Loss = 0.0794, Actor Loss = 0.0158, Critic Loss = 0.1274, Entropy = 0.1128, Returns = 1.3251, Value = 1.3317, mask_loss = 0.1814std_loss = 0.3347:   8%|▊         | 440/5475 [1:58:55<23:18:10, 16.66s/it]

ratio tensor(0.9657, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.2327, device='cuda:0') factor: tensor(0.2664, device='cuda:0')
reward_raw: tensor(0.5266, device='cuda:0') reward: tensor(1.2577, device='cuda:0') factor: tensor(0.2689, device='cuda:0')
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.2655, device='cuda:0') factor: tensor(0.2602, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.2549, device='cuda:0') factor: tensor(0.2617, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.2573, device='cuda:0') factor: tensor(0.2634, device='cuda:0')


Epoch 1, Step 441: Loss = -0.1438, Actor Loss = -0.2406, Critic Loss = 0.1939, Entropy = 0.1810, Returns = 1.3351, Value = 1.0615, mask_loss = 0.2459std_loss = 0.3523:   8%|▊         | 440/5475 [1:59:07<23:18:10, 16.66s/it]

ratio tensor(1.6201, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 441: Loss = -0.1438, Actor Loss = -0.2406, Critic Loss = 0.1939, Entropy = 0.1810, Returns = 1.3351, Value = 1.0615, mask_loss = 0.2459std_loss = 0.3523:   8%|▊         | 441/5475 [1:59:07<21:28:40, 15.36s/it]

reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.2973, device='cuda:0') factor: tensor(0.1900, device='cuda:0')
reward_raw: tensor(0.4877, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.4994, device='cuda:0') reward: tensor(1.2982, device='cuda:0') factor: tensor(0.2012, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.3038, device='cuda:0') factor: tensor(0.1898, device='cuda:0')
reward_raw: tensor(0.4702, device='cuda:0') reward: tensor(1.2800, device='cuda:0') factor: tensor(0.1903, device='cuda:0')


Epoch 1, Step 442: Loss = 0.0084, Actor Loss = -0.0279, Critic Loss = 0.0730, Entropy = 0.1263, Returns = 1.3011, Value = 1.2640, mask_loss = 0.1957std_loss = 0.3397:   8%|▊         | 442/5475 [1:59:22<21:15:54, 15.21s/it] 

ratio tensor(0.8181, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.3523, device='cuda:0') factor: tensor(0.2226, device='cuda:0')
reward_raw: tensor(0.5697, device='cuda:0') reward: tensor(1.3382, device='cuda:0') factor: tensor(0.2315, device='cuda:0')
reward_raw: tensor(0.5361, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.2230, device='cuda:0')
reward_raw: tensor(0.5629, device='cuda:0') reward: tensor(1.3393, device='cuda:0') factor: tensor(0.2236, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.2262, device='cuda:0')


Epoch 1, Step 443: Loss = -0.0782, Actor Loss = -0.1310, Critic Loss = 0.1059, Entropy = 0.1535, Returns = 1.3944, Value = 1.3607, mask_loss = 0.2460std_loss = 0.3581:   8%|▊         | 443/5475 [1:59:36<20:44:31, 14.84s/it]

ratio tensor(1.0429, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5471, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.2348, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.2891, device='cuda:0') factor: tensor(0.2415, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.2333, device='cuda:0')
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.2384, device='cuda:0')
reward_raw: tensor(0.5377, device='cuda:0') reward: tensor(1.2976, device='cuda:0') factor: tensor(0.2400, device='cuda:0')


Epoch 1, Step 444: Loss = -0.0662, Actor Loss = -0.0910, Critic Loss = 0.0499, Entropy = 0.1139, Returns = 1.3928, Value = 1.3227, mask_loss = 0.1925std_loss = 0.3438:   8%|▊         | 444/5475 [1:59:52<21:16:49, 15.23s/it]

ratio tensor(1.6300, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5664, device='cuda:0') reward: tensor(1.3419, device='cuda:0') factor: tensor(0.2245, device='cuda:0')
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(1.3171, device='cuda:0') factor: tensor(0.2373, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.2294, device='cuda:0')
reward_raw: tensor(0.5750, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.2363, device='cuda:0')
reward_raw: tensor(0.5656, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.2302, device='cuda:0')


Epoch 1, Step 445: Loss = -0.0357, Actor Loss = -0.0640, Critic Loss = 0.0568, Entropy = 0.1206, Returns = 1.3865, Value = 1.2545, mask_loss = 0.1910std_loss = 0.3349:   8%|▊         | 445/5475 [2:00:07<21:16:42, 15.23s/it]

ratio tensor(1.0394, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.3073, device='cuda:0') factor: tensor(0.2280, device='cuda:0')
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.3341, device='cuda:0') factor: tensor(0.2341, device='cuda:0')
reward_raw: tensor(0.5389, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.2319, device='cuda:0')
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.3278, device='cuda:0') factor: tensor(0.2264, device='cuda:0')
reward_raw: tensor(0.5428, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.2253, device='cuda:0')


Epoch 1, Step 446: Loss = -0.1787, Actor Loss = -0.2514, Critic Loss = 0.1456, Entropy = 0.1102, Returns = 1.3830, Value = 1.3455, mask_loss = 0.1867std_loss = 0.3452:   8%|▊         | 446/5475 [2:00:21<20:35:42, 14.74s/it]

ratio tensor(1.2530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5766, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.1981, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1968, device='cuda:0')
reward_raw: tensor(0.5656, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1989, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.1970, device='cuda:0')
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.2047, device='cuda:0')


Epoch 1, Step 447: Loss = -0.0382, Actor Loss = -0.0955, Critic Loss = 0.1148, Entropy = 0.1035, Returns = 1.4040, Value = 1.3878, mask_loss = 0.1768std_loss = 0.3361:   8%|▊         | 447/5475 [2:00:37<21:05:31, 15.10s/it]

ratio tensor(0.8969, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5465, device='cuda:0') reward: tensor(1.3084, device='cuda:0') factor: tensor(0.2381, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.2410, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.3118, device='cuda:0') factor: tensor(0.2403, device='cuda:0')
reward_raw: tensor(0.5468, device='cuda:0') reward: tensor(1.3042, device='cuda:0') factor: tensor(0.2426, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.3031, device='cuda:0') factor: tensor(0.2386, device='cuda:0')


Epoch 1, Step 448: Loss = 0.0277, Actor Loss = -0.0553, Critic Loss = 0.1665, Entropy = 0.1878, Returns = 1.3219, Value = 1.2392, mask_loss = 0.2556std_loss = 0.3581:   8%|▊         | 448/5475 [2:00:52<21:21:43, 15.30s/it] 

ratio tensor(1.2354, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5526, device='cuda:0') reward: tensor(1.3341, device='cuda:0') factor: tensor(0.2185, device='cuda:0')
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.5618, device='cuda:0') reward: tensor(1.3446, device='cuda:0') factor: tensor(0.2173, device='cuda:0')
reward_raw: tensor(0.5360, device='cuda:0') reward: tensor(1.3164, device='cuda:0') factor: tensor(0.2196, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.2119, device='cuda:0')


Epoch 1, Step 449: Loss = 0.0792, Actor Loss = 0.0221, Critic Loss = 0.1146, Entropy = 0.1686, Returns = 1.4160, Value = 1.2659, mask_loss = 0.2418std_loss = 0.3497:   8%|▊         | 449/5475 [2:01:06<20:44:19, 14.85s/it] 

ratio tensor(0.7651, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5468, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5440, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.2250, device='cuda:0')
reward_raw: tensor(0.5484, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.2241, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.3036, device='cuda:0') factor: tensor(0.2317, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.3074, device='cuda:0') factor: tensor(0.2319, device='cuda:0')


Epoch 1, Step 450: Loss = -0.0832, Actor Loss = -0.1676, Critic Loss = 0.1691, Entropy = 0.1355, Returns = 1.4004, Value = 1.3212, mask_loss = 0.2114std_loss = 0.3489:   8%|▊         | 450/5475 [2:01:19<19:48:51, 14.20s/it]

ratio tensor(1.5449, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5940, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.2285, device='cuda:0')
reward_raw: tensor(0.6128, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.2311, device='cuda:0')
reward_raw: tensor(0.6328, device='cuda:0') reward: tensor(1.3996, device='cuda:0') factor: tensor(0.2331, device='cuda:0')
reward_raw: tensor(0.6157, device='cuda:0') reward: tensor(1.3854, device='cuda:0') factor: tensor(0.2303, device='cuda:0')
reward_raw: tensor(0.6068, device='cuda:0') reward: tensor(1.3757, device='cuda:0') factor: tensor(0.2311, device='cuda:0')


Epoch 1, Step 451: Loss = -0.0414, Actor Loss = -0.0915, Critic Loss = 0.1005, Entropy = 0.1636, Returns = 1.4062, Value = 1.5610, mask_loss = 0.2454std_loss = 0.3399:   8%|▊         | 451/5475 [2:01:33<19:38:47, 14.08s/it]

ratio tensor(1.6092, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4957, device='cuda:0') reward: tensor(1.2768, device='cuda:0') factor: tensor(0.2189, device='cuda:0')
reward_raw: tensor(0.5180, device='cuda:0') reward: tensor(1.2955, device='cuda:0') factor: tensor(0.2225, device='cuda:0')
reward_raw: tensor(0.5075, device='cuda:0') reward: tensor(1.2752, device='cuda:0') factor: tensor(0.2323, device='cuda:0')
reward_raw: tensor(0.4836, device='cuda:0') reward: tensor(1.2576, device='cuda:0') factor: tensor(0.2260, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.2766, device='cuda:0') factor: tensor(0.2247, device='cuda:0')


Epoch 1, Step 452: Loss = 0.1717, Actor Loss = 0.0798, Critic Loss = 0.1841, Entropy = 0.1318, Returns = 1.3204, Value = 1.2547, mask_loss = 0.2316std_loss = 0.3383:   8%|▊         | 452/5475 [2:01:50<21:08:52, 15.16s/it]  

ratio tensor(0.9763, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5378, device='cuda:0') reward: tensor(1.2852, device='cuda:0') factor: tensor(0.2526, device='cuda:0')
reward_raw: tensor(0.5584, device='cuda:0') reward: tensor(1.2992, device='cuda:0') factor: tensor(0.2592, device='cuda:0')
reward_raw: tensor(0.5645, device='cuda:0') reward: tensor(1.3098, device='cuda:0') factor: tensor(0.2547, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.2910, device='cuda:0') factor: tensor(0.2540, device='cuda:0')
reward_raw: tensor(0.5490, device='cuda:0') reward: tensor(1.3013, device='cuda:0') factor: tensor(0.2477, device='cuda:0')


Epoch 1, Step 453: Loss = 0.1618, Actor Loss = 0.1070, Critic Loss = 0.1099, Entropy = 0.1770, Returns = 1.2818, Value = 1.2494, mask_loss = 0.3022std_loss = 0.3482:   8%|▊         | 453/5475 [2:02:05<20:45:37, 14.88s/it]

ratio tensor(1.0086, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4666, device='cuda:0') reward: tensor(1.2418, device='cuda:0') factor: tensor(0.2248, device='cuda:0')
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.3026, device='cuda:0') factor: tensor(0.2231, device='cuda:0')
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.2752, device='cuda:0') factor: tensor(0.2198, device='cuda:0')
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.2886, device='cuda:0') factor: tensor(0.2185, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.2219, device='cuda:0')


Epoch 1, Step 454: Loss = -0.2418, Actor Loss = -0.2843, Critic Loss = 0.0851, Entropy = 0.1211, Returns = 1.2893, Value = 1.2740, mask_loss = 0.2000std_loss = 0.3372:   8%|▊         | 454/5475 [2:02:20<20:55:28, 15.00s/it]

ratio tensor(1.6297, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5654, device='cuda:0') reward: tensor(1.3508, device='cuda:0') factor: tensor(0.2146, device='cuda:0')
reward_raw: tensor(0.5647, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.2116, device='cuda:0')
reward_raw: tensor(0.5484, device='cuda:0') reward: tensor(1.3289, device='cuda:0') factor: tensor(0.2195, device='cuda:0')
reward_raw: tensor(0.5421, device='cuda:0') reward: tensor(1.3287, device='cuda:0') factor: tensor(0.2135, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.3425, device='cuda:0') factor: tensor(0.2174, device='cuda:0')


Epoch 1, Step 455: Loss = -0.1918, Actor Loss = -0.2413, Critic Loss = 0.0994, Entropy = 0.1480, Returns = 1.3185, Value = 1.2569, mask_loss = 0.2333std_loss = 0.3528:   8%|▊         | 455/5475 [2:02:33<20:10:15, 14.47s/it]

ratio tensor(1.7950, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5538, device='cuda:0') reward: tensor(1.3014, device='cuda:0') factor: tensor(0.2524, device='cuda:0')
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.3052, device='cuda:0') factor: tensor(0.2550, device='cuda:0')
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.2920, device='cuda:0') factor: tensor(0.2579, device='cuda:0')
reward_raw: tensor(0.5482, device='cuda:0') reward: tensor(1.2929, device='cuda:0') factor: tensor(0.2553, device='cuda:0')
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.3021, device='cuda:0') factor: tensor(0.2516, device='cuda:0')


Epoch 1, Step 456: Loss = 0.3718, Actor Loss = 0.3267, Critic Loss = 0.0907, Entropy = 0.1661, Returns = 1.2321, Value = 1.3668, mask_loss = 0.2566std_loss = 0.3579:   8%|▊         | 456/5475 [2:02:53<22:28:16, 16.12s/it]  

ratio tensor(0.5760, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5839, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.2299, device='cuda:0')
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.2387, device='cuda:0')
reward_raw: tensor(0.5845, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.2400, device='cuda:0')
reward_raw: tensor(0.5645, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.2365, device='cuda:0')
reward_raw: tensor(0.5880, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.2450, device='cuda:0')


Epoch 1, Step 457: Loss = 0.0273, Actor Loss = -0.0815, Critic Loss = 0.2177, Entropy = 0.1433, Returns = 1.3780, Value = 1.1112, mask_loss = 0.2123std_loss = 0.3527:   8%|▊         | 457/5475 [2:03:07<21:21:19, 15.32s/it]

ratio tensor(1.1199, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.2363, device='cuda:0') factor: tensor(0.2597, device='cuda:0')
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.2403, device='cuda:0') factor: tensor(0.2547, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.2416, device='cuda:0') factor: tensor(0.2627, device='cuda:0')
reward_raw: tensor(0.4867, device='cuda:0') reward: tensor(1.2335, device='cuda:0') factor: tensor(0.2532, device='cuda:0')
reward_raw: tensor(0.4700, device='cuda:0') reward: tensor(1.2129, device='cuda:0') factor: tensor(0.2571, device='cuda:0')


Epoch 1, Step 458: Loss = -0.0835, Actor Loss = -0.1779, Critic Loss = 0.1890, Entropy = 0.1652, Returns = 1.2940, Value = 1.1166, mask_loss = 0.2517std_loss = 0.3534:   8%|▊         | 458/5475 [2:03:21<21:02:04, 15.09s/it]

ratio tensor(0.7736, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5795, device='cuda:0') reward: tensor(1.3005, device='cuda:0') factor: tensor(0.2790, device='cuda:0')
reward_raw: tensor(0.5631, device='cuda:0') reward: tensor(1.2793, device='cuda:0') factor: tensor(0.2839, device='cuda:0')
reward_raw: tensor(0.5765, device='cuda:0') reward: tensor(1.2991, device='cuda:0') factor: tensor(0.2774, device='cuda:0')
reward_raw: tensor(0.5335, device='cuda:0') reward: tensor(1.2534, device='cuda:0') factor: tensor(0.2800, device='cuda:0')
reward_raw: tensor(0.5688, device='cuda:0') reward: tensor(1.2831, device='cuda:0') factor: tensor(0.2857, device='cuda:0')


Epoch 1, Step 459: Loss = 0.0825, Actor Loss = 0.0474, Critic Loss = 0.0707, Entropy = 0.2025, Returns = 1.2241, Value = 1.3638, mask_loss = 0.4147std_loss = 0.3605:   8%|▊         | 459/5475 [2:03:36<21:04:15, 15.12s/it]  

ratio tensor(1.3292, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5579, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2337, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(1.2855, device='cuda:0') factor: tensor(0.2329, device='cuda:0')
reward_raw: tensor(0.5546, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.2310, device='cuda:0')
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.3081, device='cuda:0') factor: tensor(0.2407, device='cuda:0')
reward_raw: tensor(0.5495, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.2359, device='cuda:0')


Epoch 1, Step 460: Loss = 0.0510, Actor Loss = 0.0204, Critic Loss = 0.0615, Entropy = 0.1805, Returns = 1.2571, Value = 1.2573, mask_loss = 0.2534std_loss = 0.3541:   8%|▊         | 460/5475 [2:03:55<22:30:18, 16.16s/it]

ratio tensor(0.8273, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5924, device='cuda:0') reward: tensor(1.3340, device='cuda:0') factor: tensor(0.2584, device='cuda:0')
reward_raw: tensor(0.5775, device='cuda:0') reward: tensor(1.3119, device='cuda:0') factor: tensor(0.2656, device='cuda:0')
reward_raw: tensor(0.5672, device='cuda:0') reward: tensor(1.3077, device='cuda:0') factor: tensor(0.2596, device='cuda:0')
reward_raw: tensor(0.5619, device='cuda:0') reward: tensor(1.3093, device='cuda:0') factor: tensor(0.2526, device='cuda:0')
reward_raw: tensor(0.5717, device='cuda:0') reward: tensor(1.3163, device='cuda:0') factor: tensor(0.2554, device='cuda:0')


Epoch 1, Step 461: Loss = 0.0301, Actor Loss = -0.0327, Critic Loss = 0.1259, Entropy = 0.1875, Returns = 1.3873, Value = 1.3088, mask_loss = 0.2641std_loss = 0.3640:   8%|▊         | 460/5475 [2:04:10<22:30:18, 16.16s/it]

ratio tensor(1.4783, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 461: Loss = 0.0301, Actor Loss = -0.0327, Critic Loss = 0.1259, Entropy = 0.1875, Returns = 1.3873, Value = 1.3088, mask_loss = 0.2641std_loss = 0.3640:   8%|▊         | 461/5475 [2:04:10<22:10:38, 15.92s/it]

reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.2835, device='cuda:0') factor: tensor(0.2424, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.2916, device='cuda:0') factor: tensor(0.2429, device='cuda:0')
reward_raw: tensor(0.5037, device='cuda:0') reward: tensor(1.2644, device='cuda:0') factor: tensor(0.2393, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.2571, device='cuda:0') factor: tensor(0.2419, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.2666, device='cuda:0') factor: tensor(0.2470, device='cuda:0')


Epoch 1, Step 462: Loss = -0.1169, Actor Loss = -0.1869, Critic Loss = 0.1402, Entropy = 0.1531, Returns = 1.3445, Value = 1.1939, mask_loss = 0.2209std_loss = 0.3405:   8%|▊         | 462/5475 [2:04:28<22:55:24, 16.46s/it]

ratio tensor(2.2925, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5961, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.2151, device='cuda:0')
reward_raw: tensor(0.6243, device='cuda:0') reward: tensor(1.4005, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.6056, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.2184, device='cuda:0')
reward_raw: tensor(0.6364, device='cuda:0') reward: tensor(1.4082, device='cuda:0') factor: tensor(0.2282, device='cuda:0')
reward_raw: tensor(0.6159, device='cuda:0') reward: tensor(1.3969, device='cuda:0') factor: tensor(0.2190, device='cuda:0')


Epoch 1, Step 463: Loss = 0.0254, Actor Loss = -0.0486, Critic Loss = 0.1483, Entropy = 0.1534, Returns = 1.3910, Value = 1.4705, mask_loss = 0.2282std_loss = 0.3568:   8%|▊         | 463/5475 [2:04:47<23:52:38, 17.15s/it] 

ratio tensor(1.7387, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5199, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.1808, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.3504, device='cuda:0') factor: tensor(0.1803, device='cuda:0')
reward_raw: tensor(0.5399, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.5280, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1785, device='cuda:0')
reward_raw: tensor(0.5464, device='cuda:0') reward: tensor(1.3686, device='cuda:0') factor: tensor(0.1778, device='cuda:0')


Epoch 1, Step 464: Loss = -0.0240, Actor Loss = -0.0664, Critic Loss = 0.0850, Entropy = 0.1207, Returns = 1.4038, Value = 1.3971, mask_loss = 0.1972std_loss = 0.3456:   8%|▊         | 464/5475 [2:05:00<22:06:15, 15.88s/it]

ratio tensor(1.4798, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5883, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.2351, device='cuda:0')
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.3394, device='cuda:0') factor: tensor(0.2343, device='cuda:0')
reward_raw: tensor(0.6013, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.2395, device='cuda:0')
reward_raw: tensor(0.6064, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.2326, device='cuda:0')
reward_raw: tensor(0.5892, device='cuda:0') reward: tensor(1.3582, device='cuda:0') factor: tensor(0.2310, device='cuda:0')


Epoch 1, Step 465: Loss = 0.1310, Actor Loss = 0.0790, Critic Loss = 0.1043, Entropy = 0.1404, Returns = 1.4340, Value = 1.3379, mask_loss = 0.2280std_loss = 0.3572:   8%|▊         | 465/5475 [2:05:14<21:39:23, 15.56s/it]  

ratio tensor(2.5550, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.2607, device='cuda:0') factor: tensor(0.2355, device='cuda:0')
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.2752, device='cuda:0') factor: tensor(0.2417, device='cuda:0')
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.2800, device='cuda:0') factor: tensor(0.2408, device='cuda:0')
reward_raw: tensor(0.5245, device='cuda:0') reward: tensor(1.2887, device='cuda:0') factor: tensor(0.2358, device='cuda:0')
reward_raw: tensor(0.4941, device='cuda:0') reward: tensor(1.2539, device='cuda:0') factor: tensor(0.2401, device='cuda:0')


Epoch 1, Step 466: Loss = 0.2730, Actor Loss = 0.2148, Critic Loss = 0.1166, Entropy = 0.1684, Returns = 1.1609, Value = 1.3900, mask_loss = 0.3203std_loss = 0.3760:   9%|▊         | 466/5475 [2:05:27<20:27:10, 14.70s/it]

ratio tensor(0.8535, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.6010, device='cuda:0') reward: tensor(1.3677, device='cuda:0') factor: tensor(0.2333, device='cuda:0')
reward_raw: tensor(0.5658, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.2307, device='cuda:0')
reward_raw: tensor(0.5583, device='cuda:0') reward: tensor(1.3252, device='cuda:0') factor: tensor(0.2331, device='cuda:0')
reward_raw: tensor(0.5667, device='cuda:0') reward: tensor(1.3288, device='cuda:0') factor: tensor(0.2379, device='cuda:0')


Epoch 1, Step 467: Loss = 0.0704, Actor Loss = 0.0483, Critic Loss = 0.0445, Entropy = 0.1303, Returns = 1.3147, Value = 1.3990, mask_loss = 0.2300std_loss = 0.3736:   9%|▊         | 467/5475 [2:05:46<22:18:05, 16.03s/it]

ratio tensor(0.9296, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6033, device='cuda:0') reward: tensor(1.3687, device='cuda:0') factor: tensor(0.2347, device='cuda:0')
reward_raw: tensor(0.6638, device='cuda:0') reward: tensor(1.4281, device='cuda:0') factor: tensor(0.2357, device='cuda:0')
reward_raw: tensor(0.6368, device='cuda:0') reward: tensor(1.4035, device='cuda:0') factor: tensor(0.2333, device='cuda:0')
reward_raw: tensor(0.6508, device='cuda:0') reward: tensor(1.4186, device='cuda:0') factor: tensor(0.2322, device='cuda:0')
reward_raw: tensor(0.6550, device='cuda:0') reward: tensor(1.4208, device='cuda:0') factor: tensor(0.2341, device='cuda:0')


Epoch 1, Step 468: Loss = -0.0483, Actor Loss = -0.1048, Critic Loss = 0.1133, Entropy = 0.1630, Returns = 1.4012, Value = 1.3613, mask_loss = 0.2536std_loss = 0.3663:   9%|▊         | 468/5475 [2:06:03<22:27:55, 16.15s/it]

ratio tensor(1.1773, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5679, device='cuda:0') reward: tensor(1.3262, device='cuda:0') factor: tensor(0.2417, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.3225, device='cuda:0') factor: tensor(0.2366, device='cuda:0')
reward_raw: tensor(0.5671, device='cuda:0') reward: tensor(1.3344, device='cuda:0') factor: tensor(0.2328, device='cuda:0')
reward_raw: tensor(0.5616, device='cuda:0') reward: tensor(1.3252, device='cuda:0') factor: tensor(0.2364, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.2348, device='cuda:0')


Epoch 1, Step 469: Loss = 0.1163, Actor Loss = 0.0675, Critic Loss = 0.0979, Entropy = 0.1263, Returns = 1.2954, Value = 1.2570, mask_loss = 0.1995std_loss = 0.3258:   9%|▊         | 469/5475 [2:06:18<22:09:28, 15.93s/it]  

ratio tensor(0.8271, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.2971, device='cuda:0') factor: tensor(0.2262, device='cuda:0')
reward_raw: tensor(0.4870, device='cuda:0') reward: tensor(1.2679, device='cuda:0') factor: tensor(0.2191, device='cuda:0')
reward_raw: tensor(0.5097, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.2243, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.2938, device='cuda:0') factor: tensor(0.2245, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.2808, device='cuda:0') factor: tensor(0.2225, device='cuda:0')


Epoch 1, Step 470: Loss = 0.0026, Actor Loss = -0.0607, Critic Loss = 0.1270, Entropy = 0.1526, Returns = 1.3060, Value = 1.1360, mask_loss = 0.2303std_loss = 0.3605:   9%|▊         | 470/5475 [2:06:37<23:24:02, 16.83s/it]

ratio tensor(1.1963, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5995, device='cuda:0') reward: tensor(1.3736, device='cuda:0') factor: tensor(0.2259, device='cuda:0')
reward_raw: tensor(0.5920, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.2319, device='cuda:0')
reward_raw: tensor(0.5681, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.2298, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.2267, device='cuda:0')
reward_raw: tensor(0.6271, device='cuda:0') reward: tensor(1.4028, device='cuda:0') factor: tensor(0.2243, device='cuda:0')


Epoch 1, Step 471: Loss = -0.1230, Actor Loss = -0.1813, Critic Loss = 0.1170, Entropy = 0.1602, Returns = 1.4420, Value = 1.4238, mask_loss = 0.2453std_loss = 0.3648:   9%|▊         | 471/5475 [2:06:55<23:56:54, 17.23s/it]

ratio tensor(1.6221, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6093, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.2561, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.3283, device='cuda:0') factor: tensor(0.2523, device='cuda:0')
reward_raw: tensor(0.5785, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.2554, device='cuda:0')
reward_raw: tensor(0.6071, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.2532, device='cuda:0')
reward_raw: tensor(0.6080, device='cuda:0') reward: tensor(1.3573, device='cuda:0') factor: tensor(0.2507, device='cuda:0')


Epoch 1, Step 472: Loss = -0.0256, Actor Loss = -0.0699, Critic Loss = 0.0888, Entropy = 0.1040, Returns = 1.4258, Value = 1.3368, mask_loss = 0.1804std_loss = 0.3295:   9%|▊         | 472/5475 [2:07:11<23:12:39, 16.70s/it]

ratio tensor(1.5521, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5840, device='cuda:0') reward: tensor(1.3538, device='cuda:0') factor: tensor(0.2302, device='cuda:0')
reward_raw: tensor(0.6150, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.2286, device='cuda:0')
reward_raw: tensor(0.6025, device='cuda:0') reward: tensor(1.3598, device='cuda:0') factor: tensor(0.2428, device='cuda:0')
reward_raw: tensor(0.5809, device='cuda:0') reward: tensor(1.3500, device='cuda:0') factor: tensor(0.2309, device='cuda:0')
reward_raw: tensor(0.6057, device='cuda:0') reward: tensor(1.3636, device='cuda:0') factor: tensor(0.2421, device='cuda:0')


Epoch 1, Step 473: Loss = 0.1108, Actor Loss = 0.0718, Critic Loss = 0.0783, Entropy = 0.1523, Returns = 1.3708, Value = 1.3275, mask_loss = 0.2595std_loss = 0.3690:   9%|▊         | 473/5475 [2:07:30<24:08:49, 17.38s/it]  

ratio tensor(1.1864, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5268, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.2260, device='cuda:0')
reward_raw: tensor(0.5107, device='cuda:0') reward: tensor(1.2859, device='cuda:0') factor: tensor(0.2248, device='cuda:0')
reward_raw: tensor(0.5231, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.2224, device='cuda:0')
reward_raw: tensor(0.4987, device='cuda:0') reward: tensor(1.2757, device='cuda:0') factor: tensor(0.2229, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.2714, device='cuda:0') factor: tensor(0.2246, device='cuda:0')


Epoch 1, Step 474: Loss = 0.1504, Actor Loss = 0.0895, Critic Loss = 0.1221, Entropy = 0.1143, Returns = 1.2940, Value = 1.3565, mask_loss = 0.1949std_loss = 0.3509:   9%|▊         | 474/5475 [2:07:44<22:58:27, 16.54s/it]

ratio tensor(1.0908, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6402, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.2383, device='cuda:0')
reward_raw: tensor(0.6412, device='cuda:0') reward: tensor(1.4031, device='cuda:0') factor: tensor(0.2381, device='cuda:0')
reward_raw: tensor(0.6550, device='cuda:0') reward: tensor(1.4136, device='cuda:0') factor: tensor(0.2414, device='cuda:0')
reward_raw: tensor(0.6358, device='cuda:0') reward: tensor(1.3965, device='cuda:0') factor: tensor(0.2393, device='cuda:0')
reward_raw: tensor(0.6480, device='cuda:0') reward: tensor(1.4156, device='cuda:0') factor: tensor(0.2324, device='cuda:0')


Epoch 1, Step 475: Loss = -0.2625, Actor Loss = -0.2921, Critic Loss = 0.0596, Entropy = 0.1663, Returns = 1.4553, Value = 1.4433, mask_loss = 0.2309std_loss = 0.3492:   9%|▊         | 475/5475 [2:08:03<24:04:25, 17.33s/it]

ratio tensor(2.3819, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5012, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5226, device='cuda:0') reward: tensor(1.3300, device='cuda:0') factor: tensor(0.1926, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1986, device='cuda:0')
reward_raw: tensor(0.5091, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.1985, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(1.3062, device='cuda:0') factor: tensor(0.2007, device='cuda:0')


Epoch 1, Step 476: Loss = 0.4686, Actor Loss = 0.3892, Critic Loss = 0.1589, Entropy = 0.1138, Returns = 1.2362, Value = 1.4354, mask_loss = 0.1909std_loss = 0.3407:   9%|▊         | 476/5475 [2:08:21<24:03:50, 17.33s/it]  

ratio tensor(0.3125, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5832, device='cuda:0') reward: tensor(1.3675, device='cuda:0') factor: tensor(0.2157, device='cuda:0')
reward_raw: tensor(0.6039, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.2169, device='cuda:0')
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.2095, device='cuda:0')
reward_raw: tensor(0.5944, device='cuda:0') reward: tensor(1.3746, device='cuda:0') factor: tensor(0.2198, device='cuda:0')
reward_raw: tensor(0.5814, device='cuda:0') reward: tensor(1.3595, device='cuda:0') factor: tensor(0.2219, device='cuda:0')


Epoch 1, Step 477: Loss = -0.0946, Actor Loss = -0.1380, Critic Loss = 0.0872, Entropy = 0.1231, Returns = 1.3678, Value = 1.3493, mask_loss = 0.1885std_loss = 0.3380:   9%|▊         | 477/5475 [2:08:34<22:23:26, 16.13s/it]

ratio tensor(1.2082, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5212, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.2141, device='cuda:0')
reward_raw: tensor(0.5504, device='cuda:0') reward: tensor(1.3349, device='cuda:0') factor: tensor(0.2155, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.2134, device='cuda:0')
reward_raw: tensor(0.5310, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.2183, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.2078, device='cuda:0')


Epoch 1, Step 478: Loss = 0.0766, Actor Loss = -0.0526, Critic Loss = 0.2588, Entropy = 0.1252, Returns = 1.3494, Value = 1.5842, mask_loss = 0.1949std_loss = 0.3382:   9%|▊         | 478/5475 [2:08:49<21:49:51, 15.73s/it] 

ratio tensor(1.4002, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.3194, device='cuda:0') factor: tensor(0.2048, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.3003, device='cuda:0') factor: tensor(0.2045, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3084, device='cuda:0') factor: tensor(0.2088, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.3072, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.3102, device='cuda:0') factor: tensor(0.2067, device='cuda:0')


Epoch 1, Step 479: Loss = 0.3195, Actor Loss = 0.2229, Critic Loss = 0.1934, Entropy = 0.1412, Returns = 1.2018, Value = 1.2589, mask_loss = 0.2020std_loss = 0.3385:   9%|▊         | 479/5475 [2:09:04<21:43:08, 15.65s/it] 

ratio tensor(0.9606, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6056, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.2246, device='cuda:0')
reward_raw: tensor(0.6049, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.2186, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.3714, device='cuda:0') factor: tensor(0.2193, device='cuda:0')
reward_raw: tensor(0.6152, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.2270, device='cuda:0')
reward_raw: tensor(0.6101, device='cuda:0') reward: tensor(1.3802, device='cuda:0') factor: tensor(0.2299, device='cuda:0')


Epoch 1, Step 480: Loss = 0.1411, Actor Loss = 0.0991, Critic Loss = 0.0844, Entropy = 0.1454, Returns = 1.3498, Value = 1.3996, mask_loss = 0.2279std_loss = 0.3354:   9%|▉         | 480/5475 [2:09:19<21:28:35, 15.48s/it]

ratio tensor(0.9719, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5843, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.2483, device='cuda:0')
reward_raw: tensor(0.6311, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.2494, device='cuda:0')
reward_raw: tensor(0.5958, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.2404, device='cuda:0')
reward_raw: tensor(0.6120, device='cuda:0') reward: tensor(1.3665, device='cuda:0') factor: tensor(0.2455, device='cuda:0')
reward_raw: tensor(0.6145, device='cuda:0') reward: tensor(1.3690, device='cuda:0') factor: tensor(0.2454, device='cuda:0')


Epoch 1, Step 481: Loss = -0.2098, Actor Loss = -0.2284, Critic Loss = 0.0374, Entropy = 0.1454, Returns = 1.3776, Value = 1.3717, mask_loss = 0.2345std_loss = 0.3602:   9%|▉         | 480/5475 [2:09:34<21:28:35, 15.48s/it]

ratio tensor(1.6187, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 481: Loss = -0.2098, Actor Loss = -0.2284, Critic Loss = 0.0374, Entropy = 0.1454, Returns = 1.3776, Value = 1.3717, mask_loss = 0.2345std_loss = 0.3602:   9%|▉         | 481/5475 [2:09:35<21:23:03, 15.42s/it]

reward_raw: tensor(0.5248, device='cuda:0') reward: tensor(1.3120, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.5272, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.5171, device='cuda:0') reward: tensor(1.3137, device='cuda:0') factor: tensor(0.2034, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.2079, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.2938, device='cuda:0') factor: tensor(0.2063, device='cuda:0')


Epoch 1, Step 482: Loss = 0.2885, Actor Loss = 0.2418, Critic Loss = 0.0936, Entropy = 0.1239, Returns = 1.2641, Value = 1.3360, mask_loss = 0.2085std_loss = 0.3525:   9%|▉         | 482/5475 [2:09:51<21:39:36, 15.62s/it]  

ratio tensor(0.6301, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5898, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.2079, device='cuda:0')
reward_raw: tensor(0.5828, device='cuda:0') reward: tensor(1.3808, device='cuda:0') factor: tensor(0.2021, device='cuda:0')
reward_raw: tensor(0.5940, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.2052, device='cuda:0')
reward_raw: tensor(0.5986, device='cuda:0') reward: tensor(1.3918, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.6159, device='cuda:0') reward: tensor(1.4114, device='cuda:0') factor: tensor(0.2045, device='cuda:0')


Epoch 1, Step 483: Loss = 0.1194, Actor Loss = 0.0962, Critic Loss = 0.0467, Entropy = 0.1343, Returns = 1.3378, Value = 1.4191, mask_loss = 0.2242std_loss = 0.3648:   9%|▉         | 483/5475 [2:10:04<20:32:18, 14.81s/it]

ratio tensor(0.5772, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5381, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.1924, device='cuda:0')
reward_raw: tensor(0.5448, device='cuda:0') reward: tensor(1.3538, device='cuda:0') factor: tensor(0.1910, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.1952, device='cuda:0')
reward_raw: tensor(0.5399, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1959, device='cuda:0')
reward_raw: tensor(0.5381, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.1965, device='cuda:0')


Epoch 1, Step 484: Loss = 0.0532, Actor Loss = -0.0114, Critic Loss = 0.1293, Entropy = 0.1212, Returns = 1.3592, Value = 1.3927, mask_loss = 0.1977std_loss = 0.3426:   9%|▉         | 484/5475 [2:10:19<20:45:44, 14.98s/it]

ratio tensor(1.3758, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5461, device='cuda:0') reward: tensor(1.3573, device='cuda:0') factor: tensor(0.1888, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1919, device='cuda:0')
reward_raw: tensor(0.6017, device='cuda:0') reward: tensor(1.4060, device='cuda:0') factor: tensor(0.1957, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1952, device='cuda:0')
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.3881, device='cuda:0') factor: tensor(0.1978, device='cuda:0')


Epoch 1, Step 485: Loss = 0.1292, Actor Loss = 0.0552, Critic Loss = 0.1484, Entropy = 0.1169, Returns = 1.3577, Value = 1.4179, mask_loss = 0.1777std_loss = 0.3268:   9%|▉         | 485/5475 [2:10:39<23:01:16, 16.61s/it] 

ratio tensor(855.7263, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5595, device='cuda:0') reward: tensor(1.3501, device='cuda:0') factor: tensor(0.2094, device='cuda:0')
reward_raw: tensor(0.5474, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5308, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.1998, device='cuda:0')
reward_raw: tensor(0.5336, device='cuda:0') reward: tensor(1.3266, device='cuda:0') factor: tensor(0.2070, device='cuda:0')
reward_raw: tensor(0.5574, device='cuda:0') reward: tensor(1.3484, device='cuda:0') factor: tensor(0.2091, device='cuda:0')


Epoch 1, Step 486: Loss = 0.0779, Actor Loss = 0.0047, Critic Loss = 0.1466, Entropy = 0.1337, Returns = 1.3032, Value = 1.2299, mask_loss = 0.2140std_loss = 0.3489:   9%|▉         | 486/5475 [2:10:53<21:32:49, 15.55s/it]

ratio tensor(1.7894, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.3209, device='cuda:0') factor: tensor(0.2202, device='cuda:0')
reward_raw: tensor(0.5796, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.2173, device='cuda:0')
reward_raw: tensor(0.5590, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.2210, device='cuda:0')
reward_raw: tensor(0.5582, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.2284, device='cuda:0')
reward_raw: tensor(0.5625, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.2304, device='cuda:0')


Epoch 1, Step 487: Loss = -0.0685, Actor Loss = -0.1060, Critic Loss = 0.0752, Entropy = 0.1276, Returns = 1.3717, Value = 1.3002, mask_loss = 0.2203std_loss = 0.3623:   9%|▉         | 487/5475 [2:11:05<20:27:11, 14.76s/it]

ratio tensor(1.2081, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5997, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.1888, device='cuda:0')
reward_raw: tensor(0.6166, device='cuda:0') reward: tensor(1.4242, device='cuda:0') factor: tensor(0.1924, device='cuda:0')
reward_raw: tensor(0.5965, device='cuda:0') reward: tensor(1.4023, device='cuda:0') factor: tensor(0.1942, device='cuda:0')
reward_raw: tensor(0.5943, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5922, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.1927, device='cuda:0')


Epoch 1, Step 488: Loss = 0.0130, Actor Loss = -0.0251, Critic Loss = 0.0765, Entropy = 0.1231, Returns = 1.3613, Value = 1.3519, mask_loss = 0.1875std_loss = 0.3379:   9%|▉         | 488/5475 [2:11:26<23:03:22, 16.64s/it] 

ratio tensor(1.5238, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5173, device='cuda:0') reward: tensor(1.3200, device='cuda:0') factor: tensor(0.1973, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.2029, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.3189, device='cuda:0') factor: tensor(0.1946, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.2034, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.1980, device='cuda:0')


Epoch 1, Step 489: Loss = 0.1121, Actor Loss = 0.0790, Critic Loss = 0.0664, Entropy = 0.1017, Returns = 1.2978, Value = 1.2646, mask_loss = 0.2017std_loss = 0.3545:   9%|▉         | 489/5475 [2:11:40<21:48:25, 15.75s/it] 

ratio tensor(1.2358, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4384, device='cuda:0') reward: tensor(1.2795, device='cuda:0') factor: tensor(0.1588, device='cuda:0')
reward_raw: tensor(0.4694, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.1626, device='cuda:0')
reward_raw: tensor(0.4364, device='cuda:0') reward: tensor(1.2826, device='cuda:0') factor: tensor(0.1538, device='cuda:0')
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.3129, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.4663, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.1553, device='cuda:0')


Epoch 1, Step 490: Loss = 0.0639, Actor Loss = 0.0142, Critic Loss = 0.0996, Entropy = 0.0893, Returns = 1.2964, Value = 1.4489, mask_loss = 0.1423std_loss = 0.3044:   9%|▉         | 490/5475 [2:12:00<23:21:31, 16.87s/it]

ratio tensor(1.1370, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6681, device='cuda:0') reward: tensor(1.4522, device='cuda:0') factor: tensor(0.2159, device='cuda:0')
reward_raw: tensor(0.6677, device='cuda:0') reward: tensor(1.4475, device='cuda:0') factor: tensor(0.2202, device='cuda:0')
reward_raw: tensor(0.6783, device='cuda:0') reward: tensor(1.4576, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.6876, device='cuda:0') reward: tensor(1.4725, device='cuda:0') factor: tensor(0.2151, device='cuda:0')
reward_raw: tensor(0.6836, device='cuda:0') reward: tensor(1.4613, device='cuda:0') factor: tensor(0.2223, device='cuda:0')


Epoch 1, Step 491: Loss = -0.1305, Actor Loss = -0.1588, Critic Loss = 0.0568, Entropy = 0.1155, Returns = 1.5336, Value = 1.5686, mask_loss = 0.2420std_loss = 0.3790:   9%|▉         | 491/5475 [2:12:17<23:33:52, 17.02s/it]

ratio tensor(1.2211, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5453, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.2076, device='cuda:0')
reward_raw: tensor(0.5285, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.2100, device='cuda:0')
reward_raw: tensor(0.5409, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.2046, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.3587, device='cuda:0') factor: tensor(0.2081, device='cuda:0')
reward_raw: tensor(0.5448, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.2049, device='cuda:0')


Epoch 1, Step 492: Loss = 0.1559, Actor Loss = 0.1241, Critic Loss = 0.0638, Entropy = 0.1187, Returns = 1.3242, Value = 1.2730, mask_loss = 0.2139std_loss = 0.3548:   9%|▉         | 492/5475 [2:12:34<23:30:39, 16.99s/it]  

ratio tensor(1.6899, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5931, device='cuda:0') reward: tensor(1.3768, device='cuda:0') factor: tensor(0.2163, device='cuda:0')
reward_raw: tensor(0.5949, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.2101, device='cuda:0')
reward_raw: tensor(0.5976, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.2143, device='cuda:0')
reward_raw: tensor(0.5732, device='cuda:0') reward: tensor(1.3578, device='cuda:0') factor: tensor(0.2154, device='cuda:0')
reward_raw: tensor(0.5921, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.2099, device='cuda:0')


Epoch 1, Step 493: Loss = 0.1135, Actor Loss = 0.0841, Critic Loss = 0.0590, Entropy = 0.1116, Returns = 1.3958, Value = 1.3302, mask_loss = 0.2448std_loss = 0.3715:   9%|▉         | 493/5475 [2:12:49<22:37:49, 16.35s/it]

ratio tensor(0.7698, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4977, device='cuda:0') reward: tensor(1.2915, device='cuda:0') factor: tensor(0.2062, device='cuda:0')
reward_raw: tensor(0.4743, device='cuda:0') reward: tensor(1.2678, device='cuda:0') factor: tensor(0.2065, device='cuda:0')
reward_raw: tensor(0.4828, device='cuda:0') reward: tensor(1.2666, device='cuda:0') factor: tensor(0.2162, device='cuda:0')
reward_raw: tensor(0.4853, device='cuda:0') reward: tensor(1.2699, device='cuda:0') factor: tensor(0.2155, device='cuda:0')
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.2934, device='cuda:0') factor: tensor(0.2072, device='cuda:0')


Epoch 1, Step 494: Loss = -0.1785, Actor Loss = -0.2092, Critic Loss = 0.0617, Entropy = 0.1204, Returns = 1.2403, Value = 1.3341, mask_loss = 0.2428std_loss = 0.3682:   9%|▉         | 494/5475 [2:13:07<23:22:25, 16.89s/it]

ratio tensor(1.6485, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4849, device='cuda:0') reward: tensor(1.3001, device='cuda:0') factor: tensor(0.1848, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3120, device='cuda:0') factor: tensor(0.1864, device='cuda:0')
reward_raw: tensor(0.4981, device='cuda:0') reward: tensor(1.3073, device='cuda:0') factor: tensor(0.1908, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.1852, device='cuda:0')
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.1848, device='cuda:0')


Epoch 1, Step 495: Loss = 0.2034, Actor Loss = 0.1514, Critic Loss = 0.1042, Entropy = 0.1199, Returns = 1.3030, Value = 1.2217, mask_loss = 0.1954std_loss = 0.3471:   9%|▉         | 495/5475 [2:13:20<21:50:47, 15.79s/it]  

ratio tensor(0.3589, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.1737, device='cuda:0')
reward_raw: tensor(0.5981, device='cuda:0') reward: tensor(1.4207, device='cuda:0') factor: tensor(0.1773, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.1744, device='cuda:0')
reward_raw: tensor(0.5617, device='cuda:0') reward: tensor(1.3854, device='cuda:0') factor: tensor(0.1763, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.3856, device='cuda:0') factor: tensor(0.1737, device='cuda:0')


Epoch 1, Step 496: Loss = -0.1433, Actor Loss = -0.1721, Critic Loss = 0.0579, Entropy = 0.1253, Returns = 1.4268, Value = 1.4569, mask_loss = 0.1731std_loss = 0.3254:   9%|▉         | 496/5475 [2:13:36<21:48:10, 15.76s/it]

ratio tensor(1.7320, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5735, device='cuda:0') reward: tensor(1.3575, device='cuda:0') factor: tensor(0.2161, device='cuda:0')
reward_raw: tensor(0.5771, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.2204, device='cuda:0')
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.3677, device='cuda:0') factor: tensor(0.2119, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.3488, device='cuda:0') factor: tensor(0.2154, device='cuda:0')
reward_raw: tensor(0.5800, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.2186, device='cuda:0')


Epoch 1, Step 497: Loss = 0.0027, Actor Loss = -0.0598, Critic Loss = 0.1253, Entropy = 0.1257, Returns = 1.4045, Value = 1.4833, mask_loss = 0.2057std_loss = 0.3526:   9%|▉         | 497/5475 [2:13:49<20:51:32, 15.08s/it] 

ratio tensor(0.9246, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4830, device='cuda:0') reward: tensor(1.2755, device='cuda:0') factor: tensor(0.2075, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.2816, device='cuda:0') factor: tensor(0.2026, device='cuda:0')
reward_raw: tensor(0.4966, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.2112, device='cuda:0')
reward_raw: tensor(0.4803, device='cuda:0') reward: tensor(1.2706, device='cuda:0') factor: tensor(0.2097, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.2086, device='cuda:0')


Epoch 1, Step 498: Loss = 0.2772, Actor Loss = 0.2384, Critic Loss = 0.0779, Entropy = 0.1308, Returns = 1.1891, Value = 1.2578, mask_loss = 0.2061std_loss = 0.3469:   9%|▉         | 498/5475 [2:14:07<21:51:37, 15.81s/it] 

ratio tensor(0.6038, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5688, device='cuda:0') reward: tensor(1.3436, device='cuda:0') factor: tensor(0.2252, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.3410, device='cuda:0') factor: tensor(0.2182, device='cuda:0')
reward_raw: tensor(0.5828, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.2268, device='cuda:0')
reward_raw: tensor(0.5644, device='cuda:0') reward: tensor(1.3367, device='cuda:0') factor: tensor(0.2277, device='cuda:0')
reward_raw: tensor(0.5975, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.2242, device='cuda:0')


Epoch 1, Step 499: Loss = -0.1558, Actor Loss = -0.1916, Critic Loss = 0.0719, Entropy = 0.1222, Returns = 1.3837, Value = 1.3539, mask_loss = 0.2476std_loss = 0.3776:   9%|▉         | 499/5475 [2:14:21<21:08:56, 15.30s/it]

ratio tensor(1.7792, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5534, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.1907, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.1919, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.3717, device='cuda:0') factor: tensor(0.1951, device='cuda:0')
reward_raw: tensor(0.5665, device='cuda:0') reward: tensor(1.3650, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.3707, device='cuda:0') factor: tensor(0.1953, device='cuda:0')


Epoch 1, Step 500: Loss = 0.0696, Actor Loss = 0.0231, Critic Loss = 0.0933, Entropy = 0.1487, Returns = 1.3309, Value = 1.4504, mask_loss = 0.2301std_loss = 0.3601:   9%|▉         | 500/5475 [2:14:36<21:10:41, 15.32s/it]  

ratio tensor(1.3435, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4676, device='cuda:0') reward: tensor(1.2794, device='cuda:0') factor: tensor(0.1882, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3201, device='cuda:0') factor: tensor(0.1909, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3086, device='cuda:0') factor: tensor(0.1913, device='cuda:0')
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.2899, device='cuda:0') factor: tensor(0.1886, device='cuda:0')
reward_raw: tensor(0.4896, device='cuda:0') reward: tensor(1.2980, device='cuda:0') factor: tensor(0.1916, device='cuda:0')


Epoch 1, Step 501: Loss = 0.0981, Actor Loss = 0.0515, Critic Loss = 0.0935, Entropy = 0.1306, Returns = 1.2806, Value = 1.1121, mask_loss = 0.2038std_loss = 0.3459:   9%|▉         | 500/5475 [2:14:52<21:10:41, 15.32s/it]

ratio tensor(1.0462, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 501: Loss = 0.0981, Actor Loss = 0.0515, Critic Loss = 0.0935, Entropy = 0.1306, Returns = 1.2806, Value = 1.1121, mask_loss = 0.2038std_loss = 0.3459:   9%|▉         | 501/5475 [2:14:52<21:27:37, 15.53s/it]

reward_raw: tensor(0.5508, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.2372, device='cuda:0')
reward_raw: tensor(0.5321, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.2261, device='cuda:0')
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.3151, device='cuda:0') factor: tensor(0.2332, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.2275, device='cuda:0')
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.3517, device='cuda:0') factor: tensor(0.2343, device='cuda:0')


Epoch 1, Step 502: Loss = -0.0234, Actor Loss = -0.0528, Critic Loss = 0.0591, Entropy = 0.1473, Returns = 1.2840, Value = 1.3520, mask_loss = 0.2643std_loss = 0.3770:   9%|▉         | 502/5475 [2:15:16<24:46:17, 17.93s/it]

ratio tensor(1.1209, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.3676, device='cuda:0') factor: tensor(0.2054, device='cuda:0')
reward_raw: tensor(0.5919, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.5874, device='cuda:0') reward: tensor(1.3838, device='cuda:0') factor: tensor(0.2036, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.2063, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.3681, device='cuda:0') factor: tensor(0.2061, device='cuda:0')


Epoch 1, Step 503: Loss = 0.1641, Actor Loss = 0.1063, Critic Loss = 0.1158, Entropy = 0.1283, Returns = 1.2707, Value = 1.4946, mask_loss = 0.1928std_loss = 0.3418:   9%|▉         | 503/5475 [2:15:31<23:43:00, 17.17s/it]  

ratio tensor(0.9946, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.2873, device='cuda:0') factor: tensor(0.2283, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.2875, device='cuda:0') factor: tensor(0.2431, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.2351, device='cuda:0')
reward_raw: tensor(0.5025, device='cuda:0') reward: tensor(1.2700, device='cuda:0') factor: tensor(0.2325, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.2618, device='cuda:0') factor: tensor(0.2310, device='cuda:0')


Epoch 1, Step 504: Loss = 0.2163, Actor Loss = 0.1692, Critic Loss = 0.0944, Entropy = 0.1262, Returns = 1.3096, Value = 1.1875, mask_loss = 0.2286std_loss = 0.3589:   9%|▉         | 504/5475 [2:15:49<23:53:32, 17.30s/it]

ratio tensor(0.6909, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5935, device='cuda:0') reward: tensor(1.3638, device='cuda:0') factor: tensor(0.2296, device='cuda:0')
reward_raw: tensor(0.6216, device='cuda:0') reward: tensor(1.3963, device='cuda:0') factor: tensor(0.2253, device='cuda:0')
reward_raw: tensor(0.6244, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.2306, device='cuda:0')
reward_raw: tensor(0.6205, device='cuda:0') reward: tensor(1.3928, device='cuda:0') factor: tensor(0.2277, device='cuda:0')
reward_raw: tensor(0.6053, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.2276, device='cuda:0')


Epoch 1, Step 505: Loss = -0.1641, Actor Loss = -0.2067, Critic Loss = 0.0855, Entropy = 0.1321, Returns = 1.3761, Value = 1.4207, mask_loss = 0.1990std_loss = 0.3424:   9%|▉         | 505/5475 [2:16:02<22:19:15, 16.17s/it]

ratio tensor(1.3879, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6233, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.2374, device='cuda:0')
reward_raw: tensor(0.6181, device='cuda:0') reward: tensor(1.3840, device='cuda:0') factor: tensor(0.2341, device='cuda:0')
reward_raw: tensor(0.6276, device='cuda:0') reward: tensor(1.3910, device='cuda:0') factor: tensor(0.2366, device='cuda:0')
reward_raw: tensor(0.6122, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.2384, device='cuda:0')
reward_raw: tensor(0.6141, device='cuda:0') reward: tensor(1.3801, device='cuda:0') factor: tensor(0.2340, device='cuda:0')


Epoch 1, Step 506: Loss = -0.0012, Actor Loss = -0.0447, Critic Loss = 0.0874, Entropy = 0.1380, Returns = 1.3597, Value = 1.4748, mask_loss = 0.2272std_loss = 0.3368:   9%|▉         | 506/5475 [2:16:16<21:03:46, 15.26s/it]

ratio tensor(2.4248, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.2981, device='cuda:0') factor: tensor(0.2035, device='cuda:0')
reward_raw: tensor(0.4986, device='cuda:0') reward: tensor(1.3047, device='cuda:0') factor: tensor(0.1938, device='cuda:0')
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.1936, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5171, device='cuda:0') reward: tensor(1.3208, device='cuda:0') factor: tensor(0.1963, device='cuda:0')


Epoch 1, Step 507: Loss = 0.2423, Actor Loss = 0.1974, Critic Loss = 0.0899, Entropy = 0.1394, Returns = 1.2731, Value = 1.1920, mask_loss = 0.2084std_loss = 0.3487:   9%|▉         | 507/5475 [2:16:31<21:18:14, 15.44s/it]  

ratio tensor(0.4290, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4805, device='cuda:0') reward: tensor(1.2753, device='cuda:0') factor: tensor(0.2052, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.2945, device='cuda:0') factor: tensor(0.2083, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.2761, device='cuda:0') factor: tensor(0.2072, device='cuda:0')
reward_raw: tensor(0.4814, device='cuda:0') reward: tensor(1.2659, device='cuda:0') factor: tensor(0.2156, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.2841, device='cuda:0') factor: tensor(0.2088, device='cuda:0')


Epoch 1, Step 508: Loss = 0.1640, Actor Loss = 0.1169, Critic Loss = 0.0944, Entropy = 0.0963, Returns = 1.3156, Value = 1.3375, mask_loss = 0.1955std_loss = 0.3482:   9%|▉         | 508/5475 [2:16:46<21:05:07, 15.28s/it]

ratio tensor(0.4191, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5337, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.2214, device='cuda:0')
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.2981, device='cuda:0') factor: tensor(0.2281, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.2195, device='cuda:0')
reward_raw: tensor(0.5275, device='cuda:0') reward: tensor(1.3089, device='cuda:0') factor: tensor(0.2186, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.2251, device='cuda:0')


Epoch 1, Step 509: Loss = -0.1913, Actor Loss = -0.2143, Critic Loss = 0.0460, Entropy = 0.1018, Returns = 1.3543, Value = 1.3321, mask_loss = 0.1986std_loss = 0.3517:   9%|▉         | 509/5475 [2:17:08<23:53:58, 17.33s/it]

ratio tensor(1.6458, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5464, device='cuda:0') reward: tensor(1.3725, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1745, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3494, device='cuda:0') factor: tensor(0.1759, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3639, device='cuda:0') factor: tensor(0.1793, device='cuda:0')
reward_raw: tensor(0.5428, device='cuda:0') reward: tensor(1.3665, device='cuda:0') factor: tensor(0.1763, device='cuda:0')


Epoch 1, Step 510: Loss = -0.0890, Actor Loss = -0.1261, Critic Loss = 0.0742, Entropy = 0.0965, Returns = 1.4041, Value = 1.4355, mask_loss = 0.1756std_loss = 0.3355:   9%|▉         | 510/5475 [2:17:22<22:20:04, 16.19s/it]

ratio tensor(1.5818, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5798, device='cuda:0') reward: tensor(1.3554, device='cuda:0') factor: tensor(0.2243, device='cuda:0')
reward_raw: tensor(0.5869, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.2267, device='cuda:0')
reward_raw: tensor(0.5846, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.2222, device='cuda:0')
reward_raw: tensor(0.5840, device='cuda:0') reward: tensor(1.3639, device='cuda:0') factor: tensor(0.2201, device='cuda:0')
reward_raw: tensor(0.5937, device='cuda:0') reward: tensor(1.3752, device='cuda:0') factor: tensor(0.2185, device='cuda:0')


Epoch 1, Step 511: Loss = 0.0311, Actor Loss = -0.0011, Critic Loss = 0.0647, Entropy = 0.1033, Returns = 1.3933, Value = 1.3249, mask_loss = 0.2225std_loss = 0.3717:   9%|▉         | 511/5475 [2:17:37<21:48:09, 15.81s/it] 

ratio tensor(2.1114, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5820, device='cuda:0') reward: tensor(1.3843, device='cuda:0') factor: tensor(0.1978, device='cuda:0')
reward_raw: tensor(0.5620, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1945, device='cuda:0')
reward_raw: tensor(0.5695, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1928, device='cuda:0')
reward_raw: tensor(0.5706, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.1966, device='cuda:0')
reward_raw: tensor(0.5820, device='cuda:0') reward: tensor(1.3904, device='cuda:0') factor: tensor(0.1916, device='cuda:0')


Epoch 1, Step 512: Loss = -0.0115, Actor Loss = -0.0422, Critic Loss = 0.0617, Entropy = 0.1132, Returns = 1.4345, Value = 1.3888, mask_loss = 0.2040std_loss = 0.3569:   9%|▉         | 512/5475 [2:17:52<21:39:07, 15.71s/it]

ratio tensor(1.3115, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5735, device='cuda:0') reward: tensor(1.3625, device='cuda:0') factor: tensor(0.2110, device='cuda:0')
reward_raw: tensor(0.5827, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.2074, device='cuda:0')
reward_raw: tensor(0.5816, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.2034, device='cuda:0')
reward_raw: tensor(0.5662, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.2091, device='cuda:0')
reward_raw: tensor(0.5894, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.2085, device='cuda:0')


Epoch 1, Step 513: Loss = -0.0671, Actor Loss = -0.0887, Critic Loss = 0.0434, Entropy = 0.1079, Returns = 1.3170, Value = 1.3616, mask_loss = 0.2011std_loss = 0.3504:   9%|▉         | 513/5475 [2:18:05<20:34:14, 14.92s/it]

ratio tensor(1.5380, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5835, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.2605, device='cuda:0')
reward_raw: tensor(0.5828, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.2547, device='cuda:0')
reward_raw: tensor(0.5934, device='cuda:0') reward: tensor(1.3420, device='cuda:0') factor: tensor(0.2514, device='cuda:0')
reward_raw: tensor(0.5849, device='cuda:0') reward: tensor(1.3329, device='cuda:0') factor: tensor(0.2521, device='cuda:0')
reward_raw: tensor(0.5813, device='cuda:0') reward: tensor(1.3252, device='cuda:0') factor: tensor(0.2561, device='cuda:0')


Epoch 1, Step 514: Loss = 0.1663, Actor Loss = 0.1369, Critic Loss = 0.0589, Entropy = 0.1130, Returns = 1.3596, Value = 1.2791, mask_loss = 0.2441std_loss = 0.3689:   9%|▉         | 514/5475 [2:18:20<20:23:53, 14.80s/it]  

ratio tensor(0.6900, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5068, device='cuda:0') reward: tensor(1.3031, device='cuda:0') factor: tensor(0.2037, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.2072, device='cuda:0')
reward_raw: tensor(0.5241, device='cuda:0') reward: tensor(1.3117, device='cuda:0') factor: tensor(0.2123, device='cuda:0')
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.3146, device='cuda:0') factor: tensor(0.2062, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.2080, device='cuda:0')


Epoch 1, Step 515: Loss = 0.0776, Actor Loss = 0.0432, Critic Loss = 0.0693, Entropy = 0.1528, Returns = 1.2843, Value = 1.3150, mask_loss = 0.2799std_loss = 0.3834:   9%|▉         | 515/5475 [2:18:38<21:39:44, 15.72s/it]

ratio tensor(0.0413, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6028, device='cuda:0') reward: tensor(1.2845, device='cuda:0') factor: tensor(0.3183, device='cuda:0')
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.2679, device='cuda:0') factor: tensor(0.3113, device='cuda:0')
reward_raw: tensor(0.5919, device='cuda:0') reward: tensor(1.2786, device='cuda:0') factor: tensor(0.3133, device='cuda:0')
reward_raw: tensor(0.5969, device='cuda:0') reward: tensor(1.2766, device='cuda:0') factor: tensor(0.3202, device='cuda:0')
reward_raw: tensor(0.5903, device='cuda:0') reward: tensor(1.2819, device='cuda:0') factor: tensor(0.3084, device='cuda:0')


Epoch 1, Step 516: Loss = 0.1148, Actor Loss = 0.0885, Critic Loss = 0.0530, Entropy = 0.2036, Returns = 1.3554, Value = 1.3490, mask_loss = 0.3048std_loss = 0.3792:   9%|▉         | 516/5475 [2:18:52<21:08:11, 15.34s/it]

ratio tensor(0.2881, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5405, device='cuda:0') reward: tensor(1.2707, device='cuda:0') factor: tensor(0.2698, device='cuda:0')
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.2662, device='cuda:0') factor: tensor(0.2647, device='cuda:0')
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.2558, device='cuda:0') factor: tensor(0.2725, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.2882, device='cuda:0') factor: tensor(0.2684, device='cuda:0')
reward_raw: tensor(0.5379, device='cuda:0') reward: tensor(1.2698, device='cuda:0') factor: tensor(0.2682, device='cuda:0')


Epoch 1, Step 517: Loss = -0.2725, Actor Loss = -0.4075, Critic Loss = 0.2702, Entropy = 0.1823, Returns = 1.3754, Value = 1.0493, mask_loss = 0.2696std_loss = 0.3688:   9%|▉         | 517/5475 [2:19:13<23:18:01, 16.92s/it]

ratio tensor(4.3863, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.1511, device='cuda:0') factor: tensor(0.3373, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.1425, device='cuda:0') factor: tensor(0.3439, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.1586, device='cuda:0') factor: tensor(0.3403, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.1710, device='cuda:0') factor: tensor(0.3357, device='cuda:0')
reward_raw: tensor(0.4754, device='cuda:0') reward: tensor(1.1392, device='cuda:0') factor: tensor(0.3362, device='cuda:0')


Epoch 1, Step 518: Loss = 0.1935, Actor Loss = 0.1357, Critic Loss = 0.1159, Entropy = 0.1672, Returns = 1.1341, Value = 1.3202, mask_loss = 0.3427std_loss = 0.3785:   9%|▉         | 518/5475 [2:19:27<22:02:14, 16.00s/it]  

ratio tensor(0.7404, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.2268, device='cuda:0') factor: tensor(0.3486, device='cuda:0')
reward_raw: tensor(0.5887, device='cuda:0') reward: tensor(1.2479, device='cuda:0') factor: tensor(0.3408, device='cuda:0')
reward_raw: tensor(0.5972, device='cuda:0') reward: tensor(1.2510, device='cuda:0') factor: tensor(0.3462, device='cuda:0')
reward_raw: tensor(0.6018, device='cuda:0') reward: tensor(1.2479, device='cuda:0') factor: tensor(0.3539, device='cuda:0')
reward_raw: tensor(0.5958, device='cuda:0') reward: tensor(1.2485, device='cuda:0') factor: tensor(0.3473, device='cuda:0')


Epoch 1, Step 519: Loss = 0.1809, Actor Loss = 0.1218, Critic Loss = 0.1187, Entropy = 0.1842, Returns = 1.2661, Value = 1.1649, mask_loss = 0.3248std_loss = 0.3909:   9%|▉         | 519/5475 [2:19:41<21:17:07, 15.46s/it]

ratio tensor(0.8624, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4791, device='cuda:0') reward: tensor(1.2069, device='cuda:0') factor: tensor(0.2722, device='cuda:0')
reward_raw: tensor(0.4660, device='cuda:0') reward: tensor(1.1986, device='cuda:0') factor: tensor(0.2673, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.2016, device='cuda:0') factor: tensor(0.2695, device='cuda:0')
reward_raw: tensor(0.4662, device='cuda:0') reward: tensor(1.1946, device='cuda:0') factor: tensor(0.2716, device='cuda:0')
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.1846, device='cuda:0') factor: tensor(0.2783, device='cuda:0')


Epoch 1, Step 520: Loss = -0.1721, Actor Loss = -0.2178, Critic Loss = 0.0918, Entropy = 0.1899, Returns = 1.1739, Value = 1.2759, mask_loss = 0.3096std_loss = 0.3661:   9%|▉         | 520/5475 [2:19:59<22:15:05, 16.17s/it]

ratio tensor(2.2939, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5765, device='cuda:0') reward: tensor(1.2749, device='cuda:0') factor: tensor(0.3015, device='cuda:0')
reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.2840, device='cuda:0') factor: tensor(0.2954, device='cuda:0')
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.2743, device='cuda:0') factor: tensor(0.2880, device='cuda:0')
reward_raw: tensor(0.5756, device='cuda:0') reward: tensor(1.2857, device='cuda:0') factor: tensor(0.2899, device='cuda:0')
reward_raw: tensor(0.5861, device='cuda:0') reward: tensor(1.2957, device='cuda:0') factor: tensor(0.2904, device='cuda:0')


Epoch 1, Step 521: Loss = 0.1306, Actor Loss = 0.1038, Critic Loss = 0.0538, Entropy = 0.1599, Returns = 1.2725, Value = 1.3227, mask_loss = 0.2648std_loss = 0.3788:   9%|▉         | 520/5475 [2:20:15<22:15:05, 16.17s/it]  

ratio tensor(1.9234, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 521: Loss = 0.1306, Actor Loss = 0.1038, Critic Loss = 0.0538, Entropy = 0.1599, Returns = 1.2725, Value = 1.3227, mask_loss = 0.2648std_loss = 0.3788:  10%|▉         | 521/5475 [2:20:15<22:14:06, 16.16s/it]

reward_raw: tensor(0.6066, device='cuda:0') reward: tensor(1.3034, device='cuda:0') factor: tensor(0.3033, device='cuda:0')
reward_raw: tensor(0.5904, device='cuda:0') reward: tensor(1.2980, device='cuda:0') factor: tensor(0.2924, device='cuda:0')
reward_raw: tensor(0.5934, device='cuda:0') reward: tensor(1.3082, device='cuda:0') factor: tensor(0.2852, device='cuda:0')
reward_raw: tensor(0.6060, device='cuda:0') reward: tensor(1.3148, device='cuda:0') factor: tensor(0.2912, device='cuda:0')
reward_raw: tensor(0.6008, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.2822, device='cuda:0')


Epoch 1, Step 522: Loss = -0.1835, Actor Loss = -0.2065, Critic Loss = 0.0464, Entropy = 0.1811, Returns = 1.2345, Value = 1.2265, mask_loss = 0.2851std_loss = 0.3769:  10%|▉         | 522/5475 [2:20:36<24:08:58, 17.55s/it]

ratio tensor(1.2807, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6613, device='cuda:0') reward: tensor(1.3306, device='cuda:0') factor: tensor(0.3308, device='cuda:0')
reward_raw: tensor(0.6507, device='cuda:0') reward: tensor(1.3189, device='cuda:0') factor: tensor(0.3318, device='cuda:0')
reward_raw: tensor(0.6772, device='cuda:0') reward: tensor(1.3451, device='cuda:0') factor: tensor(0.3321, device='cuda:0')
reward_raw: tensor(0.6410, device='cuda:0') reward: tensor(1.3102, device='cuda:0') factor: tensor(0.3308, device='cuda:0')
reward_raw: tensor(0.6412, device='cuda:0') reward: tensor(1.3145, device='cuda:0') factor: tensor(0.3267, device='cuda:0')


Epoch 1, Step 523: Loss = 0.0563, Actor Loss = -0.0287, Critic Loss = 0.1702, Entropy = 0.1628, Returns = 1.2881, Value = 1.4239, mask_loss = 0.3351std_loss = 0.3939:  10%|▉         | 523/5475 [2:20:49<22:13:19, 16.15s/it] 

ratio tensor(1.5272, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6362, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.2662, device='cuda:0')
reward_raw: tensor(0.6608, device='cuda:0') reward: tensor(1.3978, device='cuda:0') factor: tensor(0.2630, device='cuda:0')
reward_raw: tensor(0.6655, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.2665, device='cuda:0')
reward_raw: tensor(0.6568, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.2646, device='cuda:0')
reward_raw: tensor(0.6523, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.2600, device='cuda:0')


Epoch 1, Step 524: Loss = 0.2181, Actor Loss = 0.1617, Critic Loss = 0.1131, Entropy = 0.1563, Returns = 1.3881, Value = 1.4262, mask_loss = 0.2959std_loss = 0.3773:  10%|▉         | 524/5475 [2:21:07<23:04:33, 16.78s/it] 

ratio tensor(0.9252, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5658, device='cuda:0') reward: tensor(1.2598, device='cuda:0') factor: tensor(0.3060, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.2539, device='cuda:0') factor: tensor(0.3093, device='cuda:0')
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.2408, device='cuda:0') factor: tensor(0.3142, device='cuda:0')
reward_raw: tensor(0.5734, device='cuda:0') reward: tensor(1.2722, device='cuda:0') factor: tensor(0.3012, device='cuda:0')
reward_raw: tensor(0.5694, device='cuda:0') reward: tensor(1.2511, device='cuda:0') factor: tensor(0.3183, device='cuda:0')


Epoch 1, Step 525: Loss = 0.0948, Actor Loss = 0.0508, Critic Loss = 0.0885, Entropy = 0.1852, Returns = 1.2991, Value = 1.1267, mask_loss = 0.3372std_loss = 0.3958:  10%|▉         | 525/5475 [2:21:20<21:34:04, 15.69s/it]

ratio tensor(1.6220, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6598, device='cuda:0') reward: tensor(1.3659, device='cuda:0') factor: tensor(0.2939, device='cuda:0')
reward_raw: tensor(0.6699, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.2880, device='cuda:0')
reward_raw: tensor(0.6646, device='cuda:0') reward: tensor(1.3728, device='cuda:0') factor: tensor(0.2918, device='cuda:0')
reward_raw: tensor(0.6636, device='cuda:0') reward: tensor(1.3683, device='cuda:0') factor: tensor(0.2954, device='cuda:0')
reward_raw: tensor(0.6577, device='cuda:0') reward: tensor(1.3682, device='cuda:0') factor: tensor(0.2895, device='cuda:0')


Epoch 1, Step 526: Loss = -0.1102, Actor Loss = -0.1559, Critic Loss = 0.0918, Entropy = 0.1651, Returns = 1.4093, Value = 1.4088, mask_loss = 0.2660std_loss = 0.3609:  10%|▉         | 526/5475 [2:21:38<22:41:42, 16.51s/it]

ratio tensor(4.1545, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6399, device='cuda:0') reward: tensor(1.3191, device='cuda:0') factor: tensor(0.3207, device='cuda:0')
reward_raw: tensor(0.6592, device='cuda:0') reward: tensor(1.3444, device='cuda:0') factor: tensor(0.3148, device='cuda:0')
reward_raw: tensor(0.6573, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.3206, device='cuda:0')
reward_raw: tensor(0.6600, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.3261, device='cuda:0')
reward_raw: tensor(0.6528, device='cuda:0') reward: tensor(1.3288, device='cuda:0') factor: tensor(0.3240, device='cuda:0')


Epoch 1, Step 527: Loss = -0.2057, Actor Loss = -0.2285, Critic Loss = 0.0460, Entropy = 0.1736, Returns = 1.3676, Value = 1.3463, mask_loss = 0.2892std_loss = 0.3826:  10%|▉         | 527/5475 [2:21:52<21:34:11, 15.69s/it]

ratio tensor(1.7598, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6503, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.3152, device='cuda:0')
reward_raw: tensor(0.6416, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.3031, device='cuda:0')
reward_raw: tensor(0.6222, device='cuda:0') reward: tensor(1.3116, device='cuda:0') factor: tensor(0.3106, device='cuda:0')
reward_raw: tensor(0.6140, device='cuda:0') reward: tensor(1.3112, device='cuda:0') factor: tensor(0.3028, device='cuda:0')
reward_raw: tensor(0.6131, device='cuda:0') reward: tensor(1.3043, device='cuda:0') factor: tensor(0.3088, device='cuda:0')


Epoch 1, Step 528: Loss = 0.2996, Actor Loss = 0.2276, Critic Loss = 0.1442, Entropy = 0.1504, Returns = 1.2010, Value = 1.3018, mask_loss = 0.2891std_loss = 0.3876:  10%|▉         | 528/5475 [2:22:10<22:15:21, 16.20s/it]  

ratio tensor(0.3097, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5758, device='cuda:0') reward: tensor(1.2785, device='cuda:0') factor: tensor(0.2973, device='cuda:0')
reward_raw: tensor(0.6030, device='cuda:0') reward: tensor(1.3151, device='cuda:0') factor: tensor(0.2879, device='cuda:0')
reward_raw: tensor(0.5893, device='cuda:0') reward: tensor(1.2921, device='cuda:0') factor: tensor(0.2972, device='cuda:0')
reward_raw: tensor(0.5789, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.2877, device='cuda:0')
reward_raw: tensor(0.5937, device='cuda:0') reward: tensor(1.2946, device='cuda:0') factor: tensor(0.2992, device='cuda:0')


Epoch 1, Step 529: Loss = 0.2157, Actor Loss = 0.1914, Critic Loss = 0.0489, Entropy = 0.1470, Returns = 1.2124, Value = 1.2797, mask_loss = 0.2523std_loss = 0.3656:  10%|▉         | 529/5475 [2:22:21<20:24:26, 14.85s/it]

ratio tensor(0.1679, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5342, device='cuda:0') reward: tensor(1.2682, device='cuda:0') factor: tensor(0.2660, device='cuda:0')
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.2834, device='cuda:0') factor: tensor(0.2622, device='cuda:0')
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.2704, device='cuda:0') factor: tensor(0.2585, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.2606, device='cuda:0') factor: tensor(0.2589, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.2597, device='cuda:0') factor: tensor(0.2604, device='cuda:0')


Epoch 1, Step 530: Loss = 0.0942, Actor Loss = 0.0360, Critic Loss = 0.1168, Entropy = 0.1440, Returns = 1.2628, Value = 1.3607, mask_loss = 0.2751std_loss = 0.3702:  10%|▉         | 530/5475 [2:22:33<19:10:35, 13.96s/it]

ratio tensor(3.3722, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.3287, device='cuda:0') factor: tensor(0.1718, device='cuda:0')
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.1784, device='cuda:0')
reward_raw: tensor(0.4812, device='cuda:0') reward: tensor(1.3037, device='cuda:0') factor: tensor(0.1775, device='cuda:0')
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.3126, device='cuda:0') factor: tensor(0.1795, device='cuda:0')
reward_raw: tensor(0.5107, device='cuda:0') reward: tensor(1.3248, device='cuda:0') factor: tensor(0.1860, device='cuda:0')


Epoch 1, Step 531: Loss = -0.2141, Actor Loss = -0.2523, Critic Loss = 0.0765, Entropy = 0.0931, Returns = 1.3652, Value = 1.3864, mask_loss = 0.1787std_loss = 0.3417:  10%|▉         | 531/5475 [2:22:49<19:58:41, 14.55s/it]

ratio tensor(1.9816, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.2682, device='cuda:0') factor: tensor(0.2714, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.2604, device='cuda:0') factor: tensor(0.2657, device='cuda:0')
reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.2517, device='cuda:0') factor: tensor(0.2743, device='cuda:0')
reward_raw: tensor(0.5318, device='cuda:0') reward: tensor(1.2615, device='cuda:0') factor: tensor(0.2703, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.2632, device='cuda:0') factor: tensor(0.2675, device='cuda:0')


Epoch 1, Step 532: Loss = 0.1512, Actor Loss = 0.1105, Critic Loss = 0.0815, Entropy = 0.1171, Returns = 1.3083, Value = 1.2452, mask_loss = 0.2361std_loss = 0.3699:  10%|▉         | 532/5475 [2:23:06<20:51:25, 15.19s/it]  

ratio tensor(0.7008, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6208, device='cuda:0') reward: tensor(1.3934, device='cuda:0') factor: tensor(0.2274, device='cuda:0')
reward_raw: tensor(0.6336, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.2291, device='cuda:0')
reward_raw: tensor(0.6410, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.2259, device='cuda:0')
reward_raw: tensor(0.6125, device='cuda:0') reward: tensor(1.3920, device='cuda:0') factor: tensor(0.2205, device='cuda:0')
reward_raw: tensor(0.6324, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.2279, device='cuda:0')


Epoch 1, Step 533: Loss = -0.0252, Actor Loss = -0.0774, Critic Loss = 0.1045, Entropy = 0.1364, Returns = 1.3767, Value = 1.4019, mask_loss = 0.2500std_loss = 0.3740:  10%|▉         | 533/5475 [2:23:18<19:43:55, 14.37s/it]

ratio tensor(0.4991, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6095, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.2504, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.2418, device='cuda:0')
reward_raw: tensor(0.6134, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.2514, device='cuda:0')
reward_raw: tensor(0.5828, device='cuda:0') reward: tensor(1.3323, device='cuda:0') factor: tensor(0.2505, device='cuda:0')
reward_raw: tensor(0.6089, device='cuda:0') reward: tensor(1.3601, device='cuda:0') factor: tensor(0.2487, device='cuda:0')


Epoch 1, Step 534: Loss = 0.1359, Actor Loss = 0.0880, Critic Loss = 0.0961, Entropy = 0.1209, Returns = 1.3016, Value = 1.4031, mask_loss = 0.2516std_loss = 0.3822:  10%|▉         | 534/5475 [2:23:35<20:43:30, 15.10s/it]  

ratio tensor(0.7895, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.2322, device='cuda:0') factor: tensor(0.2395, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.2495, device='cuda:0') factor: tensor(0.2441, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.2452, device='cuda:0') factor: tensor(0.2346, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.2367, device='cuda:0') factor: tensor(0.2382, device='cuda:0')
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.2365, device='cuda:0') factor: tensor(0.2324, device='cuda:0')


Epoch 1, Step 535: Loss = 0.1615, Actor Loss = 0.1249, Critic Loss = 0.0733, Entropy = 0.1056, Returns = 1.2224, Value = 1.1475, mask_loss = 0.1957std_loss = 0.3417:  10%|▉         | 535/5475 [2:23:47<19:32:59, 14.25s/it]

ratio tensor(0.6527, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5543, device='cuda:0') reward: tensor(1.2835, device='cuda:0') factor: tensor(0.2708, device='cuda:0')
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.2860, device='cuda:0') factor: tensor(0.2753, device='cuda:0')
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.2952, device='cuda:0') factor: tensor(0.2739, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.2736, device='cuda:0') factor: tensor(0.2784, device='cuda:0')
reward_raw: tensor(0.5519, device='cuda:0') reward: tensor(1.2768, device='cuda:0') factor: tensor(0.2751, device='cuda:0')


Epoch 1, Step 536: Loss = 0.1768, Actor Loss = 0.0962, Critic Loss = 0.1614, Entropy = 0.1247, Returns = 1.2221, Value = 1.2928, mask_loss = 0.3123std_loss = 0.4043:  10%|▉         | 536/5475 [2:24:00<19:01:55, 13.87s/it]

ratio tensor(0.7572, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6079, device='cuda:0') reward: tensor(1.3815, device='cuda:0') factor: tensor(0.2264, device='cuda:0')
reward_raw: tensor(0.6145, device='cuda:0') reward: tensor(1.3783, device='cuda:0') factor: tensor(0.2362, device='cuda:0')
reward_raw: tensor(0.6099, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.2266, device='cuda:0')
reward_raw: tensor(0.5947, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.2275, device='cuda:0')
reward_raw: tensor(0.6114, device='cuda:0') reward: tensor(1.3821, device='cuda:0') factor: tensor(0.2292, device='cuda:0')


Epoch 1, Step 537: Loss = -0.0441, Actor Loss = -0.0979, Critic Loss = 0.1079, Entropy = 0.1155, Returns = 1.4034, Value = 1.5525, mask_loss = 0.2241std_loss = 0.3474:  10%|▉         | 537/5475 [2:24:13<18:29:49, 13.49s/it]

ratio tensor(2.6686, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.2790, device='cuda:0') factor: tensor(0.2312, device='cuda:0')
reward_raw: tensor(0.5203, device='cuda:0') reward: tensor(1.2885, device='cuda:0') factor: tensor(0.2319, device='cuda:0')
reward_raw: tensor(0.5250, device='cuda:0') reward: tensor(1.2959, device='cuda:0') factor: tensor(0.2291, device='cuda:0')
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.2844, device='cuda:0') factor: tensor(0.2297, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.2819, device='cuda:0') factor: tensor(0.2292, device='cuda:0')


Epoch 1, Step 538: Loss = 0.4000, Actor Loss = 0.2821, Critic Loss = 0.2362, Entropy = 0.1331, Returns = 1.1724, Value = 1.4441, mask_loss = 0.2692std_loss = 0.3880:  10%|▉         | 538/5475 [2:24:30<19:58:35, 14.57s/it]  

ratio tensor(0.5035, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5326, device='cuda:0') reward: tensor(1.3021, device='cuda:0') factor: tensor(0.2305, device='cuda:0')
reward_raw: tensor(0.5582, device='cuda:0') reward: tensor(1.3266, device='cuda:0') factor: tensor(0.2316, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.2240, device='cuda:0')
reward_raw: tensor(0.5596, device='cuda:0') reward: tensor(1.3365, device='cuda:0') factor: tensor(0.2231, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.3143, device='cuda:0') factor: tensor(0.2278, device='cuda:0')


Epoch 1, Step 539: Loss = 0.0448, Actor Loss = -0.0239, Critic Loss = 0.1376, Entropy = 0.1034, Returns = 1.3570, Value = 1.1480, mask_loss = 0.2449std_loss = 0.3776:  10%|▉         | 539/5475 [2:24:48<21:19:51, 15.56s/it]

ratio tensor(0.9871, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5515, device='cuda:0') reward: tensor(1.3504, device='cuda:0') factor: tensor(0.2012, device='cuda:0')
reward_raw: tensor(0.5366, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.1979, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.2062, device='cuda:0')
reward_raw: tensor(0.5428, device='cuda:0') reward: tensor(1.3395, device='cuda:0') factor: tensor(0.2033, device='cuda:0')
reward_raw: tensor(0.5440, device='cuda:0') reward: tensor(1.3448, device='cuda:0') factor: tensor(0.1992, device='cuda:0')


Epoch 1, Step 540: Loss = -0.1304, Actor Loss = -0.1556, Critic Loss = 0.0507, Entropy = 0.1128, Returns = 1.3399, Value = 1.3792, mask_loss = 0.2330std_loss = 0.3787:  10%|▉         | 540/5475 [2:25:02<20:52:25, 15.23s/it]

ratio tensor(4.0669, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.2147, device='cuda:0')
reward_raw: tensor(0.5720, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.3181, device='cuda:0') factor: tensor(0.2126, device='cuda:0')
reward_raw: tensor(0.5684, device='cuda:0') reward: tensor(1.3452, device='cuda:0') factor: tensor(0.2232, device='cuda:0')
reward_raw: tensor(0.5790, device='cuda:0') reward: tensor(1.3621, device='cuda:0') factor: tensor(0.2169, device='cuda:0')


Epoch 1, Step 541: Loss = 0.1646, Actor Loss = 0.0694, Critic Loss = 0.1908, Entropy = 0.1112, Returns = 1.3008, Value = 1.3836, mask_loss = 0.2388std_loss = 0.3715:  10%|▉         | 540/5475 [2:25:21<20:52:25, 15.23s/it]  

ratio tensor(1.0969, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 541: Loss = 0.1646, Actor Loss = 0.0694, Critic Loss = 0.1908, Entropy = 0.1112, Returns = 1.3008, Value = 1.3836, mask_loss = 0.2388std_loss = 0.3715:  10%|▉         | 541/5475 [2:25:21<22:12:24, 16.20s/it]

reward_raw: tensor(0.5373, device='cuda:0') reward: tensor(1.3278, device='cuda:0') factor: tensor(0.2095, device='cuda:0')
reward_raw: tensor(0.5635, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.5573, device='cuda:0') reward: tensor(1.3500, device='cuda:0') factor: tensor(0.2074, device='cuda:0')
reward_raw: tensor(0.5469, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.2093, device='cuda:0')
reward_raw: tensor(0.5529, device='cuda:0') reward: tensor(1.3435, device='cuda:0') factor: tensor(0.2094, device='cuda:0')


Epoch 1, Step 542: Loss = 0.0445, Actor Loss = 0.0098, Critic Loss = 0.0696, Entropy = 0.1135, Returns = 1.4073, Value = 1.3658, mask_loss = 0.2102std_loss = 0.3600:  10%|▉         | 542/5475 [2:25:35<21:18:25, 15.55s/it]

ratio tensor(0.7943, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6323, device='cuda:0') reward: tensor(1.4089, device='cuda:0') factor: tensor(0.2234, device='cuda:0')
reward_raw: tensor(0.6229, device='cuda:0') reward: tensor(1.4050, device='cuda:0') factor: tensor(0.2179, device='cuda:0')
reward_raw: tensor(0.6305, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.2278, device='cuda:0')
reward_raw: tensor(0.6309, device='cuda:0') reward: tensor(1.4090, device='cuda:0') factor: tensor(0.2220, device='cuda:0')
reward_raw: tensor(0.6162, device='cuda:0') reward: tensor(1.3881, device='cuda:0') factor: tensor(0.2281, device='cuda:0')


Epoch 1, Step 543: Loss = 0.0922, Actor Loss = 0.0545, Critic Loss = 0.0757, Entropy = 0.1135, Returns = 1.3599, Value = 1.4615, mask_loss = 0.2229std_loss = 0.3672:  10%|▉         | 543/5475 [2:25:49<20:39:39, 15.08s/it]

ratio tensor(1.2402, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5477, device='cuda:0') reward: tensor(1.3054, device='cuda:0') factor: tensor(0.2422, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.3098, device='cuda:0') factor: tensor(0.2443, device='cuda:0')
reward_raw: tensor(0.5514, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.2407, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.2379, device='cuda:0')
reward_raw: tensor(0.5685, device='cuda:0') reward: tensor(1.3279, device='cuda:0') factor: tensor(0.2405, device='cuda:0')


Epoch 1, Step 544: Loss = -0.0868, Actor Loss = -0.1457, Critic Loss = 0.1181, Entropy = 0.1325, Returns = 1.3192, Value = 1.2266, mask_loss = 0.2600std_loss = 0.3816:  10%|▉         | 544/5475 [2:26:02<19:57:05, 14.57s/it]

ratio tensor(1.0902, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5358, device='cuda:0') reward: tensor(1.2631, device='cuda:0') factor: tensor(0.2727, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.2932, device='cuda:0') factor: tensor(0.2755, device='cuda:0')
reward_raw: tensor(0.5809, device='cuda:0') reward: tensor(1.3041, device='cuda:0') factor: tensor(0.2768, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.2849, device='cuda:0') factor: tensor(0.2743, device='cuda:0')
reward_raw: tensor(0.5560, device='cuda:0') reward: tensor(1.2773, device='cuda:0') factor: tensor(0.2787, device='cuda:0')


Epoch 1, Step 545: Loss = 0.2007, Actor Loss = 0.1770, Critic Loss = 0.0475, Entropy = 0.0831, Returns = 1.3267, Value = 1.2181, mask_loss = 0.2071std_loss = 0.3598:  10%|▉         | 545/5475 [2:26:24<22:52:25, 16.70s/it]  

ratio tensor(0.8187, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.2185, device='cuda:0')
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.3119, device='cuda:0') factor: tensor(0.2246, device='cuda:0')
reward_raw: tensor(0.5403, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.2142, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.2196, device='cuda:0')
reward_raw: tensor(0.5471, device='cuda:0') reward: tensor(1.3272, device='cuda:0') factor: tensor(0.2200, device='cuda:0')


Epoch 1, Step 546: Loss = -0.2547, Actor Loss = -0.2894, Critic Loss = 0.0697, Entropy = 0.0894, Returns = 1.4227, Value = 1.2960, mask_loss = 0.2219std_loss = 0.3800:  10%|▉         | 546/5475 [2:26:39<22:16:06, 16.26s/it]

ratio tensor(2.0201, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5845, device='cuda:0') reward: tensor(1.3724, device='cuda:0') factor: tensor(0.2121, device='cuda:0')
reward_raw: tensor(0.5880, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.2148, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.2111, device='cuda:0')
reward_raw: tensor(0.5811, device='cuda:0') reward: tensor(1.3688, device='cuda:0') factor: tensor(0.2123, device='cuda:0')
reward_raw: tensor(0.5913, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.2150, device='cuda:0')


Epoch 1, Step 547: Loss = -0.2260, Actor Loss = -0.2826, Critic Loss = 0.1133, Entropy = 0.0854, Returns = 1.4048, Value = 1.4502, mask_loss = 0.1875std_loss = 0.3481:  10%|▉         | 547/5475 [2:26:57<23:03:41, 16.85s/it]

ratio tensor(5.2944, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4781, device='cuda:0') reward: tensor(1.3083, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.3108, device='cuda:0') factor: tensor(0.1711, device='cuda:0')
reward_raw: tensor(0.5089, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.1689, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(1.3144, device='cuda:0') factor: tensor(0.1689, device='cuda:0')
reward_raw: tensor(0.4791, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.1660, device='cuda:0')


Epoch 1, Step 548: Loss = 0.3523, Actor Loss = 0.3168, Critic Loss = 0.0714, Entropy = 0.1105, Returns = 1.2831, Value = 1.2928, mask_loss = 0.1794std_loss = 0.3230:  10%|█         | 548/5475 [2:27:15<23:24:51, 17.11s/it]  

ratio tensor(0.8873, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5500, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.2002, device='cuda:0')
reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.2011, device='cuda:0')
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.2033, device='cuda:0')
reward_raw: tensor(0.5650, device='cuda:0') reward: tensor(1.3570, device='cuda:0') factor: tensor(0.2080, device='cuda:0')
reward_raw: tensor(0.5134, device='cuda:0') reward: tensor(1.3078, device='cuda:0') factor: tensor(0.2056, device='cuda:0')


Epoch 1, Step 549: Loss = -0.0999, Actor Loss = -0.1435, Critic Loss = 0.0873, Entropy = 0.0827, Returns = 1.3854, Value = 1.2410, mask_loss = 0.1824std_loss = 0.3454:  10%|█         | 549/5475 [2:27:29<21:58:40, 16.06s/it]

ratio tensor(10.3382, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5159, device='cuda:0') reward: tensor(1.3481, device='cuda:0') factor: tensor(0.1678, device='cuda:0')
reward_raw: tensor(0.5465, device='cuda:0') reward: tensor(1.3761, device='cuda:0') factor: tensor(0.1704, device='cuda:0')
reward_raw: tensor(0.5686, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.1703, device='cuda:0')
reward_raw: tensor(0.5572, device='cuda:0') reward: tensor(1.3843, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.5579, device='cuda:0') reward: tensor(1.3830, device='cuda:0') factor: tensor(0.1748, device='cuda:0')


Epoch 1, Step 550: Loss = 0.0486, Actor Loss = -0.0371, Critic Loss = 0.1716, Entropy = 0.0577, Returns = 1.3023, Value = 1.5344, mask_loss = 0.1602std_loss = 0.3354:  10%|█         | 550/5475 [2:27:47<22:55:22, 16.76s/it] 

ratio tensor(1.9493, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5084, device='cuda:0') reward: tensor(1.3086, device='cuda:0') factor: tensor(0.1998, device='cuda:0')
reward_raw: tensor(0.5005, device='cuda:0') reward: tensor(1.2979, device='cuda:0') factor: tensor(0.2025, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.1988, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.2016, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.3176, device='cuda:0') factor: tensor(0.1944, device='cuda:0')


Epoch 1, Step 551: Loss = 0.1696, Actor Loss = 0.1218, Critic Loss = 0.0959, Entropy = 0.1022, Returns = 1.2792, Value = 1.2894, mask_loss = 0.2089std_loss = 0.3619:  10%|█         | 551/5475 [2:28:02<22:18:16, 16.31s/it] 

ratio tensor(1.2898, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3011, device='cuda:0') factor: tensor(0.1738, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3120, device='cuda:0') factor: tensor(0.1786, device='cuda:0')
reward_raw: tensor(0.4926, device='cuda:0') reward: tensor(1.3083, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.3137, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1809, device='cuda:0')


Epoch 1, Step 552: Loss = 0.1435, Actor Loss = 0.1153, Critic Loss = 0.0565, Entropy = 0.0941, Returns = 1.2764, Value = 1.2870, mask_loss = 0.1972std_loss = 0.3529:  10%|█         | 552/5475 [2:28:18<21:52:37, 16.00s/it]

ratio tensor(0.8809, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6598, device='cuda:0') reward: tensor(1.4643, device='cuda:0') factor: tensor(0.1956, device='cuda:0')
reward_raw: tensor(0.6456, device='cuda:0') reward: tensor(1.4496, device='cuda:0') factor: tensor(0.1960, device='cuda:0')
reward_raw: tensor(0.6515, device='cuda:0') reward: tensor(1.4542, device='cuda:0') factor: tensor(0.1972, device='cuda:0')
reward_raw: tensor(0.6832, device='cuda:0') reward: tensor(1.4898, device='cuda:0') factor: tensor(0.1934, device='cuda:0')
reward_raw: tensor(0.6407, device='cuda:0') reward: tensor(1.4503, device='cuda:0') factor: tensor(0.1904, device='cuda:0')


Epoch 1, Step 553: Loss = -0.1514, Actor Loss = -0.1902, Critic Loss = 0.0779, Entropy = 0.0884, Returns = 1.3843, Value = 1.5094, mask_loss = 0.1960std_loss = 0.3575:  10%|█         | 553/5475 [2:28:31<20:46:33, 15.20s/it]

ratio tensor(7.0770, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5504, device='cuda:0') reward: tensor(1.3441, device='cuda:0') factor: tensor(0.2063, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.2033, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.3348, device='cuda:0') factor: tensor(0.2043, device='cuda:0')
reward_raw: tensor(0.5459, device='cuda:0') reward: tensor(1.3386, device='cuda:0') factor: tensor(0.2073, device='cuda:0')
reward_raw: tensor(0.5382, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.2023, device='cuda:0')


Epoch 1, Step 554: Loss = 0.2022, Actor Loss = 0.1364, Critic Loss = 0.1318, Entropy = 0.1031, Returns = 1.3328, Value = 1.3923, mask_loss = 0.2260std_loss = 0.3798:  10%|█         | 554/5475 [2:28:45<20:11:37, 14.77s/it]  

ratio tensor(1.6778, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.3044, device='cuda:0') factor: tensor(0.2219, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.3032, device='cuda:0') factor: tensor(0.2217, device='cuda:0')
reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.2904, device='cuda:0') factor: tensor(0.2227, device='cuda:0')
reward_raw: tensor(0.5265, device='cuda:0') reward: tensor(1.3081, device='cuda:0') factor: tensor(0.2184, device='cuda:0')
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.2194, device='cuda:0')


Epoch 1, Step 555: Loss = 0.2441, Actor Loss = 0.1810, Critic Loss = 0.1266, Entropy = 0.1372, Returns = 1.3191, Value = 1.1580, mask_loss = 0.2673std_loss = 0.3840:  10%|█         | 555/5475 [2:29:02<21:11:11, 15.50s/it]

ratio tensor(0.5556, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5535, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.2393, device='cuda:0')
reward_raw: tensor(0.5605, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.2337, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.3404, device='cuda:0') factor: tensor(0.2401, device='cuda:0')
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.2359, device='cuda:0')
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.3468, device='cuda:0') factor: tensor(0.2329, device='cuda:0')


Epoch 1, Step 556: Loss = -0.1937, Actor Loss = -0.2184, Critic Loss = 0.0496, Entropy = 0.1111, Returns = 1.3187, Value = 1.3357, mask_loss = 0.2276std_loss = 0.3766:  10%|█         | 556/5475 [2:29:16<20:44:49, 15.18s/it]

ratio tensor(5.4491, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5331, device='cuda:0') reward: tensor(1.2657, device='cuda:0') factor: tensor(0.2674, device='cuda:0')
reward_raw: tensor(0.5238, device='cuda:0') reward: tensor(1.2432, device='cuda:0') factor: tensor(0.2807, device='cuda:0')
reward_raw: tensor(0.5449, device='cuda:0') reward: tensor(1.2766, device='cuda:0') factor: tensor(0.2683, device='cuda:0')
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.2617, device='cuda:0') factor: tensor(0.2626, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.2238, device='cuda:0') factor: tensor(0.2680, device='cuda:0')


Epoch 1, Step 557: Loss = 0.1777, Actor Loss = 0.1547, Critic Loss = 0.0462, Entropy = 0.1508, Returns = 1.2802, Value = 1.2858, mask_loss = 0.2768std_loss = 0.3898:  10%|█         | 557/5475 [2:29:33<21:24:21, 15.67s/it]  

ratio tensor(0.4481, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6651, device='cuda:0') reward: tensor(1.4216, device='cuda:0') factor: tensor(0.2435, device='cuda:0')
reward_raw: tensor(0.6493, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.2448, device='cuda:0')
reward_raw: tensor(0.6723, device='cuda:0') reward: tensor(1.4242, device='cuda:0') factor: tensor(0.2482, device='cuda:0')
reward_raw: tensor(0.6322, device='cuda:0') reward: tensor(1.3887, device='cuda:0') factor: tensor(0.2435, device='cuda:0')
reward_raw: tensor(0.6577, device='cuda:0') reward: tensor(1.4067, device='cuda:0') factor: tensor(0.2510, device='cuda:0')


Epoch 1, Step 558: Loss = -0.1303, Actor Loss = -0.1511, Critic Loss = 0.0418, Entropy = 0.1187, Returns = 1.3498, Value = 1.4041, mask_loss = 0.2307std_loss = 0.3679:  10%|█         | 558/5475 [2:29:50<22:04:37, 16.16s/it]

ratio tensor(1.3851, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6029, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.3009, device='cuda:0')
reward_raw: tensor(0.5956, device='cuda:0') reward: tensor(1.2979, device='cuda:0') factor: tensor(0.2977, device='cuda:0')
reward_raw: tensor(0.6001, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.2945, device='cuda:0')
reward_raw: tensor(0.6168, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.3037, device='cuda:0')
reward_raw: tensor(0.6114, device='cuda:0') reward: tensor(1.3107, device='cuda:0') factor: tensor(0.3006, device='cuda:0')


Epoch 1, Step 559: Loss = -0.0789, Actor Loss = -0.1151, Critic Loss = 0.0726, Entropy = 0.1660, Returns = 1.3728, Value = 1.2418, mask_loss = 0.3217std_loss = 0.3863:  10%|█         | 559/5475 [2:30:04<20:58:55, 15.37s/it]

ratio tensor(53.6750, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6331, device='cuda:0') reward: tensor(1.3970, device='cuda:0') factor: tensor(0.2362, device='cuda:0')
reward_raw: tensor(0.6574, device='cuda:0') reward: tensor(1.4190, device='cuda:0') factor: tensor(0.2384, device='cuda:0')
reward_raw: tensor(0.6586, device='cuda:0') reward: tensor(1.4227, device='cuda:0') factor: tensor(0.2359, device='cuda:0')
reward_raw: tensor(0.6564, device='cuda:0') reward: tensor(1.4243, device='cuda:0') factor: tensor(0.2321, device='cuda:0')
reward_raw: tensor(0.6450, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.2377, device='cuda:0')


Epoch 1, Step 560: Loss = 0.1398, Actor Loss = 0.1122, Critic Loss = 0.0555, Entropy = 0.1299, Returns = 1.3529, Value = 1.4447, mask_loss = 0.2556std_loss = 0.3774:  10%|█         | 560/5475 [2:30:18<20:31:00, 15.03s/it]  

ratio tensor(1.2457, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5559, device='cuda:0') reward: tensor(1.2899, device='cuda:0') factor: tensor(0.2660, device='cuda:0')
reward_raw: tensor(0.5879, device='cuda:0') reward: tensor(1.3191, device='cuda:0') factor: tensor(0.2688, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.2921, device='cuda:0') factor: tensor(0.2671, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.2836, device='cuda:0') factor: tensor(0.2596, device='cuda:0')
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.3022, device='cuda:0') factor: tensor(0.2693, device='cuda:0')


Epoch 1, Step 561: Loss = -0.0085, Actor Loss = -0.1379, Critic Loss = 0.2591, Entropy = 0.1360, Returns = 1.3667, Value = 1.1756, mask_loss = 0.2255std_loss = 0.3478:  10%|█         | 560/5475 [2:30:40<20:31:00, 15.03s/it]

ratio tensor(3.0835, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 561: Loss = -0.0085, Actor Loss = -0.1379, Critic Loss = 0.2591, Entropy = 0.1360, Returns = 1.3667, Value = 1.1756, mask_loss = 0.2255std_loss = 0.3478:  10%|█         | 561/5475 [2:30:40<23:19:28, 17.09s/it]

reward_raw: tensor(0.5723, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.2209, device='cuda:0')
reward_raw: tensor(0.5822, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.6071, device='cuda:0') reward: tensor(1.3874, device='cuda:0') factor: tensor(0.2197, device='cuda:0')
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.2191, device='cuda:0')
reward_raw: tensor(0.5665, device='cuda:0') reward: tensor(1.3435, device='cuda:0') factor: tensor(0.2230, device='cuda:0')


Epoch 1, Step 562: Loss = -0.1961, Actor Loss = -0.2158, Critic Loss = 0.0397, Entropy = 0.1395, Returns = 1.4179, Value = 1.4214, mask_loss = 0.2357std_loss = 0.3669:  10%|█         | 562/5475 [2:30:53<21:48:59, 15.99s/it]

ratio tensor(2.1760, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6278, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.2706, device='cuda:0')
reward_raw: tensor(0.6226, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.2797, device='cuda:0')
reward_raw: tensor(0.6188, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.2722, device='cuda:0')
reward_raw: tensor(0.6205, device='cuda:0') reward: tensor(1.3439, device='cuda:0') factor: tensor(0.2766, device='cuda:0')
reward_raw: tensor(0.6210, device='cuda:0') reward: tensor(1.3473, device='cuda:0') factor: tensor(0.2737, device='cuda:0')


Epoch 1, Step 563: Loss = 0.0577, Actor Loss = 0.0302, Critic Loss = 0.0552, Entropy = 0.1305, Returns = 1.3808, Value = 1.2787, mask_loss = 0.2439std_loss = 0.3784:  10%|█         | 563/5475 [2:31:09<21:44:39, 15.94s/it]  

ratio tensor(0.6661, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.2771, device='cuda:0') factor: tensor(0.2434, device='cuda:0')
reward_raw: tensor(0.4927, device='cuda:0') reward: tensor(1.2471, device='cuda:0') factor: tensor(0.2455, device='cuda:0')
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.2625, device='cuda:0') factor: tensor(0.2425, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.2739, device='cuda:0') factor: tensor(0.2483, device='cuda:0')
reward_raw: tensor(0.5266, device='cuda:0') reward: tensor(1.2845, device='cuda:0') factor: tensor(0.2422, device='cuda:0')


Epoch 1, Step 564: Loss = -0.1858, Actor Loss = -0.2219, Critic Loss = 0.0724, Entropy = 0.1080, Returns = 1.2870, Value = 1.2674, mask_loss = 0.2263std_loss = 0.3777:  10%|█         | 564/5475 [2:31:28<23:01:59, 16.88s/it]

ratio tensor(15.4265, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.2264, device='cuda:0')
reward_raw: tensor(0.5802, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.2272, device='cuda:0')
reward_raw: tensor(0.6039, device='cuda:0') reward: tensor(1.3748, device='cuda:0') factor: tensor(0.2290, device='cuda:0')
reward_raw: tensor(0.5869, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.2270, device='cuda:0')
reward_raw: tensor(0.6099, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.2289, device='cuda:0')


Epoch 1, Step 565: Loss = 0.1376, Actor Loss = 0.0904, Critic Loss = 0.0945, Entropy = 0.1220, Returns = 1.3934, Value = 1.4294, mask_loss = 0.2042std_loss = 0.3514:  10%|█         | 565/5475 [2:31:45<22:43:45, 16.66s/it]  

ratio tensor(0.6088, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5799, device='cuda:0') reward: tensor(1.3102, device='cuda:0') factor: tensor(0.2697, device='cuda:0')
reward_raw: tensor(0.5853, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.2725, device='cuda:0')
reward_raw: tensor(0.5795, device='cuda:0') reward: tensor(1.3008, device='cuda:0') factor: tensor(0.2786, device='cuda:0')
reward_raw: tensor(0.6035, device='cuda:0') reward: tensor(1.3228, device='cuda:0') factor: tensor(0.2807, device='cuda:0')
reward_raw: tensor(0.5885, device='cuda:0') reward: tensor(1.3063, device='cuda:0') factor: tensor(0.2822, device='cuda:0')


Epoch 1, Step 566: Loss = 0.0988, Actor Loss = 0.0288, Critic Loss = 0.1403, Entropy = 0.1384, Returns = 1.3551, Value = 1.2280, mask_loss = 0.2313std_loss = 0.3609:  10%|█         | 566/5475 [2:31:56<20:43:21, 15.20s/it]

ratio tensor(1.0955, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.2921, device='cuda:0') factor: tensor(0.2711, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.2979, device='cuda:0') factor: tensor(0.2662, device='cuda:0')
reward_raw: tensor(0.5419, device='cuda:0') reward: tensor(1.2811, device='cuda:0') factor: tensor(0.2608, device='cuda:0')
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.2923, device='cuda:0') factor: tensor(0.2628, device='cuda:0')
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.2906, device='cuda:0') factor: tensor(0.2704, device='cuda:0')


Epoch 1, Step 567: Loss = -0.0436, Actor Loss = -0.0949, Critic Loss = 0.1029, Entropy = 0.1317, Returns = 1.3036, Value = 1.1707, mask_loss = 0.2772std_loss = 0.3950:  10%|█         | 567/5475 [2:32:09<19:47:38, 14.52s/it]

ratio tensor(0.9810, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.2638, device='cuda:0') factor: tensor(0.2860, device='cuda:0')
reward_raw: tensor(0.5463, device='cuda:0') reward: tensor(1.2572, device='cuda:0') factor: tensor(0.2891, device='cuda:0')
reward_raw: tensor(0.5562, device='cuda:0') reward: tensor(1.2638, device='cuda:0') factor: tensor(0.2924, device='cuda:0')
reward_raw: tensor(0.5441, device='cuda:0') reward: tensor(1.2599, device='cuda:0') factor: tensor(0.2842, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.2654, device='cuda:0') factor: tensor(0.2833, device='cuda:0')


Epoch 1, Step 568: Loss = 0.1604, Actor Loss = 0.0848, Critic Loss = 0.1515, Entropy = 0.1886, Returns = 1.1744, Value = 1.3728, mask_loss = 0.3402std_loss = 0.3927:  10%|█         | 568/5475 [2:32:23<19:18:17, 14.16s/it]  

ratio tensor(9.3036, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.2933, device='cuda:0') factor: tensor(0.2681, device='cuda:0')
reward_raw: tensor(0.5783, device='cuda:0') reward: tensor(1.3140, device='cuda:0') factor: tensor(0.2643, device='cuda:0')
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.3058, device='cuda:0') factor: tensor(0.2633, device='cuda:0')
reward_raw: tensor(0.5726, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.2616, device='cuda:0')
reward_raw: tensor(0.5630, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.2686, device='cuda:0')


Epoch 1, Step 569: Loss = 0.1456, Actor Loss = 0.1091, Critic Loss = 0.0732, Entropy = 0.1460, Returns = 1.2688, Value = 1.2843, mask_loss = 0.3191std_loss = 0.3683:  10%|█         | 569/5475 [2:32:40<20:31:11, 15.06s/it]

ratio tensor(0.6113, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.2230, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3322, device='cuda:0') factor: tensor(0.2111, device='cuda:0')
reward_raw: tensor(0.5484, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.2208, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.2181, device='cuda:0')
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.2209, device='cuda:0')


Epoch 1, Step 570: Loss = 0.1129, Actor Loss = 0.0775, Critic Loss = 0.0712, Entropy = 0.1356, Returns = 1.3885, Value = 1.4050, mask_loss = 0.1998std_loss = 0.3367:  10%|█         | 570/5475 [2:32:55<20:35:08, 15.11s/it]

ratio tensor(1.4905, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5709, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.2330, device='cuda:0')
reward_raw: tensor(0.5751, device='cuda:0') reward: tensor(1.3406, device='cuda:0') factor: tensor(0.2345, device='cuda:0')
reward_raw: tensor(0.5744, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.2381, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.2383, device='cuda:0')
reward_raw: tensor(0.5736, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.2336, device='cuda:0')


Epoch 1, Step 571: Loss = -0.0914, Actor Loss = -0.1477, Critic Loss = 0.1128, Entropy = 0.1364, Returns = 1.2592, Value = 1.2724, mask_loss = 0.2913std_loss = 0.3808:  10%|█         | 571/5475 [2:33:09<20:07:45, 14.78s/it]

ratio tensor(8.9610, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5501, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.2060, device='cuda:0')
reward_raw: tensor(0.5430, device='cuda:0') reward: tensor(1.3361, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5378, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.2063, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.3255, device='cuda:0') factor: tensor(0.2037, device='cuda:0')
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.2056, device='cuda:0')


Epoch 1, Step 572: Loss = 0.1324, Actor Loss = 0.1070, Critic Loss = 0.0510, Entropy = 0.1134, Returns = 1.3100, Value = 1.3337, mask_loss = 0.1994std_loss = 0.3520:  10%|█         | 572/5475 [2:33:28<21:46:45, 15.99s/it]  

ratio tensor(0.9493, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.2739, device='cuda:0') factor: tensor(0.2543, device='cuda:0')
reward_raw: tensor(0.5285, device='cuda:0') reward: tensor(1.2718, device='cuda:0') factor: tensor(0.2567, device='cuda:0')
reward_raw: tensor(0.5358, device='cuda:0') reward: tensor(1.2805, device='cuda:0') factor: tensor(0.2553, device='cuda:0')
reward_raw: tensor(0.5334, device='cuda:0') reward: tensor(1.2704, device='cuda:0') factor: tensor(0.2629, device='cuda:0')
reward_raw: tensor(0.5294, device='cuda:0') reward: tensor(1.2708, device='cuda:0') factor: tensor(0.2586, device='cuda:0')


Epoch 1, Step 573: Loss = -0.0779, Actor Loss = -0.1488, Critic Loss = 0.1419, Entropy = 0.1232, Returns = 1.2534, Value = 1.2286, mask_loss = 0.2361std_loss = 0.3669:  10%|█         | 573/5475 [2:33:42<20:58:01, 15.40s/it]

ratio tensor(1.2827, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6664, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.2608, device='cuda:0')
reward_raw: tensor(0.6555, device='cuda:0') reward: tensor(1.3932, device='cuda:0') factor: tensor(0.2623, device='cuda:0')
reward_raw: tensor(0.6631, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.2708, device='cuda:0')
reward_raw: tensor(0.6574, device='cuda:0') reward: tensor(1.3927, device='cuda:0') factor: tensor(0.2647, device='cuda:0')
reward_raw: tensor(0.6509, device='cuda:0') reward: tensor(1.3899, device='cuda:0') factor: tensor(0.2610, device='cuda:0')


Epoch 1, Step 574: Loss = 0.0060, Actor Loss = -0.0341, Critic Loss = 0.0805, Entropy = 0.1356, Returns = 1.3793, Value = 1.4261, mask_loss = 0.2602std_loss = 0.3830:  10%|█         | 574/5475 [2:33:57<20:59:06, 15.41s/it] 

ratio tensor(1.0305, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6114, device='cuda:0') reward: tensor(1.2864, device='cuda:0') factor: tensor(0.3250, device='cuda:0')
reward_raw: tensor(0.5987, device='cuda:0') reward: tensor(1.2780, device='cuda:0') factor: tensor(0.3207, device='cuda:0')
reward_raw: tensor(0.6022, device='cuda:0') reward: tensor(1.2850, device='cuda:0') factor: tensor(0.3172, device='cuda:0')
reward_raw: tensor(0.6053, device='cuda:0') reward: tensor(1.2855, device='cuda:0') factor: tensor(0.3198, device='cuda:0')
reward_raw: tensor(0.6000, device='cuda:0') reward: tensor(1.2684, device='cuda:0') factor: tensor(0.3316, device='cuda:0')


Epoch 1, Step 575: Loss = 0.0256, Actor Loss = -0.0054, Critic Loss = 0.0622, Entropy = 0.1465, Returns = 1.3473, Value = 1.2988, mask_loss = 0.2955std_loss = 0.3829:  11%|█         | 575/5475 [2:34:11<20:09:11, 14.81s/it]

ratio tensor(1.0189, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.2412, device='cuda:0') factor: tensor(0.2618, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.2323, device='cuda:0') factor: tensor(0.2640, device='cuda:0')
reward_raw: tensor(0.4921, device='cuda:0') reward: tensor(1.2226, device='cuda:0') factor: tensor(0.2695, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.2186, device='cuda:0') factor: tensor(0.2657, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.2247, device='cuda:0') factor: tensor(0.2672, device='cuda:0')


Epoch 1, Step 576: Loss = 0.0688, Actor Loss = 0.0351, Critic Loss = 0.0677, Entropy = 0.1270, Returns = 1.2767, Value = 1.1577, mask_loss = 0.2303std_loss = 0.3734:  11%|█         | 576/5475 [2:34:24<19:33:32, 14.37s/it] 

ratio tensor(0.8960, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.3367, device='cuda:0') factor: tensor(0.2279, device='cuda:0')
reward_raw: tensor(0.5694, device='cuda:0') reward: tensor(1.3493, device='cuda:0') factor: tensor(0.2201, device='cuda:0')
reward_raw: tensor(0.5555, device='cuda:0') reward: tensor(1.3333, device='cuda:0') factor: tensor(0.2222, device='cuda:0')
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.3574, device='cuda:0') factor: tensor(0.2222, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.2333, device='cuda:0')


Epoch 1, Step 577: Loss = -0.0532, Actor Loss = -0.0965, Critic Loss = 0.0868, Entropy = 0.1445, Returns = 1.2466, Value = 1.3675, mask_loss = 0.2345std_loss = 0.3651:  11%|█         | 577/5475 [2:34:39<19:57:33, 14.67s/it]

ratio tensor(2.7919, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.2358, device='cuda:0')
reward_raw: tensor(0.5705, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.2398, device='cuda:0')
reward_raw: tensor(0.5492, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.2336, device='cuda:0')
reward_raw: tensor(0.5578, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.2355, device='cuda:0')
reward_raw: tensor(0.5644, device='cuda:0') reward: tensor(1.3228, device='cuda:0') factor: tensor(0.2417, device='cuda:0')


Epoch 1, Step 578: Loss = 0.1561, Actor Loss = 0.1282, Critic Loss = 0.0560, Entropy = 0.1051, Returns = 1.3136, Value = 1.3223, mask_loss = 0.2005std_loss = 0.3498:  11%|█         | 578/5475 [2:34:55<20:27:38, 15.04s/it]  

ratio tensor(1.0038, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6059, device='cuda:0') reward: tensor(1.4167, device='cuda:0') factor: tensor(0.1892, device='cuda:0')
reward_raw: tensor(0.5843, device='cuda:0') reward: tensor(1.3956, device='cuda:0') factor: tensor(0.1888, device='cuda:0')
reward_raw: tensor(0.5896, device='cuda:0') reward: tensor(1.4005, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5937, device='cuda:0') reward: tensor(1.3989, device='cuda:0') factor: tensor(0.1948, device='cuda:0')
reward_raw: tensor(0.5955, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.1950, device='cuda:0')


Epoch 1, Step 579: Loss = -0.1479, Actor Loss = -0.1953, Critic Loss = 0.0949, Entropy = 0.0986, Returns = 1.3548, Value = 1.2360, mask_loss = 0.1693std_loss = 0.3323:  11%|█         | 579/5475 [2:35:17<23:18:06, 17.13s/it]

ratio tensor(1.0006, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3032, device='cuda:0') factor: tensor(0.1850, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.1892, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3057, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5098, device='cuda:0') reward: tensor(1.3217, device='cuda:0') factor: tensor(0.1881, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.1811, device='cuda:0')


Epoch 1, Step 580: Loss = 0.0992, Actor Loss = 0.0626, Critic Loss = 0.0733, Entropy = 0.1026, Returns = 1.3433, Value = 1.3881, mask_loss = 0.1902std_loss = 0.3465:  11%|█         | 580/5475 [2:35:40<25:22:32, 18.66s/it]  

ratio tensor(1.0434, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.2924, device='cuda:0') factor: tensor(0.2562, device='cuda:0')
reward_raw: tensor(0.5855, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.2545, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3085, device='cuda:0') factor: tensor(0.2491, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.2904, device='cuda:0') factor: tensor(0.2511, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.3089, device='cuda:0') factor: tensor(0.2553, device='cuda:0')


Epoch 1, Step 581: Loss = 0.0682, Actor Loss = 0.0097, Critic Loss = 0.1171, Entropy = 0.1290, Returns = 1.2684, Value = 1.1635, mask_loss = 0.2562std_loss = 0.3805:  11%|█         | 580/5475 [2:35:53<25:22:32, 18.66s/it]

ratio tensor(1.1231, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 581: Loss = 0.0682, Actor Loss = 0.0097, Critic Loss = 0.1171, Entropy = 0.1290, Returns = 1.2684, Value = 1.1635, mask_loss = 0.2562std_loss = 0.3805:  11%|█         | 581/5475 [2:35:53<23:27:14, 17.25s/it]

reward_raw: tensor(0.5756, device='cuda:0') reward: tensor(1.3102, device='cuda:0') factor: tensor(0.2654, device='cuda:0')
reward_raw: tensor(0.5827, device='cuda:0') reward: tensor(1.3245, device='cuda:0') factor: tensor(0.2582, device='cuda:0')
reward_raw: tensor(0.5557, device='cuda:0') reward: tensor(1.3021, device='cuda:0') factor: tensor(0.2536, device='cuda:0')
reward_raw: tensor(0.5531, device='cuda:0') reward: tensor(1.2942, device='cuda:0') factor: tensor(0.2588, device='cuda:0')
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.2883, device='cuda:0') factor: tensor(0.2669, device='cuda:0')


Epoch 1, Step 582: Loss = 0.0152, Actor Loss = -0.0571, Critic Loss = 0.1449, Entropy = 0.1475, Returns = 1.1653, Value = 1.2344, mask_loss = 0.3068std_loss = 0.3997:  11%|█         | 582/5475 [2:36:06<21:19:51, 15.69s/it]

ratio tensor(0.9990, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6087, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.2139, device='cuda:0')
reward_raw: tensor(0.6117, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.2145, device='cuda:0')
reward_raw: tensor(0.6171, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.2229, device='cuda:0')
reward_raw: tensor(0.6146, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.2323, device='cuda:0')
reward_raw: tensor(0.6116, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.2175, device='cuda:0')


Epoch 1, Step 583: Loss = -0.2208, Actor Loss = -0.2583, Critic Loss = 0.0751, Entropy = 0.1123, Returns = 1.4521, Value = 1.3504, mask_loss = 0.1938std_loss = 0.3456:  11%|█         | 583/5475 [2:36:21<21:20:19, 15.70s/it]

ratio tensor(1.9713, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5425, device='cuda:0') reward: tensor(1.2949, device='cuda:0') factor: tensor(0.2476, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.2984, device='cuda:0') factor: tensor(0.2408, device='cuda:0')
reward_raw: tensor(0.5514, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.2457, device='cuda:0')
reward_raw: tensor(0.5647, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.2508, device='cuda:0')
reward_raw: tensor(0.5274, device='cuda:0') reward: tensor(1.2817, device='cuda:0') factor: tensor(0.2457, device='cuda:0')


Epoch 1, Step 584: Loss = 0.1842, Actor Loss = 0.1424, Critic Loss = 0.0838, Entropy = 0.1398, Returns = 1.2434, Value = 1.3185, mask_loss = 0.2474std_loss = 0.3767:  11%|█         | 584/5475 [2:36:34<20:07:44, 14.82s/it]  

ratio tensor(0.5678, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.2763, device='cuda:0') factor: tensor(0.2378, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.2713, device='cuda:0') factor: tensor(0.2336, device='cuda:0')
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.2856, device='cuda:0') factor: tensor(0.2546, device='cuda:0')
reward_raw: tensor(0.5667, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.2509, device='cuda:0')
reward_raw: tensor(0.5296, device='cuda:0') reward: tensor(1.2831, device='cuda:0') factor: tensor(0.2465, device='cuda:0')


Epoch 1, Step 585: Loss = 0.0349, Actor Loss = -0.0037, Critic Loss = 0.0774, Entropy = 0.1335, Returns = 1.3447, Value = 1.1865, mask_loss = 0.2241std_loss = 0.3539:  11%|█         | 585/5475 [2:36:49<20:18:48, 14.95s/it]

ratio tensor(0.9549, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6356, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.2662, device='cuda:0')
reward_raw: tensor(0.6440, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.2756, device='cuda:0')
reward_raw: tensor(0.6290, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.2787, device='cuda:0')
reward_raw: tensor(0.6372, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.2726, device='cuda:0')
reward_raw: tensor(0.6399, device='cuda:0') reward: tensor(1.3712, device='cuda:0') factor: tensor(0.2687, device='cuda:0')


Epoch 1, Step 586: Loss = 0.0984, Actor Loss = 0.0177, Critic Loss = 0.1617, Entropy = 0.1583, Returns = 1.3553, Value = 1.5570, mask_loss = 0.2624std_loss = 0.3712:  11%|█         | 586/5475 [2:37:04<20:24:36, 15.03s/it] 

ratio tensor(1.3423, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6162, device='cuda:0') reward: tensor(1.3427, device='cuda:0') factor: tensor(0.2735, device='cuda:0')
reward_raw: tensor(0.6460, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.2690, device='cuda:0')
reward_raw: tensor(0.6633, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.2763, device='cuda:0')
reward_raw: tensor(0.6114, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.2683, device='cuda:0')
reward_raw: tensor(0.6488, device='cuda:0') reward: tensor(1.3610, device='cuda:0') factor: tensor(0.2878, device='cuda:0')


Epoch 1, Step 587: Loss = 0.2345, Actor Loss = 0.1615, Critic Loss = 0.1464, Entropy = 0.1382, Returns = 1.3247, Value = 1.5579, mask_loss = 0.2474std_loss = 0.3629:  11%|█         | 587/5475 [2:37:17<19:25:39, 14.31s/it]

ratio tensor(34.4708, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.2457, device='cuda:0') factor: tensor(0.2196, device='cuda:0')
reward_raw: tensor(0.4521, device='cuda:0') reward: tensor(1.2327, device='cuda:0') factor: tensor(0.2194, device='cuda:0')
reward_raw: tensor(0.4428, device='cuda:0') reward: tensor(1.2252, device='cuda:0') factor: tensor(0.2176, device='cuda:0')
reward_raw: tensor(0.4510, device='cuda:0') reward: tensor(1.2349, device='cuda:0') factor: tensor(0.2161, device='cuda:0')
reward_raw: tensor(0.4398, device='cuda:0') reward: tensor(1.2269, device='cuda:0') factor: tensor(0.2129, device='cuda:0')


Epoch 1, Step 588: Loss = 0.3122, Actor Loss = 0.2415, Critic Loss = 0.1416, Entropy = 0.0809, Returns = 1.2976, Value = 1.1599, mask_loss = 0.1689std_loss = 0.3335:  11%|█         | 588/5475 [2:37:36<21:05:37, 15.54s/it]

ratio tensor(0.0528, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1654, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.3371, device='cuda:0') factor: tensor(0.1692, device='cuda:0')
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.3356, device='cuda:0') factor: tensor(0.1650, device='cuda:0')
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.1716, device='cuda:0')


Epoch 1, Step 589: Loss = -0.3353, Actor Loss = -0.3723, Critic Loss = 0.0741, Entropy = 0.0721, Returns = 1.4242, Value = 1.4043, mask_loss = 0.1581std_loss = 0.3292:  11%|█         | 589/5475 [2:37:57<23:28:55, 17.30s/it]

ratio tensor(1.3978, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.3590, device='cuda:0') factor: tensor(0.1795, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1758, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.3205, device='cuda:0') factor: tensor(0.1778, device='cuda:0')
reward_raw: tensor(0.5397, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.1743, device='cuda:0')
reward_raw: tensor(0.5386, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.1799, device='cuda:0')


Epoch 1, Step 590: Loss = -0.1068, Actor Loss = -0.1334, Critic Loss = 0.0533, Entropy = 0.0873, Returns = 1.3826, Value = 1.3402, mask_loss = 0.2086std_loss = 0.3647:  11%|█         | 590/5475 [2:38:11<22:11:41, 16.36s/it]

ratio tensor(1.1151, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5405, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.1751, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.1815, device='cuda:0')
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.3297, device='cuda:0') factor: tensor(0.1769, device='cuda:0')
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.3574, device='cuda:0') factor: tensor(0.1824, device='cuda:0')
reward_raw: tensor(0.5293, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.1778, device='cuda:0')


Epoch 1, Step 591: Loss = -0.0155, Actor Loss = -0.0402, Critic Loss = 0.0496, Entropy = 0.0715, Returns = 1.3733, Value = 1.3297, mask_loss = 0.1798std_loss = 0.3533:  11%|█         | 591/5475 [2:38:24<20:47:35, 15.33s/it]

ratio tensor(1.1055, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3314, device='cuda:0') factor: tensor(0.1863, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.3218, device='cuda:0') factor: tensor(0.1901, device='cuda:0')
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.1850, device='cuda:0')
reward_raw: tensor(0.5034, device='cuda:0') reward: tensor(1.3119, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1884, device='cuda:0')


Epoch 1, Step 592: Loss = 0.0652, Actor Loss = 0.0553, Critic Loss = 0.0199, Entropy = 0.1006, Returns = 1.2822, Value = 1.2912, mask_loss = 0.2018std_loss = 0.3521:  11%|█         | 592/5475 [2:38:37<19:57:40, 14.72s/it]  

ratio tensor(2.1601, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5368, device='cuda:0') reward: tensor(1.3824, device='cuda:0') factor: tensor(0.1544, device='cuda:0')
reward_raw: tensor(0.5386, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.1564, device='cuda:0')
reward_raw: tensor(0.5285, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1532, device='cuda:0')
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.1579, device='cuda:0')
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.1573, device='cuda:0')


Epoch 1, Step 593: Loss = -0.0417, Actor Loss = -0.1072, Critic Loss = 0.1311, Entropy = 0.0856, Returns = 1.4669, Value = 1.2726, mask_loss = 0.1633std_loss = 0.3211:  11%|█         | 593/5475 [2:38:53<20:30:09, 15.12s/it]

ratio tensor(1.0468, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.1791, device='cuda:0')
reward_raw: tensor(0.5669, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1807, device='cuda:0')
reward_raw: tensor(0.5836, device='cuda:0') reward: tensor(1.4033, device='cuda:0') factor: tensor(0.1802, device='cuda:0')
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.3690, device='cuda:0') factor: tensor(0.1767, device='cuda:0')


Epoch 1, Step 594: Loss = 0.0086, Actor Loss = -0.0444, Critic Loss = 0.1060, Entropy = 0.0784, Returns = 1.3790, Value = 1.3536, mask_loss = 0.1562std_loss = 0.3258:  11%|█         | 594/5475 [2:39:07<20:01:37, 14.77s/it] 

ratio tensor(1.5760, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3036, device='cuda:0') factor: tensor(0.1471, device='cuda:0')
reward_raw: tensor(0.4600, device='cuda:0') reward: tensor(1.3153, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4316, device='cuda:0') reward: tensor(1.2877, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.4318, device='cuda:0') reward: tensor(1.2916, device='cuda:0') factor: tensor(0.1402, device='cuda:0')
reward_raw: tensor(0.4477, device='cuda:0') reward: tensor(1.3030, device='cuda:0') factor: tensor(0.1446, device='cuda:0')


Epoch 1, Step 595: Loss = 0.1686, Actor Loss = 0.1100, Critic Loss = 0.1174, Entropy = 0.0744, Returns = 1.3092, Value = 1.3996, mask_loss = 0.1469std_loss = 0.3191:  11%|█         | 595/5475 [2:39:27<21:51:23, 16.12s/it] 

ratio tensor(0.8323, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5274, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1851, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.1901, device='cuda:0')
reward_raw: tensor(0.5311, device='cuda:0') reward: tensor(1.3395, device='cuda:0') factor: tensor(0.1917, device='cuda:0')
reward_raw: tensor(0.5157, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1908, device='cuda:0')
reward_raw: tensor(0.5177, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.1909, device='cuda:0')


Epoch 1, Step 596: Loss = 0.1126, Actor Loss = 0.0303, Critic Loss = 0.1648, Entropy = 0.0915, Returns = 1.3468, Value = 1.5022, mask_loss = 0.1929std_loss = 0.3547:  11%|█         | 596/5475 [2:39:42<21:26:35, 15.82s/it]

ratio tensor(2.6024, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3148, device='cuda:0') factor: tensor(0.1685, device='cuda:0')
reward_raw: tensor(0.4795, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.1620, device='cuda:0')
reward_raw: tensor(0.4861, device='cuda:0') reward: tensor(1.3202, device='cuda:0') factor: tensor(0.1659, device='cuda:0')
reward_raw: tensor(0.4811, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.1650, device='cuda:0')
reward_raw: tensor(0.4794, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.1638, device='cuda:0')


Epoch 1, Step 597: Loss = -0.0669, Actor Loss = -0.1365, Critic Loss = 0.1394, Entropy = 0.1009, Returns = 1.3265, Value = 1.0852, mask_loss = 0.1865std_loss = 0.3372:  11%|█         | 597/5475 [2:40:04<23:54:13, 17.64s/it]

ratio tensor(1.8209, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5477, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1594, device='cuda:0')
reward_raw: tensor(0.5529, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.5535, device='cuda:0') reward: tensor(1.3861, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.1638, device='cuda:0')
reward_raw: tensor(0.5403, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1636, device='cuda:0')


Epoch 1, Step 598: Loss = -0.2208, Actor Loss = -0.2588, Critic Loss = 0.0760, Entropy = 0.0712, Returns = 1.4706, Value = 1.3512, mask_loss = 0.1528std_loss = 0.3250:  11%|█         | 598/5475 [2:40:19<22:56:26, 16.93s/it]

ratio tensor(2.7851, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4759, device='cuda:0') reward: tensor(1.3137, device='cuda:0') factor: tensor(0.1622, device='cuda:0')
reward_raw: tensor(0.4862, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.1632, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.1669, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3392, device='cuda:0') factor: tensor(0.1600, device='cuda:0')
reward_raw: tensor(0.5007, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.1608, device='cuda:0')


Epoch 1, Step 599: Loss = 0.0793, Actor Loss = 0.0489, Critic Loss = 0.0608, Entropy = 0.0690, Returns = 1.2788, Value = 1.3745, mask_loss = 0.1360std_loss = 0.3068:  11%|█         | 599/5475 [2:40:36<23:05:54, 17.05s/it]  

ratio tensor(1.2204, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.2054, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.2035, device='cuda:0')
reward_raw: tensor(0.5491, device='cuda:0') reward: tensor(1.3438, device='cuda:0') factor: tensor(0.2053, device='cuda:0')
reward_raw: tensor(0.5408, device='cuda:0') reward: tensor(1.3331, device='cuda:0') factor: tensor(0.2077, device='cuda:0')
reward_raw: tensor(0.5334, device='cuda:0') reward: tensor(1.3302, device='cuda:0') factor: tensor(0.2032, device='cuda:0')


Epoch 1, Step 600: Loss = -0.0974, Actor Loss = -0.1590, Critic Loss = 0.1233, Entropy = 0.0805, Returns = 1.4584, Value = 1.2473, mask_loss = 0.1679std_loss = 0.3149:  11%|█         | 600/5475 [2:40:58<25:02:56, 18.50s/it]

ratio tensor(1.6789, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5598, device='cuda:0') reward: tensor(1.3869, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.5663, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1779, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.3910, device='cuda:0') factor: tensor(0.1762, device='cuda:0')
reward_raw: tensor(0.5433, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.5610, device='cuda:0') reward: tensor(1.3818, device='cuda:0') factor: tensor(0.1792, device='cuda:0')


Epoch 1, Step 601: Loss = -0.0705, Actor Loss = -0.1121, Critic Loss = 0.0833, Entropy = 0.1138, Returns = 1.4143, Value = 1.4816, mask_loss = 0.1879std_loss = 0.3407:  11%|█         | 600/5475 [2:41:10<25:02:56, 18.50s/it]

ratio tensor(3.3317, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 601: Loss = -0.0705, Actor Loss = -0.1121, Critic Loss = 0.0833, Entropy = 0.1138, Returns = 1.4143, Value = 1.4816, mask_loss = 0.1879std_loss = 0.3407:  11%|█         | 601/5475 [2:41:10<22:30:32, 16.63s/it]

reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.1549, device='cuda:0')
reward_raw: tensor(0.5633, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.1629, device='cuda:0')
reward_raw: tensor(0.5780, device='cuda:0') reward: tensor(1.4231, device='cuda:0') factor: tensor(0.1550, device='cuda:0')
reward_raw: tensor(0.5743, device='cuda:0') reward: tensor(1.4217, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.1538, device='cuda:0')


Epoch 1, Step 602: Loss = 0.0523, Actor Loss = 0.0134, Critic Loss = 0.0779, Entropy = 0.0721, Returns = 1.4086, Value = 1.4630, mask_loss = 0.1364std_loss = 0.3101:  11%|█         | 602/5475 [2:41:28<22:59:28, 16.99s/it]  

ratio tensor(1.0163, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5250, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.3207, device='cuda:0') factor: tensor(0.1828, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.3330, device='cuda:0') factor: tensor(0.1800, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.4908, device='cuda:0') reward: tensor(1.3129, device='cuda:0') factor: tensor(0.1779, device='cuda:0')


Epoch 1, Step 603: Loss = 0.1567, Actor Loss = 0.1188, Critic Loss = 0.0760, Entropy = 0.0845, Returns = 1.3650, Value = 1.4116, mask_loss = 0.1841std_loss = 0.3451:  11%|█         | 603/5475 [2:41:45<22:57:56, 16.97s/it]

ratio tensor(0.5130, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.1602, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.1574, device='cuda:0')
reward_raw: tensor(0.5032, device='cuda:0') reward: tensor(1.3444, device='cuda:0') factor: tensor(0.1587, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.3345, device='cuda:0') factor: tensor(0.1649, device='cuda:0')
reward_raw: tensor(0.4870, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.1574, device='cuda:0')


Epoch 1, Step 604: Loss = 0.1998, Actor Loss = 0.1446, Critic Loss = 0.1105, Entropy = 0.0779, Returns = 1.3386, Value = 1.3241, mask_loss = 0.1843std_loss = 0.3529:  11%|█         | 604/5475 [2:42:06<24:32:36, 18.14s/it]

ratio tensor(0.6856, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5198, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1673, device='cuda:0')
reward_raw: tensor(0.4888, device='cuda:0') reward: tensor(1.3214, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.3216, device='cuda:0') factor: tensor(0.1744, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.1668, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3322, device='cuda:0') factor: tensor(0.1669, device='cuda:0')


Epoch 1, Step 605: Loss = -0.1976, Actor Loss = -0.2403, Critic Loss = 0.0856, Entropy = 0.0917, Returns = 1.3197, Value = 1.3630, mask_loss = 0.1592std_loss = 0.3209:  11%|█         | 605/5475 [2:42:21<23:05:43, 17.07s/it]

ratio tensor(1.1946, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5557, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.1794, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.1795, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.5610, device='cuda:0') reward: tensor(1.3781, device='cuda:0') factor: tensor(0.1829, device='cuda:0')
reward_raw: tensor(0.5408, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1781, device='cuda:0')


Epoch 1, Step 606: Loss = 0.1626, Actor Loss = 0.1145, Critic Loss = 0.0962, Entropy = 0.0928, Returns = 1.3511, Value = 1.5051, mask_loss = 0.1843std_loss = 0.3413:  11%|█         | 606/5475 [2:42:32<20:43:39, 15.33s/it]  

ratio tensor(2.2556, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5886, device='cuda:0') reward: tensor(1.4030, device='cuda:0') factor: tensor(0.1856, device='cuda:0')
reward_raw: tensor(0.6047, device='cuda:0') reward: tensor(1.4176, device='cuda:0') factor: tensor(0.1872, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.4086, device='cuda:0') factor: tensor(0.1822, device='cuda:0')
reward_raw: tensor(0.6032, device='cuda:0') reward: tensor(1.4169, device='cuda:0') factor: tensor(0.1863, device='cuda:0')
reward_raw: tensor(0.5826, device='cuda:0') reward: tensor(1.3974, device='cuda:0') factor: tensor(0.1852, device='cuda:0')


Epoch 1, Step 607: Loss = -0.0355, Actor Loss = -0.0781, Critic Loss = 0.0855, Entropy = 0.0913, Returns = 1.4318, Value = 1.2518, mask_loss = 0.1866std_loss = 0.3511:  11%|█         | 607/5475 [2:42:46<20:06:42, 14.87s/it]

ratio tensor(2.1463, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.3653, device='cuda:0') factor: tensor(0.1406, device='cuda:0')
reward_raw: tensor(0.5355, device='cuda:0') reward: tensor(1.3933, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1399, device='cuda:0')
reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.3709, device='cuda:0') factor: tensor(0.1423, device='cuda:0')


Epoch 1, Step 608: Loss = 0.0855, Actor Loss = 0.0408, Critic Loss = 0.0895, Entropy = 0.0695, Returns = 1.3674, Value = 1.4575, mask_loss = 0.1429std_loss = 0.3147:  11%|█         | 608/5475 [2:43:01<20:24:37, 15.10s/it]  

ratio tensor(0.9211, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5101, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.3262, device='cuda:0') factor: tensor(0.1837, device='cuda:0')
reward_raw: tensor(0.5101, device='cuda:0') reward: tensor(1.3219, device='cuda:0') factor: tensor(0.1883, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.1853, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.1868, device='cuda:0')


Epoch 1, Step 609: Loss = 0.1393, Actor Loss = 0.0671, Critic Loss = 0.1447, Entropy = 0.1048, Returns = 1.3258, Value = 1.3698, mask_loss = 0.2058std_loss = 0.3336:  11%|█         | 609/5475 [2:43:15<19:53:08, 14.71s/it]

ratio tensor(0.8437, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4354, device='cuda:0') reward: tensor(1.2736, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.4577, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.4493, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.1639, device='cuda:0')
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.2978, device='cuda:0') factor: tensor(0.1675, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.2904, device='cuda:0') factor: tensor(0.1636, device='cuda:0')


Epoch 1, Step 610: Loss = 0.1960, Actor Loss = 0.1056, Critic Loss = 0.1810, Entropy = 0.0814, Returns = 1.2744, Value = 1.3023, mask_loss = 0.1788std_loss = 0.3387:  11%|█         | 610/5475 [2:43:33<21:05:17, 15.60s/it]

ratio tensor(0.5046, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.1552, device='cuda:0')
reward_raw: tensor(0.5056, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1564, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1522, device='cuda:0')
reward_raw: tensor(0.4955, device='cuda:0') reward: tensor(1.3443, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.3357, device='cuda:0') factor: tensor(0.1601, device='cuda:0')


Epoch 1, Step 611: Loss = -0.1433, Actor Loss = -0.1735, Critic Loss = 0.0605, Entropy = 0.0733, Returns = 1.3026, Value = 1.3033, mask_loss = 0.1378std_loss = 0.3105:  11%|█         | 611/5475 [2:43:50<21:33:06, 15.95s/it]

ratio tensor(2.0673, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5409, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.1627, device='cuda:0')
reward_raw: tensor(0.5342, device='cuda:0') reward: tensor(1.3717, device='cuda:0') factor: tensor(0.1626, device='cuda:0')
reward_raw: tensor(0.5547, device='cuda:0') reward: tensor(1.3930, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.4039, device='cuda:0') factor: tensor(0.1676, device='cuda:0')
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.3925, device='cuda:0') factor: tensor(0.1698, device='cuda:0')


Epoch 1, Step 612: Loss = 0.1177, Actor Loss = 0.0786, Critic Loss = 0.0784, Entropy = 0.0693, Returns = 1.4322, Value = 1.4329, mask_loss = 0.1707std_loss = 0.3453:  11%|█         | 612/5475 [2:44:03<20:39:16, 15.29s/it]  

ratio tensor(2.2264, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.3783, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.5741, device='cuda:0') reward: tensor(1.3947, device='cuda:0') factor: tensor(0.1794, device='cuda:0')
reward_raw: tensor(0.5923, device='cuda:0') reward: tensor(1.4106, device='cuda:0') factor: tensor(0.1817, device='cuda:0')
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.4001, device='cuda:0') factor: tensor(0.1853, device='cuda:0')
reward_raw: tensor(0.5769, device='cuda:0') reward: tensor(1.3929, device='cuda:0') factor: tensor(0.1840, device='cuda:0')


Epoch 1, Step 613: Loss = -0.0575, Actor Loss = -0.0927, Critic Loss = 0.0705, Entropy = 0.0674, Returns = 1.3872, Value = 1.4351, mask_loss = 0.1739std_loss = 0.3426:  11%|█         | 613/5475 [2:44:15<19:21:02, 14.33s/it]

ratio tensor(1.4512, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5957, device='cuda:0') reward: tensor(1.4081, device='cuda:0') factor: tensor(0.1876, device='cuda:0')
reward_raw: tensor(0.5758, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.1809, device='cuda:0')
reward_raw: tensor(0.5843, device='cuda:0') reward: tensor(1.3974, device='cuda:0') factor: tensor(0.1870, device='cuda:0')
reward_raw: tensor(0.5812, device='cuda:0') reward: tensor(1.4040, device='cuda:0') factor: tensor(0.1772, device='cuda:0')
reward_raw: tensor(0.5864, device='cuda:0') reward: tensor(1.4003, device='cuda:0') factor: tensor(0.1861, device='cuda:0')


Epoch 1, Step 614: Loss = -0.0660, Actor Loss = -0.0893, Critic Loss = 0.0466, Entropy = 0.0785, Returns = 1.4643, Value = 1.4865, mask_loss = 0.1679std_loss = 0.3369:  11%|█         | 614/5475 [2:44:33<20:44:22, 15.36s/it]

ratio tensor(3.7327, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5285, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5449, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.2034, device='cuda:0')
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.3255, device='cuda:0') factor: tensor(0.1965, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.1988, device='cuda:0')
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.3454, device='cuda:0') factor: tensor(0.1968, device='cuda:0')


Epoch 1, Step 615: Loss = 0.1919, Actor Loss = 0.1625, Critic Loss = 0.0591, Entropy = 0.0778, Returns = 1.3047, Value = 1.3407, mask_loss = 0.2085std_loss = 0.3679:  11%|█         | 615/5475 [2:44:49<20:56:19, 15.51s/it]  

ratio tensor(0.8126, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4518, device='cuda:0') reward: tensor(1.3109, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.4417, device='cuda:0') reward: tensor(1.2965, device='cuda:0') factor: tensor(0.1452, device='cuda:0')
reward_raw: tensor(0.4466, device='cuda:0') reward: tensor(1.3017, device='cuda:0') factor: tensor(0.1449, device='cuda:0')
reward_raw: tensor(0.4432, device='cuda:0') reward: tensor(1.2985, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4441, device='cuda:0') reward: tensor(1.2998, device='cuda:0') factor: tensor(0.1444, device='cuda:0')


Epoch 1, Step 616: Loss = 0.2055, Actor Loss = 0.1680, Critic Loss = 0.0751, Entropy = 0.0602, Returns = 1.2561, Value = 1.3091, mask_loss = 0.1320std_loss = 0.3033:  11%|█▏        | 616/5475 [2:45:12<24:03:54, 17.83s/it]

ratio tensor(0.6187, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.3214, device='cuda:0') factor: tensor(0.1995, device='cuda:0')
reward_raw: tensor(0.5233, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1975, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1975, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.3269, device='cuda:0') factor: tensor(0.1992, device='cuda:0')
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.3393, device='cuda:0') factor: tensor(0.1992, device='cuda:0')


Epoch 1, Step 617: Loss = 0.0938, Actor Loss = 0.0451, Critic Loss = 0.0977, Entropy = 0.1015, Returns = 1.2526, Value = 1.4004, mask_loss = 0.2223std_loss = 0.3668:  11%|█▏        | 617/5475 [2:45:27<22:55:43, 16.99s/it]

ratio tensor(1.2381, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5148, device='cuda:0') reward: tensor(1.3228, device='cuda:0') factor: tensor(0.1919, device='cuda:0')
reward_raw: tensor(0.5510, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1964, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.3422, device='cuda:0') factor: tensor(0.1923, device='cuda:0')
reward_raw: tensor(0.5332, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.1974, device='cuda:0')
reward_raw: tensor(0.5421, device='cuda:0') reward: tensor(1.3481, device='cuda:0') factor: tensor(0.1940, device='cuda:0')


Epoch 1, Step 618: Loss = 0.0964, Actor Loss = 0.0686, Critic Loss = 0.0559, Entropy = 0.1053, Returns = 1.3386, Value = 1.4119, mask_loss = 0.2023std_loss = 0.3466:  11%|█▏        | 618/5475 [2:45:43<22:27:50, 16.65s/it]

ratio tensor(0.9004, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4893, device='cuda:0') reward: tensor(1.2620, device='cuda:0') factor: tensor(0.2273, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.2640, device='cuda:0') factor: tensor(0.2294, device='cuda:0')
reward_raw: tensor(0.4836, device='cuda:0') reward: tensor(1.2565, device='cuda:0') factor: tensor(0.2271, device='cuda:0')
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.2504, device='cuda:0') factor: tensor(0.2316, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.2557, device='cuda:0') factor: tensor(0.2270, device='cuda:0')


Epoch 1, Step 619: Loss = -0.0068, Actor Loss = -0.0601, Critic Loss = 0.1068, Entropy = 0.1255, Returns = 1.2625, Value = 1.2319, mask_loss = 0.2421std_loss = 0.3790:  11%|█▏        | 619/5475 [2:45:57<21:18:31, 15.80s/it]

ratio tensor(2.5602, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.1840, device='cuda:0')
reward_raw: tensor(0.5036, device='cuda:0') reward: tensor(1.3220, device='cuda:0') factor: tensor(0.1816, device='cuda:0')
reward_raw: tensor(0.5218, device='cuda:0') reward: tensor(1.3361, device='cuda:0') factor: tensor(0.1857, device='cuda:0')
reward_raw: tensor(0.4909, device='cuda:0') reward: tensor(1.3050, device='cuda:0') factor: tensor(0.1859, device='cuda:0')
reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.3037, device='cuda:0') factor: tensor(0.1883, device='cuda:0')


Epoch 1, Step 620: Loss = 0.0572, Actor Loss = 0.0249, Critic Loss = 0.0647, Entropy = 0.0895, Returns = 1.3073, Value = 1.3734, mask_loss = 0.2045std_loss = 0.3475:  11%|█▏        | 620/5475 [2:46:18<23:34:32, 17.48s/it]  

ratio tensor(1.2803, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.1779, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3255, device='cuda:0') factor: tensor(0.1677, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.1747, device='cuda:0')
reward_raw: tensor(0.4888, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.1734, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1751, device='cuda:0')


Epoch 1, Step 621: Loss = -0.0012, Actor Loss = -0.0254, Critic Loss = 0.0485, Entropy = 0.0825, Returns = 1.1843, Value = 1.2344, mask_loss = 0.1539std_loss = 0.3200:  11%|█▏        | 620/5475 [2:46:40<23:34:32, 17.48s/it]

ratio tensor(1.2342, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 621: Loss = -0.0012, Actor Loss = -0.0254, Critic Loss = 0.0485, Entropy = 0.0825, Returns = 1.1843, Value = 1.2344, mask_loss = 0.1539std_loss = 0.3200:  11%|█▏        | 621/5475 [2:46:42<25:56:34, 19.24s/it]

reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3164, device='cuda:0') factor: tensor(0.1779, device='cuda:0')
reward_raw: tensor(0.5121, device='cuda:0') reward: tensor(1.3325, device='cuda:0') factor: tensor(0.1796, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3160, device='cuda:0') factor: tensor(0.1765, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3103, device='cuda:0') factor: tensor(0.1761, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.1798, device='cuda:0')


Epoch 1, Step 622: Loss = 0.0367, Actor Loss = -0.0034, Critic Loss = 0.0804, Entropy = 0.0809, Returns = 1.3333, Value = 1.3317, mask_loss = 0.1687std_loss = 0.3393:  11%|█▏        | 622/5475 [2:47:00<25:43:36, 19.08s/it] 

ratio tensor(1.4218, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5665, device='cuda:0') reward: tensor(1.3708, device='cuda:0') factor: tensor(0.1957, device='cuda:0')
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.3437, device='cuda:0') factor: tensor(0.1973, device='cuda:0')
reward_raw: tensor(0.5625, device='cuda:0') reward: tensor(1.3617, device='cuda:0') factor: tensor(0.2009, device='cuda:0')
reward_raw: tensor(0.5557, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.1990, device='cuda:0')
reward_raw: tensor(0.5582, device='cuda:0') reward: tensor(1.3592, device='cuda:0') factor: tensor(0.1990, device='cuda:0')


Epoch 1, Step 623: Loss = -0.0099, Actor Loss = -0.1061, Critic Loss = 0.1927, Entropy = 0.1130, Returns = 1.3907, Value = 1.1557, mask_loss = 0.2060std_loss = 0.3557:  11%|█▏        | 623/5475 [2:47:16<24:18:32, 18.04s/it]

ratio tensor(1.4305, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5368, device='cuda:0') reward: tensor(1.3486, device='cuda:0') factor: tensor(0.1882, device='cuda:0')
reward_raw: tensor(0.5491, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.5567, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1823, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.1842, device='cuda:0')
reward_raw: tensor(0.5434, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.1832, device='cuda:0')


Epoch 1, Step 624: Loss = -0.0142, Actor Loss = -0.0458, Critic Loss = 0.0634, Entropy = 0.0817, Returns = 1.3833, Value = 1.3345, mask_loss = 0.1772std_loss = 0.3422:  11%|█▏        | 624/5475 [2:47:33<24:01:13, 17.83s/it]

ratio tensor(1.3176, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5026, device='cuda:0') reward: tensor(1.3185, device='cuda:0') factor: tensor(0.1842, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3028, device='cuda:0') factor: tensor(0.1897, device='cuda:0')
reward_raw: tensor(0.5091, device='cuda:0') reward: tensor(1.3198, device='cuda:0') factor: tensor(0.1893, device='cuda:0')
reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.3266, device='cuda:0') factor: tensor(0.1878, device='cuda:0')
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.1835, device='cuda:0')


Epoch 1, Step 625: Loss = 0.1233, Actor Loss = 0.0510, Critic Loss = 0.1448, Entropy = 0.0738, Returns = 1.2929, Value = 1.3736, mask_loss = 0.1731std_loss = 0.3398:  11%|█▏        | 625/5475 [2:47:49<22:59:46, 17.07s/it]  

ratio tensor(2.2356, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5460, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1634, device='cuda:0')
reward_raw: tensor(0.5375, device='cuda:0') reward: tensor(1.3757, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1560, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.3711, device='cuda:0') factor: tensor(0.1623, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.1604, device='cuda:0')


Epoch 1, Step 626: Loss = -0.0161, Actor Loss = -0.0561, Critic Loss = 0.0801, Entropy = 0.0690, Returns = 1.3908, Value = 1.3929, mask_loss = 0.1589std_loss = 0.3315:  11%|█▏        | 626/5475 [2:48:07<23:24:03, 17.37s/it]

ratio tensor(0.9059, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.1855, device='cuda:0')
reward_raw: tensor(0.5723, device='cuda:0') reward: tensor(1.3922, device='cuda:0') factor: tensor(0.1801, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.3793, device='cuda:0') factor: tensor(0.1887, device='cuda:0')
reward_raw: tensor(0.5631, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.1826, device='cuda:0')
reward_raw: tensor(0.5731, device='cuda:0') reward: tensor(1.3867, device='cuda:0') factor: tensor(0.1864, device='cuda:0')


Epoch 1, Step 627: Loss = -0.0558, Actor Loss = -0.1762, Critic Loss = 0.2412, Entropy = 0.0974, Returns = 1.3898, Value = 1.4070, mask_loss = 0.1774std_loss = 0.3224:  11%|█▏        | 627/5475 [2:48:30<25:38:58, 19.05s/it]

ratio tensor(49.6414, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.3015, device='cuda:0') factor: tensor(0.2116, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.2101, device='cuda:0')
reward_raw: tensor(0.5288, device='cuda:0') reward: tensor(1.3168, device='cuda:0') factor: tensor(0.2120, device='cuda:0')
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.2099, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.2159, device='cuda:0')


Epoch 1, Step 628: Loss = 0.2648, Actor Loss = 0.2184, Critic Loss = 0.0931, Entropy = 0.0819, Returns = 1.3188, Value = 1.2660, mask_loss = 0.1942std_loss = 0.3578:  11%|█▏        | 628/5475 [2:48:44<23:41:15, 17.59s/it]  

ratio tensor(0.6455, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4699, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.4489, device='cuda:0') reward: tensor(1.3098, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.4511, device='cuda:0') reward: tensor(1.3084, device='cuda:0') factor: tensor(0.1428, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.3323, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.4375, device='cuda:0') reward: tensor(1.2960, device='cuda:0') factor: tensor(0.1415, device='cuda:0')


Epoch 1, Step 629: Loss = -0.2662, Actor Loss = -0.3167, Critic Loss = 0.1011, Entropy = 0.0871, Returns = 1.3201, Value = 1.2453, mask_loss = 0.1355std_loss = 0.2987:  11%|█▏        | 629/5475 [2:49:03<24:26:24, 18.16s/it]

ratio tensor(3.0280, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.1735, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1711, device='cuda:0')
reward_raw: tensor(0.5255, device='cuda:0') reward: tensor(1.3482, device='cuda:0') factor: tensor(0.1773, device='cuda:0')
reward_raw: tensor(0.5404, device='cuda:0') reward: tensor(1.3685, device='cuda:0') factor: tensor(0.1719, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.3482, device='cuda:0') factor: tensor(0.1725, device='cuda:0')


Epoch 1, Step 630: Loss = 0.2403, Actor Loss = 0.1859, Critic Loss = 0.1088, Entropy = 0.0760, Returns = 1.2606, Value = 1.4609, mask_loss = 0.1456std_loss = 0.3121:  12%|█▏        | 630/5475 [2:49:17<22:38:38, 16.83s/it]  

ratio tensor(0.7963, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4364, device='cuda:0') reward: tensor(1.2936, device='cuda:0') factor: tensor(0.1428, device='cuda:0')
reward_raw: tensor(0.4052, device='cuda:0') reward: tensor(1.2676, device='cuda:0') factor: tensor(0.1376, device='cuda:0')
reward_raw: tensor(0.4326, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.1395, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.3157, device='cuda:0') factor: tensor(0.1427, device='cuda:0')
reward_raw: tensor(0.3995, device='cuda:0') reward: tensor(1.2579, device='cuda:0') factor: tensor(0.1416, device='cuda:0')


Epoch 1, Step 631: Loss = 0.1927, Actor Loss = 0.1264, Critic Loss = 0.1327, Entropy = 0.0703, Returns = 1.3564, Value = 1.1728, mask_loss = 0.1338std_loss = 0.3038:  12%|█▏        | 631/5475 [2:49:30<21:12:09, 15.76s/it]

ratio tensor(0.7705, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.3335, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.4508, device='cuda:0') reward: tensor(1.3184, device='cuda:0') factor: tensor(0.1324, device='cuda:0')
reward_raw: tensor(0.4518, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.1292, device='cuda:0')
reward_raw: tensor(0.4587, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.1351, device='cuda:0')
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.3253, device='cuda:0') factor: tensor(0.1372, device='cuda:0')


Epoch 1, Step 632: Loss = -0.1559, Actor Loss = -0.2101, Critic Loss = 0.1086, Entropy = 0.0692, Returns = 1.3652, Value = 1.4194, mask_loss = 0.1423std_loss = 0.3169:  12%|█▏        | 632/5475 [2:49:52<23:46:57, 17.68s/it]

ratio tensor(2.6239, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.1751, device='cuda:0')
reward_raw: tensor(0.5708, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1728, device='cuda:0')
reward_raw: tensor(0.5375, device='cuda:0') reward: tensor(1.3660, device='cuda:0') factor: tensor(0.1715, device='cuda:0')
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.1750, device='cuda:0')


Epoch 1, Step 633: Loss = -0.0786, Actor Loss = -0.1195, Critic Loss = 0.0821, Entropy = 0.0796, Returns = 1.3310, Value = 1.4243, mask_loss = 0.1853std_loss = 0.3468:  12%|█▏        | 633/5475 [2:50:10<23:31:18, 17.49s/it]

ratio tensor(3.1204, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5315, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1369, device='cuda:0')
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.1397, device='cuda:0')
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.4278, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.1421, device='cuda:0')


Epoch 1, Step 634: Loss = 0.2519, Actor Loss = 0.1825, Critic Loss = 0.1389, Entropy = 0.0863, Returns = 1.3148, Value = 1.3538, mask_loss = 0.1845std_loss = 0.3499:  12%|█▏        | 634/5475 [2:50:29<24:09:02, 17.96s/it]  

ratio tensor(0.4372, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.3835, device='cuda:0') factor: tensor(0.1563, device='cuda:0')
reward_raw: tensor(0.5302, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1535, device='cuda:0')
reward_raw: tensor(0.5564, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.3919, device='cuda:0') factor: tensor(0.1551, device='cuda:0')
reward_raw: tensor(0.5500, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.1487, device='cuda:0')


Epoch 1, Step 635: Loss = -0.0490, Actor Loss = -0.0873, Critic Loss = 0.0767, Entropy = 0.0709, Returns = 1.3523, Value = 1.4016, mask_loss = 0.1439std_loss = 0.3185:  12%|█▏        | 635/5475 [2:50:46<24:00:48, 17.86s/it]

ratio tensor(1.0871, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3022, device='cuda:0') factor: tensor(0.1521, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.2881, device='cuda:0') factor: tensor(0.1572, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3113, device='cuda:0') factor: tensor(0.1580, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(1.3022, device='cuda:0') factor: tensor(0.1598, device='cuda:0')
reward_raw: tensor(0.4450, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.1506, device='cuda:0')


Epoch 1, Step 636: Loss = 0.2076, Actor Loss = 0.1588, Critic Loss = 0.0976, Entropy = 0.0832, Returns = 1.2844, Value = 1.2786, mask_loss = 0.1634std_loss = 0.3256:  12%|█▏        | 636/5475 [2:51:05<24:33:22, 18.27s/it]  

ratio tensor(0.5694, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.1533, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3370, device='cuda:0') factor: tensor(0.1475, device='cuda:0')
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1523, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.3538, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5318, device='cuda:0') reward: tensor(1.3756, device='cuda:0') factor: tensor(0.1562, device='cuda:0')


Epoch 1, Step 637: Loss = -0.1226, Actor Loss = -0.1560, Critic Loss = 0.0669, Entropy = 0.0824, Returns = 1.3084, Value = 1.3160, mask_loss = 0.1545std_loss = 0.3140:  12%|█▏        | 637/5475 [2:51:20<23:11:48, 17.26s/it]

ratio tensor(3.2399, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4201, device='cuda:0') reward: tensor(1.2573, device='cuda:0') factor: tensor(0.1627, device='cuda:0')
reward_raw: tensor(0.4053, device='cuda:0') reward: tensor(1.2450, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.4259, device='cuda:0') reward: tensor(1.2662, device='cuda:0') factor: tensor(0.1597, device='cuda:0')
reward_raw: tensor(0.4037, device='cuda:0') reward: tensor(1.2462, device='cuda:0') factor: tensor(0.1575, device='cuda:0')
reward_raw: tensor(0.4197, device='cuda:0') reward: tensor(1.2578, device='cuda:0') factor: tensor(0.1619, device='cuda:0')


Epoch 1, Step 638: Loss = 0.2153, Actor Loss = 0.1859, Critic Loss = 0.0589, Entropy = 0.0614, Returns = 1.1801, Value = 1.3043, mask_loss = 0.1216std_loss = 0.2900:  12%|█▏        | 638/5475 [2:51:35<22:17:44, 16.59s/it]  

ratio tensor(0.6419, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4530, device='cuda:0') reward: tensor(1.2953, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.4666, device='cuda:0') reward: tensor(1.3097, device='cuda:0') factor: tensor(0.1569, device='cuda:0')
reward_raw: tensor(0.4471, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.4504, device='cuda:0') reward: tensor(1.2959, device='cuda:0') factor: tensor(0.1545, device='cuda:0')
reward_raw: tensor(0.4402, device='cuda:0') reward: tensor(1.2828, device='cuda:0') factor: tensor(0.1574, device='cuda:0')


Epoch 1, Step 639: Loss = 0.0996, Actor Loss = 0.0440, Critic Loss = 0.1113, Entropy = 0.0784, Returns = 1.2770, Value = 1.1913, mask_loss = 0.1578std_loss = 0.3267:  12%|█▏        | 639/5475 [2:51:52<22:21:18, 16.64s/it]

ratio tensor(2.7136, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.4501, device='cuda:0') reward: tensor(1.3432, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.4712, device='cuda:0') reward: tensor(1.3608, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.4635, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1094, device='cuda:0')


Epoch 1, Step 640: Loss = 0.0112, Actor Loss = -0.0174, Critic Loss = 0.0574, Entropy = 0.0516, Returns = 1.3972, Value = 1.3754, mask_loss = 0.1097std_loss = 0.2836:  12%|█▏        | 640/5475 [2:52:08<22:03:03, 16.42s/it]

ratio tensor(1.0046, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4698, device='cuda:0') reward: tensor(1.3082, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.4366, device='cuda:0') reward: tensor(1.2799, device='cuda:0') factor: tensor(0.1567, device='cuda:0')
reward_raw: tensor(0.4521, device='cuda:0') reward: tensor(1.2903, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.4767, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.4700, device='cuda:0') reward: tensor(1.3131, device='cuda:0') factor: tensor(0.1569, device='cuda:0')


Epoch 1, Step 641: Loss = -0.2567, Actor Loss = -0.3266, Critic Loss = 0.1399, Entropy = 0.0548, Returns = 1.2702, Value = 1.3034, mask_loss = 0.1342std_loss = 0.3127:  12%|█▏        | 640/5475 [2:52:20<22:03:03, 16.42s/it]

ratio tensor(1.3812, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 641: Loss = -0.2567, Actor Loss = -0.3266, Critic Loss = 0.1399, Entropy = 0.0548, Returns = 1.2702, Value = 1.3034, mask_loss = 0.1342std_loss = 0.3127:  12%|█▏        | 641/5475 [2:52:20<20:17:07, 15.11s/it]

reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.4829, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.4704, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3494, device='cuda:0') factor: tensor(0.1370, device='cuda:0')
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.1356, device='cuda:0')


Epoch 1, Step 642: Loss = 0.0215, Actor Loss = -0.0025, Critic Loss = 0.0481, Entropy = 0.0586, Returns = 1.3296, Value = 1.3464, mask_loss = 0.1402std_loss = 0.3107:  12%|█▏        | 642/5475 [2:52:43<23:29:48, 17.50s/it] 

ratio tensor(1.8447, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.1473, device='cuda:0')
reward_raw: tensor(0.4966, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.1468, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.1501, device='cuda:0')
reward_raw: tensor(0.5056, device='cuda:0') reward: tensor(1.3543, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.5139, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.1515, device='cuda:0')


Epoch 1, Step 643: Loss = 0.2481, Actor Loss = 0.1894, Critic Loss = 0.1174, Entropy = 0.0630, Returns = 1.3504, Value = 1.2222, mask_loss = 0.1445std_loss = 0.3203:  12%|█▏        | 643/5475 [2:52:56<21:46:58, 16.23s/it] 

ratio tensor(0.5956, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5386, device='cuda:0') reward: tensor(1.4135, device='cuda:0') factor: tensor(0.1251, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.5399, device='cuda:0') reward: tensor(1.4149, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.4210, device='cuda:0') factor: tensor(0.1235, device='cuda:0')
reward_raw: tensor(0.5403, device='cuda:0') reward: tensor(1.4163, device='cuda:0') factor: tensor(0.1240, device='cuda:0')


Epoch 1, Step 644: Loss = -0.1974, Actor Loss = -0.2806, Critic Loss = 0.1666, Entropy = 0.0525, Returns = 1.4329, Value = 1.4164, mask_loss = 0.1325std_loss = 0.3103:  12%|█▏        | 644/5475 [2:53:11<20:57:49, 15.62s/it]

ratio tensor(1.4274, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.1307, device='cuda:0')
reward_raw: tensor(0.4399, device='cuda:0') reward: tensor(1.3105, device='cuda:0') factor: tensor(0.1294, device='cuda:0')
reward_raw: tensor(0.4394, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.4379, device='cuda:0') reward: tensor(1.3074, device='cuda:0') factor: tensor(0.1305, device='cuda:0')
reward_raw: tensor(0.4313, device='cuda:0') reward: tensor(1.3047, device='cuda:0') factor: tensor(0.1265, device='cuda:0')


Epoch 1, Step 645: Loss = 0.1227, Actor Loss = 0.0860, Critic Loss = 0.0735, Entropy = 0.0614, Returns = 1.2799, Value = 1.3913, mask_loss = 0.1510std_loss = 0.3272:  12%|█▏        | 645/5475 [2:53:26<20:50:25, 15.53s/it]  

ratio tensor(0.7492, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5709, device='cuda:0') reward: tensor(1.4374, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.5838, device='cuda:0') reward: tensor(1.4519, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.5873, device='cuda:0') reward: tensor(1.4562, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.5664, device='cuda:0') reward: tensor(1.4342, device='cuda:0') factor: tensor(0.1322, device='cuda:0')
reward_raw: tensor(0.5822, device='cuda:0') reward: tensor(1.4529, device='cuda:0') factor: tensor(0.1292, device='cuda:0')


Epoch 1, Step 646: Loss = 0.1752, Actor Loss = 0.1481, Critic Loss = 0.0542, Entropy = 0.0492, Returns = 1.4287, Value = 1.3969, mask_loss = 0.1317std_loss = 0.3128:  12%|█▏        | 646/5475 [2:53:40<20:23:17, 15.20s/it]

ratio tensor(0.8483, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3971, device='cuda:0') reward: tensor(1.2679, device='cuda:0') factor: tensor(0.1292, device='cuda:0')
reward_raw: tensor(0.3747, device='cuda:0') reward: tensor(1.2485, device='cuda:0') factor: tensor(0.1262, device='cuda:0')
reward_raw: tensor(0.4029, device='cuda:0') reward: tensor(1.2748, device='cuda:0') factor: tensor(0.1281, device='cuda:0')
reward_raw: tensor(0.3923, device='cuda:0') reward: tensor(1.2660, device='cuda:0') factor: tensor(0.1263, device='cuda:0')
reward_raw: tensor(0.4008, device='cuda:0') reward: tensor(1.2751, device='cuda:0') factor: tensor(0.1256, device='cuda:0')


Epoch 1, Step 647: Loss = 0.2115, Actor Loss = 0.1573, Critic Loss = 0.1086, Entropy = 0.0483, Returns = 1.2482, Value = 1.2246, mask_loss = 0.1101std_loss = 0.2892:  12%|█▏        | 647/5475 [2:53:57<20:48:44, 15.52s/it]

ratio tensor(0.8488, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5765, device='cuda:0') reward: tensor(1.4315, device='cuda:0') factor: tensor(0.1450, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3665, device='cuda:0') factor: tensor(0.1445, device='cuda:0')
reward_raw: tensor(0.5800, device='cuda:0') reward: tensor(1.4389, device='cuda:0') factor: tensor(0.1411, device='cuda:0')
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.4194, device='cuda:0') factor: tensor(0.1400, device='cuda:0')
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.4190, device='cuda:0') factor: tensor(0.1436, device='cuda:0')


Epoch 1, Step 648: Loss = -0.2985, Actor Loss = -0.3181, Critic Loss = 0.0393, Entropy = 0.0494, Returns = 1.3867, Value = 1.4078, mask_loss = 0.1356std_loss = 0.3177:  12%|█▏        | 648/5475 [2:54:10<19:47:24, 14.76s/it]

ratio tensor(1.6068, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5582, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1640, device='cuda:0')
reward_raw: tensor(0.5546, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1604, device='cuda:0')
reward_raw: tensor(0.5395, device='cuda:0') reward: tensor(1.3824, device='cuda:0') factor: tensor(0.1571, device='cuda:0')
reward_raw: tensor(0.5510, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.1598, device='cuda:0')
reward_raw: tensor(0.5389, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.1593, device='cuda:0')


Epoch 1, Step 649: Loss = 0.1603, Actor Loss = 0.1283, Critic Loss = 0.0641, Entropy = 0.0663, Returns = 1.3671, Value = 1.4718, mask_loss = 0.1715std_loss = 0.3413:  12%|█▏        | 649/5475 [2:54:23<19:25:27, 14.49s/it]  

ratio tensor(0.8568, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4567, device='cuda:0') reward: tensor(1.3177, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.4520, device='cuda:0') reward: tensor(1.3141, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.4672, device='cuda:0') reward: tensor(1.3284, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.3055, device='cuda:0') factor: tensor(0.1396, device='cuda:0')
reward_raw: tensor(0.4372, device='cuda:0') reward: tensor(1.3004, device='cuda:0') factor: tensor(0.1368, device='cuda:0')


Epoch 1, Step 650: Loss = 0.1477, Actor Loss = 0.1229, Critic Loss = 0.0498, Entropy = 0.0599, Returns = 1.3052, Value = 1.1988, mask_loss = 0.1526std_loss = 0.3238:  12%|█▏        | 650/5475 [2:54:41<20:33:05, 15.33s/it]

ratio tensor(0.7240, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.1677, device='cuda:0')
reward_raw: tensor(0.5143, device='cuda:0') reward: tensor(1.3481, device='cuda:0') factor: tensor(0.1661, device='cuda:0')
reward_raw: tensor(0.4860, device='cuda:0') reward: tensor(1.3213, device='cuda:0') factor: tensor(0.1647, device='cuda:0')
reward_raw: tensor(0.4715, device='cuda:0') reward: tensor(1.3140, device='cuda:0') factor: tensor(0.1575, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.3446, device='cuda:0') factor: tensor(0.1602, device='cuda:0')


Epoch 1, Step 651: Loss = -0.1841, Actor Loss = -0.2302, Critic Loss = 0.0923, Entropy = 0.0498, Returns = 1.3018, Value = 1.3826, mask_loss = 0.1527std_loss = 0.3289:  12%|█▏        | 651/5475 [2:54:55<20:14:47, 15.11s/it]

ratio tensor(1.4822, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1441, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.1492, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.1457, device='cuda:0')
reward_raw: tensor(0.4477, device='cuda:0') reward: tensor(1.3040, device='cuda:0') factor: tensor(0.1438, device='cuda:0')
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3272, device='cuda:0') factor: tensor(0.1503, device='cuda:0')


Epoch 1, Step 652: Loss = 0.1990, Actor Loss = 0.1508, Critic Loss = 0.0964, Entropy = 0.0544, Returns = 1.3718, Value = 1.3337, mask_loss = 0.1756std_loss = 0.3426:  12%|█▏        | 652/5475 [2:55:09<19:29:38, 14.55s/it]  

ratio tensor(0.7580, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4910, device='cuda:0') reward: tensor(1.3412, device='cuda:0') factor: tensor(0.1498, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.1516, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.3537, device='cuda:0') factor: tensor(0.1475, device='cuda:0')
reward_raw: tensor(0.4906, device='cuda:0') reward: tensor(1.3434, device='cuda:0') factor: tensor(0.1472, device='cuda:0')
reward_raw: tensor(0.4503, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.1434, device='cuda:0')


Epoch 1, Step 653: Loss = 0.0019, Actor Loss = -0.0436, Critic Loss = 0.0910, Entropy = 0.0526, Returns = 1.4028, Value = 1.3576, mask_loss = 0.1334std_loss = 0.3094:  12%|█▏        | 653/5475 [2:55:22<19:05:41, 14.26s/it]

ratio tensor(0.8047, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4329, device='cuda:0') reward: tensor(1.2922, device='cuda:0') factor: tensor(0.1407, device='cuda:0')
reward_raw: tensor(0.4395, device='cuda:0') reward: tensor(1.3032, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.4374, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.1418, device='cuda:0')
reward_raw: tensor(0.4494, device='cuda:0') reward: tensor(1.3072, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.4470, device='cuda:0') reward: tensor(1.3096, device='cuda:0') factor: tensor(0.1373, device='cuda:0')


Epoch 1, Step 654: Loss = -0.0066, Actor Loss = -0.0499, Critic Loss = 0.0867, Entropy = 0.0508, Returns = 1.3752, Value = 1.3261, mask_loss = 0.1391std_loss = 0.3202:  12%|█▏        | 654/5475 [2:55:36<18:54:50, 14.12s/it]

ratio tensor(1.1768, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4295, device='cuda:0') reward: tensor(1.2980, device='cuda:0') factor: tensor(0.1315, device='cuda:0')
reward_raw: tensor(0.4121, device='cuda:0') reward: tensor(1.2818, device='cuda:0') factor: tensor(0.1303, device='cuda:0')
reward_raw: tensor(0.4109, device='cuda:0') reward: tensor(1.2848, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.4262, device='cuda:0') reward: tensor(1.2970, device='cuda:0') factor: tensor(0.1292, device='cuda:0')
reward_raw: tensor(0.4356, device='cuda:0') reward: tensor(1.3072, device='cuda:0') factor: tensor(0.1285, device='cuda:0')


Epoch 1, Step 655: Loss = 0.1391, Actor Loss = 0.0505, Critic Loss = 0.1775, Entropy = 0.0567, Returns = 1.3467, Value = 1.3344, mask_loss = 0.1360std_loss = 0.3076:  12%|█▏        | 655/5475 [2:55:55<20:53:23, 15.60s/it]  

ratio tensor(0.8068, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.1457, device='cuda:0')
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1502, device='cuda:0')
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3824, device='cuda:0') factor: tensor(0.1445, device='cuda:0')
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.3507, device='cuda:0') factor: tensor(0.1459, device='cuda:0')
reward_raw: tensor(0.5246, device='cuda:0') reward: tensor(1.3795, device='cuda:0') factor: tensor(0.1450, device='cuda:0')


Epoch 1, Step 656: Loss = -0.0596, Actor Loss = -0.1097, Critic Loss = 0.1004, Entropy = 0.0553, Returns = 1.2855, Value = 1.2967, mask_loss = 0.1549std_loss = 0.3344:  12%|█▏        | 656/5475 [2:56:10<20:43:45, 15.49s/it]

ratio tensor(1.5374, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5743, device='cuda:0') reward: tensor(1.4379, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.5697, device='cuda:0') reward: tensor(1.4323, device='cuda:0') factor: tensor(0.1374, device='cuda:0')
reward_raw: tensor(0.5855, device='cuda:0') reward: tensor(1.4472, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.5643, device='cuda:0') reward: tensor(1.4247, device='cuda:0') factor: tensor(0.1396, device='cuda:0')
reward_raw: tensor(0.5613, device='cuda:0') reward: tensor(1.4235, device='cuda:0') factor: tensor(0.1377, device='cuda:0')


Epoch 1, Step 657: Loss = -0.2119, Actor Loss = -0.2353, Critic Loss = 0.0470, Entropy = 0.0531, Returns = 1.4735, Value = 1.4519, mask_loss = 0.1570std_loss = 0.3361:  12%|█▏        | 657/5475 [2:56:28<21:48:16, 16.29s/it]

ratio tensor(9.5053, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.4011, device='cuda:0') factor: tensor(0.1411, device='cuda:0')
reward_raw: tensor(0.5378, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.1388, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1450, device='cuda:0')
reward_raw: tensor(0.5356, device='cuda:0') reward: tensor(1.3978, device='cuda:0') factor: tensor(0.1378, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.1394, device='cuda:0')


Epoch 1, Step 658: Loss = 0.2978, Actor Loss = 0.2472, Critic Loss = 0.1013, Entropy = 0.0367, Returns = 1.3840, Value = 1.4865, mask_loss = 0.1298std_loss = 0.3191:  12%|█▏        | 658/5475 [2:56:41<20:22:06, 15.22s/it]  

ratio tensor(0.5704, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4634, device='cuda:0') reward: tensor(1.3611, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.4631, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.0990, device='cuda:0')
reward_raw: tensor(0.4582, device='cuda:0') reward: tensor(1.3536, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.4460, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.0998, device='cuda:0')


Epoch 1, Step 659: Loss = 0.0950, Actor Loss = 0.0244, Critic Loss = 0.1412, Entropy = 0.0403, Returns = 1.3866, Value = 1.2035, mask_loss = 0.1071std_loss = 0.2834:  12%|█▏        | 659/5475 [2:57:06<24:06:19, 18.02s/it]

ratio tensor(0.8343, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.1630, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3448, device='cuda:0') factor: tensor(0.1633, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.4812, device='cuda:0') reward: tensor(1.3174, device='cuda:0') factor: tensor(0.1638, device='cuda:0')
reward_raw: tensor(0.5042, device='cuda:0') reward: tensor(1.3438, device='cuda:0') factor: tensor(0.1604, device='cuda:0')


Epoch 1, Step 660: Loss = -0.0211, Actor Loss = -0.1182, Critic Loss = 0.1944, Entropy = 0.0543, Returns = 1.3427, Value = 1.5081, mask_loss = 0.1663std_loss = 0.3435:  12%|█▏        | 660/5475 [2:57:19<22:23:52, 16.75s/it]

ratio tensor(1.6593, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3863, device='cuda:0') reward: tensor(1.2574, device='cuda:0') factor: tensor(0.1289, device='cuda:0')
reward_raw: tensor(0.3610, device='cuda:0') reward: tensor(1.2273, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.3749, device='cuda:0') reward: tensor(1.2445, device='cuda:0') factor: tensor(0.1305, device='cuda:0')
reward_raw: tensor(0.3745, device='cuda:0') reward: tensor(1.2444, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.3881, device='cuda:0') reward: tensor(1.2554, device='cuda:0') factor: tensor(0.1327, device='cuda:0')


Epoch 1, Step 661: Loss = 0.3194, Actor Loss = 0.2734, Critic Loss = 0.0921, Entropy = 0.0609, Returns = 1.2128, Value = 1.1938, mask_loss = 0.1428std_loss = 0.3148:  12%|█▏        | 660/5475 [2:57:32<22:23:52, 16.75s/it]  

ratio tensor(1.4688, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 661: Loss = 0.3194, Actor Loss = 0.2734, Critic Loss = 0.0921, Entropy = 0.0609, Returns = 1.2128, Value = 1.1938, mask_loss = 0.1428std_loss = 0.3148:  12%|█▏        | 661/5475 [2:57:32<20:38:41, 15.44s/it]

reward_raw: tensor(0.4270, device='cuda:0') reward: tensor(1.2472, device='cuda:0') factor: tensor(0.1798, device='cuda:0')
reward_raw: tensor(0.4246, device='cuda:0') reward: tensor(1.2508, device='cuda:0') factor: tensor(0.1738, device='cuda:0')
reward_raw: tensor(0.4167, device='cuda:0') reward: tensor(1.2412, device='cuda:0') factor: tensor(0.1755, device='cuda:0')
reward_raw: tensor(0.4295, device='cuda:0') reward: tensor(1.2546, device='cuda:0') factor: tensor(0.1750, device='cuda:0')
reward_raw: tensor(0.4111, device='cuda:0') reward: tensor(1.2360, device='cuda:0') factor: tensor(0.1750, device='cuda:0')


Epoch 1, Step 662: Loss = -0.0650, Actor Loss = -0.0933, Critic Loss = 0.0567, Entropy = 0.0561, Returns = 1.2694, Value = 1.2294, mask_loss = 0.1738std_loss = 0.3412:  12%|█▏        | 662/5475 [2:57:50<21:54:55, 16.39s/it]

ratio tensor(1.0335, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4817, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.4854, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3865, device='cuda:0') factor: tensor(0.1060, device='cuda:0')
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.1018, device='cuda:0')


Epoch 1, Step 663: Loss = -0.0777, Actor Loss = -0.1438, Critic Loss = 0.1324, Entropy = 0.0248, Returns = 1.4260, Value = 1.4435, mask_loss = 0.0852std_loss = 0.2667:  12%|█▏        | 663/5475 [2:58:09<22:44:24, 17.01s/it]

ratio tensor(1.0870, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.4247, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.4351, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.4345, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.5334, device='cuda:0') reward: tensor(1.4131, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.5408, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.1179, device='cuda:0')


Epoch 1, Step 664: Loss = 0.0357, Actor Loss = -0.0065, Critic Loss = 0.0845, Entropy = 0.0494, Returns = 1.3801, Value = 1.3537, mask_loss = 0.1240std_loss = 0.3063:  12%|█▏        | 664/5475 [2:58:29<24:05:15, 18.02s/it] 

ratio tensor(1.5347, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5361, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.1334, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.4259, device='cuda:0') factor: tensor(0.1345, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.4129, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.5321, device='cuda:0') reward: tensor(1.4005, device='cuda:0') factor: tensor(0.1316, device='cuda:0')
reward_raw: tensor(0.5654, device='cuda:0') reward: tensor(1.4317, device='cuda:0') factor: tensor(0.1337, device='cuda:0')


Epoch 1, Step 665: Loss = 0.0235, Actor Loss = -0.0319, Critic Loss = 0.1107, Entropy = 0.0582, Returns = 1.4333, Value = 1.3373, mask_loss = 0.1507std_loss = 0.3239:  12%|█▏        | 665/5475 [2:58:46<23:38:55, 17.70s/it]

ratio tensor(1.0880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.5506, device='cuda:0') reward: tensor(1.4081, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.5498, device='cuda:0') reward: tensor(1.4101, device='cuda:0') factor: tensor(0.1397, device='cuda:0')
reward_raw: tensor(0.5480, device='cuda:0') reward: tensor(1.4094, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.4033, device='cuda:0') factor: tensor(0.1387, device='cuda:0')


Epoch 1, Step 666: Loss = -0.0534, Actor Loss = -0.1016, Critic Loss = 0.0966, Entropy = 0.0479, Returns = 1.4015, Value = 1.4000, mask_loss = 0.1391std_loss = 0.3135:  12%|█▏        | 666/5475 [2:59:02<22:56:59, 17.18s/it]

ratio tensor(3.3067, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5510, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1638, device='cuda:0')
reward_raw: tensor(0.5316, device='cuda:0') reward: tensor(1.3668, device='cuda:0') factor: tensor(0.1647, device='cuda:0')
reward_raw: tensor(0.5384, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1690, device='cuda:0')
reward_raw: tensor(0.5458, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.1641, device='cuda:0')
reward_raw: tensor(0.5408, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1609, device='cuda:0')


Epoch 1, Step 667: Loss = 0.2587, Actor Loss = 0.1602, Critic Loss = 0.1970, Entropy = 0.0380, Returns = 1.2825, Value = 1.3017, mask_loss = 0.1261std_loss = 0.3041:  12%|█▏        | 667/5475 [2:59:18<22:33:01, 16.88s/it]  

ratio tensor(0.8059, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4866, device='cuda:0') reward: tensor(1.3597, device='cuda:0') factor: tensor(0.1269, device='cuda:0')
reward_raw: tensor(0.4685, device='cuda:0') reward: tensor(1.3463, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.4902, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1290, device='cuda:0')
reward_raw: tensor(0.4957, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1204, device='cuda:0')
reward_raw: tensor(0.4768, device='cuda:0') reward: tensor(1.3529, device='cuda:0') factor: tensor(0.1239, device='cuda:0')


Epoch 1, Step 668: Loss = -0.0413, Actor Loss = -0.0879, Critic Loss = 0.0934, Entropy = 0.0515, Returns = 1.3370, Value = 1.3800, mask_loss = 0.1245std_loss = 0.3030:  12%|█▏        | 668/5475 [2:59:34<22:06:18, 16.55s/it]

ratio tensor(1.3476, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5873, device='cuda:0') reward: tensor(1.4673, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.4465, device='cuda:0') factor: tensor(0.1248, device='cuda:0')
reward_raw: tensor(0.5910, device='cuda:0') reward: tensor(1.4684, device='cuda:0') factor: tensor(0.1226, device='cuda:0')
reward_raw: tensor(0.5677, device='cuda:0') reward: tensor(1.4435, device='cuda:0') factor: tensor(0.1242, device='cuda:0')
reward_raw: tensor(0.5898, device='cuda:0') reward: tensor(1.4676, device='cuda:0') factor: tensor(0.1222, device='cuda:0')


Epoch 1, Step 669: Loss = -0.1748, Actor Loss = -0.2225, Critic Loss = 0.0955, Entropy = 0.0473, Returns = 1.3936, Value = 1.4805, mask_loss = 0.1399std_loss = 0.3197:  12%|█▏        | 669/5475 [2:59:50<21:44:03, 16.28s/it]

ratio tensor(2.2771, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.4059, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.4949, device='cuda:0') reward: tensor(1.3866, device='cuda:0') factor: tensor(0.1083, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1156, device='cuda:0')
reward_raw: tensor(0.4981, device='cuda:0') reward: tensor(1.3879, device='cuda:0') factor: tensor(0.1102, device='cuda:0')


Epoch 1, Step 670: Loss = 0.2732, Actor Loss = 0.2369, Critic Loss = 0.0726, Entropy = 0.0334, Returns = 1.3972, Value = 1.3519, mask_loss = 0.1047std_loss = 0.2849:  12%|█▏        | 670/5475 [3:00:12<23:57:08, 17.95s/it]  

ratio tensor(0.9794, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4214, device='cuda:0') reward: tensor(1.3012, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.4291, device='cuda:0') reward: tensor(1.3082, device='cuda:0') factor: tensor(0.1210, device='cuda:0')
reward_raw: tensor(0.4039, device='cuda:0') reward: tensor(1.2858, device='cuda:0') factor: tensor(0.1181, device='cuda:0')
reward_raw: tensor(0.4162, device='cuda:0') reward: tensor(1.2977, device='cuda:0') factor: tensor(0.1184, device='cuda:0')
reward_raw: tensor(0.4251, device='cuda:0') reward: tensor(1.3057, device='cuda:0') factor: tensor(0.1194, device='cuda:0')


Epoch 1, Step 671: Loss = 0.0029, Actor Loss = -0.0331, Critic Loss = 0.0719, Entropy = 0.0313, Returns = 1.3244, Value = 1.1698, mask_loss = 0.0962std_loss = 0.2740:  12%|█▏        | 671/5475 [3:00:34<25:50:59, 19.37s/it]

ratio tensor(1.5541, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.1153, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3501, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4567, device='cuda:0') reward: tensor(1.3450, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.3590, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4756, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1144, device='cuda:0')


Epoch 1, Step 672: Loss = -0.1417, Actor Loss = -0.1852, Critic Loss = 0.0873, Entropy = 0.0389, Returns = 1.4141, Value = 1.3443, mask_loss = 0.1137std_loss = 0.2921:  12%|█▏        | 672/5475 [3:00:49<23:53:00, 17.90s/it]

ratio tensor(1.9319, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.3663, device='cuda:0') factor: tensor(0.1378, device='cuda:0')
reward_raw: tensor(0.4908, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.3547, device='cuda:0') factor: tensor(0.1412, device='cuda:0')
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3593, device='cuda:0') factor: tensor(0.1350, device='cuda:0')


Epoch 1, Step 673: Loss = -0.0923, Actor Loss = -0.1291, Critic Loss = 0.0738, Entropy = 0.0396, Returns = 1.3917, Value = 1.3499, mask_loss = 0.1407std_loss = 0.3243:  12%|█▏        | 673/5475 [3:01:03<22:12:12, 16.65s/it]

ratio tensor(1.0019, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4473, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.1031, device='cuda:0')
reward_raw: tensor(0.4444, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.4429, device='cuda:0') reward: tensor(1.3411, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.4336, device='cuda:0') reward: tensor(1.3300, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.4429, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.1023, device='cuda:0')


Epoch 1, Step 674: Loss = 0.0267, Actor Loss = -0.0175, Critic Loss = 0.0884, Entropy = 0.0338, Returns = 1.3178, Value = 1.3699, mask_loss = 0.1003std_loss = 0.2770:  12%|█▏        | 674/5475 [3:01:23<23:36:47, 17.71s/it] 

ratio tensor(0.6641, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.4686, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.3469, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.1346, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.1337, device='cuda:0')


Epoch 1, Step 675: Loss = 0.0790, Actor Loss = 0.0449, Critic Loss = 0.0682, Entropy = 0.0408, Returns = 1.3135, Value = 1.4040, mask_loss = 0.1348std_loss = 0.3163:  12%|█▏        | 675/5475 [3:01:39<22:49:29, 17.12s/it] 

ratio tensor(0.8186, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5236, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.1278, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3881, device='cuda:0') factor: tensor(0.1251, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.1275, device='cuda:0')
reward_raw: tensor(0.5157, device='cuda:0') reward: tensor(1.3887, device='cuda:0') factor: tensor(0.1270, device='cuda:0')
reward_raw: tensor(0.5145, device='cuda:0') reward: tensor(1.3847, device='cuda:0') factor: tensor(0.1298, device='cuda:0')


Epoch 1, Step 676: Loss = 0.1191, Actor Loss = 0.0854, Critic Loss = 0.0674, Entropy = 0.0286, Returns = 1.3597, Value = 1.4032, mask_loss = 0.0961std_loss = 0.2750:  12%|█▏        | 676/5475 [3:01:56<22:52:45, 17.16s/it]

ratio tensor(0.8725, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4705, device='cuda:0') reward: tensor(1.3319, device='cuda:0') factor: tensor(0.1386, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3540, device='cuda:0') factor: tensor(0.1448, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3447, device='cuda:0') factor: tensor(0.1377, device='cuda:0')


Epoch 1, Step 677: Loss = 0.0707, Actor Loss = 0.0455, Critic Loss = 0.0503, Entropy = 0.0383, Returns = 1.2974, Value = 1.3076, mask_loss = 0.1297std_loss = 0.3129:  12%|█▏        | 677/5475 [3:02:16<24:17:40, 18.23s/it]

ratio tensor(0.8955, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5327, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.4022, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.4246, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.5196, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.1177, device='cuda:0')


Epoch 1, Step 678: Loss = -0.1272, Actor Loss = -0.1723, Critic Loss = 0.0904, Entropy = 0.0241, Returns = 1.4397, Value = 1.4384, mask_loss = 0.1124std_loss = 0.2980:  12%|█▏        | 678/5475 [3:02:33<23:45:20, 17.83s/it]

ratio tensor(1.6112, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5331, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.5324, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1252, device='cuda:0')
reward_raw: tensor(0.5476, device='cuda:0') reward: tensor(1.4174, device='cuda:0') factor: tensor(0.1301, device='cuda:0')
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.4163, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.4248, device='cuda:0') factor: tensor(0.1239, device='cuda:0')


Epoch 1, Step 679: Loss = -0.1413, Actor Loss = -0.1777, Critic Loss = 0.0729, Entropy = 0.0290, Returns = 1.4489, Value = 1.2984, mask_loss = 0.1186std_loss = 0.3034:  12%|█▏        | 679/5475 [3:02:49<22:48:39, 17.12s/it]

ratio tensor(1.0706, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4084, device='cuda:0') reward: tensor(1.2889, device='cuda:0') factor: tensor(0.1195, device='cuda:0')
reward_raw: tensor(0.4006, device='cuda:0') reward: tensor(1.2827, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.4129, device='cuda:0') reward: tensor(1.2948, device='cuda:0') factor: tensor(0.1181, device='cuda:0')
reward_raw: tensor(0.4025, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.3996, device='cuda:0') reward: tensor(1.2829, device='cuda:0') factor: tensor(0.1167, device='cuda:0')


Epoch 1, Step 680: Loss = 0.2073, Actor Loss = 0.1838, Critic Loss = 0.0469, Entropy = 0.0337, Returns = 1.2767, Value = 1.1937, mask_loss = 0.1316std_loss = 0.3105:  12%|█▏        | 680/5475 [3:03:07<23:18:41, 17.50s/it]  

ratio tensor(0.5737, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5471, device='cuda:0') reward: tensor(1.4431, device='cuda:0') factor: tensor(0.1040, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.4371, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.5497, device='cuda:0') reward: tensor(1.4459, device='cuda:0') factor: tensor(0.1039, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.4427, device='cuda:0') factor: tensor(0.1045, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.4362, device='cuda:0') factor: tensor(0.1056, device='cuda:0')


Epoch 1, Step 681: Loss = -0.2838, Actor Loss = -0.3285, Critic Loss = 0.0896, Entropy = 0.0335, Returns = 1.4597, Value = 1.2734, mask_loss = 0.1068std_loss = 0.2902:  12%|█▏        | 680/5475 [3:03:23<23:18:41, 17.50s/it]

ratio tensor(3.6414, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 681: Loss = -0.2838, Actor Loss = -0.3285, Critic Loss = 0.0896, Entropy = 0.0335, Returns = 1.4597, Value = 1.2734, mask_loss = 0.1068std_loss = 0.2902:  12%|█▏        | 681/5475 [3:03:23<22:34:20, 16.95s/it]

reward_raw: tensor(0.4443, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.4323, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.1087, device='cuda:0')
reward_raw: tensor(0.4348, device='cuda:0') reward: tensor(1.3279, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.3348, device='cuda:0') factor: tensor(0.1103, device='cuda:0')
reward_raw: tensor(0.4497, device='cuda:0') reward: tensor(1.3402, device='cuda:0') factor: tensor(0.1095, device='cuda:0')


Epoch 1, Step 682: Loss = 0.0815, Actor Loss = -0.0047, Critic Loss = 0.1725, Entropy = 0.0247, Returns = 1.3056, Value = 1.4765, mask_loss = 0.0973std_loss = 0.2814:  12%|█▏        | 682/5475 [3:03:36<21:11:23, 15.92s/it] 

ratio tensor(0.7238, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3663, device='cuda:0') factor: tensor(0.1211, device='cuda:0')
reward_raw: tensor(0.5244, device='cuda:0') reward: tensor(1.4036, device='cuda:0') factor: tensor(0.1208, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3758, device='cuda:0') factor: tensor(0.1233, device='cuda:0')


Epoch 1, Step 683: Loss = 0.1752, Actor Loss = 0.1030, Critic Loss = 0.1444, Entropy = 0.0443, Returns = 1.4008, Value = 1.3729, mask_loss = 0.1426std_loss = 0.3270:  12%|█▏        | 683/5475 [3:03:50<20:08:23, 15.13s/it] 

ratio tensor(0.8813, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.4222, device='cuda:0') factor: tensor(0.1127, device='cuda:0')
reward_raw: tensor(0.5241, device='cuda:0') reward: tensor(1.4103, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.4141, device='cuda:0') factor: tensor(0.1168, device='cuda:0')
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.4143, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.5173, device='cuda:0') reward: tensor(1.4026, device='cuda:0') factor: tensor(0.1146, device='cuda:0')


Epoch 1, Step 684: Loss = 0.3235, Actor Loss = 0.2320, Critic Loss = 0.1831, Entropy = 0.0320, Returns = 1.3226, Value = 1.5043, mask_loss = 0.1279std_loss = 0.3178:  12%|█▏        | 684/5475 [3:04:03<19:34:43, 14.71s/it]

ratio tensor(0.5228, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4448, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.1242, device='cuda:0')
reward_raw: tensor(0.4579, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.1267, device='cuda:0')
reward_raw: tensor(0.4534, device='cuda:0') reward: tensor(1.3286, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.4446, device='cuda:0') reward: tensor(1.3195, device='cuda:0') factor: tensor(0.1251, device='cuda:0')


Epoch 1, Step 685: Loss = 0.0584, Actor Loss = 0.0105, Critic Loss = 0.0960, Entropy = 0.0322, Returns = 1.3496, Value = 1.2805, mask_loss = 0.1261std_loss = 0.3108:  13%|█▎        | 685/5475 [3:04:19<19:53:25, 14.95s/it]

ratio tensor(8.2750, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.4059, device='cuda:0') factor: tensor(0.1228, device='cuda:0')
reward_raw: tensor(0.5466, device='cuda:0') reward: tensor(1.4207, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.1257, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.4181, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.4142, device='cuda:0') factor: tensor(0.1203, device='cuda:0')


Epoch 1, Step 686: Loss = -0.1649, Actor Loss = -0.2179, Critic Loss = 0.1060, Entropy = 0.0326, Returns = 1.3950, Value = 1.3110, mask_loss = 0.1296std_loss = 0.3185:  13%|█▎        | 686/5475 [3:04:32<19:11:37, 14.43s/it]

ratio tensor(1.3849, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.1189, device='cuda:0')
reward_raw: tensor(0.5179, device='cuda:0') reward: tensor(1.4003, device='cuda:0') factor: tensor(0.1176, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.4037, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.5321, device='cuda:0') reward: tensor(1.4146, device='cuda:0') factor: tensor(0.1175, device='cuda:0')


Epoch 1, Step 687: Loss = 0.1880, Actor Loss = 0.1185, Critic Loss = 0.1389, Entropy = 0.0288, Returns = 1.4417, Value = 1.4192, mask_loss = 0.1230std_loss = 0.3088:  13%|█▎        | 687/5475 [3:04:49<19:59:36, 15.03s/it]  

ratio tensor(0.6900, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.4078, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4811, device='cuda:0') reward: tensor(1.3838, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4868, device='cuda:0') reward: tensor(1.3844, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.4713, device='cuda:0') reward: tensor(1.3728, device='cuda:0') factor: tensor(0.0986, device='cuda:0')


Epoch 1, Step 688: Loss = -0.1968, Actor Loss = -0.2348, Critic Loss = 0.0759, Entropy = 0.0324, Returns = 1.4019, Value = 1.3348, mask_loss = 0.1171std_loss = 0.2998:  13%|█▎        | 688/5475 [3:05:10<22:42:00, 17.07s/it]

ratio tensor(0.9829, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5714, device='cuda:0') reward: tensor(1.4304, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.5706, device='cuda:0') reward: tensor(1.4319, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.4078, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.4327, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.5692, device='cuda:0') reward: tensor(1.4288, device='cuda:0') factor: tensor(0.1404, device='cuda:0')


Epoch 1, Step 689: Loss = 0.0307, Actor Loss = -0.0049, Critic Loss = 0.0713, Entropy = 0.0519, Returns = 1.4558, Value = 1.5693, mask_loss = 0.1570std_loss = 0.3306:  13%|█▎        | 689/5475 [3:05:24<21:18:50, 16.03s/it] 

ratio tensor(0.7824, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5336, device='cuda:0') reward: tensor(1.4244, device='cuda:0') factor: tensor(0.1092, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.4113, device='cuda:0') factor: tensor(0.1082, device='cuda:0')
reward_raw: tensor(0.5206, device='cuda:0') reward: tensor(1.4136, device='cuda:0') factor: tensor(0.1070, device='cuda:0')
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.4369, device='cuda:0') factor: tensor(0.1124, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.4139, device='cuda:0') factor: tensor(0.1071, device='cuda:0')


Epoch 1, Step 690: Loss = 0.1654, Actor Loss = 0.1325, Critic Loss = 0.0658, Entropy = 0.0320, Returns = 1.3967, Value = 1.4031, mask_loss = 0.1079std_loss = 0.2912:  13%|█▎        | 690/5475 [3:05:38<20:28:12, 15.40s/it] 

ratio tensor(0.6624, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.3900, device='cuda:0') factor: tensor(0.1262, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.3845, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.3846, device='cuda:0') factor: tensor(0.1321, device='cuda:0')
reward_raw: tensor(0.5355, device='cuda:0') reward: tensor(1.4066, device='cuda:0') factor: tensor(0.1289, device='cuda:0')
reward_raw: tensor(0.5375, device='cuda:0') reward: tensor(1.4101, device='cuda:0') factor: tensor(0.1274, device='cuda:0')


Epoch 1, Step 691: Loss = 0.0035, Actor Loss = -0.0362, Critic Loss = 0.0794, Entropy = 0.0324, Returns = 1.3592, Value = 1.3476, mask_loss = 0.1236std_loss = 0.3067:  13%|█▎        | 691/5475 [3:05:54<20:42:32, 15.58s/it]

ratio tensor(0.9641, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.0960, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.0927, device='cuda:0')
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.3757, device='cuda:0') factor: tensor(0.0954, device='cuda:0')


Epoch 1, Step 692: Loss = 0.1340, Actor Loss = 0.0889, Critic Loss = 0.0904, Entropy = 0.0315, Returns = 1.3409, Value = 1.4258, mask_loss = 0.1048std_loss = 0.2873:  13%|█▎        | 692/5475 [3:06:10<20:44:20, 15.61s/it] 

ratio tensor(0.9689, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3917, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.3819, device='cuda:0') reward: tensor(1.2812, device='cuda:0') factor: tensor(0.1007, device='cuda:0')
reward_raw: tensor(0.3833, device='cuda:0') reward: tensor(1.2788, device='cuda:0') factor: tensor(0.1045, device='cuda:0')
reward_raw: tensor(0.4004, device='cuda:0') reward: tensor(1.2934, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.3817, device='cuda:0') reward: tensor(1.2815, device='cuda:0') factor: tensor(0.1002, device='cuda:0')


Epoch 1, Step 693: Loss = 0.2047, Actor Loss = 0.1235, Critic Loss = 0.1625, Entropy = 0.0310, Returns = 1.3131, Value = 1.2306, mask_loss = 0.0919std_loss = 0.2662:  13%|█▎        | 693/5475 [3:06:29<22:06:10, 16.64s/it]

ratio tensor(0.9405, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4343, device='cuda:0') reward: tensor(1.3122, device='cuda:0') factor: tensor(0.1221, device='cuda:0')
reward_raw: tensor(0.4425, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.1193, device='cuda:0')
reward_raw: tensor(0.4461, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.1255, device='cuda:0')
reward_raw: tensor(0.4224, device='cuda:0') reward: tensor(1.3000, device='cuda:0') factor: tensor(0.1225, device='cuda:0')
reward_raw: tensor(0.4222, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.1225, device='cuda:0')


Epoch 1, Step 694: Loss = 0.0102, Actor Loss = -0.0327, Critic Loss = 0.0859, Entropy = 0.0342, Returns = 1.3104, Value = 1.4512, mask_loss = 0.1173std_loss = 0.3011:  13%|█▎        | 694/5475 [3:06:41<20:11:58, 15.21s/it]

ratio tensor(1.0854, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.4409, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.1022, device='cuda:0')
reward_raw: tensor(0.4534, device='cuda:0') reward: tensor(1.3542, device='cuda:0') factor: tensor(0.0993, device='cuda:0')
reward_raw: tensor(0.4467, device='cuda:0') reward: tensor(1.3447, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1021, device='cuda:0')


Epoch 1, Step 695: Loss = 0.1109, Actor Loss = 0.0789, Critic Loss = 0.0640, Entropy = 0.0301, Returns = 1.3368, Value = 1.3864, mask_loss = 0.0968std_loss = 0.2769:  13%|█▎        | 695/5475 [3:07:02<22:48:27, 17.18s/it] 

ratio tensor(0.8461, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4290, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.4257, device='cuda:0') reward: tensor(1.3282, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.4339, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.0984, device='cuda:0')
reward_raw: tensor(0.4225, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.1000, device='cuda:0')
reward_raw: tensor(0.4239, device='cuda:0') reward: tensor(1.3233, device='cuda:0') factor: tensor(0.1006, device='cuda:0')


Epoch 1, Step 696: Loss = -0.0034, Actor Loss = -0.0641, Critic Loss = 0.1215, Entropy = 0.0306, Returns = 1.3609, Value = 1.2316, mask_loss = 0.1012std_loss = 0.2822:  13%|█▎        | 696/5475 [3:07:22<23:54:42, 18.01s/it]

ratio tensor(4.2204, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4992, device='cuda:0') reward: tensor(1.3869, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.3989, device='cuda:0') factor: tensor(0.1177, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.1159, device='cuda:0')
reward_raw: tensor(0.5299, device='cuda:0') reward: tensor(1.4127, device='cuda:0') factor: tensor(0.1172, device='cuda:0')


Epoch 1, Step 697: Loss = -0.0885, Actor Loss = -0.1309, Critic Loss = 0.0849, Entropy = 0.0280, Returns = 1.2439, Value = 1.3014, mask_loss = 0.1182std_loss = 0.2979:  13%|█▎        | 697/5475 [3:07:40<23:57:44, 18.05s/it]

ratio tensor(1.3111, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.3464, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.4695, device='cuda:0') reward: tensor(1.3352, device='cuda:0') factor: tensor(0.1343, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.1326, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3656, device='cuda:0') factor: tensor(0.1367, device='cuda:0')
reward_raw: tensor(0.4700, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1346, device='cuda:0')


Epoch 1, Step 698: Loss = 0.0884, Actor Loss = 0.0387, Critic Loss = 0.0996, Entropy = 0.0455, Returns = 1.2739, Value = 1.3678, mask_loss = 0.1504std_loss = 0.3132:  13%|█▎        | 698/5475 [3:07:55<22:33:01, 16.99s/it]  

ratio tensor(0.7913, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4013, device='cuda:0') reward: tensor(1.2801, device='cuda:0') factor: tensor(0.1212, device='cuda:0')
reward_raw: tensor(0.3938, device='cuda:0') reward: tensor(1.2742, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.3936, device='cuda:0') reward: tensor(1.2800, device='cuda:0') factor: tensor(0.1136, device='cuda:0')
reward_raw: tensor(0.4003, device='cuda:0') reward: tensor(1.2802, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.3992, device='cuda:0') reward: tensor(1.2824, device='cuda:0') factor: tensor(0.1168, device='cuda:0')


Epoch 1, Step 699: Loss = 0.0000, Actor Loss = -0.0459, Critic Loss = 0.0919, Entropy = 0.0286, Returns = 1.3548, Value = 1.2150, mask_loss = 0.1056std_loss = 0.2874:  13%|█▎        | 699/5475 [3:08:13<23:03:47, 17.38s/it]

ratio tensor(0.8898, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4695, device='cuda:0') reward: tensor(1.3584, device='cuda:0') factor: tensor(0.1112, device='cuda:0')
reward_raw: tensor(0.4752, device='cuda:0') reward: tensor(1.3671, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.4632, device='cuda:0') reward: tensor(1.3510, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.4518, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.4580, device='cuda:0') reward: tensor(1.3472, device='cuda:0') factor: tensor(0.1108, device='cuda:0')


Epoch 1, Step 700: Loss = 0.0232, Actor Loss = -0.0183, Critic Loss = 0.0830, Entropy = 0.0324, Returns = 1.2856, Value = 1.3415, mask_loss = 0.1071std_loss = 0.2921:  13%|█▎        | 700/5475 [3:08:30<22:41:19, 17.11s/it]

ratio tensor(1.0604, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4748, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.1440, device='cuda:0')
reward_raw: tensor(0.4837, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.1408, device='cuda:0')
reward_raw: tensor(0.4745, device='cuda:0') reward: tensor(1.3361, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.4846, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1423, device='cuda:0')


Epoch 1, Step 701: Loss = 0.1660, Actor Loss = 0.1251, Critic Loss = 0.0819, Entropy = 0.0547, Returns = 1.3368, Value = 1.3817, mask_loss = 0.1502std_loss = 0.3270:  13%|█▎        | 700/5475 [3:08:41<22:41:19, 17.11s/it] 

ratio tensor(0.5727, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 701: Loss = 0.1660, Actor Loss = 0.1251, Critic Loss = 0.0819, Entropy = 0.0547, Returns = 1.3368, Value = 1.3817, mask_loss = 0.1502std_loss = 0.3270:  13%|█▎        | 701/5475 [3:08:41<20:23:02, 15.37s/it]

reward_raw: tensor(0.4594, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1136, device='cuda:0')
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.1132, device='cuda:0')
reward_raw: tensor(0.4572, device='cuda:0') reward: tensor(1.3454, device='cuda:0') factor: tensor(0.1118, device='cuda:0')
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.3500, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.4732, device='cuda:0') reward: tensor(1.3583, device='cuda:0') factor: tensor(0.1150, device='cuda:0')


Epoch 1, Step 702: Loss = -0.0103, Actor Loss = -0.0421, Critic Loss = 0.0637, Entropy = 0.0378, Returns = 1.3140, Value = 1.2459, mask_loss = 0.1033std_loss = 0.2837:  13%|█▎        | 702/5475 [3:08:55<19:41:55, 14.86s/it]

ratio tensor(1.3212, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5047, device='cuda:0') reward: tensor(1.3677, device='cuda:0') factor: tensor(0.1369, device='cuda:0')
reward_raw: tensor(0.5138, device='cuda:0') reward: tensor(1.3783, device='cuda:0') factor: tensor(0.1355, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.3803, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1366, device='cuda:0')
reward_raw: tensor(0.5134, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.1378, device='cuda:0')


Epoch 1, Step 703: Loss = 0.0714, Actor Loss = 0.0076, Critic Loss = 0.1277, Entropy = 0.0499, Returns = 1.3166, Value = 1.5773, mask_loss = 0.1619std_loss = 0.3410:  13%|█▎        | 703/5475 [3:09:12<20:50:48, 15.73s/it]  

ratio tensor(0.6814, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4916, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1107, device='cuda:0')
reward_raw: tensor(0.4895, device='cuda:0') reward: tensor(1.3804, device='cuda:0') factor: tensor(0.1091, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.1106, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.3721, device='cuda:0') factor: tensor(0.1102, device='cuda:0')


Epoch 1, Step 704: Loss = 0.0703, Actor Loss = 0.0327, Critic Loss = 0.0754, Entropy = 0.0407, Returns = 1.4228, Value = 1.3588, mask_loss = 0.1298std_loss = 0.3079:  13%|█▎        | 704/5475 [3:09:32<22:11:16, 16.74s/it]

ratio tensor(0.8717, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4266, device='cuda:0') reward: tensor(1.3112, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.4391, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.4400, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.1194, device='cuda:0')
reward_raw: tensor(0.4378, device='cuda:0') reward: tensor(1.3177, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.4332, device='cuda:0') reward: tensor(1.3185, device='cuda:0') factor: tensor(0.1147, device='cuda:0')


Epoch 1, Step 705: Loss = 0.0704, Actor Loss = 0.0098, Critic Loss = 0.1212, Entropy = 0.0383, Returns = 1.2904, Value = 1.2396, mask_loss = 0.1229std_loss = 0.3063:  13%|█▎        | 705/5475 [3:09:43<20:06:02, 15.17s/it]

ratio tensor(0.5251, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4758, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1333, device='cuda:0')
reward_raw: tensor(0.4634, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.1297, device='cuda:0')
reward_raw: tensor(0.4624, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.4659, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.1303, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3465, device='cuda:0') factor: tensor(0.1314, device='cuda:0')


Epoch 1, Step 706: Loss = -0.0674, Actor Loss = -0.0994, Critic Loss = 0.0642, Entropy = 0.0345, Returns = 1.4366, Value = 1.3345, mask_loss = 0.1249std_loss = 0.3091:  13%|█▎        | 706/5475 [3:10:05<22:35:39, 17.06s/it]

ratio tensor(1.3567, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3203, device='cuda:0') factor: tensor(0.1546, device='cuda:0')
reward_raw: tensor(0.4667, device='cuda:0') reward: tensor(1.3150, device='cuda:0') factor: tensor(0.1517, device='cuda:0')
reward_raw: tensor(0.4522, device='cuda:0') reward: tensor(1.3018, device='cuda:0') factor: tensor(0.1504, device='cuda:0')
reward_raw: tensor(0.4632, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.1472, device='cuda:0')
reward_raw: tensor(0.4643, device='cuda:0') reward: tensor(1.3121, device='cuda:0') factor: tensor(0.1522, device='cuda:0')


Epoch 1, Step 707: Loss = 0.0044, Actor Loss = -0.0397, Critic Loss = 0.0884, Entropy = 0.0443, Returns = 1.2954, Value = 1.4034, mask_loss = 0.1520std_loss = 0.3321:  13%|█▎        | 707/5475 [3:10:20<22:03:07, 16.65s/it] 

ratio tensor(1.2307, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.5597, device='cuda:0') reward: tensor(1.4294, device='cuda:0') factor: tensor(0.1304, device='cuda:0')
reward_raw: tensor(0.5532, device='cuda:0') reward: tensor(1.4192, device='cuda:0') factor: tensor(0.1340, device='cuda:0')
reward_raw: tensor(0.5572, device='cuda:0') reward: tensor(1.4213, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.5471, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1369, device='cuda:0')


Epoch 1, Step 708: Loss = 0.0731, Actor Loss = 0.0379, Critic Loss = 0.0704, Entropy = 0.0454, Returns = 1.4502, Value = 1.3600, mask_loss = 0.1314std_loss = 0.3039:  13%|█▎        | 708/5475 [3:10:35<21:25:58, 16.19s/it] 

ratio tensor(0.7951, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6059, device='cuda:0') reward: tensor(1.4399, device='cuda:0') factor: tensor(0.1660, device='cuda:0')
reward_raw: tensor(0.6052, device='cuda:0') reward: tensor(1.4346, device='cuda:0') factor: tensor(0.1706, device='cuda:0')
reward_raw: tensor(0.5885, device='cuda:0') reward: tensor(1.4201, device='cuda:0') factor: tensor(0.1684, device='cuda:0')
reward_raw: tensor(0.5940, device='cuda:0') reward: tensor(1.4250, device='cuda:0') factor: tensor(0.1690, device='cuda:0')
reward_raw: tensor(0.5714, device='cuda:0') reward: tensor(1.4039, device='cuda:0') factor: tensor(0.1675, device='cuda:0')


Epoch 1, Step 709: Loss = -0.0297, Actor Loss = -0.0652, Critic Loss = 0.0712, Entropy = 0.0457, Returns = 1.4157, Value = 1.4835, mask_loss = 0.1759std_loss = 0.3546:  13%|█▎        | 709/5475 [3:10:49<20:27:53, 15.46s/it]

ratio tensor(1.1994, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5009, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.1555, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.1565, device='cuda:0')
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3216, device='cuda:0') factor: tensor(0.1560, device='cuda:0')
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.1521, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.1561, device='cuda:0')


Epoch 1, Step 710: Loss = 0.1669, Actor Loss = 0.1281, Critic Loss = 0.0776, Entropy = 0.0497, Returns = 1.3236, Value = 1.2949, mask_loss = 0.1615std_loss = 0.3394:  13%|█▎        | 710/5475 [3:11:10<22:46:35, 17.21s/it]  

ratio tensor(0.9690, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5430, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1656, device='cuda:0')
reward_raw: tensor(0.5524, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.1727, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.1681, device='cuda:0')
reward_raw: tensor(0.5769, device='cuda:0') reward: tensor(1.4077, device='cuda:0') factor: tensor(0.1692, device='cuda:0')
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.4048, device='cuda:0') factor: tensor(0.1729, device='cuda:0')


Epoch 1, Step 711: Loss = -0.0837, Actor Loss = -0.1229, Critic Loss = 0.0785, Entropy = 0.0627, Returns = 1.3243, Value = 1.2752, mask_loss = 0.2074std_loss = 0.3570:  13%|█▎        | 711/5475 [3:11:24<21:20:46, 16.13s/it]

ratio tensor(1.4734, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5431, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1242, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.4337, device='cuda:0') factor: tensor(0.1239, device='cuda:0')
reward_raw: tensor(0.5590, device='cuda:0') reward: tensor(1.4359, device='cuda:0') factor: tensor(0.1232, device='cuda:0')
reward_raw: tensor(0.5608, device='cuda:0') reward: tensor(1.4385, device='cuda:0') factor: tensor(0.1223, device='cuda:0')
reward_raw: tensor(0.5562, device='cuda:0') reward: tensor(1.4346, device='cuda:0') factor: tensor(0.1216, device='cuda:0')


Epoch 1, Step 712: Loss = -0.0863, Actor Loss = -0.1071, Critic Loss = 0.0415, Entropy = 0.0210, Returns = 1.5013, Value = 1.5342, mask_loss = 0.1037std_loss = 0.2910:  13%|█▎        | 712/5475 [3:11:37<20:04:42, 15.18s/it]

ratio tensor(1.0842, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4406, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.1231, device='cuda:0')
reward_raw: tensor(0.4234, device='cuda:0') reward: tensor(1.2959, device='cuda:0') factor: tensor(0.1275, device='cuda:0')
reward_raw: tensor(0.4470, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.1240, device='cuda:0')
reward_raw: tensor(0.4251, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.1230, device='cuda:0')
reward_raw: tensor(0.4407, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.1234, device='cuda:0')


Epoch 1, Step 713: Loss = 0.2050, Actor Loss = 0.1483, Critic Loss = 0.1135, Entropy = 0.0315, Returns = 1.2848, Value = 1.1973, mask_loss = 0.1333std_loss = 0.3209:  13%|█▎        | 713/5475 [3:12:01<23:24:20, 17.69s/it]  

ratio tensor(1.7320, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4504, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.1568, device='cuda:0')
reward_raw: tensor(0.4387, device='cuda:0') reward: tensor(1.2831, device='cuda:0') factor: tensor(0.1556, device='cuda:0')
reward_raw: tensor(0.4218, device='cuda:0') reward: tensor(1.2665, device='cuda:0') factor: tensor(0.1552, device='cuda:0')
reward_raw: tensor(0.4276, device='cuda:0') reward: tensor(1.2712, device='cuda:0') factor: tensor(0.1564, device='cuda:0')
reward_raw: tensor(0.4304, device='cuda:0') reward: tensor(1.2727, device='cuda:0') factor: tensor(0.1577, device='cuda:0')


Epoch 1, Step 714: Loss = 0.0062, Actor Loss = -0.0340, Critic Loss = 0.0805, Entropy = 0.0454, Returns = 1.2690, Value = 1.3321, mask_loss = 0.1789std_loss = 0.3382:  13%|█▎        | 714/5475 [3:12:17<22:43:22, 17.18s/it]

ratio tensor(1.0443, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4835, device='cuda:0') reward: tensor(1.3414, device='cuda:0') factor: tensor(0.1421, device='cuda:0')
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.1408, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.1400, device='cuda:0')
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.1375, device='cuda:0')
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.1432, device='cuda:0')


Epoch 1, Step 715: Loss = -0.0556, Actor Loss = -0.1072, Critic Loss = 0.1032, Entropy = 0.0319, Returns = 1.3273, Value = 1.3906, mask_loss = 0.1411std_loss = 0.3209:  13%|█▎        | 715/5475 [3:12:36<23:33:25, 17.82s/it]

ratio tensor(1.3302, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3187, device='cuda:0') factor: tensor(0.1562, device='cuda:0')
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.1594, device='cuda:0')
reward_raw: tensor(0.4768, device='cuda:0') reward: tensor(1.3164, device='cuda:0') factor: tensor(0.1604, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.1594, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3300, device='cuda:0') factor: tensor(0.1543, device='cuda:0')


Epoch 1, Step 716: Loss = 0.2190, Actor Loss = 0.1800, Critic Loss = 0.0782, Entropy = 0.0658, Returns = 1.3153, Value = 1.3891, mask_loss = 0.1959std_loss = 0.3560:  13%|█▎        | 716/5475 [3:12:53<23:20:44, 17.66s/it]  

ratio tensor(0.2360, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5741, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1552, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.4066, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5536, device='cuda:0') reward: tensor(1.4014, device='cuda:0') factor: tensor(0.1522, device='cuda:0')
reward_raw: tensor(0.5701, device='cuda:0') reward: tensor(1.4161, device='cuda:0') factor: tensor(0.1541, device='cuda:0')
reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.4264, device='cuda:0') factor: tensor(0.1528, device='cuda:0')


Epoch 1, Step 717: Loss = -0.1054, Actor Loss = -0.1701, Critic Loss = 0.1294, Entropy = 0.0298, Returns = 1.4699, Value = 1.3649, mask_loss = 0.1469std_loss = 0.3339:  13%|█▎        | 717/5475 [3:13:05<20:51:45, 15.79s/it]

ratio tensor(1.1618, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.1721, device='cuda:0')
reward_raw: tensor(0.4775, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.1681, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.1760, device='cuda:0')
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.3485, device='cuda:0') factor: tensor(0.1694, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3419, device='cuda:0') factor: tensor(0.1696, device='cuda:0')


Epoch 1, Step 718: Loss = 0.2023, Actor Loss = 0.1562, Critic Loss = 0.0922, Entropy = 0.0606, Returns = 1.3558, Value = 1.4490, mask_loss = 0.1974std_loss = 0.3590:  13%|█▎        | 718/5475 [3:13:20<20:37:11, 15.60s/it]  

ratio tensor(0.8259, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.3798, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1363, device='cuda:0')
reward_raw: tensor(0.5323, device='cuda:0') reward: tensor(1.3969, device='cuda:0') factor: tensor(0.1354, device='cuda:0')
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.1366, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.1365, device='cuda:0')


Epoch 1, Step 719: Loss = 0.1287, Actor Loss = 0.0999, Critic Loss = 0.0578, Entropy = 0.0382, Returns = 1.3637, Value = 1.3819, mask_loss = 0.1141std_loss = 0.2930:  13%|█▎        | 719/5475 [3:13:36<20:51:47, 15.79s/it]

ratio tensor(0.6582, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4743, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1389, device='cuda:0')
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.1369, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3526, device='cuda:0') factor: tensor(0.1385, device='cuda:0')
reward_raw: tensor(0.4817, device='cuda:0') reward: tensor(1.3418, device='cuda:0') factor: tensor(0.1399, device='cuda:0')
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1380, device='cuda:0')


Epoch 1, Step 720: Loss = -0.0285, Actor Loss = -0.0613, Critic Loss = 0.0656, Entropy = 0.0289, Returns = 1.3302, Value = 1.2879, mask_loss = 0.1426std_loss = 0.3335:  13%|█▎        | 720/5475 [3:13:48<19:22:16, 14.67s/it]

ratio tensor(1.1249, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.3861, device='cuda:0') factor: tensor(0.1388, device='cuda:0')
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.5128, device='cuda:0') reward: tensor(1.3741, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.1325, device='cuda:0')
reward_raw: tensor(0.5126, device='cuda:0') reward: tensor(1.3768, device='cuda:0') factor: tensor(0.1357, device='cuda:0')


Epoch 1, Step 721: Loss = -0.0212, Actor Loss = -0.0645, Critic Loss = 0.0867, Entropy = 0.0385, Returns = 1.3933, Value = 1.5061, mask_loss = 0.1259std_loss = 0.3017:  13%|█▎        | 720/5475 [3:14:05<19:22:16, 14.67s/it]

ratio tensor(0.9481, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 721: Loss = -0.0212, Actor Loss = -0.0645, Critic Loss = 0.0867, Entropy = 0.0385, Returns = 1.3933, Value = 1.5061, mask_loss = 0.1259std_loss = 0.3017:  13%|█▎        | 721/5475 [3:14:06<20:36:23, 15.60s/it]

reward_raw: tensor(0.5497, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.5440, device='cuda:0') reward: tensor(1.4069, device='cuda:0') factor: tensor(0.1372, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.1375, device='cuda:0')
reward_raw: tensor(0.5363, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.3947, device='cuda:0') factor: tensor(0.1394, device='cuda:0')


Epoch 1, Step 722: Loss = 0.3001, Actor Loss = 0.2384, Critic Loss = 0.1233, Entropy = 0.0405, Returns = 1.3823, Value = 1.4689, mask_loss = 0.1466std_loss = 0.3242:  13%|█▎        | 722/5475 [3:14:20<20:00:30, 15.15s/it]  

ratio tensor(0.7982, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4859, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.1204, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.1232, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3798, device='cuda:0') factor: tensor(0.1223, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.3736, device='cuda:0') factor: tensor(0.1266, device='cuda:0')


Epoch 1, Step 723: Loss = 0.0093, Actor Loss = -0.0258, Critic Loss = 0.0703, Entropy = 0.0348, Returns = 1.3948, Value = 1.3412, mask_loss = 0.1250std_loss = 0.3094:  13%|█▎        | 723/5475 [3:14:43<23:12:44, 17.59s/it]

ratio tensor(0.8868, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5328, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.5303, device='cuda:0') reward: tensor(1.3565, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1781, device='cuda:0')
reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.3541, device='cuda:0') factor: tensor(0.1717, device='cuda:0')
reward_raw: tensor(0.5239, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.1728, device='cuda:0')


Epoch 1, Step 724: Loss = 0.0083, Actor Loss = -0.0211, Critic Loss = 0.0588, Entropy = 0.0561, Returns = 1.3639, Value = 1.3592, mask_loss = 0.2170std_loss = 0.3854:  13%|█▎        | 724/5475 [3:14:58<22:14:42, 16.86s/it]

ratio tensor(0.8908, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4505, device='cuda:0') reward: tensor(1.3255, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.4246, device='cuda:0') reward: tensor(1.3008, device='cuda:0') factor: tensor(0.1239, device='cuda:0')
reward_raw: tensor(0.4606, device='cuda:0') reward: tensor(1.3386, device='cuda:0') factor: tensor(0.1220, device='cuda:0')
reward_raw: tensor(0.4432, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.4482, device='cuda:0') reward: tensor(1.3213, device='cuda:0') factor: tensor(0.1269, device='cuda:0')


Epoch 1, Step 725: Loss = 0.0632, Actor Loss = 0.0126, Critic Loss = 0.1014, Entropy = 0.0369, Returns = 1.3581, Value = 1.3133, mask_loss = 0.1209std_loss = 0.3022:  13%|█▎        | 725/5475 [3:15:17<23:04:48, 17.49s/it] 

ratio tensor(1.2530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4891, device='cuda:0') reward: tensor(1.3275, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.5027, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1598, device='cuda:0')
reward_raw: tensor(0.5003, device='cuda:0') reward: tensor(1.3420, device='cuda:0') factor: tensor(0.1582, device='cuda:0')
reward_raw: tensor(0.4891, device='cuda:0') reward: tensor(1.3289, device='cuda:0') factor: tensor(0.1602, device='cuda:0')


Epoch 1, Step 726: Loss = 0.0937, Actor Loss = 0.0172, Critic Loss = 0.1532, Entropy = 0.0401, Returns = 1.3142, Value = 1.3867, mask_loss = 0.1492std_loss = 0.3291:  13%|█▎        | 726/5475 [3:15:33<22:17:53, 16.90s/it]

ratio tensor(0.9039, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4912, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.4889, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.1220, device='cuda:0')


Epoch 1, Step 727: Loss = 0.2333, Actor Loss = 0.1743, Critic Loss = 0.1180, Entropy = 0.0303, Returns = 1.4418, Value = 1.4656, mask_loss = 0.1085std_loss = 0.2926:  13%|█▎        | 727/5475 [3:15:51<22:57:21, 17.41s/it]

ratio tensor(0.7043, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.1244, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3709, device='cuda:0') factor: tensor(0.1305, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3890, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.1224, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.1247, device='cuda:0')


Epoch 1, Step 728: Loss = -0.1226, Actor Loss = -0.1702, Critic Loss = 0.0951, Entropy = 0.0305, Returns = 1.3524, Value = 1.3886, mask_loss = 0.1189std_loss = 0.3063:  13%|█▎        | 728/5475 [3:16:08<22:30:57, 17.08s/it]

ratio tensor(0.9956, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1210, device='cuda:0')
reward_raw: tensor(0.4789, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.1212, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3644, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.3463, device='cuda:0') factor: tensor(0.1225, device='cuda:0')
reward_raw: tensor(0.4973, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1220, device='cuda:0')


Epoch 1, Step 729: Loss = 0.0513, Actor Loss = 0.0000, Critic Loss = 0.1027, Entropy = 0.0419, Returns = 1.3422, Value = 1.5044, mask_loss = 0.1527std_loss = 0.3342:  13%|█▎        | 729/5475 [3:16:27<23:20:38, 17.71s/it]  

ratio tensor(0.8907, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5394, device='cuda:0') reward: tensor(1.4259, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.5313, device='cuda:0') reward: tensor(1.4169, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.4300, device='cuda:0') factor: tensor(0.1175, device='cuda:0')
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.4280, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.4259, device='cuda:0') factor: tensor(0.1141, device='cuda:0')


Epoch 1, Step 730: Loss = 0.1659, Actor Loss = 0.1362, Critic Loss = 0.0593, Entropy = 0.0244, Returns = 1.3877, Value = 1.3601, mask_loss = 0.1216std_loss = 0.3124:  13%|█▎        | 730/5475 [3:16:45<23:19:45, 17.70s/it]

ratio tensor(0.7214, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5559, device='cuda:0') reward: tensor(1.4358, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.4365, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.4080, device='cuda:0') factor: tensor(0.1209, device='cuda:0')
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4316, device='cuda:0') factor: tensor(0.1181, device='cuda:0')
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.4301, device='cuda:0') factor: tensor(0.1184, device='cuda:0')


Epoch 1, Step 731: Loss = -0.0902, Actor Loss = -0.1428, Critic Loss = 0.1054, Entropy = 0.0366, Returns = 1.4077, Value = 1.4613, mask_loss = 0.1279std_loss = 0.3048:  13%|█▎        | 731/5475 [3:17:01<22:58:52, 17.44s/it]

ratio tensor(1.9722, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.3895, device='cuda:0') factor: tensor(0.1208, device='cuda:0')
reward_raw: tensor(0.4929, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.1185, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.3866, device='cuda:0') factor: tensor(0.1177, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.3749, device='cuda:0') factor: tensor(0.1182, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1185, device='cuda:0')


Epoch 1, Step 732: Loss = 0.1939, Actor Loss = 0.1600, Critic Loss = 0.0677, Entropy = 0.0361, Returns = 1.3383, Value = 1.4432, mask_loss = 0.1144std_loss = 0.2985:  13%|█▎        | 732/5475 [3:17:17<22:09:02, 16.81s/it]  

ratio tensor(0.6881, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5492, device='cuda:0') reward: tensor(1.4335, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.5327, device='cuda:0') reward: tensor(1.4168, device='cuda:0') factor: tensor(0.1159, device='cuda:0')
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.4147, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.5459, device='cuda:0') reward: tensor(1.4293, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4398, device='cuda:0') factor: tensor(0.1165, device='cuda:0')


Epoch 1, Step 733: Loss = -0.0371, Actor Loss = -0.0769, Critic Loss = 0.0799, Entropy = 0.0315, Returns = 1.3558, Value = 1.4219, mask_loss = 0.1294std_loss = 0.3181:  13%|█▎        | 733/5475 [3:17:30<20:35:10, 15.63s/it]

ratio tensor(1.8393, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3937, device='cuda:0') reward: tensor(1.2858, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.3936, device='cuda:0') reward: tensor(1.2826, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.3730, device='cuda:0') reward: tensor(1.2683, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.3898, device='cuda:0') reward: tensor(1.2785, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.3862, device='cuda:0') reward: tensor(1.2763, device='cuda:0') factor: tensor(0.1098, device='cuda:0')


Epoch 1, Step 734: Loss = 0.2212, Actor Loss = 0.1729, Critic Loss = 0.0967, Entropy = 0.0315, Returns = 1.2850, Value = 1.3235, mask_loss = 0.1149std_loss = 0.3000:  13%|█▎        | 734/5475 [3:17:48<21:36:23, 16.41s/it]  

ratio tensor(0.7187, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.4500, device='cuda:0') factor: tensor(0.1323, device='cuda:0')
reward_raw: tensor(0.5838, device='cuda:0') reward: tensor(1.4547, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.5838, device='cuda:0') reward: tensor(1.4530, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.4575, device='cuda:0') factor: tensor(0.1332, device='cuda:0')
reward_raw: tensor(0.5882, device='cuda:0') reward: tensor(1.4556, device='cuda:0') factor: tensor(0.1326, device='cuda:0')


Epoch 1, Step 735: Loss = 0.0048, Actor Loss = -0.0273, Critic Loss = 0.0642, Entropy = 0.0441, Returns = 1.4200, Value = 1.3435, mask_loss = 0.1401std_loss = 0.3220:  13%|█▎        | 735/5475 [3:18:01<20:26:54, 15.53s/it]

ratio tensor(0.9460, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4384, device='cuda:0') reward: tensor(1.3016, device='cuda:0') factor: tensor(0.1369, device='cuda:0')
reward_raw: tensor(0.4490, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.1334, device='cuda:0')
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.4240, device='cuda:0') reward: tensor(1.2888, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.4198, device='cuda:0') reward: tensor(1.2887, device='cuda:0') factor: tensor(0.1311, device='cuda:0')


Epoch 1, Step 736: Loss = 0.2573, Actor Loss = 0.2173, Critic Loss = 0.0801, Entropy = 0.0356, Returns = 1.2487, Value = 1.3068, mask_loss = 0.1374std_loss = 0.3255:  13%|█▎        | 736/5475 [3:18:16<20:09:23, 15.31s/it] 

ratio tensor(0.4253, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4710, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.1478, device='cuda:0')
reward_raw: tensor(0.4729, device='cuda:0') reward: tensor(1.3285, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.1457, device='cuda:0')
reward_raw: tensor(0.4453, device='cuda:0') reward: tensor(1.2995, device='cuda:0') factor: tensor(0.1459, device='cuda:0')
reward_raw: tensor(0.4532, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.1438, device='cuda:0')


Epoch 1, Step 737: Loss = -0.1195, Actor Loss = -0.1741, Critic Loss = 0.1092, Entropy = 0.0606, Returns = 1.2747, Value = 1.2127, mask_loss = 0.1754std_loss = 0.3445:  13%|█▎        | 737/5475 [3:18:33<20:39:30, 15.70s/it]

ratio tensor(1.0189, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4803, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.1341, device='cuda:0')
reward_raw: tensor(0.4664, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.4736, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1313, device='cuda:0')
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.4762, device='cuda:0') reward: tensor(1.3436, device='cuda:0') factor: tensor(0.1326, device='cuda:0')


Epoch 1, Step 738: Loss = 0.0336, Actor Loss = 0.0044, Critic Loss = 0.0583, Entropy = 0.0450, Returns = 1.3666, Value = 1.3146, mask_loss = 0.1355std_loss = 0.3216:  13%|█▎        | 738/5475 [3:18:47<20:05:43, 15.27s/it]  

ratio tensor(1.5233, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.5068, device='cuda:0') reward: tensor(1.3836, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.4040, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5303, device='cuda:0') reward: tensor(1.4079, device='cuda:0') factor: tensor(0.1224, device='cuda:0')
reward_raw: tensor(0.5034, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.1222, device='cuda:0')


Epoch 1, Step 739: Loss = 0.0237, Actor Loss = -0.0066, Critic Loss = 0.0607, Entropy = 0.0346, Returns = 1.3372, Value = 1.3124, mask_loss = 0.1403std_loss = 0.3240:  13%|█▎        | 739/5475 [3:19:03<20:33:35, 15.63s/it]

ratio tensor(0.7020, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4052, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.4066, device='cuda:0') reward: tensor(1.2983, device='cuda:0') factor: tensor(0.1082, device='cuda:0')
reward_raw: tensor(0.4110, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.4130, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.1106, device='cuda:0')
reward_raw: tensor(0.4035, device='cuda:0') reward: tensor(1.2946, device='cuda:0') factor: tensor(0.1088, device='cuda:0')


Epoch 1, Step 740: Loss = 0.0348, Actor Loss = 0.0037, Critic Loss = 0.0624, Entropy = 0.0375, Returns = 1.2943, Value = 1.3486, mask_loss = 0.1196std_loss = 0.3003:  14%|█▎        | 740/5475 [3:19:23<22:10:43, 16.86s/it] 

ratio tensor(2.0089, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4694, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.4667, device='cuda:0') reward: tensor(1.3327, device='cuda:0') factor: tensor(0.1340, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3475, device='cuda:0') factor: tensor(0.1323, device='cuda:0')
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.1339, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3316, device='cuda:0') factor: tensor(0.1359, device='cuda:0')


Epoch 1, Step 741: Loss = -0.0833, Actor Loss = -0.1081, Critic Loss = 0.0497, Entropy = 0.0277, Returns = 1.3921, Value = 1.3754, mask_loss = 0.1364std_loss = 0.3251:  14%|█▎        | 740/5475 [3:19:40<22:10:43, 16.86s/it]

ratio tensor(1.1876, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 741: Loss = -0.0833, Actor Loss = -0.1081, Critic Loss = 0.0497, Entropy = 0.0277, Returns = 1.3921, Value = 1.3754, mask_loss = 0.1364std_loss = 0.3251:  14%|█▎        | 741/5475 [3:19:41<22:22:13, 17.01s/it]

reward_raw: tensor(0.5458, device='cuda:0') reward: tensor(1.4158, device='cuda:0') factor: tensor(0.1300, device='cuda:0')
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.1285, device='cuda:0')
reward_raw: tensor(0.5712, device='cuda:0') reward: tensor(1.4422, device='cuda:0') factor: tensor(0.1290, device='cuda:0')
reward_raw: tensor(0.5709, device='cuda:0') reward: tensor(1.4418, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.5634, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.1290, device='cuda:0')


Epoch 1, Step 742: Loss = -0.0824, Actor Loss = -0.1356, Critic Loss = 0.1067, Entropy = 0.0370, Returns = 1.5204, Value = 1.3540, mask_loss = 0.1466std_loss = 0.3283:  14%|█▎        | 742/5475 [3:19:56<21:53:35, 16.65s/it]

ratio tensor(2.9007, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4854, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.1555, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1529, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.3432, device='cuda:0') factor: tensor(0.1512, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.1570, device='cuda:0')
reward_raw: tensor(0.5011, device='cuda:0') reward: tensor(1.3450, device='cuda:0') factor: tensor(0.1561, device='cuda:0')


Epoch 1, Step 743: Loss = 0.0703, Actor Loss = 0.0245, Critic Loss = 0.0917, Entropy = 0.0562, Returns = 1.3173, Value = 1.4573, mask_loss = 0.1514std_loss = 0.3339:  14%|█▎        | 743/5475 [3:20:08<19:55:31, 15.16s/it]  

ratio tensor(1.1110, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4367, device='cuda:0') reward: tensor(1.3240, device='cuda:0') factor: tensor(0.1128, device='cuda:0')
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.4522, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.4437, device='cuda:0') reward: tensor(1.3287, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.4605, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1174, device='cuda:0')


Epoch 1, Step 744: Loss = -0.0345, Actor Loss = -0.0988, Critic Loss = 0.1286, Entropy = 0.0262, Returns = 1.4547, Value = 1.2199, mask_loss = 0.0947std_loss = 0.2753:  14%|█▎        | 744/5475 [3:20:22<19:23:58, 14.76s/it]

ratio tensor(1.0755, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4264, device='cuda:0') reward: tensor(1.3116, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.4249, device='cuda:0') reward: tensor(1.3109, device='cuda:0') factor: tensor(0.1140, device='cuda:0')
reward_raw: tensor(0.4262, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4234, device='cuda:0') reward: tensor(1.3103, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.4294, device='cuda:0') reward: tensor(1.3166, device='cuda:0') factor: tensor(0.1128, device='cuda:0')


Epoch 1, Step 745: Loss = 0.0483, Actor Loss = -0.0080, Critic Loss = 0.1126, Entropy = 0.0257, Returns = 1.3379, Value = 1.3855, mask_loss = 0.1004std_loss = 0.2832:  14%|█▎        | 745/5475 [3:20:42<21:26:56, 16.32s/it] 

ratio tensor(0.8360, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3610, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.4921, device='cuda:0') reward: tensor(1.3711, device='cuda:0') factor: tensor(0.1210, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.4012, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1237, device='cuda:0')


Epoch 1, Step 746: Loss = 0.0505, Actor Loss = 0.0125, Critic Loss = 0.0761, Entropy = 0.0365, Returns = 1.3700, Value = 1.4767, mask_loss = 0.1298std_loss = 0.3168:  14%|█▎        | 746/5475 [3:21:00<21:59:57, 16.75s/it] 

ratio tensor(1.5971, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5416, device='cuda:0') reward: tensor(1.4197, device='cuda:0') factor: tensor(0.1219, device='cuda:0')
reward_raw: tensor(0.5758, device='cuda:0') reward: tensor(1.4556, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.4481, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.4465, device='cuda:0') factor: tensor(0.1215, device='cuda:0')
reward_raw: tensor(0.5960, device='cuda:0') reward: tensor(1.4732, device='cuda:0') factor: tensor(0.1228, device='cuda:0')


Epoch 1, Step 747: Loss = 0.0444, Actor Loss = -0.0119, Critic Loss = 0.1125, Entropy = 0.0315, Returns = 1.3587, Value = 1.4960, mask_loss = 0.1482std_loss = 0.3208:  14%|█▎        | 747/5475 [3:21:18<22:37:19, 17.23s/it]

ratio tensor(0.6880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4775, device='cuda:0') reward: tensor(1.3437, device='cuda:0') factor: tensor(0.1338, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3420, device='cuda:0') factor: tensor(0.1360, device='cuda:0')
reward_raw: tensor(0.4794, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.1332, device='cuda:0')
reward_raw: tensor(0.4616, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.1351, device='cuda:0')
reward_raw: tensor(0.4638, device='cuda:0') reward: tensor(1.3318, device='cuda:0') factor: tensor(0.1320, device='cuda:0')


Epoch 1, Step 748: Loss = 0.2839, Actor Loss = 0.2432, Critic Loss = 0.0815, Entropy = 0.0378, Returns = 1.3115, Value = 1.2886, mask_loss = 0.1263std_loss = 0.3127:  14%|█▎        | 748/5475 [3:21:33<21:47:26, 16.60s/it] 

ratio tensor(0.7461, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3543, device='cuda:0') factor: tensor(0.1374, device='cuda:0')
reward_raw: tensor(0.4752, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.1323, device='cuda:0')
reward_raw: tensor(0.4744, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1304, device='cuda:0')
reward_raw: tensor(0.4700, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.5008, device='cuda:0') reward: tensor(1.3711, device='cuda:0') factor: tensor(0.1297, device='cuda:0')


Epoch 1, Step 749: Loss = 0.0065, Actor Loss = -0.0685, Critic Loss = 0.1501, Entropy = 0.0430, Returns = 1.3168, Value = 1.3929, mask_loss = 0.1277std_loss = 0.3066:  14%|█▎        | 749/5475 [3:21:48<21:08:35, 16.11s/it]

ratio tensor(1.2674, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.1206, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.3868, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.3793, device='cuda:0') factor: tensor(0.1189, device='cuda:0')
reward_raw: tensor(0.4846, device='cuda:0') reward: tensor(1.3675, device='cuda:0') factor: tensor(0.1171, device='cuda:0')


Epoch 1, Step 750: Loss = 0.1283, Actor Loss = 0.0878, Critic Loss = 0.0812, Entropy = 0.0376, Returns = 1.3464, Value = 1.5049, mask_loss = 0.1133std_loss = 0.2947:  14%|█▎        | 750/5475 [3:22:03<20:39:25, 15.74s/it] 

ratio tensor(0.6245, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4743, device='cuda:0') reward: tensor(1.3597, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.4669, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.4658, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4649, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.4705, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.1130, device='cuda:0')


Epoch 1, Step 751: Loss = 0.1031, Actor Loss = 0.0389, Critic Loss = 0.1285, Entropy = 0.0352, Returns = 1.4189, Value = 1.3727, mask_loss = 0.1105std_loss = 0.2912:  14%|█▎        | 751/5475 [3:22:21<21:33:35, 16.43s/it]

ratio tensor(0.7792, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4974, device='cuda:0') reward: tensor(1.3742, device='cuda:0') factor: tensor(0.1232, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3866, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3574, device='cuda:0') factor: tensor(0.1225, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.1218, device='cuda:0')


Epoch 1, Step 752: Loss = -0.0316, Actor Loss = -0.0834, Critic Loss = 0.1035, Entropy = 0.0274, Returns = 1.3565, Value = 1.3457, mask_loss = 0.1054std_loss = 0.2904:  14%|█▎        | 752/5475 [3:22:35<20:39:44, 15.75s/it]

ratio tensor(1.1029, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.1147, device='cuda:0')
reward_raw: tensor(0.5429, device='cuda:0') reward: tensor(1.4199, device='cuda:0') factor: tensor(0.1230, device='cuda:0')
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.5104, device='cuda:0') reward: tensor(1.3944, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.4014, device='cuda:0') factor: tensor(0.1211, device='cuda:0')


Epoch 1, Step 753: Loss = 0.0047, Actor Loss = -0.0200, Critic Loss = 0.0495, Entropy = 0.0317, Returns = 1.4227, Value = 1.4735, mask_loss = 0.1214std_loss = 0.3095:  14%|█▍        | 753/5475 [3:22:47<19:16:08, 14.69s/it] 

ratio tensor(1.0525, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3708, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.4673, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.4689, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.4593, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.3506, device='cuda:0') factor: tensor(0.1182, device='cuda:0')


Epoch 1, Step 754: Loss = 0.2185, Actor Loss = 0.1854, Critic Loss = 0.0663, Entropy = 0.0299, Returns = 1.2565, Value = 1.3846, mask_loss = 0.1428std_loss = 0.3209:  14%|█▍        | 754/5475 [3:23:10<22:30:33, 17.16s/it] 

ratio tensor(0.6276, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.5036, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.0992, device='cuda:0')
reward_raw: tensor(0.5272, device='cuda:0') reward: tensor(1.4257, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.5148, device='cuda:0') reward: tensor(1.4124, device='cuda:0') factor: tensor(0.1023, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3899, device='cuda:0') factor: tensor(0.1044, device='cuda:0')


Epoch 1, Step 755: Loss = 0.1986, Actor Loss = 0.1631, Critic Loss = 0.0710, Entropy = 0.0288, Returns = 1.4277, Value = 1.3283, mask_loss = 0.0898std_loss = 0.2642:  14%|█▍        | 755/5475 [3:23:31<23:47:37, 18.15s/it]

ratio tensor(1.3256, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.1471, device='cuda:0')
reward_raw: tensor(0.4875, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.1495, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.1471, device='cuda:0')
reward_raw: tensor(0.4826, device='cuda:0') reward: tensor(1.3326, device='cuda:0') factor: tensor(0.1500, device='cuda:0')
reward_raw: tensor(0.4810, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1518, device='cuda:0')


Epoch 1, Step 756: Loss = -0.2526, Actor Loss = -0.3517, Critic Loss = 0.1982, Entropy = 0.0323, Returns = 1.4028, Value = 1.1785, mask_loss = 0.1389std_loss = 0.3257:  14%|█▍        | 756/5475 [3:23:46<22:45:31, 17.36s/it]

ratio tensor(1.2762, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.1210, device='cuda:0')
reward_raw: tensor(0.5274, device='cuda:0') reward: tensor(1.4079, device='cuda:0') factor: tensor(0.1195, device='cuda:0')
reward_raw: tensor(0.5383, device='cuda:0') reward: tensor(1.4204, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.5323, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.1190, device='cuda:0')


Epoch 1, Step 757: Loss = 0.6371, Actor Loss = 0.6130, Critic Loss = 0.0484, Entropy = 0.0379, Returns = 1.4383, Value = 1.5023, mask_loss = 0.1477std_loss = 0.3320:  14%|█▍        | 757/5475 [3:24:08<24:21:25, 18.59s/it]  

ratio tensor(3.0259, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3491, device='cuda:0') factor: tensor(0.1507, device='cuda:0')
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.3549, device='cuda:0') factor: tensor(0.1501, device='cuda:0')
reward_raw: tensor(0.5074, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.1503, device='cuda:0')
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3465, device='cuda:0') factor: tensor(0.1523, device='cuda:0')
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1489, device='cuda:0')


Epoch 1, Step 758: Loss = 0.1984, Actor Loss = 0.1683, Critic Loss = 0.0603, Entropy = 0.0697, Returns = 1.2898, Value = 1.3255, mask_loss = 0.2709std_loss = 0.4060:  14%|█▍        | 758/5475 [3:24:23<23:10:06, 17.68s/it]

ratio tensor(9.3490e-14, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6011, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.2552, device='cuda:0')
reward_raw: tensor(0.5851, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.2500, device='cuda:0')
reward_raw: tensor(0.5811, device='cuda:0') reward: tensor(1.3250, device='cuda:0') factor: tensor(0.2561, device='cuda:0')
reward_raw: tensor(0.5960, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.2546, device='cuda:0')
reward_raw: tensor(0.5678, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.2492, device='cuda:0')


Epoch 1, Step 759: Loss = 0.1370, Actor Loss = 0.1051, Critic Loss = 0.0639, Entropy = 0.0967, Returns = 1.3215, Value = 1.3305, mask_loss = 0.3393std_loss = 0.4194:  14%|█▍        | 759/5475 [3:24:42<23:38:22, 18.05s/it]

ratio tensor(0.1224, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5905, device='cuda:0') reward: tensor(1.3112, device='cuda:0') factor: tensor(0.2793, device='cuda:0')
reward_raw: tensor(0.5894, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.2794, device='cuda:0')
reward_raw: tensor(0.5851, device='cuda:0') reward: tensor(1.3090, device='cuda:0') factor: tensor(0.2761, device='cuda:0')
reward_raw: tensor(0.5860, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.2791, device='cuda:0')
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.2951, device='cuda:0') factor: tensor(0.2826, device='cuda:0')


Epoch 1, Step 760: Loss = -0.1003, Actor Loss = -0.1522, Critic Loss = 0.1038, Entropy = 0.0790, Returns = 1.2995, Value = 1.1589, mask_loss = 0.2654std_loss = 0.4033:  14%|█▍        | 760/5475 [3:24:56<21:58:25, 16.78s/it]

ratio tensor(1.3313, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.2313, device='cuda:0') factor: tensor(0.3692, device='cuda:0')
reward_raw: tensor(0.6211, device='cuda:0') reward: tensor(1.2499, device='cuda:0') factor: tensor(0.3711, device='cuda:0')
reward_raw: tensor(0.6237, device='cuda:0') reward: tensor(1.2478, device='cuda:0') factor: tensor(0.3760, device='cuda:0')
reward_raw: tensor(0.6240, device='cuda:0') reward: tensor(1.2432, device='cuda:0') factor: tensor(0.3808, device='cuda:0')
reward_raw: tensor(0.6259, device='cuda:0') reward: tensor(1.2482, device='cuda:0') factor: tensor(0.3778, device='cuda:0')


Epoch 1, Step 761: Loss = 0.0551, Actor Loss = -0.0036, Critic Loss = 0.1177, Entropy = 0.1123, Returns = 1.2204, Value = 1.2693, mask_loss = 0.3819std_loss = 0.4298:  14%|█▍        | 760/5475 [3:25:09<21:58:25, 16.78s/it] 

ratio tensor(0.9091, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 761: Loss = 0.0551, Actor Loss = -0.0036, Critic Loss = 0.1177, Entropy = 0.1123, Returns = 1.2204, Value = 1.2693, mask_loss = 0.3819std_loss = 0.4298:  14%|█▍        | 761/5475 [3:25:09<20:38:40, 15.77s/it]

reward_raw: tensor(0.6546, device='cuda:0') reward: tensor(1.3297, device='cuda:0') factor: tensor(0.3249, device='cuda:0')
reward_raw: tensor(0.6628, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.3169, device='cuda:0')
reward_raw: tensor(0.6470, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.3157, device='cuda:0')
reward_raw: tensor(0.6765, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.3254, device='cuda:0')
reward_raw: tensor(0.6517, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.3182, device='cuda:0')


Epoch 1, Step 762: Loss = -0.1833, Actor Loss = -0.2251, Critic Loss = 0.0838, Entropy = 0.1055, Returns = 1.3435, Value = 1.2451, mask_loss = 0.3464std_loss = 0.4348:  14%|█▍        | 762/5475 [3:25:22<19:27:55, 14.87s/it]

ratio tensor(19.1330, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.2828, device='cuda:0') factor: tensor(0.2896, device='cuda:0')
reward_raw: tensor(0.5729, device='cuda:0') reward: tensor(1.2811, device='cuda:0') factor: tensor(0.2917, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.2827, device='cuda:0') factor: tensor(0.2916, device='cuda:0')
reward_raw: tensor(0.5856, device='cuda:0') reward: tensor(1.2857, device='cuda:0') factor: tensor(0.2998, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.2712, device='cuda:0') factor: tensor(0.2949, device='cuda:0')


Epoch 1, Step 763: Loss = 0.1225, Actor Loss = 0.1029, Critic Loss = 0.0393, Entropy = 0.1085, Returns = 1.2654, Value = 1.2089, mask_loss = 0.3370std_loss = 0.4289:  14%|█▍        | 763/5475 [3:25:38<19:57:01, 15.24s/it]  

ratio tensor(0.3679, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5990, device='cuda:0') reward: tensor(1.2981, device='cuda:0') factor: tensor(0.3009, device='cuda:0')
reward_raw: tensor(0.5767, device='cuda:0') reward: tensor(1.2782, device='cuda:0') factor: tensor(0.2985, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.2638, device='cuda:0') factor: tensor(0.3010, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.2740, device='cuda:0') factor: tensor(0.3005, device='cuda:0')
reward_raw: tensor(0.5832, device='cuda:0') reward: tensor(1.2890, device='cuda:0') factor: tensor(0.2942, device='cuda:0')


Epoch 1, Step 764: Loss = -0.0442, Actor Loss = -0.0792, Critic Loss = 0.0702, Entropy = 0.1097, Returns = 1.3135, Value = 1.2919, mask_loss = 0.2919std_loss = 0.4040:  14%|█▍        | 764/5475 [3:25:55<20:31:45, 15.69s/it]

ratio tensor(0.8022, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5710, device='cuda:0') reward: tensor(1.2803, device='cuda:0') factor: tensor(0.2907, device='cuda:0')
reward_raw: tensor(0.5883, device='cuda:0') reward: tensor(1.2958, device='cuda:0') factor: tensor(0.2925, device='cuda:0')
reward_raw: tensor(0.5881, device='cuda:0') reward: tensor(1.2951, device='cuda:0') factor: tensor(0.2930, device='cuda:0')
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.2798, device='cuda:0') factor: tensor(0.2915, device='cuda:0')
reward_raw: tensor(0.5902, device='cuda:0') reward: tensor(1.2949, device='cuda:0') factor: tensor(0.2953, device='cuda:0')


Epoch 1, Step 765: Loss = 0.2147, Actor Loss = 0.1890, Critic Loss = 0.0516, Entropy = 0.1248, Returns = 1.2446, Value = 1.3614, mask_loss = 0.3743std_loss = 0.4374:  14%|█▍        | 765/5475 [3:26:11<20:28:47, 15.65s/it]  

ratio tensor(0.6936, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6804, device='cuda:0') reward: tensor(1.3402, device='cuda:0') factor: tensor(0.3402, device='cuda:0')
reward_raw: tensor(0.6681, device='cuda:0') reward: tensor(1.3282, device='cuda:0') factor: tensor(0.3398, device='cuda:0')
reward_raw: tensor(0.6862, device='cuda:0') reward: tensor(1.3540, device='cuda:0') factor: tensor(0.3322, device='cuda:0')
reward_raw: tensor(0.6801, device='cuda:0') reward: tensor(1.3419, device='cuda:0') factor: tensor(0.3383, device='cuda:0')
reward_raw: tensor(0.6829, device='cuda:0') reward: tensor(1.3483, device='cuda:0') factor: tensor(0.3346, device='cuda:0')


Epoch 1, Step 766: Loss = -0.0620, Actor Loss = -0.1042, Critic Loss = 0.0846, Entropy = 0.1170, Returns = 1.3571, Value = 1.3330, mask_loss = 0.3749std_loss = 0.4305:  14%|█▍        | 766/5475 [3:26:24<19:27:29, 14.88s/it]

ratio tensor(1.4812, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5795, device='cuda:0') reward: tensor(1.1844, device='cuda:0') factor: tensor(0.3951, device='cuda:0')
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.1566, device='cuda:0') factor: tensor(0.4028, device='cuda:0')
reward_raw: tensor(0.5535, device='cuda:0') reward: tensor(1.1612, device='cuda:0') factor: tensor(0.3923, device='cuda:0')
reward_raw: tensor(0.5508, device='cuda:0') reward: tensor(1.1523, device='cuda:0') factor: tensor(0.3984, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.1629, device='cuda:0') factor: tensor(0.4046, device='cuda:0')


Epoch 1, Step 767: Loss = 0.1730, Actor Loss = 0.1398, Critic Loss = 0.0668, Entropy = 0.1265, Returns = 1.1686, Value = 1.0771, mask_loss = 0.4173std_loss = 0.4136:  14%|█▍        | 767/5475 [3:26:46<22:29:26, 17.20s/it]  

ratio tensor(0.1089, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6630, device='cuda:0') reward: tensor(1.2094, device='cuda:0') factor: tensor(0.4536, device='cuda:0')
reward_raw: tensor(0.6673, device='cuda:0') reward: tensor(1.2224, device='cuda:0') factor: tensor(0.4448, device='cuda:0')
reward_raw: tensor(0.6691, device='cuda:0') reward: tensor(1.2281, device='cuda:0') factor: tensor(0.4409, device='cuda:0')
reward_raw: tensor(0.6749, device='cuda:0') reward: tensor(1.2255, device='cuda:0') factor: tensor(0.4494, device='cuda:0')
reward_raw: tensor(0.6585, device='cuda:0') reward: tensor(1.2128, device='cuda:0') factor: tensor(0.4457, device='cuda:0')


Epoch 1, Step 768: Loss = -0.0022, Actor Loss = -0.0201, Critic Loss = 0.0359, Entropy = 0.1292, Returns = 1.2360, Value = 1.2137, mask_loss = 0.4811std_loss = 0.4460:  14%|█▍        | 768/5475 [3:27:01<21:33:29, 16.49s/it]

ratio tensor(0.5378, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6854, device='cuda:0') reward: tensor(1.0839, device='cuda:0') factor: tensor(0.6015, device='cuda:0')
reward_raw: tensor(0.6856, device='cuda:0') reward: tensor(1.0838, device='cuda:0') factor: tensor(0.6019, device='cuda:0')
reward_raw: tensor(0.6901, device='cuda:0') reward: tensor(1.0842, device='cuda:0') factor: tensor(0.6059, device='cuda:0')
reward_raw: tensor(0.6786, device='cuda:0') reward: tensor(1.0819, device='cuda:0') factor: tensor(0.5966, device='cuda:0')
reward_raw: tensor(0.6743, device='cuda:0') reward: tensor(1.0731, device='cuda:0') factor: tensor(0.6012, device='cuda:0')


Epoch 1, Step 769: Loss = 0.1415, Actor Loss = 0.0945, Critic Loss = 0.0942, Entropy = 0.1245, Returns = 1.1427, Value = 1.0959, mask_loss = 0.6289std_loss = 0.4151:  14%|█▍        | 769/5475 [3:27:14<20:05:58, 15.38s/it]  

ratio tensor(0.9543, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6651, device='cuda:0') reward: tensor(1.1804, device='cuda:0') factor: tensor(0.4847, device='cuda:0')
reward_raw: tensor(0.6852, device='cuda:0') reward: tensor(1.1998, device='cuda:0') factor: tensor(0.4855, device='cuda:0')
reward_raw: tensor(0.6624, device='cuda:0') reward: tensor(1.1601, device='cuda:0') factor: tensor(0.5023, device='cuda:0')
reward_raw: tensor(0.6554, device='cuda:0') reward: tensor(1.1640, device='cuda:0') factor: tensor(0.4914, device='cuda:0')
reward_raw: tensor(0.6744, device='cuda:0') reward: tensor(1.1921, device='cuda:0') factor: tensor(0.4824, device='cuda:0')


Epoch 1, Step 770: Loss = -0.0579, Actor Loss = -0.0944, Critic Loss = 0.0731, Entropy = 0.1252, Returns = 1.1398, Value = 1.1937, mask_loss = 0.5544std_loss = 0.4270:  14%|█▍        | 770/5475 [3:27:27<19:17:07, 14.76s/it]

ratio tensor(2.9508, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7398, device='cuda:0') reward: tensor(1.3131, device='cuda:0') factor: tensor(0.4267, device='cuda:0')
reward_raw: tensor(0.7622, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.4243, device='cuda:0')
reward_raw: tensor(0.7411, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.4221, device='cuda:0')
reward_raw: tensor(0.7555, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.4209, device='cuda:0')
reward_raw: tensor(0.7402, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.4219, device='cuda:0')


Epoch 1, Step 771: Loss = 0.0682, Actor Loss = 0.0284, Critic Loss = 0.0798, Entropy = 0.1260, Returns = 1.2833, Value = 1.3239, mask_loss = 0.4911std_loss = 0.4338:  14%|█▍        | 771/5475 [3:27:48<21:47:27, 16.68s/it]  

ratio tensor(0.1543, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.7074, device='cuda:0') reward: tensor(1.2039, device='cuda:0') factor: tensor(0.5035, device='cuda:0')
reward_raw: tensor(0.7139, device='cuda:0') reward: tensor(1.2043, device='cuda:0') factor: tensor(0.5097, device='cuda:0')
reward_raw: tensor(0.7199, device='cuda:0') reward: tensor(1.2075, device='cuda:0') factor: tensor(0.5124, device='cuda:0')
reward_raw: tensor(0.7049, device='cuda:0') reward: tensor(1.1998, device='cuda:0') factor: tensor(0.5050, device='cuda:0')
reward_raw: tensor(0.7170, device='cuda:0') reward: tensor(1.2108, device='cuda:0') factor: tensor(0.5062, device='cuda:0')


Epoch 1, Step 772: Loss = 0.1924, Actor Loss = 0.1563, Critic Loss = 0.0723, Entropy = 0.1139, Returns = 1.1844, Value = 1.0338, mask_loss = 0.4911std_loss = 0.4238:  14%|█▍        | 772/5475 [3:28:01<20:05:55, 15.39s/it]

ratio tensor(0.0908, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6027, device='cuda:0') reward: tensor(1.1916, device='cuda:0') factor: tensor(0.4112, device='cuda:0')
reward_raw: tensor(0.6183, device='cuda:0') reward: tensor(1.2043, device='cuda:0') factor: tensor(0.4140, device='cuda:0')
reward_raw: tensor(0.5922, device='cuda:0') reward: tensor(1.1809, device='cuda:0') factor: tensor(0.4112, device='cuda:0')
reward_raw: tensor(0.6079, device='cuda:0') reward: tensor(1.1999, device='cuda:0') factor: tensor(0.4080, device='cuda:0')
reward_raw: tensor(0.6058, device='cuda:0') reward: tensor(1.1919, device='cuda:0') factor: tensor(0.4138, device='cuda:0')


Epoch 1, Step 773: Loss = 0.0581, Actor Loss = 0.0354, Critic Loss = 0.0456, Entropy = 0.1272, Returns = 1.1017, Value = 1.1468, mask_loss = 0.4551std_loss = 0.4439:  14%|█▍        | 773/5475 [3:28:17<20:25:54, 15.64s/it]

ratio tensor(1.8309, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6573, device='cuda:0') reward: tensor(1.1931, device='cuda:0') factor: tensor(0.4641, device='cuda:0')
reward_raw: tensor(0.6704, device='cuda:0') reward: tensor(1.2078, device='cuda:0') factor: tensor(0.4625, device='cuda:0')
reward_raw: tensor(0.6584, device='cuda:0') reward: tensor(1.1920, device='cuda:0') factor: tensor(0.4665, device='cuda:0')
reward_raw: tensor(0.6524, device='cuda:0') reward: tensor(1.1807, device='cuda:0') factor: tensor(0.4717, device='cuda:0')
reward_raw: tensor(0.6683, device='cuda:0') reward: tensor(1.1990, device='cuda:0') factor: tensor(0.4692, device='cuda:0')


Epoch 1, Step 774: Loss = 0.0992, Actor Loss = 0.0617, Critic Loss = 0.0753, Entropy = 0.1393, Returns = 1.1783, Value = 1.3010, mask_loss = 0.5223std_loss = 0.4521:  14%|█▍        | 774/5475 [3:28:31<19:51:00, 15.20s/it]

ratio tensor(0.7365, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6630, device='cuda:0') reward: tensor(1.1714, device='cuda:0') factor: tensor(0.4916, device='cuda:0')
reward_raw: tensor(0.6736, device='cuda:0') reward: tensor(1.1768, device='cuda:0') factor: tensor(0.4968, device='cuda:0')
reward_raw: tensor(0.6663, device='cuda:0') reward: tensor(1.1762, device='cuda:0') factor: tensor(0.4901, device='cuda:0')
reward_raw: tensor(0.6722, device='cuda:0') reward: tensor(1.1751, device='cuda:0') factor: tensor(0.4972, device='cuda:0')
reward_raw: tensor(0.6642, device='cuda:0') reward: tensor(1.1799, device='cuda:0') factor: tensor(0.4843, device='cuda:0')


Epoch 1, Step 775: Loss = 0.0238, Actor Loss = -0.0086, Critic Loss = 0.0650, Entropy = 0.1221, Returns = 1.1622, Value = 1.1476, mask_loss = 0.4330std_loss = 0.4277:  14%|█▍        | 775/5475 [3:28:49<20:57:00, 16.05s/it]

ratio tensor(0.4583, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6719, device='cuda:0') reward: tensor(1.1996, device='cuda:0') factor: tensor(0.4722, device='cuda:0')
reward_raw: tensor(0.6946, device='cuda:0') reward: tensor(1.2198, device='cuda:0') factor: tensor(0.4749, device='cuda:0')
reward_raw: tensor(0.6822, device='cuda:0') reward: tensor(1.2086, device='cuda:0') factor: tensor(0.4736, device='cuda:0')
reward_raw: tensor(0.6713, device='cuda:0') reward: tensor(1.2024, device='cuda:0') factor: tensor(0.4689, device='cuda:0')
reward_raw: tensor(0.6940, device='cuda:0') reward: tensor(1.2144, device='cuda:0') factor: tensor(0.4796, device='cuda:0')


Epoch 1, Step 776: Loss = 0.0817, Actor Loss = 0.0422, Critic Loss = 0.0793, Entropy = 0.1346, Returns = 1.1279, Value = 1.1889, mask_loss = 0.4665std_loss = 0.4227:  14%|█▍        | 776/5475 [3:29:08<21:54:35, 16.79s/it] 

ratio tensor(0.1003, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6322, device='cuda:0') reward: tensor(1.2871, device='cuda:0') factor: tensor(0.3451, device='cuda:0')
reward_raw: tensor(0.6438, device='cuda:0') reward: tensor(1.2974, device='cuda:0') factor: tensor(0.3465, device='cuda:0')
reward_raw: tensor(0.6379, device='cuda:0') reward: tensor(1.2927, device='cuda:0') factor: tensor(0.3453, device='cuda:0')
reward_raw: tensor(0.6269, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.3498, device='cuda:0')
reward_raw: tensor(0.6376, device='cuda:0') reward: tensor(1.2960, device='cuda:0') factor: tensor(0.3416, device='cuda:0')


Epoch 1, Step 777: Loss = -0.1053, Actor Loss = -0.1292, Critic Loss = 0.0480, Entropy = 0.0900, Returns = 1.3281, Value = 1.3591, mask_loss = 0.3111std_loss = 0.4184:  14%|█▍        | 777/5475 [3:29:26<22:32:42, 17.28s/it]

ratio tensor(1.5121, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6374, device='cuda:0') reward: tensor(1.2799, device='cuda:0') factor: tensor(0.3576, device='cuda:0')
reward_raw: tensor(0.6095, device='cuda:0') reward: tensor(1.2501, device='cuda:0') factor: tensor(0.3595, device='cuda:0')
reward_raw: tensor(0.6216, device='cuda:0') reward: tensor(1.2606, device='cuda:0') factor: tensor(0.3610, device='cuda:0')
reward_raw: tensor(0.6280, device='cuda:0') reward: tensor(1.2749, device='cuda:0') factor: tensor(0.3532, device='cuda:0')
reward_raw: tensor(0.6327, device='cuda:0') reward: tensor(1.2795, device='cuda:0') factor: tensor(0.3532, device='cuda:0')


Epoch 1, Step 778: Loss = 0.1465, Actor Loss = 0.1121, Critic Loss = 0.0688, Entropy = 0.0953, Returns = 1.3244, Value = 1.3638, mask_loss = 0.2625std_loss = 0.3909:  14%|█▍        | 778/5475 [3:29:48<24:28:03, 18.75s/it]  

ratio tensor(0.6477, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5863, device='cuda:0') reward: tensor(1.2180, device='cuda:0') factor: tensor(0.3683, device='cuda:0')
reward_raw: tensor(0.5916, device='cuda:0') reward: tensor(1.2166, device='cuda:0') factor: tensor(0.3750, device='cuda:0')
reward_raw: tensor(0.5942, device='cuda:0') reward: tensor(1.2281, device='cuda:0') factor: tensor(0.3661, device='cuda:0')
reward_raw: tensor(0.5930, device='cuda:0') reward: tensor(1.2224, device='cuda:0') factor: tensor(0.3706, device='cuda:0')
reward_raw: tensor(0.5798, device='cuda:0') reward: tensor(1.2078, device='cuda:0') factor: tensor(0.3720, device='cuda:0')


Epoch 1, Step 779: Loss = 0.1694, Actor Loss = 0.1420, Critic Loss = 0.0549, Entropy = 0.1345, Returns = 1.2270, Value = 1.2972, mask_loss = 0.3966std_loss = 0.4240:  14%|█▍        | 779/5475 [3:30:09<25:04:42, 19.23s/it]

ratio tensor(0.1766, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6811, device='cuda:0') reward: tensor(1.2668, device='cuda:0') factor: tensor(0.4142, device='cuda:0')
reward_raw: tensor(0.6770, device='cuda:0') reward: tensor(1.2578, device='cuda:0') factor: tensor(0.4192, device='cuda:0')
reward_raw: tensor(0.6796, device='cuda:0') reward: tensor(1.2594, device='cuda:0') factor: tensor(0.4201, device='cuda:0')
reward_raw: tensor(0.6732, device='cuda:0') reward: tensor(1.2612, device='cuda:0') factor: tensor(0.4120, device='cuda:0')
reward_raw: tensor(0.6821, device='cuda:0') reward: tensor(1.2668, device='cuda:0') factor: tensor(0.4153, device='cuda:0')


Epoch 1, Step 780: Loss = 0.2069, Actor Loss = 0.1432, Critic Loss = 0.1277, Entropy = 0.1222, Returns = 1.2232, Value = 1.1888, mask_loss = 0.3551std_loss = 0.4307:  14%|█▍        | 780/5475 [3:30:25<23:49:57, 18.27s/it]

ratio tensor(1.1866e-05, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6605, device='cuda:0') reward: tensor(1.2802, device='cuda:0') factor: tensor(0.3803, device='cuda:0')
reward_raw: tensor(0.6588, device='cuda:0') reward: tensor(1.2769, device='cuda:0') factor: tensor(0.3819, device='cuda:0')
reward_raw: tensor(0.6421, device='cuda:0') reward: tensor(1.2638, device='cuda:0') factor: tensor(0.3783, device='cuda:0')
reward_raw: tensor(0.6513, device='cuda:0') reward: tensor(1.2789, device='cuda:0') factor: tensor(0.3723, device='cuda:0')
reward_raw: tensor(0.6668, device='cuda:0') reward: tensor(1.2861, device='cuda:0') factor: tensor(0.3807, device='cuda:0')


Epoch 1, Step 781: Loss = 0.1436, Actor Loss = 0.0856, Critic Loss = 0.1163, Entropy = 0.1218, Returns = 1.3429, Value = 1.3854, mask_loss = 0.3614std_loss = 0.4235:  14%|█▍        | 780/5475 [3:30:38<23:49:57, 18.27s/it]

ratio tensor(1.3576, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 781: Loss = 0.1436, Actor Loss = 0.0856, Critic Loss = 0.1163, Entropy = 0.1218, Returns = 1.3429, Value = 1.3854, mask_loss = 0.3614std_loss = 0.4235:  14%|█▍        | 781/5475 [3:30:39<22:14:43, 17.06s/it]

reward_raw: tensor(0.6049, device='cuda:0') reward: tensor(1.2317, device='cuda:0') factor: tensor(0.3732, device='cuda:0')
reward_raw: tensor(0.6233, device='cuda:0') reward: tensor(1.2511, device='cuda:0') factor: tensor(0.3722, device='cuda:0')
reward_raw: tensor(0.6235, device='cuda:0') reward: tensor(1.2516, device='cuda:0') factor: tensor(0.3719, device='cuda:0')
reward_raw: tensor(0.6139, device='cuda:0') reward: tensor(1.2359, device='cuda:0') factor: tensor(0.3780, device='cuda:0')
reward_raw: tensor(0.6048, device='cuda:0') reward: tensor(1.2375, device='cuda:0') factor: tensor(0.3673, device='cuda:0')


Epoch 1, Step 782: Loss = 0.0411, Actor Loss = 0.0043, Critic Loss = 0.0740, Entropy = 0.1096, Returns = 1.2875, Value = 1.2583, mask_loss = 0.3768std_loss = 0.4298:  14%|█▍        | 782/5475 [3:30:54<21:19:58, 16.36s/it]

ratio tensor(0.4535, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6520, device='cuda:0') reward: tensor(1.2843, device='cuda:0') factor: tensor(0.3677, device='cuda:0')
reward_raw: tensor(0.6556, device='cuda:0') reward: tensor(1.2918, device='cuda:0') factor: tensor(0.3638, device='cuda:0')
reward_raw: tensor(0.6383, device='cuda:0') reward: tensor(1.2744, device='cuda:0') factor: tensor(0.3639, device='cuda:0')
reward_raw: tensor(0.6515, device='cuda:0') reward: tensor(1.2867, device='cuda:0') factor: tensor(0.3647, device='cuda:0')
reward_raw: tensor(0.6599, device='cuda:0') reward: tensor(1.2972, device='cuda:0') factor: tensor(0.3626, device='cuda:0')


Epoch 1, Step 783: Loss = 0.0988, Actor Loss = 0.0634, Critic Loss = 0.0711, Entropy = 0.1243, Returns = 1.2598, Value = 1.3095, mask_loss = 0.3848std_loss = 0.4242:  14%|█▍        | 783/5475 [3:31:11<21:47:12, 16.72s/it]

ratio tensor(1.2368, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6274, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.3119, device='cuda:0')
reward_raw: tensor(0.6232, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.3096, device='cuda:0')
reward_raw: tensor(0.6362, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.3125, device='cuda:0')
reward_raw: tensor(0.6539, device='cuda:0') reward: tensor(1.3417, device='cuda:0') factor: tensor(0.3123, device='cuda:0')
reward_raw: tensor(0.6366, device='cuda:0') reward: tensor(1.3207, device='cuda:0') factor: tensor(0.3159, device='cuda:0')


Epoch 1, Step 784: Loss = 0.0845, Actor Loss = 0.0617, Critic Loss = 0.0458, Entropy = 0.1053, Returns = 1.3122, Value = 1.3238, mask_loss = 0.3122std_loss = 0.4172:  14%|█▍        | 784/5475 [3:31:27<21:26:18, 16.45s/it]

ratio tensor(0.8829, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6588, device='cuda:0') reward: tensor(1.2457, device='cuda:0') factor: tensor(0.4131, device='cuda:0')
reward_raw: tensor(0.6715, device='cuda:0') reward: tensor(1.2556, device='cuda:0') factor: tensor(0.4159, device='cuda:0')
reward_raw: tensor(0.6553, device='cuda:0') reward: tensor(1.2372, device='cuda:0') factor: tensor(0.4181, device='cuda:0')
reward_raw: tensor(0.6668, device='cuda:0') reward: tensor(1.2539, device='cuda:0') factor: tensor(0.4129, device='cuda:0')
reward_raw: tensor(0.6743, device='cuda:0') reward: tensor(1.2683, device='cuda:0') factor: tensor(0.4060, device='cuda:0')


Epoch 1, Step 785: Loss = -0.0426, Actor Loss = -0.0713, Critic Loss = 0.0578, Entropy = 0.1129, Returns = 1.2814, Value = 1.2582, mask_loss = 0.3843std_loss = 0.4113:  14%|█▍        | 785/5475 [3:31:48<23:19:56, 17.91s/it]

ratio tensor(1.0360, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6197, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.3062, device='cuda:0')
reward_raw: tensor(0.6537, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.3013, device='cuda:0')
reward_raw: tensor(0.6396, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.3013, device='cuda:0')
reward_raw: tensor(0.6366, device='cuda:0') reward: tensor(1.3366, device='cuda:0') factor: tensor(0.3000, device='cuda:0')
reward_raw: tensor(0.6344, device='cuda:0') reward: tensor(1.3327, device='cuda:0') factor: tensor(0.3018, device='cuda:0')


Epoch 1, Step 786: Loss = -0.0593, Actor Loss = -0.0910, Critic Loss = 0.0638, Entropy = 0.1118, Returns = 1.3349, Value = 1.3660, mask_loss = 0.2954std_loss = 0.4025:  14%|█▍        | 786/5475 [3:32:06<23:13:50, 17.84s/it]

ratio tensor(1.2303, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.1963, device='cuda:0') factor: tensor(0.3520, device='cuda:0')
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.2121, device='cuda:0') factor: tensor(0.3377, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.2124, device='cuda:0') factor: tensor(0.3349, device='cuda:0')
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.2138, device='cuda:0') factor: tensor(0.3350, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.1915, device='cuda:0') factor: tensor(0.3419, device='cuda:0')


Epoch 1, Step 787: Loss = 0.3867, Actor Loss = 0.3542, Critic Loss = 0.0653, Entropy = 0.1050, Returns = 1.2172, Value = 1.3190, mask_loss = 0.3546std_loss = 0.4307:  14%|█▍        | 787/5475 [3:32:21<22:17:00, 17.11s/it]  

ratio tensor(1.1345, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6616, device='cuda:0') reward: tensor(1.3892, device='cuda:0') factor: tensor(0.2724, device='cuda:0')
reward_raw: tensor(0.6607, device='cuda:0') reward: tensor(1.3866, device='cuda:0') factor: tensor(0.2741, device='cuda:0')
reward_raw: tensor(0.6670, device='cuda:0') reward: tensor(1.3909, device='cuda:0') factor: tensor(0.2761, device='cuda:0')
reward_raw: tensor(0.6363, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.2711, device='cuda:0')
reward_raw: tensor(0.6616, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.2788, device='cuda:0')


Epoch 1, Step 788: Loss = 0.0344, Actor Loss = 0.0099, Critic Loss = 0.0492, Entropy = 0.0968, Returns = 1.3508, Value = 1.3077, mask_loss = 0.2965std_loss = 0.3991:  14%|█▍        | 788/5475 [3:32:39<22:41:25, 17.43s/it]

ratio tensor(0.6668, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5860, device='cuda:0') reward: tensor(1.3035, device='cuda:0') factor: tensor(0.2825, device='cuda:0')
reward_raw: tensor(0.5981, device='cuda:0') reward: tensor(1.3077, device='cuda:0') factor: tensor(0.2904, device='cuda:0')
reward_raw: tensor(0.5940, device='cuda:0') reward: tensor(1.3097, device='cuda:0') factor: tensor(0.2843, device='cuda:0')
reward_raw: tensor(0.5988, device='cuda:0') reward: tensor(1.3060, device='cuda:0') factor: tensor(0.2928, device='cuda:0')
reward_raw: tensor(0.5891, device='cuda:0') reward: tensor(1.3022, device='cuda:0') factor: tensor(0.2870, device='cuda:0')


Epoch 1, Step 789: Loss = 0.2329, Actor Loss = 0.1722, Critic Loss = 0.1217, Entropy = 0.1213, Returns = 1.2229, Value = 1.3962, mask_loss = 0.3348std_loss = 0.4158:  14%|█▍        | 789/5475 [3:32:58<22:59:30, 17.66s/it]

ratio tensor(0.9591, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6205, device='cuda:0') reward: tensor(1.2425, device='cuda:0') factor: tensor(0.3781, device='cuda:0')
reward_raw: tensor(0.6181, device='cuda:0') reward: tensor(1.2466, device='cuda:0') factor: tensor(0.3715, device='cuda:0')
reward_raw: tensor(0.6228, device='cuda:0') reward: tensor(1.2500, device='cuda:0') factor: tensor(0.3729, device='cuda:0')
reward_raw: tensor(0.6181, device='cuda:0') reward: tensor(1.2309, device='cuda:0') factor: tensor(0.3872, device='cuda:0')
reward_raw: tensor(0.6375, device='cuda:0') reward: tensor(1.2646, device='cuda:0') factor: tensor(0.3728, device='cuda:0')


Epoch 1, Step 790: Loss = 0.5627, Actor Loss = 0.4959, Critic Loss = 0.1339, Entropy = 0.1398, Returns = 1.2518, Value = 1.3758, mask_loss = 0.3703std_loss = 0.4171:  14%|█▍        | 790/5475 [3:33:09<20:31:09, 15.77s/it]

ratio tensor(2.6142, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6035, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.2536, device='cuda:0')
reward_raw: tensor(0.6111, device='cuda:0') reward: tensor(1.3536, device='cuda:0') factor: tensor(0.2575, device='cuda:0')
reward_raw: tensor(0.6064, device='cuda:0') reward: tensor(1.3461, device='cuda:0') factor: tensor(0.2603, device='cuda:0')
reward_raw: tensor(0.5978, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.2533, device='cuda:0')
reward_raw: tensor(0.6002, device='cuda:0') reward: tensor(1.3438, device='cuda:0') factor: tensor(0.2564, device='cuda:0')


Epoch 1, Step 791: Loss = 0.0729, Actor Loss = 0.0458, Critic Loss = 0.0543, Entropy = 0.0842, Returns = 1.3737, Value = 1.3038, mask_loss = 0.1833std_loss = 0.3511:  14%|█▍        | 791/5475 [3:33:26<20:54:33, 16.07s/it]

ratio tensor(0.0005, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6687, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.2593, device='cuda:0')
reward_raw: tensor(0.6505, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.2513, device='cuda:0')
reward_raw: tensor(0.6662, device='cuda:0') reward: tensor(1.4146, device='cuda:0') factor: tensor(0.2516, device='cuda:0')
reward_raw: tensor(0.6774, device='cuda:0') reward: tensor(1.4228, device='cuda:0') factor: tensor(0.2546, device='cuda:0')
reward_raw: tensor(0.6750, device='cuda:0') reward: tensor(1.4195, device='cuda:0') factor: tensor(0.2555, device='cuda:0')


Epoch 1, Step 792: Loss = 0.0405, Actor Loss = 0.0017, Critic Loss = 0.0780, Entropy = 0.1009, Returns = 1.3487, Value = 1.5430, mask_loss = 0.2016std_loss = 0.3561:  14%|█▍        | 792/5475 [3:33:40<19:59:24, 15.37s/it]

ratio tensor(0.5446, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5763, device='cuda:0') reward: tensor(1.2724, device='cuda:0') factor: tensor(0.3040, device='cuda:0')
reward_raw: tensor(0.5686, device='cuda:0') reward: tensor(1.2713, device='cuda:0') factor: tensor(0.2973, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.2956, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.2801, device='cuda:0') factor: tensor(0.3005, device='cuda:0')
reward_raw: tensor(0.5587, device='cuda:0') reward: tensor(1.2613, device='cuda:0') factor: tensor(0.2974, device='cuda:0')


Epoch 1, Step 793: Loss = 0.2189, Actor Loss = 0.2001, Critic Loss = 0.0379, Entropy = 0.0986, Returns = 1.3285, Value = 1.2683, mask_loss = 0.3000std_loss = 0.4116:  14%|█▍        | 793/5475 [3:34:02<22:50:24, 17.56s/it]

ratio tensor(0.5306, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5595, device='cuda:0') reward: tensor(1.3000, device='cuda:0') factor: tensor(0.2595, device='cuda:0')
reward_raw: tensor(0.5607, device='cuda:0') reward: tensor(1.2947, device='cuda:0') factor: tensor(0.2660, device='cuda:0')
reward_raw: tensor(0.5539, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.2602, device='cuda:0')
reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.2603, device='cuda:0')
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.2957, device='cuda:0') factor: tensor(0.2592, device='cuda:0')


Epoch 1, Step 794: Loss = -0.1306, Actor Loss = -0.1729, Critic Loss = 0.0848, Entropy = 0.1182, Returns = 1.2956, Value = 1.2609, mask_loss = 0.2460std_loss = 0.3714:  15%|█▍        | 794/5475 [3:34:19<22:32:50, 17.34s/it]

ratio tensor(1.3120, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5174, device='cuda:0') reward: tensor(1.2968, device='cuda:0') factor: tensor(0.2206, device='cuda:0')
reward_raw: tensor(0.5311, device='cuda:0') reward: tensor(1.3078, device='cuda:0') factor: tensor(0.2233, device='cuda:0')
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.2893, device='cuda:0') factor: tensor(0.2249, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.2869, device='cuda:0') factor: tensor(0.2250, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.2248, device='cuda:0')


Epoch 1, Step 795: Loss = -0.0377, Actor Loss = -0.0746, Critic Loss = 0.0741, Entropy = 0.1144, Returns = 1.3370, Value = 1.3262, mask_loss = 0.1933std_loss = 0.3461:  15%|█▍        | 795/5475 [3:34:31<20:31:14, 15.79s/it]

ratio tensor(2.4825, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5824, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.2442, device='cuda:0')
reward_raw: tensor(0.5955, device='cuda:0') reward: tensor(1.3322, device='cuda:0') factor: tensor(0.2633, device='cuda:0')
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.2516, device='cuda:0')
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.2497, device='cuda:0')
reward_raw: tensor(0.5719, device='cuda:0') reward: tensor(1.3141, device='cuda:0') factor: tensor(0.2578, device='cuda:0')


Epoch 1, Step 796: Loss = 0.0270, Actor Loss = -0.0123, Critic Loss = 0.0789, Entropy = 0.1142, Returns = 1.3389, Value = 1.3945, mask_loss = 0.2355std_loss = 0.3772:  15%|█▍        | 796/5475 [3:34:44<19:18:10, 14.85s/it] 

ratio tensor(1.1296, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5984, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.2291, device='cuda:0')
reward_raw: tensor(0.6150, device='cuda:0') reward: tensor(1.3873, device='cuda:0') factor: tensor(0.2277, device='cuda:0')
reward_raw: tensor(0.6229, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.2331, device='cuda:0')
reward_raw: tensor(0.6092, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.2233, device='cuda:0')
reward_raw: tensor(0.6124, device='cuda:0') reward: tensor(1.3902, device='cuda:0') factor: tensor(0.2222, device='cuda:0')


Epoch 1, Step 797: Loss = 0.0759, Actor Loss = 0.0453, Critic Loss = 0.0614, Entropy = 0.1109, Returns = 1.3785, Value = 1.3672, mask_loss = 0.2397std_loss = 0.3760:  15%|█▍        | 797/5475 [3:35:02<20:30:43, 15.79s/it] 

ratio tensor(0.6310, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.2108, device='cuda:0') factor: tensor(0.2641, device='cuda:0')
reward_raw: tensor(0.4732, device='cuda:0') reward: tensor(1.2115, device='cuda:0') factor: tensor(0.2618, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.2216, device='cuda:0') factor: tensor(0.2712, device='cuda:0')
reward_raw: tensor(0.4704, device='cuda:0') reward: tensor(1.2023, device='cuda:0') factor: tensor(0.2681, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.2054, device='cuda:0') factor: tensor(0.2692, device='cuda:0')


Epoch 1, Step 798: Loss = -0.0462, Actor Loss = -0.0765, Critic Loss = 0.0607, Entropy = 0.1211, Returns = 1.2291, Value = 1.1365, mask_loss = 0.2739std_loss = 0.3984:  15%|█▍        | 798/5475 [3:35:24<22:55:57, 17.65s/it]

ratio tensor(7.1808, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.2413, device='cuda:0') factor: tensor(0.2530, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.2597, device='cuda:0') factor: tensor(0.2539, device='cuda:0')
reward_raw: tensor(0.4942, device='cuda:0') reward: tensor(1.2464, device='cuda:0') factor: tensor(0.2478, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.2478, device='cuda:0') factor: tensor(0.2505, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.2527, device='cuda:0') factor: tensor(0.2559, device='cuda:0')


Epoch 1, Step 799: Loss = 0.1048, Actor Loss = 0.0663, Critic Loss = 0.0773, Entropy = 0.1155, Returns = 1.3056, Value = 1.1791, mask_loss = 0.2822std_loss = 0.3783:  15%|█▍        | 799/5475 [3:35:46<24:38:42, 18.97s/it]  

ratio tensor(0.7449, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5748, device='cuda:0') reward: tensor(1.3393, device='cuda:0') factor: tensor(0.2355, device='cuda:0')
reward_raw: tensor(0.5659, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.2303, device='cuda:0')
reward_raw: tensor(0.5887, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.2410, device='cuda:0')
reward_raw: tensor(0.5818, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.2359, device='cuda:0')
reward_raw: tensor(0.5847, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.2315, device='cuda:0')


Epoch 1, Step 800: Loss = -0.3568, Actor Loss = -0.3823, Critic Loss = 0.0512, Entropy = 0.1146, Returns = 1.4480, Value = 1.4512, mask_loss = 0.2095std_loss = 0.3611:  15%|█▍        | 800/5475 [3:36:07<25:22:03, 19.53s/it]

ratio tensor(2.5588, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5084, device='cuda:0') reward: tensor(1.2866, device='cuda:0') factor: tensor(0.2218, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.2211, device='cuda:0')
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.2971, device='cuda:0') factor: tensor(0.2237, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.2198, device='cuda:0')
reward_raw: tensor(0.5089, device='cuda:0') reward: tensor(1.2837, device='cuda:0') factor: tensor(0.2252, device='cuda:0')


Epoch 1, Step 801: Loss = 0.1805, Actor Loss = 0.1338, Critic Loss = 0.0936, Entropy = 0.1158, Returns = 1.2879, Value = 1.3067, mask_loss = 0.2064std_loss = 0.3486:  15%|█▍        | 800/5475 [3:36:21<25:22:03, 19.53s/it]  

ratio tensor(0.9446, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 801: Loss = 0.1805, Actor Loss = 0.1338, Critic Loss = 0.0936, Entropy = 0.1158, Returns = 1.2879, Value = 1.3067, mask_loss = 0.2064std_loss = 0.3486:  15%|█▍        | 801/5475 [3:36:21<23:23:08, 18.01s/it]

reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.2586, device='cuda:0') factor: tensor(0.2677, device='cuda:0')
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.2826, device='cuda:0') factor: tensor(0.2574, device='cuda:0')
reward_raw: tensor(0.5455, device='cuda:0') reward: tensor(1.2852, device='cuda:0') factor: tensor(0.2603, device='cuda:0')
reward_raw: tensor(0.5316, device='cuda:0') reward: tensor(1.2666, device='cuda:0') factor: tensor(0.2650, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.2760, device='cuda:0') factor: tensor(0.2591, device='cuda:0')


Epoch 1, Step 802: Loss = 0.1048, Actor Loss = 0.0602, Critic Loss = 0.0895, Entropy = 0.1428, Returns = 1.2555, Value = 1.2008, mask_loss = 0.2704std_loss = 0.3854:  15%|█▍        | 802/5475 [3:36:36<22:14:28, 17.13s/it]

ratio tensor(1.0098, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.2907, device='cuda:0') factor: tensor(0.2566, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.2540, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.2985, device='cuda:0') factor: tensor(0.2619, device='cuda:0')
reward_raw: tensor(0.5431, device='cuda:0') reward: tensor(1.2855, device='cuda:0') factor: tensor(0.2576, device='cuda:0')
reward_raw: tensor(0.5531, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.2619, device='cuda:0')


Epoch 1, Step 803: Loss = -0.1730, Actor Loss = -0.2068, Critic Loss = 0.0678, Entropy = 0.1325, Returns = 1.3180, Value = 1.4063, mask_loss = 0.2477std_loss = 0.3778:  15%|█▍        | 803/5475 [3:36:52<21:46:37, 16.78s/it]

ratio tensor(1.5543, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5885, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.1988, device='cuda:0')
reward_raw: tensor(0.5692, device='cuda:0') reward: tensor(1.3806, device='cuda:0') factor: tensor(0.1886, device='cuda:0')
reward_raw: tensor(0.5761, device='cuda:0') reward: tensor(1.3815, device='cuda:0') factor: tensor(0.1946, device='cuda:0')
reward_raw: tensor(0.5893, device='cuda:0') reward: tensor(1.3946, device='cuda:0') factor: tensor(0.1947, device='cuda:0')
reward_raw: tensor(0.5781, device='cuda:0') reward: tensor(1.3858, device='cuda:0') factor: tensor(0.1923, device='cuda:0')


Epoch 1, Step 804: Loss = 0.0831, Actor Loss = 0.0495, Critic Loss = 0.0675, Entropy = 0.1090, Returns = 1.4459, Value = 1.5131, mask_loss = 0.2026std_loss = 0.3541:  15%|█▍        | 804/5475 [3:37:07<20:50:34, 16.06s/it]  

ratio tensor(1.1045, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6177, device='cuda:0') reward: tensor(1.3846, device='cuda:0') factor: tensor(0.2331, device='cuda:0')
reward_raw: tensor(0.5855, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.2356, device='cuda:0')
reward_raw: tensor(0.6333, device='cuda:0') reward: tensor(1.3966, device='cuda:0') factor: tensor(0.2367, device='cuda:0')
reward_raw: tensor(0.6081, device='cuda:0') reward: tensor(1.3621, device='cuda:0') factor: tensor(0.2460, device='cuda:0')
reward_raw: tensor(0.5899, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.2313, device='cuda:0')


Epoch 1, Step 805: Loss = 0.0672, Actor Loss = 0.0066, Critic Loss = 0.1213, Entropy = 0.1207, Returns = 1.4596, Value = 1.3139, mask_loss = 0.2242std_loss = 0.3662:  15%|█▍        | 805/5475 [3:37:29<23:19:12, 17.98s/it]

ratio tensor(0.7376, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.2880, device='cuda:0') factor: tensor(0.2317, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.2556, device='cuda:0') factor: tensor(0.2404, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.2819, device='cuda:0') factor: tensor(0.2334, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.2636, device='cuda:0') factor: tensor(0.2395, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.2726, device='cuda:0') factor: tensor(0.2347, device='cuda:0')


Epoch 1, Step 806: Loss = 0.0837, Actor Loss = 0.0546, Critic Loss = 0.0585, Entropy = 0.1104, Returns = 1.2257, Value = 1.3239, mask_loss = 0.2014std_loss = 0.3505:  15%|█▍        | 806/5475 [3:37:45<22:37:32, 17.45s/it]

ratio tensor(0.9385, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5606, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.2407, device='cuda:0')
reward_raw: tensor(0.5301, device='cuda:0') reward: tensor(1.3005, device='cuda:0') factor: tensor(0.2296, device='cuda:0')
reward_raw: tensor(0.5596, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.2302, device='cuda:0')
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.3025, device='cuda:0') factor: tensor(0.2323, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.3163, device='cuda:0') factor: tensor(0.2404, device='cuda:0')


Epoch 1, Step 807: Loss = -0.0725, Actor Loss = -0.1117, Critic Loss = 0.0786, Entropy = 0.1118, Returns = 1.3138, Value = 1.2996, mask_loss = 0.2419std_loss = 0.3728:  15%|█▍        | 807/5475 [3:38:01<21:47:04, 16.80s/it]

ratio tensor(1.2192, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.2592, device='cuda:0') factor: tensor(0.2426, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.2723, device='cuda:0') factor: tensor(0.2366, device='cuda:0')
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.2599, device='cuda:0') factor: tensor(0.2453, device='cuda:0')
reward_raw: tensor(0.5219, device='cuda:0') reward: tensor(1.2812, device='cuda:0') factor: tensor(0.2406, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.2652, device='cuda:0') factor: tensor(0.2471, device='cuda:0')


Epoch 1, Step 808: Loss = 0.2582, Actor Loss = 0.2097, Critic Loss = 0.0972, Entropy = 0.1488, Returns = 1.2439, Value = 1.3814, mask_loss = 0.2766std_loss = 0.3904:  15%|█▍        | 808/5475 [3:38:13<20:01:10, 15.44s/it]  

ratio tensor(0.7490, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6003, device='cuda:0') reward: tensor(1.3413, device='cuda:0') factor: tensor(0.2590, device='cuda:0')
reward_raw: tensor(0.6293, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.2523, device='cuda:0')
reward_raw: tensor(0.6265, device='cuda:0') reward: tensor(1.3645, device='cuda:0') factor: tensor(0.2620, device='cuda:0')
reward_raw: tensor(0.6173, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.2566, device='cuda:0')
reward_raw: tensor(0.6160, device='cuda:0') reward: tensor(1.3596, device='cuda:0') factor: tensor(0.2565, device='cuda:0')


Epoch 1, Step 809: Loss = -0.1392, Actor Loss = -0.1774, Critic Loss = 0.0767, Entropy = 0.1192, Returns = 1.3626, Value = 1.4006, mask_loss = 0.2338std_loss = 0.3736:  15%|█▍        | 809/5475 [3:38:26<18:58:49, 14.64s/it]

ratio tensor(1.9903, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5505, device='cuda:0') reward: tensor(1.3451, device='cuda:0') factor: tensor(0.2053, device='cuda:0')
reward_raw: tensor(0.5790, device='cuda:0') reward: tensor(1.3681, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.5531, device='cuda:0') reward: tensor(1.3469, device='cuda:0') factor: tensor(0.2062, device='cuda:0')
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.2040, device='cuda:0')


Epoch 1, Step 810: Loss = 0.0793, Actor Loss = 0.0057, Critic Loss = 0.1475, Entropy = 0.1003, Returns = 1.3603, Value = 1.4208, mask_loss = 0.1983std_loss = 0.3413:  15%|█▍        | 810/5475 [3:38:47<21:31:02, 16.61s/it]  

ratio tensor(1.4584, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.2115, device='cuda:0')
reward_raw: tensor(0.5633, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.5514, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.2137, device='cuda:0')
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.2116, device='cuda:0')
reward_raw: tensor(0.5759, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.2118, device='cuda:0')


Epoch 1, Step 811: Loss = 0.2791, Actor Loss = 0.2457, Critic Loss = 0.0670, Entropy = 0.1150, Returns = 1.3428, Value = 1.2817, mask_loss = 0.2251std_loss = 0.3666:  15%|█▍        | 811/5475 [3:39:00<20:19:40, 15.69s/it]

ratio tensor(0.6675, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5387, device='cuda:0') reward: tensor(1.2626, device='cuda:0') factor: tensor(0.2761, device='cuda:0')
reward_raw: tensor(0.5778, device='cuda:0') reward: tensor(1.3002, device='cuda:0') factor: tensor(0.2776, device='cuda:0')
reward_raw: tensor(0.5460, device='cuda:0') reward: tensor(1.2677, device='cuda:0') factor: tensor(0.2783, device='cuda:0')
reward_raw: tensor(0.5627, device='cuda:0') reward: tensor(1.2797, device='cuda:0') factor: tensor(0.2830, device='cuda:0')
reward_raw: tensor(0.5643, device='cuda:0') reward: tensor(1.2793, device='cuda:0') factor: tensor(0.2850, device='cuda:0')


Epoch 1, Step 812: Loss = -0.1847, Actor Loss = -0.2022, Critic Loss = 0.0351, Entropy = 0.1278, Returns = 1.2550, Value = 1.2392, mask_loss = 0.3120std_loss = 0.4049:  15%|█▍        | 812/5475 [3:39:18<20:54:48, 16.15s/it]

ratio tensor(1.4168, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5799, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.2044, device='cuda:0')
reward_raw: tensor(0.5817, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.2085, device='cuda:0')
reward_raw: tensor(0.5897, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.2164, device='cuda:0')
reward_raw: tensor(0.5703, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.2103, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.2062, device='cuda:0')


Epoch 1, Step 813: Loss = 0.0772, Actor Loss = 0.0356, Critic Loss = 0.0834, Entropy = 0.0869, Returns = 1.3286, Value = 1.4565, mask_loss = 0.1869std_loss = 0.3433:  15%|█▍        | 813/5475 [3:39:32<20:07:13, 15.54s/it]  

ratio tensor(0.7752, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6482, device='cuda:0') reward: tensor(1.4043, device='cuda:0') factor: tensor(0.2439, device='cuda:0')
reward_raw: tensor(0.6568, device='cuda:0') reward: tensor(1.4148, device='cuda:0') factor: tensor(0.2420, device='cuda:0')
reward_raw: tensor(0.6212, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.2349, device='cuda:0')
reward_raw: tensor(0.6473, device='cuda:0') reward: tensor(1.4055, device='cuda:0') factor: tensor(0.2418, device='cuda:0')
reward_raw: tensor(0.6523, device='cuda:0') reward: tensor(1.4114, device='cuda:0') factor: tensor(0.2409, device='cuda:0')


Epoch 1, Step 814: Loss = 0.0664, Actor Loss = 0.0125, Critic Loss = 0.1081, Entropy = 0.1061, Returns = 1.4244, Value = 1.4773, mask_loss = 0.2295std_loss = 0.3732:  15%|█▍        | 814/5475 [3:39:43<18:18:39, 14.14s/it]

ratio tensor(1.3154, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3009, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3071, device='cuda:0') factor: tensor(0.2039, device='cuda:0')
reward_raw: tensor(0.5351, device='cuda:0') reward: tensor(1.3282, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.2080, device='cuda:0')
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.2094, device='cuda:0')


Epoch 1, Step 815: Loss = 0.2708, Actor Loss = 0.2511, Critic Loss = 0.0395, Entropy = 0.1030, Returns = 1.2751, Value = 1.2663, mask_loss = 0.2171std_loss = 0.3600:  15%|█▍        | 815/5475 [3:40:00<19:25:39, 15.01s/it]

ratio tensor(0.9710, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5187, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.2247, device='cuda:0')
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.2993, device='cuda:0') factor: tensor(0.2280, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.2802, device='cuda:0') factor: tensor(0.2294, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.2851, device='cuda:0') factor: tensor(0.2319, device='cuda:0')
reward_raw: tensor(0.5113, device='cuda:0') reward: tensor(1.2822, device='cuda:0') factor: tensor(0.2292, device='cuda:0')


Epoch 1, Step 816: Loss = 0.1017, Actor Loss = 0.0854, Critic Loss = 0.0327, Entropy = 0.1019, Returns = 1.2633, Value = 1.3328, mask_loss = 0.1950std_loss = 0.3495:  15%|█▍        | 816/5475 [3:40:18<20:32:25, 15.87s/it]

ratio tensor(0.7862, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.2136, device='cuda:0')
reward_raw: tensor(0.5780, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.2211, device='cuda:0')
reward_raw: tensor(0.5893, device='cuda:0') reward: tensor(1.3704, device='cuda:0') factor: tensor(0.2189, device='cuda:0')
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.3550, device='cuda:0') factor: tensor(0.2132, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.2210, device='cuda:0')


Epoch 1, Step 817: Loss = 0.0713, Actor Loss = -0.0028, Critic Loss = 0.1486, Entropy = 0.1117, Returns = 1.3509, Value = 1.4802, mask_loss = 0.2272std_loss = 0.3657:  15%|█▍        | 817/5475 [3:40:36<21:43:33, 16.79s/it]

ratio tensor(1.5439, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5608, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1845, device='cuda:0')
reward_raw: tensor(0.5690, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1828, device='cuda:0')
reward_raw: tensor(0.5467, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.5482, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.1883, device='cuda:0')
reward_raw: tensor(0.5587, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1786, device='cuda:0')


Epoch 1, Step 818: Loss = 0.0926, Actor Loss = 0.0409, Critic Loss = 0.1036, Entropy = 0.0866, Returns = 1.3300, Value = 1.3870, mask_loss = 0.1943std_loss = 0.3470:  15%|█▍        | 818/5475 [3:40:53<21:37:02, 16.71s/it] 

ratio tensor(1.1872, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.3289, device='cuda:0') factor: tensor(0.2182, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3197, device='cuda:0') factor: tensor(0.2235, device='cuda:0')
reward_raw: tensor(0.5565, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.2273, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.3216, device='cuda:0') factor: tensor(0.2229, device='cuda:0')
reward_raw: tensor(0.5658, device='cuda:0') reward: tensor(1.3439, device='cuda:0') factor: tensor(0.2219, device='cuda:0')


Epoch 1, Step 819: Loss = 0.4656, Actor Loss = 0.4150, Critic Loss = 0.1014, Entropy = 0.1520, Returns = 1.2199, Value = 1.3942, mask_loss = 0.2550std_loss = 0.3755:  15%|█▍        | 819/5475 [3:41:07<20:45:36, 16.05s/it]

ratio tensor(0.4659, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6085, device='cuda:0') reward: tensor(1.3998, device='cuda:0') factor: tensor(0.2088, device='cuda:0')
reward_raw: tensor(0.5835, device='cuda:0') reward: tensor(1.3747, device='cuda:0') factor: tensor(0.2088, device='cuda:0')
reward_raw: tensor(0.5976, device='cuda:0') reward: tensor(1.3907, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.6151, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.2193, device='cuda:0')
reward_raw: tensor(0.5993, device='cuda:0') reward: tensor(1.3844, device='cuda:0') factor: tensor(0.2149, device='cuda:0')


Epoch 1, Step 820: Loss = -0.0500, Actor Loss = -0.1485, Critic Loss = 0.1973, Entropy = 0.1220, Returns = 1.2623, Value = 1.4276, mask_loss = 0.2136std_loss = 0.3491:  15%|█▍        | 820/5475 [3:41:22<20:14:32, 15.65s/it]

ratio tensor(97.2126, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5037, device='cuda:0') reward: tensor(1.2984, device='cuda:0') factor: tensor(0.2053, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3127, device='cuda:0') factor: tensor(0.2086, device='cuda:0')
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.3126, device='cuda:0') factor: tensor(0.2097, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.3098, device='cuda:0') factor: tensor(0.2117, device='cuda:0')
reward_raw: tensor(0.5091, device='cuda:0') reward: tensor(1.2965, device='cuda:0') factor: tensor(0.2127, device='cuda:0')


Epoch 1, Step 821: Loss = 0.2309, Actor Loss = 0.1948, Critic Loss = 0.0725, Entropy = 0.1119, Returns = 1.2621, Value = 1.3084, mask_loss = 0.2136std_loss = 0.3585:  15%|█▍        | 820/5475 [3:41:45<20:14:32, 15.65s/it]  

ratio tensor(0.7015, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 821: Loss = 0.2309, Actor Loss = 0.1948, Critic Loss = 0.0725, Entropy = 0.1119, Returns = 1.2621, Value = 1.3084, mask_loss = 0.2136std_loss = 0.3585:  15%|█▍        | 821/5475 [3:41:45<22:56:42, 17.75s/it]

reward_raw: tensor(0.6331, device='cuda:0') reward: tensor(1.4091, device='cuda:0') factor: tensor(0.2240, device='cuda:0')
reward_raw: tensor(0.6179, device='cuda:0') reward: tensor(1.3954, device='cuda:0') factor: tensor(0.2225, device='cuda:0')
reward_raw: tensor(0.6217, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.2234, device='cuda:0')
reward_raw: tensor(0.6250, device='cuda:0') reward: tensor(1.3968, device='cuda:0') factor: tensor(0.2283, device='cuda:0')
reward_raw: tensor(0.6200, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.2195, device='cuda:0')


Epoch 1, Step 822: Loss = -0.0448, Actor Loss = -0.0790, Critic Loss = 0.0686, Entropy = 0.1120, Returns = 1.4494, Value = 1.3335, mask_loss = 0.2057std_loss = 0.3555:  15%|█▌        | 822/5475 [3:42:01<22:21:41, 17.30s/it]

ratio tensor(0.9985, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5963, device='cuda:0') reward: tensor(1.3656, device='cuda:0') factor: tensor(0.2306, device='cuda:0')
reward_raw: tensor(0.5870, device='cuda:0') reward: tensor(1.3593, device='cuda:0') factor: tensor(0.2277, device='cuda:0')
reward_raw: tensor(0.6059, device='cuda:0') reward: tensor(1.3687, device='cuda:0') factor: tensor(0.2372, device='cuda:0')
reward_raw: tensor(0.6114, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.2302, device='cuda:0')
reward_raw: tensor(0.5853, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.2272, device='cuda:0')


Epoch 1, Step 823: Loss = 0.0698, Actor Loss = 0.0440, Critic Loss = 0.0517, Entropy = 0.1187, Returns = 1.3611, Value = 1.3569, mask_loss = 0.2218std_loss = 0.3607:  15%|█▌        | 823/5475 [3:42:20<22:50:00, 17.67s/it]  

ratio tensor(0.8123, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.3051, device='cuda:0') factor: tensor(0.2694, device='cuda:0')
reward_raw: tensor(0.5600, device='cuda:0') reward: tensor(1.2898, device='cuda:0') factor: tensor(0.2703, device='cuda:0')
reward_raw: tensor(0.5907, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.2724, device='cuda:0')
reward_raw: tensor(0.5768, device='cuda:0') reward: tensor(1.3063, device='cuda:0') factor: tensor(0.2705, device='cuda:0')
reward_raw: tensor(0.6001, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.2765, device='cuda:0')


Epoch 1, Step 824: Loss = -0.0058, Actor Loss = -0.0433, Critic Loss = 0.0753, Entropy = 0.1607, Returns = 1.2451, Value = 1.1995, mask_loss = 0.2891std_loss = 0.3904:  15%|█▌        | 824/5475 [3:42:37<22:39:06, 17.53s/it]

ratio tensor(1.3143, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.2641, device='cuda:0') factor: tensor(0.2572, device='cuda:0')
reward_raw: tensor(0.5238, device='cuda:0') reward: tensor(1.2694, device='cuda:0') factor: tensor(0.2544, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.2592, device='cuda:0') factor: tensor(0.2545, device='cuda:0')
reward_raw: tensor(0.5147, device='cuda:0') reward: tensor(1.2553, device='cuda:0') factor: tensor(0.2595, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.2735, device='cuda:0') factor: tensor(0.2544, device='cuda:0')


Epoch 1, Step 825: Loss = -0.0595, Actor Loss = -0.0916, Critic Loss = 0.0647, Entropy = 0.1507, Returns = 1.2645, Value = 1.2619, mask_loss = 0.2518std_loss = 0.3731:  15%|█▌        | 825/5475 [3:42:58<24:06:36, 18.67s/it]

ratio tensor(1.3511, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6471, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.2462, device='cuda:0')
reward_raw: tensor(0.6443, device='cuda:0') reward: tensor(1.3966, device='cuda:0') factor: tensor(0.2477, device='cuda:0')
reward_raw: tensor(0.6484, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.2401, device='cuda:0')
reward_raw: tensor(0.6609, device='cuda:0') reward: tensor(1.4135, device='cuda:0') factor: tensor(0.2474, device='cuda:0')
reward_raw: tensor(0.6575, device='cuda:0') reward: tensor(1.4155, device='cuda:0') factor: tensor(0.2420, device='cuda:0')


Epoch 1, Step 826: Loss = -0.0400, Actor Loss = -0.0906, Critic Loss = 0.1014, Entropy = 0.1395, Returns = 1.4295, Value = 1.4327, mask_loss = 0.2663std_loss = 0.3856:  15%|█▌        | 826/5475 [3:43:17<24:14:23, 18.77s/it]

ratio tensor(1.0439, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.2994, device='cuda:0') factor: tensor(0.2688, device='cuda:0')
reward_raw: tensor(0.5974, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.2675, device='cuda:0')
reward_raw: tensor(0.5929, device='cuda:0') reward: tensor(1.3374, device='cuda:0') factor: tensor(0.2555, device='cuda:0')
reward_raw: tensor(0.5755, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.2567, device='cuda:0')
reward_raw: tensor(0.6140, device='cuda:0') reward: tensor(1.3536, device='cuda:0') factor: tensor(0.2604, device='cuda:0')


Epoch 1, Step 827: Loss = -0.0067, Actor Loss = -0.0389, Critic Loss = 0.0647, Entropy = 0.1568, Returns = 1.3519, Value = 1.2429, mask_loss = 0.2457std_loss = 0.3565:  15%|█▌        | 827/5475 [3:43:32<22:31:30, 17.45s/it]

ratio tensor(1.0551, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6176, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.2714, device='cuda:0')
reward_raw: tensor(0.5975, device='cuda:0') reward: tensor(1.3215, device='cuda:0') factor: tensor(0.2760, device='cuda:0')
reward_raw: tensor(0.5804, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.2703, device='cuda:0')
reward_raw: tensor(0.5969, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2726, device='cuda:0')
reward_raw: tensor(0.5941, device='cuda:0') reward: tensor(1.3184, device='cuda:0') factor: tensor(0.2756, device='cuda:0')


Epoch 1, Step 828: Loss = 0.2892, Actor Loss = 0.2088, Critic Loss = 0.1611, Entropy = 0.1347, Returns = 1.2739, Value = 1.3984, mask_loss = 0.2921std_loss = 0.3972:  15%|█▌        | 828/5475 [3:43:47<21:54:46, 16.98s/it]  

ratio tensor(0.5182, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5643, device='cuda:0') reward: tensor(1.2737, device='cuda:0') factor: tensor(0.2907, device='cuda:0')
reward_raw: tensor(0.5617, device='cuda:0') reward: tensor(1.2686, device='cuda:0') factor: tensor(0.2931, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.2606, device='cuda:0') factor: tensor(0.2810, device='cuda:0')
reward_raw: tensor(0.5476, device='cuda:0') reward: tensor(1.2710, device='cuda:0') factor: tensor(0.2765, device='cuda:0')
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.2669, device='cuda:0') factor: tensor(0.2794, device='cuda:0')


Epoch 1, Step 829: Loss = 0.1701, Actor Loss = 0.1320, Critic Loss = 0.0765, Entropy = 0.1925, Returns = 1.0867, Value = 1.2012, mask_loss = 0.3556std_loss = 0.3955:  15%|█▌        | 829/5475 [3:43:59<19:52:38, 15.40s/it]

ratio tensor(0.7119, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.2932, device='cuda:0') factor: tensor(0.2131, device='cuda:0')
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.2899, device='cuda:0') factor: tensor(0.2159, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.2700, device='cuda:0') factor: tensor(0.2205, device='cuda:0')
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.2876, device='cuda:0') factor: tensor(0.2146, device='cuda:0')


Epoch 1, Step 830: Loss = -0.1480, Actor Loss = -0.2253, Critic Loss = 0.1548, Entropy = 0.1171, Returns = 1.3115, Value = 1.3280, mask_loss = 0.2121std_loss = 0.3531:  15%|█▌        | 830/5475 [3:44:12<19:02:50, 14.76s/it]

ratio tensor(1.1660, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6444, device='cuda:0') reward: tensor(1.3520, device='cuda:0') factor: tensor(0.2924, device='cuda:0')
reward_raw: tensor(0.6177, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.2912, device='cuda:0')
reward_raw: tensor(0.6282, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.2826, device='cuda:0')
reward_raw: tensor(0.6253, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.2895, device='cuda:0')
reward_raw: tensor(0.6185, device='cuda:0') reward: tensor(1.3275, device='cuda:0') factor: tensor(0.2910, device='cuda:0')


Epoch 1, Step 831: Loss = 0.1587, Actor Loss = 0.1113, Critic Loss = 0.0952, Entropy = 0.1254, Returns = 1.3037, Value = 1.2224, mask_loss = 0.2774std_loss = 0.3899:  15%|█▌        | 831/5475 [3:44:30<20:12:28, 15.66s/it]  

ratio tensor(0.6900, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5012, device='cuda:0') reward: tensor(1.2809, device='cuda:0') factor: tensor(0.2203, device='cuda:0')
reward_raw: tensor(0.4676, device='cuda:0') reward: tensor(1.2474, device='cuda:0') factor: tensor(0.2202, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.2841, device='cuda:0') factor: tensor(0.2202, device='cuda:0')
reward_raw: tensor(0.5060, device='cuda:0') reward: tensor(1.2935, device='cuda:0') factor: tensor(0.2125, device='cuda:0')
reward_raw: tensor(0.5154, device='cuda:0') reward: tensor(1.2942, device='cuda:0') factor: tensor(0.2212, device='cuda:0')


Epoch 1, Step 832: Loss = -0.0271, Actor Loss = -0.0669, Critic Loss = 0.0799, Entropy = 0.1322, Returns = 1.3303, Value = 1.2241, mask_loss = 0.2145std_loss = 0.3580:  15%|█▌        | 832/5475 [3:44:45<19:52:31, 15.41s/it]

ratio tensor(1.4991, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5495, device='cuda:0') reward: tensor(1.3037, device='cuda:0') factor: tensor(0.2458, device='cuda:0')
reward_raw: tensor(0.5384, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.2522, device='cuda:0')
reward_raw: tensor(0.5516, device='cuda:0') reward: tensor(1.3016, device='cuda:0') factor: tensor(0.2500, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.2987, device='cuda:0') factor: tensor(0.2534, device='cuda:0')
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.2950, device='cuda:0') factor: tensor(0.2513, device='cuda:0')


Epoch 1, Step 833: Loss = -0.1647, Actor Loss = -0.2243, Critic Loss = 0.1194, Entropy = 0.1275, Returns = 1.2181, Value = 1.2297, mask_loss = 0.3040std_loss = 0.4069:  15%|█▌        | 833/5475 [3:45:04<21:18:00, 16.52s/it]

ratio tensor(0.9567, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5988, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.2497, device='cuda:0')
reward_raw: tensor(0.5527, device='cuda:0') reward: tensor(1.3154, device='cuda:0') factor: tensor(0.2373, device='cuda:0')
reward_raw: tensor(0.5931, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.2401, device='cuda:0')
reward_raw: tensor(0.5752, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.2476, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.2434, device='cuda:0')


Epoch 1, Step 834: Loss = 0.2096, Actor Loss = 0.1647, Critic Loss = 0.0901, Entropy = 0.1393, Returns = 1.3222, Value = 1.4298, mask_loss = 0.2693std_loss = 0.3851:  15%|█▌        | 834/5475 [3:45:17<19:56:05, 15.46s/it]  

ratio tensor(1.3118, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5847, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.2548, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.3174, device='cuda:0') factor: tensor(0.2500, device='cuda:0')
reward_raw: tensor(0.5657, device='cuda:0') reward: tensor(1.3170, device='cuda:0') factor: tensor(0.2487, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.2546, device='cuda:0')
reward_raw: tensor(0.5572, device='cuda:0') reward: tensor(1.3055, device='cuda:0') factor: tensor(0.2517, device='cuda:0')


Epoch 1, Step 835: Loss = 0.2565, Actor Loss = 0.2173, Critic Loss = 0.0785, Entropy = 0.1074, Returns = 1.3774, Value = 1.2757, mask_loss = 0.2349std_loss = 0.3731:  15%|█▌        | 835/5475 [3:45:31<19:15:57, 14.95s/it]

ratio tensor(1.1751, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6278, device='cuda:0') reward: tensor(1.3869, device='cuda:0') factor: tensor(0.2409, device='cuda:0')
reward_raw: tensor(0.5967, device='cuda:0') reward: tensor(1.3497, device='cuda:0') factor: tensor(0.2470, device='cuda:0')
reward_raw: tensor(0.5937, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.2513, device='cuda:0')
reward_raw: tensor(0.5996, device='cuda:0') reward: tensor(1.3510, device='cuda:0') factor: tensor(0.2486, device='cuda:0')
reward_raw: tensor(0.6094, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.2436, device='cuda:0')


Epoch 1, Step 836: Loss = -0.0252, Actor Loss = -0.0679, Critic Loss = 0.0856, Entropy = 0.1445, Returns = 1.2891, Value = 1.4062, mask_loss = 0.2501std_loss = 0.3740:  15%|█▌        | 836/5475 [3:45:51<21:12:43, 16.46s/it]

ratio tensor(1.4531, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.2953, device='cuda:0') factor: tensor(0.2174, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.2932, device='cuda:0') factor: tensor(0.2154, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.2991, device='cuda:0') factor: tensor(0.2124, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.2134, device='cuda:0')
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.2897, device='cuda:0') factor: tensor(0.2189, device='cuda:0')


Epoch 1, Step 837: Loss = 0.2294, Actor Loss = 0.1709, Critic Loss = 0.1173, Entropy = 0.1077, Returns = 1.2569, Value = 1.2556, mask_loss = 0.2167std_loss = 0.3625:  15%|█▌        | 837/5475 [3:46:11<22:40:57, 17.61s/it]  

ratio tensor(0.7838, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5518, device='cuda:0') reward: tensor(1.3210, device='cuda:0') factor: tensor(0.2309, device='cuda:0')
reward_raw: tensor(0.5540, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.2385, device='cuda:0')
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.2333, device='cuda:0')
reward_raw: tensor(0.5546, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.2342, device='cuda:0')
reward_raw: tensor(0.5523, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.2299, device='cuda:0')


Epoch 1, Step 838: Loss = 0.0057, Actor Loss = -0.0304, Critic Loss = 0.0724, Entropy = 0.1007, Returns = 1.3008, Value = 1.2907, mask_loss = 0.2178std_loss = 0.3631:  15%|█▌        | 838/5475 [3:46:25<21:15:50, 16.51s/it]

ratio tensor(1.0175, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5076, device='cuda:0') reward: tensor(1.3125, device='cuda:0') factor: tensor(0.1951, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.1889, device='cuda:0')
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.3253, device='cuda:0') factor: tensor(0.1914, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.3193, device='cuda:0') factor: tensor(0.1879, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.3240, device='cuda:0') factor: tensor(0.1909, device='cuda:0')


Epoch 1, Step 839: Loss = -0.0719, Actor Loss = -0.1276, Critic Loss = 0.1116, Entropy = 0.1025, Returns = 1.2839, Value = 1.2920, mask_loss = 0.1974std_loss = 0.3471:  15%|█▌        | 839/5475 [3:46:42<21:31:31, 16.72s/it]

ratio tensor(1.8101, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4618, device='cuda:0') reward: tensor(1.2387, device='cuda:0') factor: tensor(0.2231, device='cuda:0')
reward_raw: tensor(0.4619, device='cuda:0') reward: tensor(1.2380, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.4595, device='cuda:0') reward: tensor(1.2458, device='cuda:0') factor: tensor(0.2137, device='cuda:0')
reward_raw: tensor(0.4769, device='cuda:0') reward: tensor(1.2516, device='cuda:0') factor: tensor(0.2253, device='cuda:0')
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.2552, device='cuda:0') factor: tensor(0.2167, device='cuda:0')


Epoch 1, Step 840: Loss = -0.0885, Actor Loss = -0.1325, Critic Loss = 0.0883, Entropy = 0.1111, Returns = 1.2627, Value = 1.2228, mask_loss = 0.2265std_loss = 0.3629:  15%|█▌        | 840/5475 [3:46:58<21:03:37, 16.36s/it]

ratio tensor(1.2472, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6491, device='cuda:0') reward: tensor(1.4273, device='cuda:0') factor: tensor(0.2218, device='cuda:0')
reward_raw: tensor(0.6290, device='cuda:0') reward: tensor(1.4111, device='cuda:0') factor: tensor(0.2178, device='cuda:0')
reward_raw: tensor(0.5965, device='cuda:0') reward: tensor(1.3793, device='cuda:0') factor: tensor(0.2172, device='cuda:0')
reward_raw: tensor(0.6124, device='cuda:0') reward: tensor(1.3984, device='cuda:0') factor: tensor(0.2140, device='cuda:0')
reward_raw: tensor(0.6205, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.2192, device='cuda:0')


Epoch 1, Step 841: Loss = 0.0297, Actor Loss = -0.0286, Critic Loss = 0.1170, Entropy = 0.1149, Returns = 1.3754, Value = 1.4369, mask_loss = 0.2065std_loss = 0.3536:  15%|█▌        | 840/5475 [3:47:18<21:03:37, 16.36s/it] 

ratio tensor(1.1199, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 841: Loss = 0.0297, Actor Loss = -0.0286, Critic Loss = 0.1170, Entropy = 0.1149, Returns = 1.3754, Value = 1.4369, mask_loss = 0.2065std_loss = 0.3536:  15%|█▌        | 841/5475 [3:47:18<22:41:01, 17.62s/it]

reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3092, device='cuda:0') factor: tensor(0.2484, device='cuda:0')
reward_raw: tensor(0.5774, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.2506, device='cuda:0')
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.2979, device='cuda:0') factor: tensor(0.2477, device='cuda:0')
reward_raw: tensor(0.5374, device='cuda:0') reward: tensor(1.2857, device='cuda:0') factor: tensor(0.2517, device='cuda:0')
reward_raw: tensor(0.5611, device='cuda:0') reward: tensor(1.3148, device='cuda:0') factor: tensor(0.2463, device='cuda:0')


Epoch 1, Step 842: Loss = 0.1064, Actor Loss = 0.0752, Critic Loss = 0.0626, Entropy = 0.0943, Returns = 1.3640, Value = 1.2592, mask_loss = 0.1881std_loss = 0.3407:  15%|█▌        | 842/5475 [3:47:35<22:21:05, 17.37s/it] 

ratio tensor(1.2912, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4351, device='cuda:0') reward: tensor(1.2238, device='cuda:0') factor: tensor(0.2113, device='cuda:0')
reward_raw: tensor(0.4494, device='cuda:0') reward: tensor(1.2331, device='cuda:0') factor: tensor(0.2163, device='cuda:0')
reward_raw: tensor(0.4686, device='cuda:0') reward: tensor(1.2560, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.4560, device='cuda:0') reward: tensor(1.2413, device='cuda:0') factor: tensor(0.2147, device='cuda:0')
reward_raw: tensor(0.4576, device='cuda:0') reward: tensor(1.2379, device='cuda:0') factor: tensor(0.2198, device='cuda:0')


Epoch 1, Step 843: Loss = 0.0749, Actor Loss = 0.0418, Critic Loss = 0.0663, Entropy = 0.0916, Returns = 1.2590, Value = 1.1412, mask_loss = 0.2095std_loss = 0.3692:  15%|█▌        | 843/5475 [3:47:49<21:00:00, 16.32s/it]

ratio tensor(0.7441, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5901, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.2101, device='cuda:0')
reward_raw: tensor(0.6026, device='cuda:0') reward: tensor(1.3978, device='cuda:0') factor: tensor(0.2048, device='cuda:0')
reward_raw: tensor(0.6087, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.2086, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.3948, device='cuda:0') factor: tensor(0.2057, device='cuda:0')
reward_raw: tensor(0.5974, device='cuda:0') reward: tensor(1.3877, device='cuda:0') factor: tensor(0.2097, device='cuda:0')


Epoch 1, Step 844: Loss = -0.3315, Actor Loss = -0.3676, Critic Loss = 0.0726, Entropy = 0.1194, Returns = 1.4617, Value = 1.5141, mask_loss = 0.2251std_loss = 0.3720:  15%|█▌        | 844/5475 [3:48:05<20:45:30, 16.14s/it]

ratio tensor(49.1352, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6647, device='cuda:0') reward: tensor(1.4099, device='cuda:0') factor: tensor(0.2548, device='cuda:0')
reward_raw: tensor(0.6494, device='cuda:0') reward: tensor(1.3918, device='cuda:0') factor: tensor(0.2576, device='cuda:0')
reward_raw: tensor(0.6631, device='cuda:0') reward: tensor(1.4022, device='cuda:0') factor: tensor(0.2609, device='cuda:0')
reward_raw: tensor(0.6675, device='cuda:0') reward: tensor(1.4147, device='cuda:0') factor: tensor(0.2528, device='cuda:0')
reward_raw: tensor(0.6724, device='cuda:0') reward: tensor(1.4210, device='cuda:0') factor: tensor(0.2514, device='cuda:0')


Epoch 1, Step 845: Loss = 0.0922, Actor Loss = 0.0772, Critic Loss = 0.0303, Entropy = 0.1185, Returns = 1.4199, Value = 1.4052, mask_loss = 0.2857std_loss = 0.3864:  15%|█▌        | 845/5475 [3:48:18<19:42:59, 15.33s/it]  

ratio tensor(0.8652, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6106, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.2683, device='cuda:0')
reward_raw: tensor(0.5958, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.2676, device='cuda:0')
reward_raw: tensor(0.5804, device='cuda:0') reward: tensor(1.3152, device='cuda:0') factor: tensor(0.2651, device='cuda:0')
reward_raw: tensor(0.5883, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.2705, device='cuda:0')
reward_raw: tensor(0.5984, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.2647, device='cuda:0')


Epoch 1, Step 846: Loss = -0.0696, Actor Loss = -0.1220, Critic Loss = 0.1051, Entropy = 0.1519, Returns = 1.3907, Value = 1.1609, mask_loss = 0.2475std_loss = 0.3543:  15%|█▌        | 846/5475 [3:48:36<20:30:12, 15.95s/it]

ratio tensor(2.3707, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.2272, device='cuda:0')
reward_raw: tensor(0.5578, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.2239, device='cuda:0')
reward_raw: tensor(0.5548, device='cuda:0') reward: tensor(1.3234, device='cuda:0') factor: tensor(0.2314, device='cuda:0')
reward_raw: tensor(0.5491, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.2153, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.2268, device='cuda:0')


Epoch 1, Step 847: Loss = 0.0701, Actor Loss = 0.0416, Critic Loss = 0.0573, Entropy = 0.1401, Returns = 1.3343, Value = 1.3953, mask_loss = 0.2368std_loss = 0.3691:  15%|█▌        | 847/5475 [3:48:53<20:56:37, 16.29s/it]  

ratio tensor(1.0528, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.2606, device='cuda:0') factor: tensor(0.2497, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.2598, device='cuda:0') factor: tensor(0.2564, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.2592, device='cuda:0') factor: tensor(0.2563, device='cuda:0')
reward_raw: tensor(0.5138, device='cuda:0') reward: tensor(1.2686, device='cuda:0') factor: tensor(0.2452, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.2464, device='cuda:0') factor: tensor(0.2479, device='cuda:0')


Epoch 1, Step 848: Loss = 0.1402, Actor Loss = 0.1170, Critic Loss = 0.0467, Entropy = 0.1319, Returns = 1.2523, Value = 1.2020, mask_loss = 0.2510std_loss = 0.3605:  15%|█▌        | 848/5475 [3:49:07<20:03:33, 15.61s/it]

ratio tensor(0.5168, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5497, device='cuda:0') reward: tensor(1.2700, device='cuda:0') factor: tensor(0.2797, device='cuda:0')
reward_raw: tensor(0.5320, device='cuda:0') reward: tensor(1.2557, device='cuda:0') factor: tensor(0.2764, device='cuda:0')
reward_raw: tensor(0.5278, device='cuda:0') reward: tensor(1.2567, device='cuda:0') factor: tensor(0.2712, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.2759, device='cuda:0') factor: tensor(0.2793, device='cuda:0')
reward_raw: tensor(0.5557, device='cuda:0') reward: tensor(1.2805, device='cuda:0') factor: tensor(0.2751, device='cuda:0')


Epoch 1, Step 849: Loss = -0.0097, Actor Loss = -0.0336, Critic Loss = 0.0481, Entropy = 0.1675, Returns = 1.2427, Value = 1.3037, mask_loss = 0.3371std_loss = 0.3978:  16%|█▌        | 849/5475 [3:49:19<18:43:32, 14.57s/it]

ratio tensor(0.9871, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.3017, device='cuda:0') factor: tensor(0.2642, device='cuda:0')
reward_raw: tensor(0.5822, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.2666, device='cuda:0')
reward_raw: tensor(0.5600, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.2637, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3044, device='cuda:0') factor: tensor(0.2663, device='cuda:0')
reward_raw: tensor(0.5796, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.2634, device='cuda:0')


Epoch 1, Step 850: Loss = 0.0359, Actor Loss = -0.0009, Critic Loss = 0.0738, Entropy = 0.1382, Returns = 1.1431, Value = 1.3329, mask_loss = 0.3222std_loss = 0.4070:  16%|█▌        | 850/5475 [3:49:37<20:09:20, 15.69s/it] 

ratio tensor(0.8100, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5296, device='cuda:0') reward: tensor(1.3179, device='cuda:0') factor: tensor(0.2117, device='cuda:0')
reward_raw: tensor(0.5383, device='cuda:0') reward: tensor(1.3220, device='cuda:0') factor: tensor(0.2163, device='cuda:0')
reward_raw: tensor(0.5448, device='cuda:0') reward: tensor(1.3286, device='cuda:0') factor: tensor(0.2162, device='cuda:0')
reward_raw: tensor(0.5547, device='cuda:0') reward: tensor(1.3393, device='cuda:0') factor: tensor(0.2154, device='cuda:0')
reward_raw: tensor(0.5531, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.2228, device='cuda:0')


Epoch 1, Step 851: Loss = 0.0758, Actor Loss = 0.0288, Critic Loss = 0.0944, Entropy = 0.1230, Returns = 1.3521, Value = 1.4052, mask_loss = 0.2255std_loss = 0.3654:  16%|█▌        | 851/5475 [3:49:51<19:20:05, 15.05s/it] 

ratio tensor(1.1290, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6068, device='cuda:0') reward: tensor(1.3507, device='cuda:0') factor: tensor(0.2561, device='cuda:0')
reward_raw: tensor(0.5791, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.2537, device='cuda:0')
reward_raw: tensor(0.6128, device='cuda:0') reward: tensor(1.3578, device='cuda:0') factor: tensor(0.2551, device='cuda:0')
reward_raw: tensor(0.6317, device='cuda:0') reward: tensor(1.3725, device='cuda:0') factor: tensor(0.2593, device='cuda:0')
reward_raw: tensor(0.6161, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.2579, device='cuda:0')


Epoch 1, Step 852: Loss = 0.0136, Actor Loss = -0.0018, Critic Loss = 0.0311, Entropy = 0.1156, Returns = 1.3604, Value = 1.4210, mask_loss = 0.2270std_loss = 0.3657:  16%|█▌        | 852/5475 [3:50:08<20:19:20, 15.83s/it]

ratio tensor(1.3690, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5768, device='cuda:0') reward: tensor(1.2947, device='cuda:0') factor: tensor(0.2820, device='cuda:0')
reward_raw: tensor(0.5677, device='cuda:0') reward: tensor(1.2892, device='cuda:0') factor: tensor(0.2784, device='cuda:0')
reward_raw: tensor(0.5774, device='cuda:0') reward: tensor(1.2927, device='cuda:0') factor: tensor(0.2847, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.2636, device='cuda:0') factor: tensor(0.2837, device='cuda:0')
reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.2686, device='cuda:0') factor: tensor(0.2728, device='cuda:0')


Epoch 1, Step 853: Loss = 0.2627, Actor Loss = 0.2254, Critic Loss = 0.0747, Entropy = 0.1267, Returns = 1.3110, Value = 1.3503, mask_loss = 0.2799std_loss = 0.3952:  16%|█▌        | 853/5475 [3:50:20<18:38:45, 14.52s/it] 

ratio tensor(0.4799, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.2462, device='cuda:0')
reward_raw: tensor(0.5726, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.2494, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.2969, device='cuda:0') factor: tensor(0.2445, device='cuda:0')
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.2479, device='cuda:0')
reward_raw: tensor(0.5840, device='cuda:0') reward: tensor(1.3319, device='cuda:0') factor: tensor(0.2521, device='cuda:0')


Epoch 1, Step 854: Loss = 0.0278, Actor Loss = -0.0252, Critic Loss = 0.1061, Entropy = 0.1227, Returns = 1.2660, Value = 1.4170, mask_loss = 0.2305std_loss = 0.3617:  16%|█▌        | 854/5475 [3:50:35<19:00:20, 14.81s/it]

ratio tensor(1.3393, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5821, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.2299, device='cuda:0')
reward_raw: tensor(0.5759, device='cuda:0') reward: tensor(1.3389, device='cuda:0') factor: tensor(0.2370, device='cuda:0')
reward_raw: tensor(0.5733, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.2401, device='cuda:0')
reward_raw: tensor(0.5891, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.2402, device='cuda:0')
reward_raw: tensor(0.5956, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.2387, device='cuda:0')


Epoch 1, Step 855: Loss = -0.0668, Actor Loss = -0.0988, Critic Loss = 0.0641, Entropy = 0.1083, Returns = 1.2768, Value = 1.2867, mask_loss = 0.2775std_loss = 0.3772:  16%|█▌        | 855/5475 [3:50:52<19:34:16, 15.25s/it]

ratio tensor(1.5531, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.2797, device='cuda:0') factor: tensor(0.2542, device='cuda:0')
reward_raw: tensor(0.5106, device='cuda:0') reward: tensor(1.2712, device='cuda:0') factor: tensor(0.2394, device='cuda:0')
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.2433, device='cuda:0')
reward_raw: tensor(0.5320, device='cuda:0') reward: tensor(1.2880, device='cuda:0') factor: tensor(0.2440, device='cuda:0')
reward_raw: tensor(0.5379, device='cuda:0') reward: tensor(1.2910, device='cuda:0') factor: tensor(0.2470, device='cuda:0')


Epoch 1, Step 856: Loss = 0.3430, Actor Loss = 0.3063, Critic Loss = 0.0738, Entropy = 0.1190, Returns = 1.2002, Value = 1.3186, mask_loss = 0.2577std_loss = 0.3808:  16%|█▌        | 856/5475 [3:51:05<18:44:05, 14.60s/it]  

ratio tensor(0.7702, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.2577, device='cuda:0') factor: tensor(0.2676, device='cuda:0')
reward_raw: tensor(0.5273, device='cuda:0') reward: tensor(1.2636, device='cuda:0') factor: tensor(0.2637, device='cuda:0')
reward_raw: tensor(0.5204, device='cuda:0') reward: tensor(1.2527, device='cuda:0') factor: tensor(0.2677, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.2721, device='cuda:0') factor: tensor(0.2695, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.2758, device='cuda:0') factor: tensor(0.2664, device='cuda:0')


Epoch 1, Step 857: Loss = 0.1832, Actor Loss = 0.1235, Critic Loss = 0.1196, Entropy = 0.0973, Returns = 1.3086, Value = 1.4371, mask_loss = 0.2507std_loss = 0.3707:  16%|█▌        | 857/5475 [3:51:19<18:32:39, 14.46s/it]

ratio tensor(1.1047, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6157, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.2426, device='cuda:0')
reward_raw: tensor(0.5990, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.2381, device='cuda:0')
reward_raw: tensor(0.6116, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.2437, device='cuda:0')
reward_raw: tensor(0.6297, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.2425, device='cuda:0')
reward_raw: tensor(0.6186, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.2486, device='cuda:0')


Epoch 1, Step 858: Loss = -0.2589, Actor Loss = -0.2965, Critic Loss = 0.0755, Entropy = 0.0974, Returns = 1.4074, Value = 1.4366, mask_loss = 0.2464std_loss = 0.3840:  16%|█▌        | 858/5475 [3:51:31<17:50:53, 13.92s/it]

ratio tensor(1.7791, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5555, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.2020, device='cuda:0')
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.2044, device='cuda:0')
reward_raw: tensor(0.5812, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.2075, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.2024, device='cuda:0')
reward_raw: tensor(0.5787, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.2032, device='cuda:0')


Epoch 1, Step 859: Loss = 0.0949, Actor Loss = 0.0746, Critic Loss = 0.0407, Entropy = 0.0888, Returns = 1.3478, Value = 1.3804, mask_loss = 0.2084std_loss = 0.3725:  16%|█▌        | 859/5475 [3:51:52<20:33:55, 16.04s/it]  

ratio tensor(0.8344, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5964, device='cuda:0') reward: tensor(1.3903, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5946, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.2064, device='cuda:0')
reward_raw: tensor(0.5915, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.2027, device='cuda:0')
reward_raw: tensor(0.5984, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.2102, device='cuda:0')
reward_raw: tensor(0.6072, device='cuda:0') reward: tensor(1.4041, device='cuda:0') factor: tensor(0.2030, device='cuda:0')


Epoch 1, Step 860: Loss = 0.2325, Actor Loss = 0.1721, Critic Loss = 0.1209, Entropy = 0.0940, Returns = 1.3683, Value = 1.3444, mask_loss = 0.2188std_loss = 0.3705:  16%|█▌        | 860/5475 [3:52:10<21:17:26, 16.61s/it]

ratio tensor(0.1890, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5732, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.2362, device='cuda:0')
reward_raw: tensor(0.5705, device='cuda:0') reward: tensor(1.3278, device='cuda:0') factor: tensor(0.2427, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.3451, device='cuda:0') factor: tensor(0.2390, device='cuda:0')
reward_raw: tensor(0.5786, device='cuda:0') reward: tensor(1.3421, device='cuda:0') factor: tensor(0.2365, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.2433, device='cuda:0')


Epoch 1, Step 861: Loss = -0.0129, Actor Loss = -0.0482, Critic Loss = 0.0709, Entropy = 0.1032, Returns = 1.3248, Value = 1.3897, mask_loss = 0.2384std_loss = 0.3857:  16%|█▌        | 860/5475 [3:52:26<21:17:26, 16.61s/it]

ratio tensor(7.2326, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 861: Loss = -0.0129, Actor Loss = -0.0482, Critic Loss = 0.0709, Entropy = 0.1032, Returns = 1.3248, Value = 1.3897, mask_loss = 0.2384std_loss = 0.3857:  16%|█▌        | 861/5475 [3:52:27<21:07:28, 16.48s/it]

reward_raw: tensor(0.5512, device='cuda:0') reward: tensor(1.3084, device='cuda:0') factor: tensor(0.2428, device='cuda:0')
reward_raw: tensor(0.5446, device='cuda:0') reward: tensor(1.3092, device='cuda:0') factor: tensor(0.2355, device='cuda:0')
reward_raw: tensor(0.5412, device='cuda:0') reward: tensor(1.3004, device='cuda:0') factor: tensor(0.2408, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.2427, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.2446, device='cuda:0')


Epoch 1, Step 862: Loss = 0.3872, Actor Loss = 0.3324, Critic Loss = 0.1099, Entropy = 0.0918, Returns = 1.3152, Value = 1.2764, mask_loss = 0.2257std_loss = 0.3659:  16%|█▌        | 862/5475 [3:52:45<21:47:06, 17.00s/it]  

ratio tensor(0.6010, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6405, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.2391, device='cuda:0')
reward_raw: tensor(0.6295, device='cuda:0') reward: tensor(1.3838, device='cuda:0') factor: tensor(0.2456, device='cuda:0')
reward_raw: tensor(0.6445, device='cuda:0') reward: tensor(1.4005, device='cuda:0') factor: tensor(0.2441, device='cuda:0')
reward_raw: tensor(0.6364, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.2475, device='cuda:0')
reward_raw: tensor(0.6221, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.2460, device='cuda:0')


Epoch 1, Step 863: Loss = -0.2956, Actor Loss = -0.3215, Critic Loss = 0.0519, Entropy = 0.1119, Returns = 1.4012, Value = 1.4829, mask_loss = 0.2412std_loss = 0.3772:  16%|█▌        | 863/5475 [3:53:06<23:36:09, 18.42s/it]

ratio tensor(7.2681, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6046, device='cuda:0') reward: tensor(1.3359, device='cuda:0') factor: tensor(0.2686, device='cuda:0')
reward_raw: tensor(0.6193, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.2684, device='cuda:0')
reward_raw: tensor(0.6190, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.2695, device='cuda:0')
reward_raw: tensor(0.5896, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2655, device='cuda:0')
reward_raw: tensor(0.6081, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.2676, device='cuda:0')


Epoch 1, Step 864: Loss = 0.1313, Actor Loss = 0.0765, Critic Loss = 0.1098, Entropy = 0.1036, Returns = 1.3331, Value = 1.3401, mask_loss = 0.2076std_loss = 0.3613:  16%|█▌        | 864/5475 [3:53:20<21:34:03, 16.84s/it]  

ratio tensor(0.4556, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6482, device='cuda:0') reward: tensor(1.4188, device='cuda:0') factor: tensor(0.2294, device='cuda:0')
reward_raw: tensor(0.6454, device='cuda:0') reward: tensor(1.4157, device='cuda:0') factor: tensor(0.2296, device='cuda:0')
reward_raw: tensor(0.6425, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.2305, device='cuda:0')
reward_raw: tensor(0.6817, device='cuda:0') reward: tensor(1.4446, device='cuda:0') factor: tensor(0.2371, device='cuda:0')
reward_raw: tensor(0.6591, device='cuda:0') reward: tensor(1.4302, device='cuda:0') factor: tensor(0.2289, device='cuda:0')


Epoch 1, Step 865: Loss = 0.0426, Actor Loss = 0.0048, Critic Loss = 0.0758, Entropy = 0.1115, Returns = 1.4647, Value = 1.3230, mask_loss = 0.2586std_loss = 0.3914:  16%|█▌        | 865/5475 [3:53:35<21:01:08, 16.41s/it]

ratio tensor(1.0706, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5266, device='cuda:0') reward: tensor(1.2974, device='cuda:0') factor: tensor(0.2292, device='cuda:0')
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.2324, device='cuda:0')
reward_raw: tensor(0.5209, device='cuda:0') reward: tensor(1.2870, device='cuda:0') factor: tensor(0.2339, device='cuda:0')
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.2301, device='cuda:0')
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.2925, device='cuda:0') factor: tensor(0.2332, device='cuda:0')


Epoch 1, Step 866: Loss = -0.0549, Actor Loss = -0.0772, Critic Loss = 0.0448, Entropy = 0.0900, Returns = 1.3164, Value = 1.3943, mask_loss = 0.1794std_loss = 0.3356:  16%|█▌        | 866/5475 [3:53:50<20:26:25, 15.97s/it]

ratio tensor(1.0746, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5276, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.2050, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.3095, device='cuda:0') factor: tensor(0.1997, device='cuda:0')
reward_raw: tensor(0.5209, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.2080, device='cuda:0')
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.2073, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.2015, device='cuda:0')


Epoch 1, Step 867: Loss = 0.0169, Actor Loss = -0.0173, Critic Loss = 0.0686, Entropy = 0.0989, Returns = 1.3990, Value = 1.3145, mask_loss = 0.2034std_loss = 0.3557:  16%|█▌        | 867/5475 [3:54:04<19:45:23, 15.43s/it] 

ratio tensor(1.0910, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5245, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1925, device='cuda:0')
reward_raw: tensor(0.5429, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.1931, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.1922, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.3289, device='cuda:0') factor: tensor(0.1958, device='cuda:0')
reward_raw: tensor(0.5226, device='cuda:0') reward: tensor(1.3220, device='cuda:0') factor: tensor(0.2006, device='cuda:0')


Epoch 1, Step 868: Loss = 0.0519, Actor Loss = 0.0130, Critic Loss = 0.0781, Entropy = 0.0838, Returns = 1.3156, Value = 1.3293, mask_loss = 0.2137std_loss = 0.3584:  16%|█▌        | 868/5475 [3:54:21<20:07:14, 15.72s/it] 

ratio tensor(0.7596, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6071, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.2235, device='cuda:0')
reward_raw: tensor(0.6173, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.2235, device='cuda:0')
reward_raw: tensor(0.6189, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.2252, device='cuda:0')
reward_raw: tensor(0.6295, device='cuda:0') reward: tensor(1.4107, device='cuda:0') factor: tensor(0.2188, device='cuda:0')
reward_raw: tensor(0.6101, device='cuda:0') reward: tensor(1.3878, device='cuda:0') factor: tensor(0.2223, device='cuda:0')


Epoch 1, Step 869: Loss = -0.0627, Actor Loss = -0.0981, Critic Loss = 0.0710, Entropy = 0.0918, Returns = 1.4787, Value = 1.3715, mask_loss = 0.2299std_loss = 0.3756:  16%|█▌        | 869/5475 [3:54:39<20:59:27, 16.41s/it]

ratio tensor(1.0874, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5271, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.2365, device='cuda:0')
reward_raw: tensor(0.5271, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.2345, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.2411, device='cuda:0')
reward_raw: tensor(0.5350, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.3366, device='cuda:0') factor: tensor(0.2388, device='cuda:0')


Epoch 1, Step 870: Loss = 0.0318, Actor Loss = -0.0142, Critic Loss = 0.0922, Entropy = 0.0959, Returns = 1.3416, Value = 1.3677, mask_loss = 0.2337std_loss = 0.3789:  16%|█▌        | 870/5475 [3:54:54<20:35:31, 16.10s/it] 

ratio tensor(1.3133, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6131, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.2376, device='cuda:0')
reward_raw: tensor(0.6376, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.2400, device='cuda:0')
reward_raw: tensor(0.6208, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.2445, device='cuda:0')
reward_raw: tensor(0.6180, device='cuda:0') reward: tensor(1.3830, device='cuda:0') factor: tensor(0.2350, device='cuda:0')
reward_raw: tensor(0.6142, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.2412, device='cuda:0')


Epoch 1, Step 871: Loss = 0.1663, Actor Loss = 0.0792, Critic Loss = 0.1745, Entropy = 0.1016, Returns = 1.4269, Value = 1.2600, mask_loss = 0.2272std_loss = 0.3594:  16%|█▌        | 871/5475 [3:55:08<19:58:59, 15.63s/it] 

ratio tensor(1.6052, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4745, device='cuda:0') reward: tensor(1.2589, device='cuda:0') factor: tensor(0.2156, device='cuda:0')
reward_raw: tensor(0.4712, device='cuda:0') reward: tensor(1.2518, device='cuda:0') factor: tensor(0.2195, device='cuda:0')
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.2657, device='cuda:0') factor: tensor(0.2224, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.2605, device='cuda:0') factor: tensor(0.2267, device='cuda:0')
reward_raw: tensor(0.4730, device='cuda:0') reward: tensor(1.2558, device='cuda:0') factor: tensor(0.2172, device='cuda:0')


Epoch 1, Step 872: Loss = 0.3135, Actor Loss = 0.1964, Critic Loss = 0.2344, Entropy = 0.1044, Returns = 1.1991, Value = 1.5026, mask_loss = 0.2210std_loss = 0.3673:  16%|█▌        | 872/5475 [3:55:31<22:39:40, 17.72s/it]

ratio tensor(1.0695, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.2218, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.3151, device='cuda:0') factor: tensor(0.2362, device='cuda:0')
reward_raw: tensor(0.5551, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.2352, device='cuda:0')
reward_raw: tensor(0.5505, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.2275, device='cuda:0')
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.3150, device='cuda:0') factor: tensor(0.2252, device='cuda:0')


Epoch 1, Step 873: Loss = 0.0660, Actor Loss = 0.0276, Critic Loss = 0.0770, Entropy = 0.0931, Returns = 1.4165, Value = 1.3133, mask_loss = 0.2353std_loss = 0.3791:  16%|█▌        | 873/5475 [3:55:45<21:03:10, 16.47s/it]

ratio tensor(0.6823, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6219, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.3289, device='cuda:0')
reward_raw: tensor(0.6123, device='cuda:0') reward: tensor(1.2898, device='cuda:0') factor: tensor(0.3225, device='cuda:0')
reward_raw: tensor(0.6246, device='cuda:0') reward: tensor(1.2977, device='cuda:0') factor: tensor(0.3269, device='cuda:0')
reward_raw: tensor(0.6121, device='cuda:0') reward: tensor(1.2822, device='cuda:0') factor: tensor(0.3300, device='cuda:0')
reward_raw: tensor(0.6233, device='cuda:0') reward: tensor(1.2927, device='cuda:0') factor: tensor(0.3306, device='cuda:0')


Epoch 1, Step 874: Loss = 0.0678, Actor Loss = 0.0290, Critic Loss = 0.0779, Entropy = 0.1430, Returns = 1.2853, Value = 1.3438, mask_loss = 0.3130std_loss = 0.4076:  16%|█▌        | 874/5475 [3:55:56<19:09:37, 14.99s/it]

ratio tensor(0.8487, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5773, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.2401, device='cuda:0')
reward_raw: tensor(0.5618, device='cuda:0') reward: tensor(1.3283, device='cuda:0') factor: tensor(0.2336, device='cuda:0')
reward_raw: tensor(0.5612, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2369, device='cuda:0')
reward_raw: tensor(0.5562, device='cuda:0') reward: tensor(1.3137, device='cuda:0') factor: tensor(0.2424, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.2964, device='cuda:0') factor: tensor(0.2373, device='cuda:0')


Epoch 1, Step 875: Loss = 0.0710, Actor Loss = 0.0434, Critic Loss = 0.0555, Entropy = 0.1129, Returns = 1.3544, Value = 1.3314, mask_loss = 0.2579std_loss = 0.3827:  16%|█▌        | 875/5475 [3:56:15<20:27:04, 16.01s/it]

ratio tensor(0.8293, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5927, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.2397, device='cuda:0')
reward_raw: tensor(0.5838, device='cuda:0') reward: tensor(1.3484, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.5779, device='cuda:0') reward: tensor(1.3340, device='cuda:0') factor: tensor(0.2439, device='cuda:0')
reward_raw: tensor(0.5719, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.2408, device='cuda:0')
reward_raw: tensor(0.5712, device='cuda:0') reward: tensor(1.3314, device='cuda:0') factor: tensor(0.2399, device='cuda:0')


Epoch 1, Step 876: Loss = -0.0769, Actor Loss = -0.1196, Critic Loss = 0.0857, Entropy = 0.1247, Returns = 1.3795, Value = 1.3413, mask_loss = 0.2458std_loss = 0.3805:  16%|█▌        | 876/5475 [3:56:27<18:59:17, 14.86s/it]

ratio tensor(1.0189, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5815, device='cuda:0') reward: tensor(1.2827, device='cuda:0') factor: tensor(0.2988, device='cuda:0')
reward_raw: tensor(0.5866, device='cuda:0') reward: tensor(1.2888, device='cuda:0') factor: tensor(0.2978, device='cuda:0')
reward_raw: tensor(0.5944, device='cuda:0') reward: tensor(1.2922, device='cuda:0') factor: tensor(0.3022, device='cuda:0')
reward_raw: tensor(0.5969, device='cuda:0') reward: tensor(1.2967, device='cuda:0') factor: tensor(0.3002, device='cuda:0')
reward_raw: tensor(0.5870, device='cuda:0') reward: tensor(1.2873, device='cuda:0') factor: tensor(0.2996, device='cuda:0')


Epoch 1, Step 877: Loss = 0.0647, Actor Loss = 0.0272, Critic Loss = 0.0752, Entropy = 0.1164, Returns = 1.3046, Value = 1.2921, mask_loss = 0.3270std_loss = 0.3837:  16%|█▌        | 877/5475 [3:56:42<19:17:42, 15.11s/it]  

ratio tensor(0.7215, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6251, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.2340, device='cuda:0')
reward_raw: tensor(0.6293, device='cuda:0') reward: tensor(1.3991, device='cuda:0') factor: tensor(0.2301, device='cuda:0')
reward_raw: tensor(0.6249, device='cuda:0') reward: tensor(1.3986, device='cuda:0') factor: tensor(0.2263, device='cuda:0')
reward_raw: tensor(0.6121, device='cuda:0') reward: tensor(1.3861, device='cuda:0') factor: tensor(0.2260, device='cuda:0')
reward_raw: tensor(0.6199, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.2329, device='cuda:0')


Epoch 1, Step 878: Loss = -0.0415, Actor Loss = -0.0761, Critic Loss = 0.0695, Entropy = 0.1056, Returns = 1.3835, Value = 1.3490, mask_loss = 0.2366std_loss = 0.3805:  16%|█▌        | 878/5475 [3:56:58<19:28:35, 15.25s/it]

ratio tensor(0.9589, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.2658, device='cuda:0') factor: tensor(0.2265, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.2817, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.2789, device='cuda:0') factor: tensor(0.2233, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.2664, device='cuda:0') factor: tensor(0.2268, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.2674, device='cuda:0') factor: tensor(0.2257, device='cuda:0')


Epoch 1, Step 879: Loss = 0.0581, Actor Loss = 0.0329, Critic Loss = 0.0506, Entropy = 0.1130, Returns = 1.3487, Value = 1.3000, mask_loss = 0.2102std_loss = 0.3540:  16%|█▌        | 879/5475 [3:57:22<22:38:44, 17.74s/it]  

ratio tensor(0.7285, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6592, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.2893, device='cuda:0')
reward_raw: tensor(0.6572, device='cuda:0') reward: tensor(1.3725, device='cuda:0') factor: tensor(0.2847, device='cuda:0')
reward_raw: tensor(0.6800, device='cuda:0') reward: tensor(1.3922, device='cuda:0') factor: tensor(0.2878, device='cuda:0')
reward_raw: tensor(0.6686, device='cuda:0') reward: tensor(1.3858, device='cuda:0') factor: tensor(0.2828, device='cuda:0')
reward_raw: tensor(0.6689, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.2857, device='cuda:0')


Epoch 1, Step 880: Loss = -0.0255, Actor Loss = -0.0751, Critic Loss = 0.0995, Entropy = 0.1414, Returns = 1.3468, Value = 1.4966, mask_loss = 0.3026std_loss = 0.4030:  16%|█▌        | 880/5475 [3:57:37<21:42:23, 17.01s/it]

ratio tensor(1.9733, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6120, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.3018, device='cuda:0')
reward_raw: tensor(0.6053, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.3057, device='cuda:0')
reward_raw: tensor(0.6261, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.3029, device='cuda:0')
reward_raw: tensor(0.6183, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.3021, device='cuda:0')
reward_raw: tensor(0.6087, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.3031, device='cuda:0')


Epoch 1, Step 881: Loss = 0.2082, Actor Loss = 0.1624, Critic Loss = 0.0919, Entropy = 0.1260, Returns = 1.3466, Value = 1.2226, mask_loss = 0.3391std_loss = 0.4130:  16%|█▌        | 880/5475 [3:57:50<21:42:23, 17.01s/it]  

ratio tensor(0.2568, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 881: Loss = 0.2082, Actor Loss = 0.1624, Critic Loss = 0.0919, Entropy = 0.1260, Returns = 1.3466, Value = 1.2226, mask_loss = 0.3391std_loss = 0.4130:  16%|█▌        | 881/5475 [3:57:50<20:19:57, 15.93s/it]

reward_raw: tensor(0.6008, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.2429, device='cuda:0')
reward_raw: tensor(0.5944, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.2454, device='cuda:0')
reward_raw: tensor(0.6243, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.2458, device='cuda:0')
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.2463, device='cuda:0')
reward_raw: tensor(0.6279, device='cuda:0') reward: tensor(1.3806, device='cuda:0') factor: tensor(0.2473, device='cuda:0')


Epoch 1, Step 882: Loss = -0.2181, Actor Loss = -0.2393, Critic Loss = 0.0427, Entropy = 0.1072, Returns = 1.3718, Value = 1.3165, mask_loss = 0.2291std_loss = 0.3709:  16%|█▌        | 882/5475 [3:58:13<22:49:00, 17.88s/it]

ratio tensor(1.9081, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7234, device='cuda:0') reward: tensor(1.4061, device='cuda:0') factor: tensor(0.3173, device='cuda:0')
reward_raw: tensor(0.6982, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.3084, device='cuda:0')
reward_raw: tensor(0.7122, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.3171, device='cuda:0')
reward_raw: tensor(0.7375, device='cuda:0') reward: tensor(1.4311, device='cuda:0') factor: tensor(0.3063, device='cuda:0')
reward_raw: tensor(0.7121, device='cuda:0') reward: tensor(1.3907, device='cuda:0') factor: tensor(0.3214, device='cuda:0')


Epoch 1, Step 883: Loss = 0.0339, Actor Loss = -0.0128, Critic Loss = 0.0937, Entropy = 0.1492, Returns = 1.4071, Value = 1.4943, mask_loss = 0.3095std_loss = 0.3990:  16%|█▌        | 883/5475 [3:58:29<22:03:12, 17.29s/it] 

ratio tensor(1.1331, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6679, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.3425, device='cuda:0')
reward_raw: tensor(0.6402, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.3404, device='cuda:0')
reward_raw: tensor(0.6686, device='cuda:0') reward: tensor(1.3326, device='cuda:0') factor: tensor(0.3360, device='cuda:0')
reward_raw: tensor(0.6852, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.3421, device='cuda:0')
reward_raw: tensor(0.6693, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.3427, device='cuda:0')


Epoch 1, Step 884: Loss = 0.1100, Actor Loss = 0.0612, Critic Loss = 0.0980, Entropy = 0.1604, Returns = 1.3555, Value = 1.2082, mask_loss = 0.3311std_loss = 0.3992:  16%|█▌        | 884/5475 [3:58:42<20:32:43, 16.11s/it] 

ratio tensor(0.7133, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6818, device='cuda:0') reward: tensor(1.4464, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.6534, device='cuda:0') reward: tensor(1.4147, device='cuda:0') factor: tensor(0.2386, device='cuda:0')
reward_raw: tensor(0.6724, device='cuda:0') reward: tensor(1.4392, device='cuda:0') factor: tensor(0.2332, device='cuda:0')
reward_raw: tensor(0.6556, device='cuda:0') reward: tensor(1.4204, device='cuda:0') factor: tensor(0.2352, device='cuda:0')
reward_raw: tensor(0.6527, device='cuda:0') reward: tensor(1.4145, device='cuda:0') factor: tensor(0.2383, device='cuda:0')


Epoch 1, Step 885: Loss = -0.0479, Actor Loss = -0.0826, Critic Loss = 0.0696, Entropy = 0.1184, Returns = 1.3769, Value = 1.5047, mask_loss = 0.2272std_loss = 0.3715:  16%|█▌        | 885/5475 [3:59:06<23:24:16, 18.36s/it]

ratio tensor(1.3592, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5634, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.2565, device='cuda:0')
reward_raw: tensor(0.5652, device='cuda:0') reward: tensor(1.3094, device='cuda:0') factor: tensor(0.2557, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.2920, device='cuda:0') factor: tensor(0.2551, device='cuda:0')
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.3004, device='cuda:0') factor: tensor(0.2584, device='cuda:0')
reward_raw: tensor(0.5816, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.2650, device='cuda:0')


Epoch 1, Step 886: Loss = 0.0243, Actor Loss = -0.0014, Critic Loss = 0.0516, Entropy = 0.1438, Returns = 1.3237, Value = 1.2923, mask_loss = 0.2760std_loss = 0.3914:  16%|█▌        | 886/5475 [3:59:23<23:05:23, 18.11s/it] 

ratio tensor(1.0461, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6182, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.2223, device='cuda:0')
reward_raw: tensor(0.6176, device='cuda:0') reward: tensor(1.4030, device='cuda:0') factor: tensor(0.2146, device='cuda:0')
reward_raw: tensor(0.6290, device='cuda:0') reward: tensor(1.4054, device='cuda:0') factor: tensor(0.2237, device='cuda:0')
reward_raw: tensor(0.6163, device='cuda:0') reward: tensor(1.3963, device='cuda:0') factor: tensor(0.2200, device='cuda:0')
reward_raw: tensor(0.6230, device='cuda:0') reward: tensor(1.4068, device='cuda:0') factor: tensor(0.2162, device='cuda:0')


Epoch 1, Step 887: Loss = 0.0610, Actor Loss = 0.0408, Critic Loss = 0.0406, Entropy = 0.1408, Returns = 1.3536, Value = 1.3894, mask_loss = 0.2583std_loss = 0.3757:  16%|█▌        | 887/5475 [3:59:42<23:27:30, 18.41s/it] 

ratio tensor(0.7219, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6189, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.2757, device='cuda:0')
reward_raw: tensor(0.6119, device='cuda:0') reward: tensor(1.3272, device='cuda:0') factor: tensor(0.2847, device='cuda:0')
reward_raw: tensor(0.5947, device='cuda:0') reward: tensor(1.3120, device='cuda:0') factor: tensor(0.2827, device='cuda:0')
reward_raw: tensor(0.6004, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.2765, device='cuda:0')
reward_raw: tensor(0.5918, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.2722, device='cuda:0')


Epoch 1, Step 888: Loss = 0.0419, Actor Loss = 0.0125, Critic Loss = 0.0591, Entropy = 0.1385, Returns = 1.3606, Value = 1.3104, mask_loss = 0.2568std_loss = 0.3742:  16%|█▌        | 888/5475 [3:59:56<21:33:01, 16.91s/it]

ratio tensor(0.8278, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5622, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.2531, device='cuda:0')
reward_raw: tensor(0.5728, device='cuda:0') reward: tensor(1.3169, device='cuda:0') factor: tensor(0.2558, device='cuda:0')
reward_raw: tensor(0.5611, device='cuda:0') reward: tensor(1.3109, device='cuda:0') factor: tensor(0.2503, device='cuda:0')
reward_raw: tensor(0.5991, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.2533, device='cuda:0')
reward_raw: tensor(0.5743, device='cuda:0') reward: tensor(1.3122, device='cuda:0') factor: tensor(0.2621, device='cuda:0')


Epoch 1, Step 889: Loss = -0.0935, Actor Loss = -0.1855, Critic Loss = 0.1843, Entropy = 0.1385, Returns = 1.2839, Value = 1.2499, mask_loss = 0.2557std_loss = 0.3460:  16%|█▌        | 889/5475 [4:00:10<20:28:38, 16.07s/it]

ratio tensor(0.9642, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6179, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.3029, device='cuda:0')
reward_raw: tensor(0.6165, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.3101, device='cuda:0')
reward_raw: tensor(0.6085, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.3078, device='cuda:0')
reward_raw: tensor(0.6350, device='cuda:0') reward: tensor(1.3217, device='cuda:0') factor: tensor(0.3133, device='cuda:0')
reward_raw: tensor(0.6233, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.3028, device='cuda:0')


Epoch 1, Step 890: Loss = 0.1067, Actor Loss = 0.0604, Critic Loss = 0.0928, Entropy = 0.1466, Returns = 1.2730, Value = 1.4060, mask_loss = 0.3603std_loss = 0.4136:  16%|█▋        | 890/5475 [4:00:24<19:53:05, 15.61s/it]  

ratio tensor(0.9429, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5661, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.2426, device='cuda:0')
reward_raw: tensor(0.5508, device='cuda:0') reward: tensor(1.3129, device='cuda:0') factor: tensor(0.2379, device='cuda:0')
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(1.3309, device='cuda:0') factor: tensor(0.2387, device='cuda:0')
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.3026, device='cuda:0') factor: tensor(0.2448, device='cuda:0')
reward_raw: tensor(0.5653, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.2430, device='cuda:0')


Epoch 1, Step 891: Loss = 0.2225, Actor Loss = 0.1681, Critic Loss = 0.1090, Entropy = 0.1334, Returns = 1.3595, Value = 1.2896, mask_loss = 0.2315std_loss = 0.3588:  16%|█▋        | 891/5475 [4:00:40<20:01:15, 15.72s/it]

ratio tensor(0.8968, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.2967, device='cuda:0') factor: tensor(0.2826, device='cuda:0')
reward_raw: tensor(0.5997, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.2874, device='cuda:0')
reward_raw: tensor(0.5869, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.2863, device='cuda:0')
reward_raw: tensor(0.5773, device='cuda:0') reward: tensor(1.2969, device='cuda:0') factor: tensor(0.2804, device='cuda:0')
reward_raw: tensor(0.5827, device='cuda:0') reward: tensor(1.2992, device='cuda:0') factor: tensor(0.2835, device='cuda:0')


Epoch 1, Step 892: Loss = 0.0168, Actor Loss = -0.0198, Critic Loss = 0.0733, Entropy = 0.1083, Returns = 1.3044, Value = 1.2549, mask_loss = 0.3105std_loss = 0.4106:  16%|█▋        | 892/5475 [4:01:01<21:46:14, 17.10s/it]

ratio tensor(0.9908, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5902, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.3129, device='cuda:0')
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.2505, device='cuda:0') factor: tensor(0.3193, device='cuda:0')
reward_raw: tensor(0.5939, device='cuda:0') reward: tensor(1.2682, device='cuda:0') factor: tensor(0.3257, device='cuda:0')
reward_raw: tensor(0.5731, device='cuda:0') reward: tensor(1.2575, device='cuda:0') factor: tensor(0.3155, device='cuda:0')
reward_raw: tensor(0.5802, device='cuda:0') reward: tensor(1.2608, device='cuda:0') factor: tensor(0.3194, device='cuda:0')


Epoch 1, Step 893: Loss = 0.0619, Actor Loss = -0.0092, Critic Loss = 0.1423, Entropy = 0.0919, Returns = 1.2763, Value = 1.2792, mask_loss = 0.3305std_loss = 0.3765:  16%|█▋        | 893/5475 [4:01:19<22:10:27, 17.42s/it]

ratio tensor(3.2324, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6184, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.2614, device='cuda:0')
reward_raw: tensor(0.6186, device='cuda:0') reward: tensor(1.3590, device='cuda:0') factor: tensor(0.2596, device='cuda:0')
reward_raw: tensor(0.6226, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.2659, device='cuda:0')
reward_raw: tensor(0.6305, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.2640, device='cuda:0')
reward_raw: tensor(0.6288, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.2633, device='cuda:0')


Epoch 1, Step 894: Loss = 0.0104, Actor Loss = -0.0504, Critic Loss = 0.1217, Entropy = 0.0528, Returns = 1.3742, Value = 1.5329, mask_loss = 0.2531std_loss = 0.3895:  16%|█▋        | 894/5475 [4:01:39<23:20:19, 18.34s/it]

ratio tensor(1.7589, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6587, device='cuda:0') reward: tensor(1.3285, device='cuda:0') factor: tensor(0.3302, device='cuda:0')
reward_raw: tensor(0.6662, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.3315, device='cuda:0')
reward_raw: tensor(0.6656, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.3304, device='cuda:0')
reward_raw: tensor(0.6674, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.3295, device='cuda:0')
reward_raw: tensor(0.6496, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.3242, device='cuda:0')


Epoch 1, Step 895: Loss = 0.0785, Actor Loss = 0.0175, Critic Loss = 0.1222, Entropy = 0.0969, Returns = 1.3663, Value = 1.3851, mask_loss = 0.3019std_loss = 0.4076:  16%|█▋        | 895/5475 [4:01:52<21:15:30, 16.71s/it] 

ratio tensor(0.5325, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6027, device='cuda:0') reward: tensor(1.2945, device='cuda:0') factor: tensor(0.3083, device='cuda:0')
reward_raw: tensor(0.5993, device='cuda:0') reward: tensor(1.2918, device='cuda:0') factor: tensor(0.3075, device='cuda:0')
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.2929, device='cuda:0') factor: tensor(0.3065, device='cuda:0')
reward_raw: tensor(0.6004, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.3049, device='cuda:0')
reward_raw: tensor(0.5972, device='cuda:0') reward: tensor(1.2934, device='cuda:0') factor: tensor(0.3037, device='cuda:0')


Epoch 1, Step 896: Loss = 0.2567, Actor Loss = 0.1995, Critic Loss = 0.1146, Entropy = 0.0844, Returns = 1.2987, Value = 1.3543, mask_loss = 0.3202std_loss = 0.4168:  16%|█▋        | 896/5475 [4:02:08<20:48:42, 16.36s/it]

ratio tensor(0.2972, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4561, device='cuda:0') reward: tensor(1.1123, device='cuda:0') factor: tensor(0.3439, device='cuda:0')
reward_raw: tensor(0.4673, device='cuda:0') reward: tensor(1.1272, device='cuda:0') factor: tensor(0.3401, device='cuda:0')
reward_raw: tensor(0.4543, device='cuda:0') reward: tensor(1.1163, device='cuda:0') factor: tensor(0.3380, device='cuda:0')
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(1.1195, device='cuda:0') factor: tensor(0.3461, device='cuda:0')
reward_raw: tensor(0.4487, device='cuda:0') reward: tensor(1.1095, device='cuda:0') factor: tensor(0.3392, device='cuda:0')


Epoch 1, Step 897: Loss = 0.1484, Actor Loss = 0.1192, Critic Loss = 0.0587, Entropy = 0.1056, Returns = 1.1078, Value = 1.1029, mask_loss = 0.3326std_loss = 0.4124:  16%|█▋        | 897/5475 [4:02:21<19:31:40, 15.36s/it]

ratio tensor(0.0907, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.3201, device='cuda:0') factor: tensor(0.2393, device='cuda:0')
reward_raw: tensor(0.5491, device='cuda:0') reward: tensor(1.3037, device='cuda:0') factor: tensor(0.2454, device='cuda:0')
reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.2408, device='cuda:0')
reward_raw: tensor(0.5461, device='cuda:0') reward: tensor(1.3018, device='cuda:0') factor: tensor(0.2443, device='cuda:0')
reward_raw: tensor(0.5543, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.2433, device='cuda:0')


Epoch 1, Step 898: Loss = -0.1080, Actor Loss = -0.1334, Critic Loss = 0.0510, Entropy = 0.0776, Returns = 1.3327, Value = 1.3818, mask_loss = 0.2546std_loss = 0.3878:  16%|█▋        | 898/5475 [4:02:34<18:36:34, 14.64s/it]

ratio tensor(1.7822, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.3117, device='cuda:0') factor: tensor(0.2613, device='cuda:0')
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.2960, device='cuda:0') factor: tensor(0.2573, device='cuda:0')
reward_raw: tensor(0.5585, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.2621, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.3036, device='cuda:0') factor: tensor(0.2607, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.2994, device='cuda:0') factor: tensor(0.2610, device='cuda:0')


Epoch 1, Step 899: Loss = 0.1598, Actor Loss = 0.1113, Critic Loss = 0.0970, Entropy = 0.0662, Returns = 1.3048, Value = 1.3976, mask_loss = 0.2354std_loss = 0.3715:  16%|█▋        | 899/5475 [4:02:49<19:02:21, 14.98s/it]  

ratio tensor(1.2394, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6058, device='cuda:0') reward: tensor(1.3474, device='cuda:0') factor: tensor(0.2584, device='cuda:0')
reward_raw: tensor(0.5890, device='cuda:0') reward: tensor(1.3278, device='cuda:0') factor: tensor(0.2612, device='cuda:0')
reward_raw: tensor(0.6021, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.2633, device='cuda:0')
reward_raw: tensor(0.5933, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.2632, device='cuda:0')
reward_raw: tensor(0.6041, device='cuda:0') reward: tensor(1.3404, device='cuda:0') factor: tensor(0.2637, device='cuda:0')


Epoch 1, Step 900: Loss = -0.0274, Actor Loss = -0.0996, Critic Loss = 0.1446, Entropy = 0.0751, Returns = 1.2668, Value = 1.2470, mask_loss = 0.3003std_loss = 0.3941:  16%|█▋        | 900/5475 [4:03:07<19:50:33, 15.61s/it]

ratio tensor(1.0881, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6108, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.2379, device='cuda:0')
reward_raw: tensor(0.6143, device='cuda:0') reward: tensor(1.3727, device='cuda:0') factor: tensor(0.2415, device='cuda:0')
reward_raw: tensor(0.6039, device='cuda:0') reward: tensor(1.3685, device='cuda:0') factor: tensor(0.2354, device='cuda:0')
reward_raw: tensor(0.6180, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.2402, device='cuda:0')
reward_raw: tensor(0.6052, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.2396, device='cuda:0')


Epoch 1, Step 901: Loss = 0.1903, Actor Loss = 0.1469, Critic Loss = 0.0870, Entropy = 0.0617, Returns = 1.3349, Value = 1.4255, mask_loss = 0.2255std_loss = 0.3885:  16%|█▋        | 900/5475 [4:03:27<19:50:33, 15.61s/it]  

ratio tensor(0.3524, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 901: Loss = 0.1903, Actor Loss = 0.1469, Critic Loss = 0.0870, Entropy = 0.0617, Returns = 1.3349, Value = 1.4255, mask_loss = 0.2255std_loss = 0.3885:  16%|█▋        | 901/5475 [4:03:27<21:48:03, 17.16s/it]

reward_raw: tensor(0.6629, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.2671, device='cuda:0')
reward_raw: tensor(0.6717, device='cuda:0') reward: tensor(1.4063, device='cuda:0') factor: tensor(0.2654, device='cuda:0')
reward_raw: tensor(0.6726, device='cuda:0') reward: tensor(1.4061, device='cuda:0') factor: tensor(0.2665, device='cuda:0')
reward_raw: tensor(0.6611, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.2660, device='cuda:0')
reward_raw: tensor(0.6713, device='cuda:0') reward: tensor(1.4088, device='cuda:0') factor: tensor(0.2625, device='cuda:0')


Epoch 1, Step 902: Loss = -0.0580, Actor Loss = -0.0869, Critic Loss = 0.0581, Entropy = 0.0629, Returns = 1.4983, Value = 1.4019, mask_loss = 0.2225std_loss = 0.3792:  16%|█▋        | 902/5475 [4:03:42<20:53:32, 16.45s/it]

ratio tensor(1.3936, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.3153, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3169, device='cuda:0') factor: tensor(0.2082, device='cuda:0')
reward_raw: tensor(0.5451, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.2070, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.3041, device='cuda:0') factor: tensor(0.2072, device='cuda:0')


Epoch 1, Step 903: Loss = 0.0995, Actor Loss = 0.0465, Critic Loss = 0.1061, Entropy = 0.0754, Returns = 1.2993, Value = 1.3487, mask_loss = 0.2082std_loss = 0.3675:  16%|█▋        | 903/5475 [4:04:06<23:34:06, 18.56s/it]  

ratio tensor(0.3301, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6244, device='cuda:0') reward: tensor(1.3486, device='cuda:0') factor: tensor(0.2758, device='cuda:0')
reward_raw: tensor(0.6322, device='cuda:0') reward: tensor(1.3596, device='cuda:0') factor: tensor(0.2726, device='cuda:0')
reward_raw: tensor(0.6279, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.2780, device='cuda:0')
reward_raw: tensor(0.6357, device='cuda:0') reward: tensor(1.3682, device='cuda:0') factor: tensor(0.2675, device='cuda:0')
reward_raw: tensor(0.6333, device='cuda:0') reward: tensor(1.3598, device='cuda:0') factor: tensor(0.2735, device='cuda:0')


Epoch 1, Step 904: Loss = -0.0134, Actor Loss = -0.0672, Critic Loss = 0.1079, Entropy = 0.0710, Returns = 1.3351, Value = 1.1989, mask_loss = 0.3048std_loss = 0.4186:  17%|█▋        | 904/5475 [4:04:21<22:14:58, 17.52s/it]

ratio tensor(1.4295, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.2756, device='cuda:0') factor: tensor(0.2248, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.2796, device='cuda:0') factor: tensor(0.2286, device='cuda:0')
reward_raw: tensor(0.5081, device='cuda:0') reward: tensor(1.2874, device='cuda:0') factor: tensor(0.2206, device='cuda:0')
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.2830, device='cuda:0') factor: tensor(0.2267, device='cuda:0')
reward_raw: tensor(0.5121, device='cuda:0') reward: tensor(1.2852, device='cuda:0') factor: tensor(0.2269, device='cuda:0')


Epoch 1, Step 905: Loss = 0.0516, Actor Loss = 0.0312, Critic Loss = 0.0410, Entropy = 0.0842, Returns = 1.3006, Value = 1.2388, mask_loss = 0.2426std_loss = 0.3881:  17%|█▋        | 905/5475 [4:04:40<22:55:32, 18.06s/it]  

ratio tensor(1.4089, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5116, device='cuda:0') reward: tensor(1.2277, device='cuda:0') factor: tensor(0.2839, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.2290, device='cuda:0') factor: tensor(0.2872, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.2346, device='cuda:0') factor: tensor(0.2849, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.2274, device='cuda:0') factor: tensor(0.2814, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.2144, device='cuda:0') factor: tensor(0.2858, device='cuda:0')


Epoch 1, Step 906: Loss = -0.2045, Actor Loss = -0.2383, Critic Loss = 0.0679, Entropy = 0.0866, Returns = 1.2595, Value = 1.1926, mask_loss = 0.3200std_loss = 0.4193:  17%|█▋        | 906/5475 [4:04:56<22:14:51, 17.53s/it]

ratio tensor(3.0824, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5720, device='cuda:0') reward: tensor(1.3302, device='cuda:0') factor: tensor(0.2418, device='cuda:0')
reward_raw: tensor(0.5689, device='cuda:0') reward: tensor(1.3318, device='cuda:0') factor: tensor(0.2371, device='cuda:0')
reward_raw: tensor(0.5842, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.2427, device='cuda:0')
reward_raw: tensor(0.5789, device='cuda:0') reward: tensor(1.3370, device='cuda:0') factor: tensor(0.2419, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(1.3484, device='cuda:0') factor: tensor(0.2433, device='cuda:0')


Epoch 1, Step 907: Loss = 0.0473, Actor Loss = 0.0067, Critic Loss = 0.0815, Entropy = 0.0959, Returns = 1.3139, Value = 1.4120, mask_loss = 0.2542std_loss = 0.3833:  17%|█▋        | 907/5475 [4:05:14<22:12:45, 17.51s/it]  

ratio tensor(1.0168, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5973, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.2385, device='cuda:0')
reward_raw: tensor(0.5832, device='cuda:0') reward: tensor(1.3482, device='cuda:0') factor: tensor(0.2350, device='cuda:0')
reward_raw: tensor(0.5900, device='cuda:0') reward: tensor(1.3541, device='cuda:0') factor: tensor(0.2359, device='cuda:0')
reward_raw: tensor(0.5821, device='cuda:0') reward: tensor(1.3471, device='cuda:0') factor: tensor(0.2350, device='cuda:0')
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.2364, device='cuda:0')


Epoch 1, Step 908: Loss = 0.0441, Actor Loss = 0.0055, Critic Loss = 0.0773, Entropy = 0.0756, Returns = 1.3590, Value = 1.4027, mask_loss = 0.2250std_loss = 0.3860:  17%|█▋        | 908/5475 [4:05:32<22:23:04, 17.64s/it]

ratio tensor(1.3484, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5755, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.2627, device='cuda:0')
reward_raw: tensor(0.5786, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.2614, device='cuda:0')
reward_raw: tensor(0.5821, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.2665, device='cuda:0')
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.2575, device='cuda:0')
reward_raw: tensor(0.5697, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.2606, device='cuda:0')


Epoch 1, Step 909: Loss = 0.1364, Actor Loss = 0.0982, Critic Loss = 0.0767, Entropy = 0.0782, Returns = 1.2673, Value = 1.2922, mask_loss = 0.2606std_loss = 0.3955:  17%|█▋        | 909/5475 [4:05:54<24:14:04, 19.11s/it]

ratio tensor(0.9877, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6100, device='cuda:0') reward: tensor(1.3306, device='cuda:0') factor: tensor(0.2793, device='cuda:0')
reward_raw: tensor(0.6112, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.2770, device='cuda:0')
reward_raw: tensor(0.6175, device='cuda:0') reward: tensor(1.3396, device='cuda:0') factor: tensor(0.2778, device='cuda:0')
reward_raw: tensor(0.6095, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.2717, device='cuda:0')
reward_raw: tensor(0.6053, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.2754, device='cuda:0')


Epoch 1, Step 910: Loss = 0.0530, Actor Loss = 0.0128, Critic Loss = 0.0808, Entropy = 0.1083, Returns = 1.2343, Value = 1.1991, mask_loss = 0.3516std_loss = 0.4335:  17%|█▋        | 910/5475 [4:06:10<23:05:54, 18.22s/it]

ratio tensor(1.3271, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5940, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.2490, device='cuda:0')
reward_raw: tensor(0.6043, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.2524, device='cuda:0')
reward_raw: tensor(0.5896, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.2496, device='cuda:0')
reward_raw: tensor(0.5928, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.2547, device='cuda:0')
reward_raw: tensor(0.5906, device='cuda:0') reward: tensor(1.3343, device='cuda:0') factor: tensor(0.2564, device='cuda:0')


Epoch 1, Step 911: Loss = -0.0730, Actor Loss = -0.1158, Critic Loss = 0.0858, Entropy = 0.0836, Returns = 1.3029, Value = 1.3781, mask_loss = 0.2792std_loss = 0.4028:  17%|█▋        | 911/5475 [4:06:25<21:33:33, 17.01s/it]

ratio tensor(1.3744, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5656, device='cuda:0') reward: tensor(1.3009, device='cuda:0') factor: tensor(0.2646, device='cuda:0')
reward_raw: tensor(0.5809, device='cuda:0') reward: tensor(1.3179, device='cuda:0') factor: tensor(0.2631, device='cuda:0')
reward_raw: tensor(0.5863, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.2624, device='cuda:0')
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.3244, device='cuda:0') factor: tensor(0.2615, device='cuda:0')
reward_raw: tensor(0.5956, device='cuda:0') reward: tensor(1.3359, device='cuda:0') factor: tensor(0.2596, device='cuda:0')


Epoch 1, Step 912: Loss = 0.1551, Actor Loss = 0.1298, Critic Loss = 0.0508, Entropy = 0.0945, Returns = 1.2925, Value = 1.3071, mask_loss = 0.2980std_loss = 0.4097:  17%|█▋        | 912/5475 [4:06:38<20:07:43, 15.88s/it]  

ratio tensor(0.2724, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.2951, device='cuda:0') factor: tensor(0.2460, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.3005, device='cuda:0') factor: tensor(0.2411, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.3028, device='cuda:0') factor: tensor(0.2407, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.2413, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.2441, device='cuda:0')


Epoch 1, Step 913: Loss = -0.0611, Actor Loss = -0.0923, Critic Loss = 0.0626, Entropy = 0.0759, Returns = 1.2630, Value = 1.3313, mask_loss = 0.2961std_loss = 0.4065:  17%|█▋        | 913/5475 [4:06:57<21:33:02, 17.01s/it]

ratio tensor(0.7736, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5962, device='cuda:0') reward: tensor(1.3486, device='cuda:0') factor: tensor(0.2476, device='cuda:0')
reward_raw: tensor(0.6228, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.2496, device='cuda:0')
reward_raw: tensor(0.6197, device='cuda:0') reward: tensor(1.3707, device='cuda:0') factor: tensor(0.2490, device='cuda:0')
reward_raw: tensor(0.6136, device='cuda:0') reward: tensor(1.3593, device='cuda:0') factor: tensor(0.2543, device='cuda:0')
reward_raw: tensor(0.6129, device='cuda:0') reward: tensor(1.3597, device='cuda:0') factor: tensor(0.2532, device='cuda:0')


Epoch 1, Step 914: Loss = 0.0605, Actor Loss = 0.0385, Critic Loss = 0.0440, Entropy = 0.0752, Returns = 1.2653, Value = 1.3810, mask_loss = 0.2274std_loss = 0.3784:  17%|█▋        | 914/5475 [4:07:13<20:49:04, 16.43s/it]  

ratio tensor(0.8087, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.3063, device='cuda:0') factor: tensor(0.2652, device='cuda:0')
reward_raw: tensor(0.5686, device='cuda:0') reward: tensor(1.3040, device='cuda:0') factor: tensor(0.2646, device='cuda:0')
reward_raw: tensor(0.5667, device='cuda:0') reward: tensor(1.2962, device='cuda:0') factor: tensor(0.2705, device='cuda:0')
reward_raw: tensor(0.5703, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.2635, device='cuda:0')
reward_raw: tensor(0.5764, device='cuda:0') reward: tensor(1.3088, device='cuda:0') factor: tensor(0.2675, device='cuda:0')


Epoch 1, Step 915: Loss = 0.2742, Actor Loss = 0.2473, Critic Loss = 0.0540, Entropy = 0.0893, Returns = 1.2096, Value = 1.3610, mask_loss = 0.3489std_loss = 0.4228:  17%|█▋        | 915/5475 [4:07:30<21:16:27, 16.80s/it]

ratio tensor(0.1696, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5853, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.2516, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.2504, device='cuda:0')
reward_raw: tensor(0.5609, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.2559, device='cuda:0')
reward_raw: tensor(0.5877, device='cuda:0') reward: tensor(1.3308, device='cuda:0') factor: tensor(0.2570, device='cuda:0')
reward_raw: tensor(0.5762, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2520, device='cuda:0')


Epoch 1, Step 916: Loss = -0.0380, Actor Loss = -0.0803, Critic Loss = 0.0847, Entropy = 0.0739, Returns = 1.3456, Value = 1.2005, mask_loss = 0.2383std_loss = 0.3946:  17%|█▋        | 916/5475 [4:07:47<21:18:14, 16.82s/it]

ratio tensor(0.7634, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5515, device='cuda:0') reward: tensor(1.2691, device='cuda:0') factor: tensor(0.2825, device='cuda:0')
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.2835, device='cuda:0') factor: tensor(0.2856, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.2752, device='cuda:0') factor: tensor(0.2761, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.2839, device='cuda:0') factor: tensor(0.2834, device='cuda:0')
reward_raw: tensor(0.5619, device='cuda:0') reward: tensor(1.2816, device='cuda:0') factor: tensor(0.2803, device='cuda:0')


Epoch 1, Step 917: Loss = 0.1845, Actor Loss = 0.1322, Critic Loss = 0.1047, Entropy = 0.0817, Returns = 1.2218, Value = 1.3422, mask_loss = 0.3084std_loss = 0.4083:  17%|█▋        | 917/5475 [4:08:01<20:18:49, 16.04s/it]  

ratio tensor(0.5225, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5134, device='cuda:0') reward: tensor(1.2986, device='cuda:0') factor: tensor(0.2148, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.2964, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.5009, device='cuda:0') reward: tensor(1.2946, device='cuda:0') factor: tensor(0.2063, device='cuda:0')
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.2921, device='cuda:0') factor: tensor(0.2166, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.2077, device='cuda:0')


Epoch 1, Step 918: Loss = 0.0407, Actor Loss = -0.0342, Critic Loss = 0.1499, Entropy = 0.0577, Returns = 1.3081, Value = 1.3404, mask_loss = 0.1919std_loss = 0.3567:  17%|█▋        | 918/5475 [4:08:29<24:50:25, 19.62s/it]

ratio tensor(0.5491, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.2695, device='cuda:0') factor: tensor(0.2353, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.2518, device='cuda:0') factor: tensor(0.2387, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.2551, device='cuda:0') factor: tensor(0.2306, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.2613, device='cuda:0') factor: tensor(0.2380, device='cuda:0')
reward_raw: tensor(0.4952, device='cuda:0') reward: tensor(1.2657, device='cuda:0') factor: tensor(0.2295, device='cuda:0')


Epoch 1, Step 919: Loss = 0.1262, Actor Loss = 0.0953, Critic Loss = 0.0620, Entropy = 0.0859, Returns = 1.1704, Value = 1.2383, mask_loss = 0.2569std_loss = 0.3909:  17%|█▋        | 919/5475 [4:08:43<22:38:24, 17.89s/it] 

ratio tensor(1.0892, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5821, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.2323, device='cuda:0')
reward_raw: tensor(0.5942, device='cuda:0') reward: tensor(1.3636, device='cuda:0') factor: tensor(0.2306, device='cuda:0')
reward_raw: tensor(0.6080, device='cuda:0') reward: tensor(1.3752, device='cuda:0') factor: tensor(0.2328, device='cuda:0')
reward_raw: tensor(0.5965, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.2311, device='cuda:0')
reward_raw: tensor(0.5905, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.2329, device='cuda:0')


Epoch 1, Step 920: Loss = -0.1497, Actor Loss = -0.2270, Critic Loss = 0.1548, Entropy = 0.0768, Returns = 1.4793, Value = 1.2071, mask_loss = 0.2179std_loss = 0.3794:  17%|█▋        | 920/5475 [4:08:58<21:32:29, 17.03s/it]

ratio tensor(1.6708, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5198, device='cuda:0') reward: tensor(1.2819, device='cuda:0') factor: tensor(0.2379, device='cuda:0')
reward_raw: tensor(0.5296, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.2355, device='cuda:0')
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.2928, device='cuda:0') factor: tensor(0.2306, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.2328, device='cuda:0')
reward_raw: tensor(0.5233, device='cuda:0') reward: tensor(1.2901, device='cuda:0') factor: tensor(0.2333, device='cuda:0')


Epoch 1, Step 921: Loss = 0.0113, Actor Loss = -0.0300, Critic Loss = 0.0828, Entropy = 0.0797, Returns = 1.2674, Value = 1.2850, mask_loss = 0.2496std_loss = 0.3927:  17%|█▋        | 920/5475 [4:09:14<21:32:29, 17.03s/it] 

ratio tensor(0.8051, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 921: Loss = 0.0113, Actor Loss = -0.0300, Critic Loss = 0.0828, Entropy = 0.0797, Returns = 1.2674, Value = 1.2850, mask_loss = 0.2496std_loss = 0.3927:  17%|█▋        | 921/5475 [4:09:14<21:13:24, 16.78s/it]

reward_raw: tensor(0.5573, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.2298, device='cuda:0')
reward_raw: tensor(0.5565, device='cuda:0') reward: tensor(1.3240, device='cuda:0') factor: tensor(0.2325, device='cuda:0')
reward_raw: tensor(0.5328, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.2304, device='cuda:0')
reward_raw: tensor(0.5501, device='cuda:0') reward: tensor(1.3159, device='cuda:0') factor: tensor(0.2342, device='cuda:0')
reward_raw: tensor(0.5540, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.2316, device='cuda:0')


Epoch 1, Step 922: Loss = -0.0209, Actor Loss = -0.0521, Critic Loss = 0.0625, Entropy = 0.0863, Returns = 1.3109, Value = 1.3508, mask_loss = 0.2475std_loss = 0.3791:  17%|█▋        | 922/5475 [4:09:30<20:50:34, 16.48s/it]

ratio tensor(0.9968, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6327, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.2176, device='cuda:0')
reward_raw: tensor(0.6324, device='cuda:0') reward: tensor(1.4161, device='cuda:0') factor: tensor(0.2163, device='cuda:0')
reward_raw: tensor(0.6312, device='cuda:0') reward: tensor(1.4195, device='cuda:0') factor: tensor(0.2117, device='cuda:0')
reward_raw: tensor(0.6512, device='cuda:0') reward: tensor(1.4363, device='cuda:0') factor: tensor(0.2149, device='cuda:0')
reward_raw: tensor(0.6054, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.2169, device='cuda:0')


Epoch 1, Step 923: Loss = -0.0343, Actor Loss = -0.0565, Critic Loss = 0.0445, Entropy = 0.0961, Returns = 1.4503, Value = 1.4757, mask_loss = 0.2208std_loss = 0.3704:  17%|█▋        | 923/5475 [4:09:44<19:59:34, 15.81s/it]

ratio tensor(2.2285, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5939, device='cuda:0') reward: tensor(1.2954, device='cuda:0') factor: tensor(0.2985, device='cuda:0')
reward_raw: tensor(0.6129, device='cuda:0') reward: tensor(1.3116, device='cuda:0') factor: tensor(0.3013, device='cuda:0')
reward_raw: tensor(0.6107, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.2971, device='cuda:0')
reward_raw: tensor(0.5862, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.2950, device='cuda:0')
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.2895, device='cuda:0') factor: tensor(0.2965, device='cuda:0')


Epoch 1, Step 924: Loss = 0.1853, Actor Loss = 0.1495, Critic Loss = 0.0718, Entropy = 0.0979, Returns = 1.3633, Value = 1.2806, mask_loss = 0.2652std_loss = 0.3900:  17%|█▋        | 924/5475 [4:09:59<19:37:37, 15.53s/it]  

ratio tensor(0.7041, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5990, device='cuda:0') reward: tensor(1.3803, device='cuda:0') factor: tensor(0.2187, device='cuda:0')
reward_raw: tensor(0.5953, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.2178, device='cuda:0')
reward_raw: tensor(0.6052, device='cuda:0') reward: tensor(1.3876, device='cuda:0') factor: tensor(0.2177, device='cuda:0')
reward_raw: tensor(0.6027, device='cuda:0') reward: tensor(1.3842, device='cuda:0') factor: tensor(0.2185, device='cuda:0')
reward_raw: tensor(0.5953, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.2161, device='cuda:0')


Epoch 1, Step 925: Loss = -0.1801, Actor Loss = -0.2113, Critic Loss = 0.0626, Entropy = 0.0662, Returns = 1.3455, Value = 1.3661, mask_loss = 0.2105std_loss = 0.3711:  17%|█▋        | 925/5475 [4:10:11<18:06:33, 14.33s/it]

ratio tensor(7.0232, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5403, device='cuda:0') reward: tensor(1.3305, device='cuda:0') factor: tensor(0.2098, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.2118, device='cuda:0')
reward_raw: tensor(0.5613, device='cuda:0') reward: tensor(1.3437, device='cuda:0') factor: tensor(0.2176, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.3345, device='cuda:0') factor: tensor(0.2125, device='cuda:0')
reward_raw: tensor(0.5598, device='cuda:0') reward: tensor(1.3504, device='cuda:0') factor: tensor(0.2094, device='cuda:0')


Epoch 1, Step 926: Loss = 0.1148, Actor Loss = 0.0699, Critic Loss = 0.0899, Entropy = 0.0681, Returns = 1.3161, Value = 1.4763, mask_loss = 0.1891std_loss = 0.3535:  17%|█▋        | 926/5475 [4:10:27<18:58:08, 15.01s/it]  

ratio tensor(1.0798, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5630, device='cuda:0') reward: tensor(1.3670, device='cuda:0') factor: tensor(0.1960, device='cuda:0')
reward_raw: tensor(0.5608, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.3551, device='cuda:0') factor: tensor(0.1956, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.1949, device='cuda:0')
reward_raw: tensor(0.5534, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.2019, device='cuda:0')


Epoch 1, Step 927: Loss = 0.0206, Actor Loss = -0.0205, Critic Loss = 0.0825, Entropy = 0.0597, Returns = 1.4358, Value = 1.3597, mask_loss = 0.1964std_loss = 0.3679:  17%|█▋        | 927/5475 [4:10:44<19:41:52, 15.59s/it]

ratio tensor(2.2144, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.2023, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.2043, device='cuda:0')
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.3340, device='cuda:0') factor: tensor(0.2022, device='cuda:0')
reward_raw: tensor(0.5231, device='cuda:0') reward: tensor(1.3209, device='cuda:0') factor: tensor(0.2022, device='cuda:0')
reward_raw: tensor(0.5318, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.2007, device='cuda:0')


Epoch 1, Step 928: Loss = 0.0649, Actor Loss = 0.0311, Critic Loss = 0.0678, Entropy = 0.0625, Returns = 1.3001, Value = 1.3088, mask_loss = 0.2100std_loss = 0.3696:  17%|█▋        | 928/5475 [4:11:03<20:43:18, 16.41s/it] 

ratio tensor(1.3225, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5265, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.2151, device='cuda:0')
reward_raw: tensor(0.5072, device='cuda:0') reward: tensor(1.2953, device='cuda:0') factor: tensor(0.2119, device='cuda:0')
reward_raw: tensor(0.5236, device='cuda:0') reward: tensor(1.3071, device='cuda:0') factor: tensor(0.2165, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.3263, device='cuda:0') factor: tensor(0.2150, device='cuda:0')
reward_raw: tensor(0.5276, device='cuda:0') reward: tensor(1.3154, device='cuda:0') factor: tensor(0.2122, device='cuda:0')


Epoch 1, Step 929: Loss = 0.1246, Actor Loss = 0.0729, Critic Loss = 0.1035, Entropy = 0.0532, Returns = 1.3271, Value = 1.2730, mask_loss = 0.1888std_loss = 0.3661:  17%|█▋        | 929/5475 [4:11:14<18:56:23, 15.00s/it]

ratio tensor(0.9290, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(1.2910, device='cuda:0') factor: tensor(0.2012, device='cuda:0')
reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.2896, device='cuda:0') factor: tensor(0.1967, device='cuda:0')
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(1.2928, device='cuda:0') factor: tensor(0.1941, device='cuda:0')
reward_raw: tensor(0.4684, device='cuda:0') reward: tensor(1.2739, device='cuda:0') factor: tensor(0.1945, device='cuda:0')
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.2823, device='cuda:0') factor: tensor(0.1987, device='cuda:0')


Epoch 1, Step 930: Loss = 0.0303, Actor Loss = -0.0121, Critic Loss = 0.0849, Entropy = 0.0633, Returns = 1.2765, Value = 1.2586, mask_loss = 0.2242std_loss = 0.3868:  17%|█▋        | 930/5475 [4:11:28<18:19:14, 14.51s/it]

ratio tensor(2.4782, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.2194, device='cuda:0') factor: tensor(0.2680, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.2595, device='cuda:0') factor: tensor(0.2750, device='cuda:0')
reward_raw: tensor(0.5146, device='cuda:0') reward: tensor(1.2477, device='cuda:0') factor: tensor(0.2669, device='cuda:0')
reward_raw: tensor(0.5179, device='cuda:0') reward: tensor(1.2521, device='cuda:0') factor: tensor(0.2658, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.2519, device='cuda:0') factor: tensor(0.2723, device='cuda:0')


Epoch 1, Step 931: Loss = 0.0027, Actor Loss = -0.0280, Critic Loss = 0.0614, Entropy = 0.0791, Returns = 1.2217, Value = 1.2748, mask_loss = 0.2839std_loss = 0.4088:  17%|█▋        | 931/5475 [4:11:42<18:20:58, 14.54s/it]

ratio tensor(0.8495, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6084, device='cuda:0') reward: tensor(1.4148, device='cuda:0') factor: tensor(0.1936, device='cuda:0')
reward_raw: tensor(0.5903, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.2002, device='cuda:0')
reward_raw: tensor(0.5816, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1932, device='cuda:0')
reward_raw: tensor(0.5779, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1920, device='cuda:0')
reward_raw: tensor(0.6056, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.1972, device='cuda:0')


Epoch 1, Step 932: Loss = 0.0121, Actor Loss = -0.0149, Critic Loss = 0.0542, Entropy = 0.0534, Returns = 1.3629, Value = 1.3714, mask_loss = 0.1899std_loss = 0.3627:  17%|█▋        | 932/5475 [4:12:05<21:31:17, 17.05s/it]

ratio tensor(0.7996, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.1842, device='cuda:0')
reward_raw: tensor(0.5161, device='cuda:0') reward: tensor(1.3345, device='cuda:0') factor: tensor(0.1816, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.3326, device='cuda:0') factor: tensor(0.1870, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.3357, device='cuda:0') factor: tensor(0.1865, device='cuda:0')
reward_raw: tensor(0.5175, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.1880, device='cuda:0')


Epoch 1, Step 933: Loss = -0.0215, Actor Loss = -0.0533, Critic Loss = 0.0638, Entropy = 0.0590, Returns = 1.3091, Value = 1.2640, mask_loss = 0.2022std_loss = 0.3663:  17%|█▋        | 933/5475 [4:12:23<21:45:57, 17.25s/it]

ratio tensor(0.8753, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.4194, device='cuda:0') factor: tensor(0.1533, device='cuda:0')
reward_raw: tensor(0.5938, device='cuda:0') reward: tensor(1.4381, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.5980, device='cuda:0') reward: tensor(1.4420, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.5843, device='cuda:0') reward: tensor(1.4315, device='cuda:0') factor: tensor(0.1528, device='cuda:0')
reward_raw: tensor(0.5958, device='cuda:0') reward: tensor(1.4459, device='cuda:0') factor: tensor(0.1500, device='cuda:0')


Epoch 1, Step 934: Loss = -0.1042, Actor Loss = -0.1377, Critic Loss = 0.0672, Entropy = 0.0606, Returns = 1.4494, Value = 1.3786, mask_loss = 0.1674std_loss = 0.3404:  17%|█▋        | 934/5475 [4:12:38<20:49:56, 16.52s/it]

ratio tensor(1.7643, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.2511, device='cuda:0') factor: tensor(0.2771, device='cuda:0')
reward_raw: tensor(0.5044, device='cuda:0') reward: tensor(1.2304, device='cuda:0') factor: tensor(0.2740, device='cuda:0')
reward_raw: tensor(0.5276, device='cuda:0') reward: tensor(1.2483, device='cuda:0') factor: tensor(0.2793, device='cuda:0')
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.2313, device='cuda:0') factor: tensor(0.2758, device='cuda:0')
reward_raw: tensor(0.5245, device='cuda:0') reward: tensor(1.2552, device='cuda:0') factor: tensor(0.2693, device='cuda:0')


Epoch 1, Step 935: Loss = 0.1652, Actor Loss = 0.1239, Critic Loss = 0.0829, Entropy = 0.0945, Returns = 1.2314, Value = 1.1854, mask_loss = 0.3168std_loss = 0.4145:  17%|█▋        | 935/5475 [4:12:55<21:11:39, 16.81s/it]  

ratio tensor(0.5099, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.3644, device='cuda:0') factor: tensor(0.1949, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.1912, device='cuda:0')
reward_raw: tensor(0.5690, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1955, device='cuda:0')
reward_raw: tensor(0.5760, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.5630, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.1958, device='cuda:0')


Epoch 1, Step 936: Loss = -0.0457, Actor Loss = -0.0757, Critic Loss = 0.0601, Entropy = 0.0770, Returns = 1.3832, Value = 1.4000, mask_loss = 0.1895std_loss = 0.3518:  17%|█▋        | 936/5475 [4:13:13<21:34:16, 17.11s/it]

ratio tensor(0.8117, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.2858, device='cuda:0') factor: tensor(0.2540, device='cuda:0')
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.2675, device='cuda:0') factor: tensor(0.2587, device='cuda:0')
reward_raw: tensor(0.5368, device='cuda:0') reward: tensor(1.2797, device='cuda:0') factor: tensor(0.2572, device='cuda:0')
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.2839, device='cuda:0') factor: tensor(0.2583, device='cuda:0')
reward_raw: tensor(0.5466, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.2612, device='cuda:0')


Epoch 1, Step 937: Loss = -0.0011, Actor Loss = -0.0240, Critic Loss = 0.0459, Entropy = 0.0745, Returns = 1.2758, Value = 1.3884, mask_loss = 0.2348std_loss = 0.3602:  17%|█▋        | 937/5475 [4:13:31<21:54:07, 17.37s/it]

ratio tensor(1.0407, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5426, device='cuda:0') reward: tensor(1.3436, device='cuda:0') factor: tensor(0.1990, device='cuda:0')
reward_raw: tensor(0.5477, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.2047, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.3531, device='cuda:0') factor: tensor(0.2020, device='cuda:0')
reward_raw: tensor(0.5799, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.2034, device='cuda:0')


Epoch 1, Step 938: Loss = 0.0667, Actor Loss = 0.0114, Critic Loss = 0.1109, Entropy = 0.0453, Returns = 1.3628, Value = 1.2599, mask_loss = 0.1887std_loss = 0.3661:  17%|█▋        | 938/5475 [4:13:48<21:52:38, 17.36s/it]  

ratio tensor(1.1332, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6175, device='cuda:0') reward: tensor(1.3850, device='cuda:0') factor: tensor(0.2325, device='cuda:0')
reward_raw: tensor(0.5999, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.2371, device='cuda:0')
reward_raw: tensor(0.6151, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.2367, device='cuda:0')
reward_raw: tensor(0.6094, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.2330, device='cuda:0')
reward_raw: tensor(0.6200, device='cuda:0') reward: tensor(1.3852, device='cuda:0') factor: tensor(0.2347, device='cuda:0')


Epoch 1, Step 939: Loss = -0.0099, Actor Loss = -0.0677, Critic Loss = 0.1157, Entropy = 0.0797, Returns = 1.3950, Value = 1.4899, mask_loss = 0.2465std_loss = 0.3894:  17%|█▋        | 939/5475 [4:14:05<21:39:27, 17.19s/it]

ratio tensor(2.0483, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5743, device='cuda:0') reward: tensor(1.2657, device='cuda:0') factor: tensor(0.3087, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.2589, device='cuda:0') factor: tensor(0.3085, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.2640, device='cuda:0') factor: tensor(0.3164, device='cuda:0')
reward_raw: tensor(0.5787, device='cuda:0') reward: tensor(1.2662, device='cuda:0') factor: tensor(0.3125, device='cuda:0')
reward_raw: tensor(0.5690, device='cuda:0') reward: tensor(1.2637, device='cuda:0') factor: tensor(0.3053, device='cuda:0')


Epoch 1, Step 940: Loss = 0.1340, Actor Loss = 0.0856, Critic Loss = 0.0969, Entropy = 0.0812, Returns = 1.3717, Value = 1.2170, mask_loss = 0.2763std_loss = 0.3873:  17%|█▋        | 940/5475 [4:14:21<21:07:55, 16.78s/it]  

ratio tensor(0.9631, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.2895, device='cuda:0') factor: tensor(0.2055, device='cuda:0')
reward_raw: tensor(0.4887, device='cuda:0') reward: tensor(1.2807, device='cuda:0') factor: tensor(0.2079, device='cuda:0')
reward_raw: tensor(0.4703, device='cuda:0') reward: tensor(1.2636, device='cuda:0') factor: tensor(0.2067, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.2797, device='cuda:0') factor: tensor(0.2060, device='cuda:0')
reward_raw: tensor(0.4898, device='cuda:0') reward: tensor(1.2843, device='cuda:0') factor: tensor(0.2055, device='cuda:0')


Epoch 1, Step 941: Loss = -0.0206, Actor Loss = -0.0634, Critic Loss = 0.0857, Entropy = 0.0466, Returns = 1.3048, Value = 1.3067, mask_loss = 0.1943std_loss = 0.3493:  17%|█▋        | 940/5475 [4:14:35<21:07:55, 16.78s/it]

ratio tensor(0.7979, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 941: Loss = -0.0206, Actor Loss = -0.0634, Critic Loss = 0.0857, Entropy = 0.0466, Returns = 1.3048, Value = 1.3067, mask_loss = 0.1943std_loss = 0.3493:  17%|█▋        | 941/5475 [4:14:35<20:09:30, 16.01s/it]

reward_raw: tensor(0.5960, device='cuda:0') reward: tensor(1.4048, device='cuda:0') factor: tensor(0.1912, device='cuda:0')
reward_raw: tensor(0.5948, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5856, device='cuda:0') reward: tensor(1.3922, device='cuda:0') factor: tensor(0.1935, device='cuda:0')
reward_raw: tensor(0.5726, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1982, device='cuda:0')
reward_raw: tensor(0.6004, device='cuda:0') reward: tensor(1.4026, device='cuda:0') factor: tensor(0.1978, device='cuda:0')


Epoch 1, Step 942: Loss = -0.1034, Actor Loss = -0.1216, Critic Loss = 0.0366, Entropy = 0.0573, Returns = 1.3924, Value = 1.4191, mask_loss = 0.1943std_loss = 0.3622:  17%|█▋        | 942/5475 [4:14:53<20:44:56, 16.48s/it]

ratio tensor(0.9162, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.2226, device='cuda:0')
reward_raw: tensor(0.5377, device='cuda:0') reward: tensor(1.3107, device='cuda:0') factor: tensor(0.2270, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.3348, device='cuda:0') factor: tensor(0.2244, device='cuda:0')
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.2263, device='cuda:0')
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.2262, device='cuda:0')


Epoch 1, Step 943: Loss = 0.1234, Actor Loss = 0.0789, Critic Loss = 0.0892, Entropy = 0.0784, Returns = 1.3244, Value = 1.2662, mask_loss = 0.2311std_loss = 0.3817:  17%|█▋        | 943/5475 [4:15:06<19:35:26, 15.56s/it]  

ratio tensor(0.7402, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5218, device='cuda:0') reward: tensor(1.2842, device='cuda:0') factor: tensor(0.2376, device='cuda:0')
reward_raw: tensor(0.5286, device='cuda:0') reward: tensor(1.2897, device='cuda:0') factor: tensor(0.2389, device='cuda:0')
reward_raw: tensor(0.5233, device='cuda:0') reward: tensor(1.2840, device='cuda:0') factor: tensor(0.2393, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.2715, device='cuda:0') factor: tensor(0.2377, device='cuda:0')
reward_raw: tensor(0.5147, device='cuda:0') reward: tensor(1.2795, device='cuda:0') factor: tensor(0.2352, device='cuda:0')


Epoch 1, Step 944: Loss = -0.0209, Actor Loss = -0.0362, Critic Loss = 0.0307, Entropy = 0.0594, Returns = 1.2796, Value = 1.2825, mask_loss = 0.2224std_loss = 0.3800:  17%|█▋        | 944/5475 [4:15:21<19:18:15, 15.34s/it]

ratio tensor(2.7411, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.3374, device='cuda:0') factor: tensor(0.1647, device='cuda:0')
reward_raw: tensor(0.4867, device='cuda:0') reward: tensor(1.3165, device='cuda:0') factor: tensor(0.1702, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3263, device='cuda:0') factor: tensor(0.1701, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3194, device='cuda:0') factor: tensor(0.1717, device='cuda:0')
reward_raw: tensor(0.4789, device='cuda:0') reward: tensor(1.3057, device='cuda:0') factor: tensor(0.1732, device='cuda:0')


Epoch 1, Step 945: Loss = -0.1122, Actor Loss = -0.1489, Critic Loss = 0.0735, Entropy = 0.0540, Returns = 1.4230, Value = 1.2952, mask_loss = 0.1673std_loss = 0.3472:  17%|█▋        | 945/5475 [4:15:38<19:55:41, 15.84s/it]

ratio tensor(0.9974, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5419, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.2021, device='cuda:0')
reward_raw: tensor(0.5260, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.2018, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1947, device='cuda:0')
reward_raw: tensor(0.5189, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.1977, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.1982, device='cuda:0')


Epoch 1, Step 946: Loss = 0.2271, Actor Loss = 0.1723, Critic Loss = 0.1097, Entropy = 0.0448, Returns = 1.3136, Value = 1.4449, mask_loss = 0.1662std_loss = 0.3453:  17%|█▋        | 946/5475 [4:15:50<18:28:22, 14.68s/it]  

ratio tensor(0.9163, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.2215, device='cuda:0')
reward_raw: tensor(0.5556, device='cuda:0') reward: tensor(1.3341, device='cuda:0') factor: tensor(0.2216, device='cuda:0')
reward_raw: tensor(0.5750, device='cuda:0') reward: tensor(1.3508, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5656, device='cuda:0') reward: tensor(1.3422, device='cuda:0') factor: tensor(0.2233, device='cuda:0')
reward_raw: tensor(0.5755, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.2238, device='cuda:0')


Epoch 1, Step 947: Loss = 0.1188, Actor Loss = 0.0907, Critic Loss = 0.0563, Entropy = 0.0573, Returns = 1.2772, Value = 1.2442, mask_loss = 0.2200std_loss = 0.3717:  17%|█▋        | 947/5475 [4:16:05<18:28:34, 14.69s/it]

ratio tensor(0.5145, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5888, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1946, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.3806, device='cuda:0') factor: tensor(0.1939, device='cuda:0')
reward_raw: tensor(0.5839, device='cuda:0') reward: tensor(1.3880, device='cuda:0') factor: tensor(0.1959, device='cuda:0')
reward_raw: tensor(0.5897, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.1959, device='cuda:0')
reward_raw: tensor(0.5862, device='cuda:0') reward: tensor(1.3887, device='cuda:0') factor: tensor(0.1975, device='cuda:0')


Epoch 1, Step 948: Loss = -0.2281, Actor Loss = -0.2461, Critic Loss = 0.0360, Entropy = 0.0545, Returns = 1.3758, Value = 1.3975, mask_loss = 0.2029std_loss = 0.3662:  17%|█▋        | 948/5475 [4:16:26<21:03:19, 16.74s/it]

ratio tensor(1.7285, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.3294, device='cuda:0') factor: tensor(0.2431, device='cuda:0')
reward_raw: tensor(0.5722, device='cuda:0') reward: tensor(1.3297, device='cuda:0') factor: tensor(0.2426, device='cuda:0')
reward_raw: tensor(0.5800, device='cuda:0') reward: tensor(1.3411, device='cuda:0') factor: tensor(0.2389, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.2403, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.2451, device='cuda:0')


Epoch 1, Step 949: Loss = 0.1270, Actor Loss = 0.0986, Critic Loss = 0.0569, Entropy = 0.0642, Returns = 1.3285, Value = 1.3998, mask_loss = 0.2139std_loss = 0.3777:  17%|█▋        | 949/5475 [4:16:40<19:54:22, 15.83s/it]  

ratio tensor(1.5682, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6111, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.2054, device='cuda:0')
reward_raw: tensor(0.6414, device='cuda:0') reward: tensor(1.4307, device='cuda:0') factor: tensor(0.2108, device='cuda:0')
reward_raw: tensor(0.6444, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.2100, device='cuda:0')
reward_raw: tensor(0.6016, device='cuda:0') reward: tensor(1.3932, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.6309, device='cuda:0') reward: tensor(1.4172, device='cuda:0') factor: tensor(0.2136, device='cuda:0')


Epoch 1, Step 950: Loss = 0.0740, Actor Loss = -0.0284, Critic Loss = 0.2050, Entropy = 0.0695, Returns = 1.3939, Value = 1.4088, mask_loss = 0.2155std_loss = 0.3747:  17%|█▋        | 950/5475 [4:16:57<20:30:53, 16.32s/it]

ratio tensor(1.5164, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5861, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.2131, device='cuda:0')
reward_raw: tensor(0.6137, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.2119, device='cuda:0')
reward_raw: tensor(0.5795, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.2121, device='cuda:0')
reward_raw: tensor(0.5886, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.2131, device='cuda:0')
reward_raw: tensor(0.5963, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.2110, device='cuda:0')


Epoch 1, Step 951: Loss = -0.0427, Actor Loss = -0.0855, Critic Loss = 0.0856, Entropy = 0.0654, Returns = 1.4430, Value = 1.4811, mask_loss = 0.2178std_loss = 0.3696:  17%|█▋        | 951/5475 [4:17:16<21:15:37, 16.92s/it]

ratio tensor(1.9936, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5934, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.2476, device='cuda:0')
reward_raw: tensor(0.5732, device='cuda:0') reward: tensor(1.3218, device='cuda:0') factor: tensor(0.2514, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3215, device='cuda:0') factor: tensor(0.2508, device='cuda:0')
reward_raw: tensor(0.5787, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.2550, device='cuda:0')
reward_raw: tensor(0.5810, device='cuda:0') reward: tensor(1.3306, device='cuda:0') factor: tensor(0.2503, device='cuda:0')


Epoch 1, Step 952: Loss = 0.1883, Actor Loss = 0.1588, Critic Loss = 0.0593, Entropy = 0.1191, Returns = 1.2513, Value = 1.1550, mask_loss = 0.3090std_loss = 0.4035:  17%|█▋        | 952/5475 [4:17:27<19:17:42, 15.36s/it]  

ratio tensor(0.4559, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4397, device='cuda:0') reward: tensor(1.2750, device='cuda:0') factor: tensor(0.1647, device='cuda:0')
reward_raw: tensor(0.4157, device='cuda:0') reward: tensor(1.2512, device='cuda:0') factor: tensor(0.1645, device='cuda:0')
reward_raw: tensor(0.4186, device='cuda:0') reward: tensor(1.2542, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.4254, device='cuda:0') reward: tensor(1.2587, device='cuda:0') factor: tensor(0.1666, device='cuda:0')
reward_raw: tensor(0.4309, device='cuda:0') reward: tensor(1.2663, device='cuda:0') factor: tensor(0.1646, device='cuda:0')


Epoch 1, Step 953: Loss = 0.0363, Actor Loss = 0.0214, Critic Loss = 0.0299, Entropy = 0.0483, Returns = 1.1744, Value = 1.1690, mask_loss = 0.1655std_loss = 0.3482:  17%|█▋        | 953/5475 [4:17:45<20:10:52, 16.07s/it]

ratio tensor(0.9374, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.2399, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.2990, device='cuda:0') factor: tensor(0.2401, device='cuda:0')
reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.2372, device='cuda:0')
reward_raw: tensor(0.5700, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.2390, device='cuda:0')
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.3240, device='cuda:0') factor: tensor(0.2319, device='cuda:0')


Epoch 1, Step 954: Loss = -0.1951, Actor Loss = -0.2163, Critic Loss = 0.0426, Entropy = 0.0612, Returns = 1.3152, Value = 1.3952, mask_loss = 0.2148std_loss = 0.3788:  17%|█▋        | 954/5475 [4:17:58<18:51:35, 15.02s/it]

ratio tensor(2.3776, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5278, device='cuda:0') reward: tensor(1.3613, device='cuda:0') factor: tensor(0.1665, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1670, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.1643, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.1656, device='cuda:0')


Epoch 1, Step 955: Loss = 0.0899, Actor Loss = 0.0726, Critic Loss = 0.0347, Entropy = 0.0534, Returns = 1.3644, Value = 1.3565, mask_loss = 0.1948std_loss = 0.3627:  17%|█▋        | 955/5475 [4:18:15<19:54:30, 15.86s/it]  

ratio tensor(0.8673, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.2847, device='cuda:0') factor: tensor(0.2082, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3001, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.2103, device='cuda:0')
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.3006, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.5076, device='cuda:0') reward: tensor(1.2981, device='cuda:0') factor: tensor(0.2095, device='cuda:0')


Epoch 1, Step 956: Loss = 0.0874, Actor Loss = 0.0525, Critic Loss = 0.0698, Entropy = 0.0580, Returns = 1.2966, Value = 1.2644, mask_loss = 0.2005std_loss = 0.3608:  17%|█▋        | 956/5475 [4:18:29<18:56:20, 15.09s/it]

ratio tensor(0.7154, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.2369, device='cuda:0')
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.2340, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.2408, device='cuda:0')
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.2422, device='cuda:0')
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.2310, device='cuda:0')


Epoch 1, Step 957: Loss = -0.0203, Actor Loss = -0.0431, Critic Loss = 0.0458, Entropy = 0.0622, Returns = 1.3381, Value = 1.3059, mask_loss = 0.2329std_loss = 0.3776:  17%|█▋        | 957/5475 [4:18:42<18:10:05, 14.48s/it]

ratio tensor(1.1785, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.1911, device='cuda:0')
reward_raw: tensor(0.5258, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.1873, device='cuda:0')
reward_raw: tensor(0.5228, device='cuda:0') reward: tensor(1.3324, device='cuda:0') factor: tensor(0.1904, device='cuda:0')
reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.3366, device='cuda:0') factor: tensor(0.1893, device='cuda:0')
reward_raw: tensor(0.5181, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.1873, device='cuda:0')


Epoch 1, Step 958: Loss = -0.1530, Actor Loss = -0.1798, Critic Loss = 0.0536, Entropy = 0.0594, Returns = 1.3411, Value = 1.2597, mask_loss = 0.1874std_loss = 0.3612:  17%|█▋        | 958/5475 [4:18:57<18:16:58, 14.57s/it]

ratio tensor(8.1612, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4488, device='cuda:0') reward: tensor(1.2400, device='cuda:0') factor: tensor(0.2088, device='cuda:0')
reward_raw: tensor(0.4393, device='cuda:0') reward: tensor(1.2269, device='cuda:0') factor: tensor(0.2124, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.2426, device='cuda:0') factor: tensor(0.2114, device='cuda:0')
reward_raw: tensor(0.4340, device='cuda:0') reward: tensor(1.2270, device='cuda:0') factor: tensor(0.2070, device='cuda:0')
reward_raw: tensor(0.4383, device='cuda:0') reward: tensor(1.2285, device='cuda:0') factor: tensor(0.2099, device='cuda:0')


Epoch 1, Step 959: Loss = 0.2494, Actor Loss = 0.1984, Critic Loss = 0.1021, Entropy = 0.0646, Returns = 1.1644, Value = 1.1179, mask_loss = 0.2095std_loss = 0.3649:  18%|█▊        | 959/5475 [4:19:11<18:04:38, 14.41s/it]  

ratio tensor(0.6563, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5684, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.1795, device='cuda:0')
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.1769, device='cuda:0')
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.1810, device='cuda:0')
reward_raw: tensor(0.5438, device='cuda:0') reward: tensor(1.3613, device='cuda:0') factor: tensor(0.1826, device='cuda:0')
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1793, device='cuda:0')


Epoch 1, Step 960: Loss = -0.1174, Actor Loss = -0.1399, Critic Loss = 0.0452, Entropy = 0.0623, Returns = 1.3817, Value = 1.4092, mask_loss = 0.1754std_loss = 0.3402:  18%|█▊        | 960/5475 [4:19:28<19:21:26, 15.43s/it]

ratio tensor(1.8220, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5911, device='cuda:0') reward: tensor(1.4107, device='cuda:0') factor: tensor(0.1804, device='cuda:0')
reward_raw: tensor(0.5956, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.1863, device='cuda:0')
reward_raw: tensor(0.6196, device='cuda:0') reward: tensor(1.4352, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5976, device='cuda:0') reward: tensor(1.4099, device='cuda:0') factor: tensor(0.1877, device='cuda:0')
reward_raw: tensor(0.6082, device='cuda:0') reward: tensor(1.4273, device='cuda:0') factor: tensor(0.1808, device='cuda:0')


Epoch 1, Step 961: Loss = -0.0845, Actor Loss = -0.1287, Critic Loss = 0.0885, Entropy = 0.0643, Returns = 1.4061, Value = 1.5430, mask_loss = 0.1900std_loss = 0.3556:  18%|█▊        | 960/5475 [4:19:42<19:21:26, 15.43s/it]

ratio tensor(1.6575, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 961: Loss = -0.0845, Actor Loss = -0.1287, Critic Loss = 0.0885, Entropy = 0.0643, Returns = 1.4061, Value = 1.5430, mask_loss = 0.1900std_loss = 0.3556:  18%|█▊        | 961/5475 [4:19:42<18:40:54, 14.90s/it]

reward_raw: tensor(0.5740, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.5788, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.2054, device='cuda:0')
reward_raw: tensor(0.5814, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.2079, device='cuda:0')
reward_raw: tensor(0.5899, device='cuda:0') reward: tensor(1.3793, device='cuda:0') factor: tensor(0.2105, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.3739, device='cuda:0') factor: tensor(0.2066, device='cuda:0')


Epoch 1, Step 962: Loss = 0.2080, Actor Loss = 0.1572, Critic Loss = 0.1018, Entropy = 0.0671, Returns = 1.3989, Value = 1.3401, mask_loss = 0.2153std_loss = 0.3750:  18%|█▊        | 962/5475 [4:19:55<17:51:25, 14.24s/it]  

ratio tensor(0.6041, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.2987, device='cuda:0') factor: tensor(0.2398, device='cuda:0')
reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.2852, device='cuda:0') factor: tensor(0.2407, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.3144, device='cuda:0') factor: tensor(0.2397, device='cuda:0')
reward_raw: tensor(0.5265, device='cuda:0') reward: tensor(1.2777, device='cuda:0') factor: tensor(0.2488, device='cuda:0')
reward_raw: tensor(0.5429, device='cuda:0') reward: tensor(1.3017, device='cuda:0') factor: tensor(0.2412, device='cuda:0')


Epoch 1, Step 963: Loss = 0.1629, Actor Loss = 0.1221, Critic Loss = 0.0816, Entropy = 0.0748, Returns = 1.2557, Value = 1.2760, mask_loss = 0.2424std_loss = 0.3741:  18%|█▊        | 963/5475 [4:20:10<18:16:07, 14.58s/it]

ratio tensor(1.1575, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.3116, device='cuda:0') factor: tensor(0.1621, device='cuda:0')
reward_raw: tensor(0.4512, device='cuda:0') reward: tensor(1.2950, device='cuda:0') factor: tensor(0.1562, device='cuda:0')
reward_raw: tensor(0.4790, device='cuda:0') reward: tensor(1.3166, device='cuda:0') factor: tensor(0.1624, device='cuda:0')
reward_raw: tensor(0.4703, device='cuda:0') reward: tensor(1.3096, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.4668, device='cuda:0') reward: tensor(1.3015, device='cuda:0') factor: tensor(0.1653, device='cuda:0')


Epoch 1, Step 964: Loss = 0.0239, Actor Loss = -0.0132, Critic Loss = 0.0743, Entropy = 0.0417, Returns = 1.3616, Value = 1.4567, mask_loss = 0.1642std_loss = 0.3437:  18%|█▊        | 964/5475 [4:20:30<20:12:10, 16.12s/it]

ratio tensor(1.1246, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5149, device='cuda:0') reward: tensor(1.3496, device='cuda:0') factor: tensor(0.1653, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.1666, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.1695, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.1705, device='cuda:0')
reward_raw: tensor(0.4982, device='cuda:0') reward: tensor(1.3279, device='cuda:0') factor: tensor(0.1703, device='cuda:0')


Epoch 1, Step 965: Loss = 0.0600, Actor Loss = 0.0046, Critic Loss = 0.1109, Entropy = 0.0623, Returns = 1.2762, Value = 1.4258, mask_loss = 0.1800std_loss = 0.3505:  18%|█▊        | 965/5475 [4:20:50<21:35:44, 17.24s/it] 

ratio tensor(1.1022, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4840, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.1741, device='cuda:0')
reward_raw: tensor(0.4662, device='cuda:0') reward: tensor(1.2943, device='cuda:0') factor: tensor(0.1719, device='cuda:0')
reward_raw: tensor(0.4777, device='cuda:0') reward: tensor(1.3080, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.4753, device='cuda:0') reward: tensor(1.3015, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.4817, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.1752, device='cuda:0')


Epoch 1, Step 966: Loss = 0.2819, Actor Loss = 0.2304, Critic Loss = 0.1030, Entropy = 0.0701, Returns = 1.2924, Value = 1.3535, mask_loss = 0.1681std_loss = 0.3372:  18%|█▊        | 966/5475 [4:21:07<21:40:15, 17.30s/it]

ratio tensor(0.7048, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5097, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.1922, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3120, device='cuda:0') factor: tensor(0.1919, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3121, device='cuda:0') factor: tensor(0.1893, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3165, device='cuda:0') factor: tensor(0.1850, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.1921, device='cuda:0')


Epoch 1, Step 967: Loss = -0.1654, Actor Loss = -0.2062, Critic Loss = 0.0819, Entropy = 0.0643, Returns = 1.3040, Value = 1.3295, mask_loss = 0.1939std_loss = 0.3625:  18%|█▊        | 967/5475 [4:21:28<22:55:25, 18.31s/it]

ratio tensor(1.5396, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.3601, device='cuda:0') factor: tensor(0.1791, device='cuda:0')
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.3491, device='cuda:0') factor: tensor(0.1799, device='cuda:0')
reward_raw: tensor(0.5392, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.5421, device='cuda:0') reward: tensor(1.3647, device='cuda:0') factor: tensor(0.1774, device='cuda:0')
reward_raw: tensor(0.5236, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1813, device='cuda:0')


Epoch 1, Step 968: Loss = 0.0208, Actor Loss = -0.0279, Critic Loss = 0.0976, Entropy = 0.0637, Returns = 1.3854, Value = 1.2958, mask_loss = 0.1819std_loss = 0.3439:  18%|█▊        | 968/5475 [4:21:43<21:53:24, 17.49s/it] 

ratio tensor(0.9608, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5587, device='cuda:0') reward: tensor(1.4005, device='cuda:0') factor: tensor(0.1582, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.1561, device='cuda:0')
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(1.4003, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.4085, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.5538, device='cuda:0') reward: tensor(1.4061, device='cuda:0') factor: tensor(0.1476, device='cuda:0')


Epoch 1, Step 969: Loss = 0.1126, Actor Loss = 0.0778, Critic Loss = 0.0696, Entropy = 0.0487, Returns = 1.3890, Value = 1.3933, mask_loss = 0.1561std_loss = 0.3336:  18%|█▊        | 969/5475 [4:22:01<21:50:39, 17.45s/it] 

ratio tensor(1.9950, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5363, device='cuda:0') reward: tensor(1.3444, device='cuda:0') factor: tensor(0.1919, device='cuda:0')
reward_raw: tensor(0.5298, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1948, device='cuda:0')
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.3104, device='cuda:0') factor: tensor(0.1966, device='cuda:0')
reward_raw: tensor(0.5251, device='cuda:0') reward: tensor(1.3306, device='cuda:0') factor: tensor(0.1945, device='cuda:0')
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.3219, device='cuda:0') factor: tensor(0.1937, device='cuda:0')


Epoch 1, Step 970: Loss = -0.1288, Actor Loss = -0.1637, Critic Loss = 0.0698, Entropy = 0.0625, Returns = 1.3749, Value = 1.3543, mask_loss = 0.1782std_loss = 0.3518:  18%|█▊        | 970/5475 [4:22:17<21:11:29, 16.93s/it]

ratio tensor(0.9634, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5695, device='cuda:0') reward: tensor(1.3801, device='cuda:0') factor: tensor(0.1894, device='cuda:0')
reward_raw: tensor(0.5720, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.1888, device='cuda:0')
reward_raw: tensor(0.5744, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.1881, device='cuda:0')
reward_raw: tensor(0.5772, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1900, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1906, device='cuda:0')


Epoch 1, Step 971: Loss = -0.0185, Actor Loss = -0.1180, Critic Loss = 0.1991, Entropy = 0.0585, Returns = 1.2782, Value = 1.2056, mask_loss = 0.2135std_loss = 0.3606:  18%|█▊        | 971/5475 [4:22:32<20:36:01, 16.47s/it]

ratio tensor(0.8760, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.2903, device='cuda:0') factor: tensor(0.1811, device='cuda:0')
reward_raw: tensor(0.4701, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.1836, device='cuda:0')
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.1791, device='cuda:0')
reward_raw: tensor(0.4689, device='cuda:0') reward: tensor(1.2885, device='cuda:0') factor: tensor(0.1804, device='cuda:0')
reward_raw: tensor(0.4658, device='cuda:0') reward: tensor(1.2851, device='cuda:0') factor: tensor(0.1807, device='cuda:0')


Epoch 1, Step 972: Loss = 0.2462, Actor Loss = 0.2215, Critic Loss = 0.0495, Entropy = 0.0535, Returns = 1.2182, Value = 1.2995, mask_loss = 0.1703std_loss = 0.3324:  18%|█▊        | 972/5475 [4:22:50<21:11:05, 16.94s/it]  

ratio tensor(0.4214, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5354, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1412, device='cuda:0')
reward_raw: tensor(0.5195, device='cuda:0') reward: tensor(1.3773, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3829, device='cuda:0') factor: tensor(0.1372, device='cuda:0')
reward_raw: tensor(0.5455, device='cuda:0') reward: tensor(1.4059, device='cuda:0') factor: tensor(0.1396, device='cuda:0')


Epoch 1, Step 973: Loss = 0.0723, Actor Loss = 0.0277, Critic Loss = 0.0895, Entropy = 0.0653, Returns = 1.3287, Value = 1.4294, mask_loss = 0.1587std_loss = 0.3289:  18%|█▊        | 973/5475 [4:23:13<23:17:25, 18.62s/it]

ratio tensor(1.0833, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3096, device='cuda:0') factor: tensor(0.2019, device='cuda:0')
reward_raw: tensor(0.4751, device='cuda:0') reward: tensor(1.2766, device='cuda:0') factor: tensor(0.1985, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.2984, device='cuda:0') factor: tensor(0.2007, device='cuda:0')
reward_raw: tensor(0.5046, device='cuda:0') reward: tensor(1.2986, device='cuda:0') factor: tensor(0.2060, device='cuda:0')
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.3034, device='cuda:0') factor: tensor(0.2018, device='cuda:0')


Epoch 1, Step 974: Loss = -0.0295, Actor Loss = -0.0890, Critic Loss = 0.1192, Entropy = 0.0653, Returns = 1.2999, Value = 1.2099, mask_loss = 0.1831std_loss = 0.3425:  18%|█▊        | 974/5475 [4:23:28<22:07:13, 17.69s/it]

ratio tensor(1.1908, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5107, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.3359, device='cuda:0') factor: tensor(0.1658, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.1681, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.3479, device='cuda:0') factor: tensor(0.1657, device='cuda:0')
reward_raw: tensor(0.5024, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.1663, device='cuda:0')


Epoch 1, Step 975: Loss = -0.0999, Actor Loss = -0.1205, Critic Loss = 0.0414, Entropy = 0.0586, Returns = 1.3492, Value = 1.3348, mask_loss = 0.1612std_loss = 0.3349:  18%|█▊        | 975/5475 [4:23:47<22:40:04, 18.13s/it]

ratio tensor(1.6350, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.3274, device='cuda:0') factor: tensor(0.2093, device='cuda:0')
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5497, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.2039, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.2028, device='cuda:0')
reward_raw: tensor(0.5444, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.2123, device='cuda:0')


Epoch 1, Step 976: Loss = 0.2019, Actor Loss = 0.1840, Critic Loss = 0.0359, Entropy = 0.0728, Returns = 1.2261, Value = 1.2855, mask_loss = 0.2201std_loss = 0.3743:  18%|█▊        | 976/5475 [4:24:08<23:37:39, 18.91s/it]  

ratio tensor(1.0724, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.3560, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.3652, device='cuda:0') factor: tensor(0.1943, device='cuda:0')
reward_raw: tensor(0.5523, device='cuda:0') reward: tensor(1.3601, device='cuda:0') factor: tensor(0.1922, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.2009, device='cuda:0')
reward_raw: tensor(0.5704, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.1973, device='cuda:0')


Epoch 1, Step 977: Loss = -0.0643, Actor Loss = -0.1078, Critic Loss = 0.0870, Entropy = 0.0679, Returns = 1.4719, Value = 1.3794, mask_loss = 0.1788std_loss = 0.3477:  18%|█▊        | 977/5475 [4:24:23<22:18:41, 17.86s/it]

ratio tensor(1.2742, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5751, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.2045, device='cuda:0')
reward_raw: tensor(0.5595, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.2024, device='cuda:0')
reward_raw: tensor(0.5783, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.1992, device='cuda:0')
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.2037, device='cuda:0')
reward_raw: tensor(0.5706, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.2080, device='cuda:0')


Epoch 1, Step 978: Loss = 0.0280, Actor Loss = -0.0125, Critic Loss = 0.0811, Entropy = 0.0762, Returns = 1.3006, Value = 1.4910, mask_loss = 0.2048std_loss = 0.3579:  18%|█▊        | 978/5475 [4:24:36<20:23:07, 16.32s/it] 

ratio tensor(1.4631, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6742, device='cuda:0') reward: tensor(1.4911, device='cuda:0') factor: tensor(0.1831, device='cuda:0')
reward_raw: tensor(0.6706, device='cuda:0') reward: tensor(1.4796, device='cuda:0') factor: tensor(0.1910, device='cuda:0')
reward_raw: tensor(0.6682, device='cuda:0') reward: tensor(1.4820, device='cuda:0') factor: tensor(0.1862, device='cuda:0')
reward_raw: tensor(0.6792, device='cuda:0') reward: tensor(1.4944, device='cuda:0') factor: tensor(0.1849, device='cuda:0')
reward_raw: tensor(0.7068, device='cuda:0') reward: tensor(1.5172, device='cuda:0') factor: tensor(0.1896, device='cuda:0')


Epoch 1, Step 979: Loss = -0.0466, Actor Loss = -0.0646, Critic Loss = 0.0361, Entropy = 0.0866, Returns = 1.4841, Value = 1.5458, mask_loss = 0.1647std_loss = 0.3327:  18%|█▊        | 979/5475 [4:24:49<19:05:23, 15.29s/it]

ratio tensor(1.1402, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3185, device='cuda:0') factor: tensor(0.1687, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.1663, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3207, device='cuda:0') factor: tensor(0.1726, device='cuda:0')
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.2931, device='cuda:0') factor: tensor(0.1695, device='cuda:0')
reward_raw: tensor(0.5072, device='cuda:0') reward: tensor(1.3345, device='cuda:0') factor: tensor(0.1727, device='cuda:0')


Epoch 1, Step 980: Loss = 0.2511, Actor Loss = 0.2016, Critic Loss = 0.0993, Entropy = 0.0793, Returns = 1.2988, Value = 1.1804, mask_loss = 0.1824std_loss = 0.3492:  18%|█▊        | 980/5475 [4:25:06<19:39:09, 15.74s/it]  

ratio tensor(0.4645, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5464, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.3590, device='cuda:0') factor: tensor(0.2122, device='cuda:0')
reward_raw: tensor(0.5709, device='cuda:0') reward: tensor(1.3585, device='cuda:0') factor: tensor(0.2125, device='cuda:0')
reward_raw: tensor(0.5746, device='cuda:0') reward: tensor(1.3542, device='cuda:0') factor: tensor(0.2204, device='cuda:0')
reward_raw: tensor(0.5548, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.2191, device='cuda:0')


Epoch 1, Step 981: Loss = -0.1878, Actor Loss = -0.2264, Critic Loss = 0.0773, Entropy = 0.0846, Returns = 1.3392, Value = 1.2335, mask_loss = 0.2296std_loss = 0.3723:  18%|█▊        | 980/5475 [4:25:19<19:39:09, 15.74s/it]

ratio tensor(1.4447, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 981: Loss = -0.1878, Actor Loss = -0.2264, Critic Loss = 0.0773, Entropy = 0.0846, Returns = 1.3392, Value = 1.2335, mask_loss = 0.2296std_loss = 0.3723:  18%|█▊        | 981/5475 [4:25:20<18:59:38, 15.22s/it]

reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.2772, device='cuda:0') factor: tensor(0.2128, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.2770, device='cuda:0') factor: tensor(0.2143, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.2596, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.2820, device='cuda:0') factor: tensor(0.2103, device='cuda:0')
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.2918, device='cuda:0') factor: tensor(0.2043, device='cuda:0')


Epoch 1, Step 982: Loss = 0.0253, Actor Loss = -0.0063, Critic Loss = 0.0635, Entropy = 0.0762, Returns = 1.2622, Value = 1.3314, mask_loss = 0.2142std_loss = 0.3640:  18%|█▊        | 982/5475 [4:25:32<17:51:17, 14.31s/it] 

ratio tensor(0.6730, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6633, device='cuda:0') reward: tensor(1.4536, device='cuda:0') factor: tensor(0.2097, device='cuda:0')
reward_raw: tensor(0.6555, device='cuda:0') reward: tensor(1.4415, device='cuda:0') factor: tensor(0.2139, device='cuda:0')
reward_raw: tensor(0.6359, device='cuda:0') reward: tensor(1.4290, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.6789, device='cuda:0') reward: tensor(1.4695, device='cuda:0') factor: tensor(0.2094, device='cuda:0')
reward_raw: tensor(0.6528, device='cuda:0') reward: tensor(1.4428, device='cuda:0') factor: tensor(0.2099, device='cuda:0')


Epoch 1, Step 983: Loss = -0.1101, Actor Loss = -0.1391, Critic Loss = 0.0581, Entropy = 0.0832, Returns = 1.4998, Value = 1.5418, mask_loss = 0.1954std_loss = 0.3553:  18%|█▊        | 983/5475 [4:25:47<18:02:16, 14.46s/it]

ratio tensor(1.2258, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3411, device='cuda:0') factor: tensor(0.1764, device='cuda:0')
reward_raw: tensor(0.5077, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.1775, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.1753, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1741, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.3325, device='cuda:0') factor: tensor(0.1748, device='cuda:0')


Epoch 1, Step 984: Loss = 0.3091, Actor Loss = 0.2845, Critic Loss = 0.0494, Entropy = 0.0705, Returns = 1.2919, Value = 1.2721, mask_loss = 0.1723std_loss = 0.3430:  18%|█▊        | 984/5475 [4:26:01<18:07:42, 14.53s/it]  

ratio tensor(0.9320, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5948, device='cuda:0') reward: tensor(1.3869, device='cuda:0') factor: tensor(0.2079, device='cuda:0')
reward_raw: tensor(0.6008, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.2009, device='cuda:0')
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.3795, device='cuda:0') factor: tensor(0.1981, device='cuda:0')
reward_raw: tensor(0.5676, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.1944, device='cuda:0')
reward_raw: tensor(0.6024, device='cuda:0') reward: tensor(1.4033, device='cuda:0') factor: tensor(0.1991, device='cuda:0')


Epoch 1, Step 985: Loss = 0.0061, Actor Loss = -0.0451, Critic Loss = 0.1026, Entropy = 0.0753, Returns = 1.4128, Value = 1.3832, mask_loss = 0.1858std_loss = 0.3443:  18%|█▊        | 985/5475 [4:26:14<17:31:57, 14.06s/it]

ratio tensor(1.1693, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5801, device='cuda:0') reward: tensor(1.4081, device='cuda:0') factor: tensor(0.1720, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.4058, device='cuda:0') factor: tensor(0.1784, device='cuda:0')
reward_raw: tensor(0.5840, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.1720, device='cuda:0')
reward_raw: tensor(0.5883, device='cuda:0') reward: tensor(1.4131, device='cuda:0') factor: tensor(0.1752, device='cuda:0')
reward_raw: tensor(0.5980, device='cuda:0') reward: tensor(1.4243, device='cuda:0') factor: tensor(0.1738, device='cuda:0')


Epoch 1, Step 986: Loss = -0.1095, Actor Loss = -0.1514, Critic Loss = 0.0840, Entropy = 0.0641, Returns = 1.3928, Value = 1.4513, mask_loss = 0.1741std_loss = 0.3475:  18%|█▊        | 986/5475 [4:26:32<18:52:19, 15.13s/it]

ratio tensor(0.9458, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3194, device='cuda:0') factor: tensor(0.1821, device='cuda:0')
reward_raw: tensor(0.5105, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.1815, device='cuda:0')
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.3256, device='cuda:0') factor: tensor(0.1805, device='cuda:0')
reward_raw: tensor(0.5053, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1760, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.3329, device='cuda:0') factor: tensor(0.1780, device='cuda:0')


Epoch 1, Step 987: Loss = 0.2572, Actor Loss = 0.2264, Critic Loss = 0.0618, Entropy = 0.0657, Returns = 1.3208, Value = 1.3372, mask_loss = 0.1705std_loss = 0.3417:  18%|█▊        | 987/5475 [4:26:55<21:48:20, 17.49s/it]  

ratio tensor(0.4963, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6210, device='cuda:0') reward: tensor(1.4105, device='cuda:0') factor: tensor(0.2105, device='cuda:0')
reward_raw: tensor(0.6389, device='cuda:0') reward: tensor(1.4351, device='cuda:0') factor: tensor(0.2038, device='cuda:0')
reward_raw: tensor(0.6445, device='cuda:0') reward: tensor(1.4379, device='cuda:0') factor: tensor(0.2066, device='cuda:0')
reward_raw: tensor(0.6340, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.2111, device='cuda:0')
reward_raw: tensor(0.6299, device='cuda:0') reward: tensor(1.4236, device='cuda:0') factor: tensor(0.2064, device='cuda:0')


Epoch 1, Step 988: Loss = -0.0133, Actor Loss = -0.0566, Critic Loss = 0.0868, Entropy = 0.0794, Returns = 1.4145, Value = 1.4815, mask_loss = 0.1847std_loss = 0.3469:  18%|█▊        | 988/5475 [4:27:17<23:21:35, 18.74s/it]

ratio tensor(1.1578, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5120, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.1934, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.1889, device='cuda:0')
reward_raw: tensor(0.5149, device='cuda:0') reward: tensor(1.3247, device='cuda:0') factor: tensor(0.1902, device='cuda:0')
reward_raw: tensor(0.5064, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.1907, device='cuda:0')
reward_raw: tensor(0.5246, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.1908, device='cuda:0')


Epoch 1, Step 989: Loss = -0.0115, Actor Loss = -0.0345, Critic Loss = 0.0461, Entropy = 0.0665, Returns = 1.3775, Value = 1.3746, mask_loss = 0.1964std_loss = 0.3616:  18%|█▊        | 989/5475 [4:27:35<23:09:17, 18.58s/it]

ratio tensor(1.2007, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.1851, device='cuda:0')
reward_raw: tensor(0.5443, device='cuda:0') reward: tensor(1.3631, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.5652, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.1867, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1864, device='cuda:0')
reward_raw: tensor(0.5544, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1877, device='cuda:0')


Epoch 1, Step 990: Loss = 0.1296, Actor Loss = 0.0821, Critic Loss = 0.0952, Entropy = 0.0747, Returns = 1.3374, Value = 1.3793, mask_loss = 0.2127std_loss = 0.3633:  18%|█▊        | 990/5475 [4:27:56<23:58:41, 19.25s/it]  

ratio tensor(0.8027, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4952, device='cuda:0') reward: tensor(1.3105, device='cuda:0') factor: tensor(0.1847, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.1846, device='cuda:0')
reward_raw: tensor(0.4868, device='cuda:0') reward: tensor(1.3039, device='cuda:0') factor: tensor(0.1829, device='cuda:0')
reward_raw: tensor(0.4868, device='cuda:0') reward: tensor(1.3036, device='cuda:0') factor: tensor(0.1832, device='cuda:0')
reward_raw: tensor(0.4958, device='cuda:0') reward: tensor(1.3157, device='cuda:0') factor: tensor(0.1801, device='cuda:0')


Epoch 1, Step 991: Loss = -0.0984, Actor Loss = -0.1383, Critic Loss = 0.0801, Entropy = 0.0786, Returns = 1.2584, Value = 1.2955, mask_loss = 0.1798std_loss = 0.3434:  18%|█▊        | 991/5475 [4:28:17<24:42:11, 19.83s/it]

ratio tensor(1.1190, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.3244, device='cuda:0') factor: tensor(0.1907, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.3274, device='cuda:0') factor: tensor(0.1838, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3053, device='cuda:0') factor: tensor(0.1847, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.1883, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.1831, device='cuda:0')


Epoch 1, Step 992: Loss = 0.0902, Actor Loss = 0.0357, Critic Loss = 0.1092, Entropy = 0.0772, Returns = 1.3207, Value = 1.2600, mask_loss = 0.1841std_loss = 0.3541:  18%|█▊        | 992/5475 [4:28:31<22:28:27, 18.05s/it]  

ratio tensor(0.7978, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.3974, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.5510, device='cuda:0') reward: tensor(1.3968, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.5557, device='cuda:0') reward: tensor(1.4011, device='cuda:0') factor: tensor(0.1547, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.1525, device='cuda:0')
reward_raw: tensor(0.5466, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1487, device='cuda:0')


Epoch 1, Step 993: Loss = -0.2423, Actor Loss = -0.2801, Critic Loss = 0.0758, Entropy = 0.0774, Returns = 1.3990, Value = 1.3234, mask_loss = 0.1580std_loss = 0.3286:  18%|█▊        | 993/5475 [4:28:46<21:14:48, 17.07s/it]

ratio tensor(5.2671, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4733, device='cuda:0') reward: tensor(1.2903, device='cuda:0') factor: tensor(0.1830, device='cuda:0')
reward_raw: tensor(0.5024, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.1802, device='cuda:0')
reward_raw: tensor(0.4848, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.1773, device='cuda:0')
reward_raw: tensor(0.4719, device='cuda:0') reward: tensor(1.2913, device='cuda:0') factor: tensor(0.1806, device='cuda:0')


Epoch 1, Step 994: Loss = 0.3355, Actor Loss = 0.2707, Critic Loss = 0.1298, Entropy = 0.0654, Returns = 1.3196, Value = 1.2936, mask_loss = 0.1841std_loss = 0.3499:  18%|█▊        | 994/5475 [4:29:07<22:48:33, 18.32s/it]  

ratio tensor(0.6105, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.2834, device='cuda:0') factor: tensor(0.2329, device='cuda:0')
reward_raw: tensor(0.5278, device='cuda:0') reward: tensor(1.2968, device='cuda:0') factor: tensor(0.2310, device='cuda:0')
reward_raw: tensor(0.5203, device='cuda:0') reward: tensor(1.2891, device='cuda:0') factor: tensor(0.2312, device='cuda:0')
reward_raw: tensor(0.5149, device='cuda:0') reward: tensor(1.2919, device='cuda:0') factor: tensor(0.2230, device='cuda:0')
reward_raw: tensor(0.5113, device='cuda:0') reward: tensor(1.2782, device='cuda:0') factor: tensor(0.2331, device='cuda:0')


Epoch 1, Step 995: Loss = -0.0159, Actor Loss = -0.0407, Critic Loss = 0.0496, Entropy = 0.0770, Returns = 1.2866, Value = 1.1944, mask_loss = 0.2245std_loss = 0.3765:  18%|█▊        | 995/5475 [4:29:22<21:42:00, 17.44s/it]

ratio tensor(0.7322, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3170, device='cuda:0') factor: tensor(0.1687, device='cuda:0')
reward_raw: tensor(0.5011, device='cuda:0') reward: tensor(1.3344, device='cuda:0') factor: tensor(0.1667, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.1626, device='cuda:0')
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1656, device='cuda:0')
reward_raw: tensor(0.5007, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.1660, device='cuda:0')


Epoch 1, Step 996: Loss = -0.0851, Actor Loss = -0.1242, Critic Loss = 0.0782, Entropy = 0.0679, Returns = 1.3353, Value = 1.4517, mask_loss = 0.1745std_loss = 0.3476:  18%|█▊        | 996/5475 [4:29:37<20:35:29, 16.55s/it]

ratio tensor(1.1472, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5551, device='cuda:0') reward: tensor(1.4005, device='cuda:0') factor: tensor(0.1546, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.1581, device='cuda:0')
reward_raw: tensor(0.5452, device='cuda:0') reward: tensor(1.3865, device='cuda:0') factor: tensor(0.1587, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1544, device='cuda:0')
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.1593, device='cuda:0')


Epoch 1, Step 997: Loss = 0.0379, Actor Loss = -0.0006, Critic Loss = 0.0771, Entropy = 0.0525, Returns = 1.4690, Value = 1.4354, mask_loss = 0.1374std_loss = 0.3137:  18%|█▊        | 997/5475 [4:29:48<18:47:08, 15.10s/it] 

ratio tensor(1.1329, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1815, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.3686, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.5525, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1770, device='cuda:0')
reward_raw: tensor(0.5650, device='cuda:0') reward: tensor(1.3843, device='cuda:0') factor: tensor(0.1807, device='cuda:0')
reward_raw: tensor(0.5685, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1826, device='cuda:0')


Epoch 1, Step 998: Loss = -0.0703, Actor Loss = -0.1163, Critic Loss = 0.0922, Entropy = 0.0646, Returns = 1.4139, Value = 1.3536, mask_loss = 0.1886std_loss = 0.3498:  18%|█▊        | 998/5475 [4:30:01<17:55:39, 14.42s/it]

ratio tensor(1.1022, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.1520, device='cuda:0')
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.1496, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.1510, device='cuda:0')
reward_raw: tensor(0.5379, device='cuda:0') reward: tensor(1.3883, device='cuda:0') factor: tensor(0.1496, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3708, device='cuda:0') factor: tensor(0.1477, device='cuda:0')


Epoch 1, Step 999: Loss = 0.1290, Actor Loss = 0.0942, Critic Loss = 0.0697, Entropy = 0.0595, Returns = 1.4181, Value = 1.4213, mask_loss = 0.1509std_loss = 0.3273:  18%|█▊        | 999/5475 [4:30:17<18:23:11, 14.79s/it]  

ratio tensor(0.8750, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5615, device='cuda:0') reward: tensor(1.4150, device='cuda:0') factor: tensor(0.1465, device='cuda:0')
reward_raw: tensor(0.5764, device='cuda:0') reward: tensor(1.4302, device='cuda:0') factor: tensor(0.1463, device='cuda:0')
reward_raw: tensor(0.5519, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.1492, device='cuda:0')
reward_raw: tensor(0.5782, device='cuda:0') reward: tensor(1.4335, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.1459, device='cuda:0')


Epoch 1, Step 1000: Loss = 0.0117, Actor Loss = -0.0043, Critic Loss = 0.0322, Entropy = 0.0673, Returns = 1.3789, Value = 1.3738, mask_loss = 0.1677std_loss = 0.3451:  18%|█▊        | 1000/5475 [4:30:30<17:53:58, 14.40s/it]

ratio tensor(1.0788, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5315, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.1654, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.1635, device='cuda:0')
reward_raw: tensor(0.5206, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.3370, device='cuda:0') factor: tensor(0.1626, device='cuda:0')
reward_raw: tensor(0.5343, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1576, device='cuda:0')


Epoch 1, Step 1001: Loss = -0.1071, Actor Loss = -0.1386, Critic Loss = 0.0632, Entropy = 0.0698, Returns = 1.3969, Value = 1.3986, mask_loss = 0.1709std_loss = 0.3372:  18%|█▊        | 1000/5475 [4:30:48<17:53:58, 14.40s/it]

ratio tensor(7.7238, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1001: Loss = -0.1071, Actor Loss = -0.1386, Critic Loss = 0.0632, Entropy = 0.0698, Returns = 1.3969, Value = 1.3986, mask_loss = 0.1709std_loss = 0.3372:  18%|█▊        | 1001/5475 [4:30:48<19:11:43, 15.45s/it]

reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3408, device='cuda:0') factor: tensor(0.1707, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3318, device='cuda:0') factor: tensor(0.1711, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.1761, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.3345, device='cuda:0') factor: tensor(0.1742, device='cuda:0')


Epoch 1, Step 1002: Loss = 0.2842, Actor Loss = 0.2402, Critic Loss = 0.0881, Entropy = 0.0732, Returns = 1.3076, Value = 1.2667, mask_loss = 0.1490std_loss = 0.3235:  18%|█▊        | 1002/5475 [4:31:00<17:54:10, 14.41s/it]  

ratio tensor(0.7853, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5590, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.2071, device='cuda:0')
reward_raw: tensor(0.5615, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.2044, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.3583, device='cuda:0') factor: tensor(0.1983, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.3533, device='cuda:0') factor: tensor(0.2008, device='cuda:0')
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.3456, device='cuda:0') factor: tensor(0.2032, device='cuda:0')


Epoch 1, Step 1003: Loss = -0.1215, Actor Loss = -0.1505, Critic Loss = 0.0582, Entropy = 0.0767, Returns = 1.4236, Value = 1.3312, mask_loss = 0.1694std_loss = 0.3319:  18%|█▊        | 1003/5475 [4:31:18<19:10:52, 15.44s/it]

ratio tensor(0.7471, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5040, device='cuda:0') reward: tensor(1.3130, device='cuda:0') factor: tensor(0.1910, device='cuda:0')
reward_raw: tensor(0.5196, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1880, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.1858, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3028, device='cuda:0') factor: tensor(0.1936, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.1911, device='cuda:0')


Epoch 1, Step 1004: Loss = 0.1295, Actor Loss = 0.1017, Critic Loss = 0.0558, Entropy = 0.0849, Returns = 1.2736, Value = 1.4241, mask_loss = 0.2038std_loss = 0.3518:  18%|█▊        | 1004/5475 [4:31:34<19:22:59, 15.61s/it]  

ratio tensor(0.9880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4942, device='cuda:0') reward: tensor(1.3136, device='cuda:0') factor: tensor(0.1806, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.3005, device='cuda:0') factor: tensor(0.1769, device='cuda:0')
reward_raw: tensor(0.4849, device='cuda:0') reward: tensor(1.3060, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.3168, device='cuda:0') factor: tensor(0.1829, device='cuda:0')
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.3202, device='cuda:0') factor: tensor(0.1863, device='cuda:0')


Epoch 1, Step 1005: Loss = 0.0986, Actor Loss = 0.0412, Critic Loss = 0.1151, Entropy = 0.1034, Returns = 1.3580, Value = 1.2079, mask_loss = 0.2000std_loss = 0.3503:  18%|█▊        | 1005/5475 [4:31:51<19:54:54, 16.04s/it]

ratio tensor(0.8841, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1984, device='cuda:0')
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3013, device='cuda:0') factor: tensor(0.1955, device='cuda:0')
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.3051, device='cuda:0') factor: tensor(0.1971, device='cuda:0')
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.3168, device='cuda:0') factor: tensor(0.1955, device='cuda:0')


Epoch 1, Step 1006: Loss = -0.2019, Actor Loss = -0.2341, Critic Loss = 0.0645, Entropy = 0.0722, Returns = 1.3177, Value = 1.3613, mask_loss = 0.1703std_loss = 0.3438:  18%|█▊        | 1006/5475 [4:32:07<19:45:32, 15.92s/it]

ratio tensor(1.7004, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6650, device='cuda:0') reward: tensor(1.4405, device='cuda:0') factor: tensor(0.2245, device='cuda:0')
reward_raw: tensor(0.6706, device='cuda:0') reward: tensor(1.4499, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.6813, device='cuda:0') reward: tensor(1.4525, device='cuda:0') factor: tensor(0.2288, device='cuda:0')
reward_raw: tensor(0.6935, device='cuda:0') reward: tensor(1.4676, device='cuda:0') factor: tensor(0.2259, device='cuda:0')
reward_raw: tensor(0.6815, device='cuda:0') reward: tensor(1.4604, device='cuda:0') factor: tensor(0.2211, device='cuda:0')


Epoch 1, Step 1007: Loss = 0.1421, Actor Loss = 0.1074, Critic Loss = 0.0696, Entropy = 0.0961, Returns = 1.3961, Value = 1.5163, mask_loss = 0.2199std_loss = 0.3739:  18%|█▊        | 1007/5475 [4:32:18<17:55:40, 14.45s/it]  

ratio tensor(0.7306, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3395, device='cuda:0') factor: tensor(0.1875, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.3171, device='cuda:0') factor: tensor(0.1788, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.1821, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1834, device='cuda:0')
reward_raw: tensor(0.5064, device='cuda:0') reward: tensor(1.3168, device='cuda:0') factor: tensor(0.1896, device='cuda:0')


Epoch 1, Step 1008: Loss = 0.0697, Actor Loss = -0.0026, Critic Loss = 0.1448, Entropy = 0.0802, Returns = 1.3681, Value = 1.2677, mask_loss = 0.1745std_loss = 0.3423:  18%|█▊        | 1008/5475 [4:32:35<18:56:30, 15.27s/it]

ratio tensor(0.7751, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.2978, device='cuda:0') factor: tensor(0.2355, device='cuda:0')
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.3121, device='cuda:0') factor: tensor(0.2301, device='cuda:0')
reward_raw: tensor(0.5392, device='cuda:0') reward: tensor(1.3067, device='cuda:0') factor: tensor(0.2325, device='cuda:0')
reward_raw: tensor(0.5196, device='cuda:0') reward: tensor(1.2925, device='cuda:0') factor: tensor(0.2270, device='cuda:0')
reward_raw: tensor(0.5312, device='cuda:0') reward: tensor(1.2978, device='cuda:0') factor: tensor(0.2334, device='cuda:0')


Epoch 1, Step 1009: Loss = -0.0060, Actor Loss = -0.0495, Critic Loss = 0.0872, Entropy = 0.1027, Returns = 1.2894, Value = 1.3010, mask_loss = 0.2344std_loss = 0.3705:  18%|█▊        | 1009/5475 [4:32:49<18:21:30, 14.80s/it]

ratio tensor(0.7486, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5314, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1696, device='cuda:0')
reward_raw: tensor(0.5241, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1622, device='cuda:0')
reward_raw: tensor(0.5276, device='cuda:0') reward: tensor(1.3622, device='cuda:0') factor: tensor(0.1654, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.1712, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3467, device='cuda:0') factor: tensor(0.1709, device='cuda:0')


Epoch 1, Step 1010: Loss = -0.0146, Actor Loss = -0.0414, Critic Loss = 0.0539, Entropy = 0.0886, Returns = 1.3361, Value = 1.3794, mask_loss = 0.1921std_loss = 0.3506:  18%|█▊        | 1010/5475 [4:33:02<17:40:26, 14.25s/it]

ratio tensor(0.9415, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6257, device='cuda:0') reward: tensor(1.4271, device='cuda:0') factor: tensor(0.1986, device='cuda:0')
reward_raw: tensor(0.6116, device='cuda:0') reward: tensor(1.4070, device='cuda:0') factor: tensor(0.2046, device='cuda:0')
reward_raw: tensor(0.6154, device='cuda:0') reward: tensor(1.4128, device='cuda:0') factor: tensor(0.2026, device='cuda:0')
reward_raw: tensor(0.6180, device='cuda:0') reward: tensor(1.4149, device='cuda:0') factor: tensor(0.2031, device='cuda:0')
reward_raw: tensor(0.6247, device='cuda:0') reward: tensor(1.4214, device='cuda:0') factor: tensor(0.2033, device='cuda:0')


Epoch 1, Step 1011: Loss = -0.0096, Actor Loss = -0.0534, Critic Loss = 0.0877, Entropy = 0.1140, Returns = 1.4611, Value = 1.4919, mask_loss = 0.2226std_loss = 0.3531:  18%|█▊        | 1011/5475 [4:33:19<19:01:46, 15.35s/it]

ratio tensor(1.6804, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3058, device='cuda:0') factor: tensor(0.1940, device='cuda:0')
reward_raw: tensor(0.5159, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.1859, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.3069, device='cuda:0') factor: tensor(0.1867, device='cuda:0')
reward_raw: tensor(0.4953, device='cuda:0') reward: tensor(1.3109, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.4861, device='cuda:0') reward: tensor(1.2973, device='cuda:0') factor: tensor(0.1887, device='cuda:0')


Epoch 1, Step 1012: Loss = 0.5999, Actor Loss = 0.5092, Critic Loss = 0.1816, Entropy = 0.0987, Returns = 1.3111, Value = 1.4904, mask_loss = 0.2276std_loss = 0.3767:  18%|█▊        | 1012/5475 [4:33:35<19:02:33, 15.36s/it]  

ratio tensor(0.6285, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5266, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.2055, device='cuda:0')
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.3374, device='cuda:0') factor: tensor(0.2082, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.3071, device='cuda:0') factor: tensor(0.2058, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.3022, device='cuda:0') factor: tensor(0.2095, device='cuda:0')
reward_raw: tensor(0.5138, device='cuda:0') reward: tensor(1.3010, device='cuda:0') factor: tensor(0.2129, device='cuda:0')


Epoch 1, Step 1013: Loss = -0.0897, Actor Loss = -0.1289, Critic Loss = 0.0785, Entropy = 0.0956, Returns = 1.3379, Value = 1.3053, mask_loss = 0.2201std_loss = 0.3713:  19%|█▊        | 1013/5475 [4:33:49<18:35:33, 15.00s/it]

ratio tensor(1.3706, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.2110, device='cuda:0')
reward_raw: tensor(0.5853, device='cuda:0') reward: tensor(1.3794, device='cuda:0') factor: tensor(0.2059, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.3580, device='cuda:0') factor: tensor(0.2144, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.5659, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.2050, device='cuda:0')


Epoch 1, Step 1014: Loss = -0.0123, Actor Loss = -0.0468, Critic Loss = 0.0692, Entropy = 0.0851, Returns = 1.4260, Value = 1.3706, mask_loss = 0.1897std_loss = 0.3547:  19%|█▊        | 1014/5475 [4:34:02<17:46:57, 14.35s/it]

ratio tensor(1.1080, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5139, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3479, device='cuda:0') factor: tensor(0.1704, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.3603, device='cuda:0') factor: tensor(0.1681, device='cuda:0')
reward_raw: tensor(0.5148, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.1706, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.3284, device='cuda:0') factor: tensor(0.1709, device='cuda:0')


Epoch 1, Step 1015: Loss = -0.0570, Actor Loss = -0.0907, Critic Loss = 0.0676, Entropy = 0.0900, Returns = 1.3483, Value = 1.3649, mask_loss = 0.1649std_loss = 0.3283:  19%|█▊        | 1015/5475 [4:34:16<17:38:57, 14.25s/it]

ratio tensor(0.8671, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.1773, device='cuda:0')
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1807, device='cuda:0')
reward_raw: tensor(0.5250, device='cuda:0') reward: tensor(1.3460, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.5296, device='cuda:0') reward: tensor(1.3472, device='cuda:0') factor: tensor(0.1824, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1777, device='cuda:0')


Epoch 1, Step 1016: Loss = -0.0177, Actor Loss = -0.0475, Critic Loss = 0.0599, Entropy = 0.0988, Returns = 1.3188, Value = 1.3382, mask_loss = 0.1984std_loss = 0.3550:  19%|█▊        | 1016/5475 [4:34:29<17:20:41, 14.00s/it]

ratio tensor(1.4539, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5686, device='cuda:0') reward: tensor(1.3444, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.2196, device='cuda:0')
reward_raw: tensor(0.5655, device='cuda:0') reward: tensor(1.3448, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.5801, device='cuda:0') reward: tensor(1.3554, device='cuda:0') factor: tensor(0.2247, device='cuda:0')
reward_raw: tensor(0.5829, device='cuda:0') reward: tensor(1.3639, device='cuda:0') factor: tensor(0.2190, device='cuda:0')


Epoch 1, Step 1017: Loss = 0.2106, Actor Loss = 0.1596, Critic Loss = 0.1022, Entropy = 0.1367, Returns = 1.2989, Value = 1.2714, mask_loss = 0.2816std_loss = 0.3890:  19%|█▊        | 1017/5475 [4:34:43<17:16:53, 13.96s/it]  

ratio tensor(0.6233, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3665, device='cuda:0') factor: tensor(0.1846, device='cuda:0')
reward_raw: tensor(0.5580, device='cuda:0') reward: tensor(1.3722, device='cuda:0') factor: tensor(0.1858, device='cuda:0')
reward_raw: tensor(0.5387, device='cuda:0') reward: tensor(1.3560, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.5491, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.1836, device='cuda:0')
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.1794, device='cuda:0')


Epoch 1, Step 1018: Loss = -0.0708, Actor Loss = -0.1213, Critic Loss = 0.1012, Entropy = 0.0951, Returns = 1.3761, Value = 1.3559, mask_loss = 0.2033std_loss = 0.3536:  19%|█▊        | 1018/5475 [4:34:58<17:42:44, 14.31s/it]

ratio tensor(1.0667, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1884, device='cuda:0')
reward_raw: tensor(0.5639, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.1941, device='cuda:0')
reward_raw: tensor(0.5497, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1868, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.1936, device='cuda:0')
reward_raw: tensor(0.5683, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1922, device='cuda:0')


Epoch 1, Step 1019: Loss = -0.0787, Actor Loss = -0.1084, Critic Loss = 0.0596, Entropy = 0.0956, Returns = 1.4214, Value = 1.3874, mask_loss = 0.1566std_loss = 0.3189:  19%|█▊        | 1019/5475 [4:35:12<17:33:00, 14.18s/it]

ratio tensor(1.0607, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6365, device='cuda:0') reward: tensor(1.4443, device='cuda:0') factor: tensor(0.1922, device='cuda:0')
reward_raw: tensor(0.6505, device='cuda:0') reward: tensor(1.4547, device='cuda:0') factor: tensor(0.1958, device='cuda:0')
reward_raw: tensor(0.6493, device='cuda:0') reward: tensor(1.4509, device='cuda:0') factor: tensor(0.1984, device='cuda:0')
reward_raw: tensor(0.6513, device='cuda:0') reward: tensor(1.4524, device='cuda:0') factor: tensor(0.1990, device='cuda:0')
reward_raw: tensor(0.6512, device='cuda:0') reward: tensor(1.4585, device='cuda:0') factor: tensor(0.1927, device='cuda:0')


Epoch 1, Step 1020: Loss = -0.0253, Actor Loss = -0.0438, Critic Loss = 0.0371, Entropy = 0.0897, Returns = 1.4730, Value = 1.4610, mask_loss = 0.1939std_loss = 0.3545:  19%|█▊        | 1020/5475 [4:35:24<16:40:44, 13.48s/it]

ratio tensor(1.1702, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6163, device='cuda:0') reward: tensor(1.4068, device='cuda:0') factor: tensor(0.2095, device='cuda:0')
reward_raw: tensor(0.6398, device='cuda:0') reward: tensor(1.4272, device='cuda:0') factor: tensor(0.2126, device='cuda:0')
reward_raw: tensor(0.6137, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.2187, device='cuda:0')
reward_raw: tensor(0.6125, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.2106, device='cuda:0')
reward_raw: tensor(0.6123, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.2118, device='cuda:0')


Epoch 1, Step 1021: Loss = -0.0451, Actor Loss = -0.0883, Critic Loss = 0.0866, Entropy = 0.1042, Returns = 1.4562, Value = 1.4235, mask_loss = 0.2063std_loss = 0.3615:  19%|█▊        | 1020/5475 [4:35:38<16:40:44, 13.48s/it]

ratio tensor(1.6044, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1021: Loss = -0.0451, Actor Loss = -0.0883, Critic Loss = 0.0866, Entropy = 0.1042, Returns = 1.4562, Value = 1.4235, mask_loss = 0.2063std_loss = 0.3615:  19%|█▊        | 1021/5475 [4:35:38<16:57:19, 13.70s/it]

reward_raw: tensor(0.5816, device='cuda:0') reward: tensor(1.4046, device='cuda:0') factor: tensor(0.1770, device='cuda:0')
reward_raw: tensor(0.5775, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.1762, device='cuda:0')
reward_raw: tensor(0.5832, device='cuda:0') reward: tensor(1.4144, device='cuda:0') factor: tensor(0.1688, device='cuda:0')
reward_raw: tensor(0.5832, device='cuda:0') reward: tensor(1.4127, device='cuda:0') factor: tensor(0.1704, device='cuda:0')
reward_raw: tensor(0.5853, device='cuda:0') reward: tensor(1.4063, device='cuda:0') factor: tensor(0.1790, device='cuda:0')


Epoch 1, Step 1022: Loss = 0.0474, Actor Loss = -0.0044, Critic Loss = 0.1038, Entropy = 0.1085, Returns = 1.3675, Value = 1.3295, mask_loss = 0.2027std_loss = 0.3471:  19%|█▊        | 1022/5475 [4:35:58<19:22:42, 15.67s/it] 

ratio tensor(3.0786, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5811, device='cuda:0') reward: tensor(1.3447, device='cuda:0') factor: tensor(0.2364, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.3331, device='cuda:0') factor: tensor(0.2356, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.2306, device='cuda:0')
reward_raw: tensor(0.5949, device='cuda:0') reward: tensor(1.3604, device='cuda:0') factor: tensor(0.2345, device='cuda:0')
reward_raw: tensor(0.5762, device='cuda:0') reward: tensor(1.3451, device='cuda:0') factor: tensor(0.2311, device='cuda:0')


Epoch 1, Step 1023: Loss = -0.0151, Actor Loss = -0.0625, Critic Loss = 0.0951, Entropy = 0.1048, Returns = 1.4136, Value = 1.3613, mask_loss = 0.2182std_loss = 0.3637:  19%|█▊        | 1023/5475 [4:36:12<18:36:51, 15.05s/it]

ratio tensor(1.1402, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5757, device='cuda:0') reward: tensor(1.3547, device='cuda:0') factor: tensor(0.2210, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.2211, device='cuda:0')
reward_raw: tensor(0.5659, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.2156, device='cuda:0')
reward_raw: tensor(0.5734, device='cuda:0') reward: tensor(1.3540, device='cuda:0') factor: tensor(0.2194, device='cuda:0')
reward_raw: tensor(0.5912, device='cuda:0') reward: tensor(1.3681, device='cuda:0') factor: tensor(0.2231, device='cuda:0')


Epoch 1, Step 1024: Loss = 0.0039, Actor Loss = -0.0148, Critic Loss = 0.0377, Entropy = 0.1289, Returns = 1.4255, Value = 1.3598, mask_loss = 0.2310std_loss = 0.3583:  19%|█▊        | 1024/5475 [4:36:28<18:56:06, 15.31s/it] 

ratio tensor(0.8110, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4956, device='cuda:0') reward: tensor(1.2988, device='cuda:0') factor: tensor(0.1968, device='cuda:0')
reward_raw: tensor(0.4837, device='cuda:0') reward: tensor(1.2864, device='cuda:0') factor: tensor(0.1972, device='cuda:0')
reward_raw: tensor(0.5005, device='cuda:0') reward: tensor(1.3023, device='cuda:0') factor: tensor(0.1982, device='cuda:0')
reward_raw: tensor(0.4889, device='cuda:0') reward: tensor(1.2866, device='cuda:0') factor: tensor(0.2022, device='cuda:0')
reward_raw: tensor(0.5058, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.2033, device='cuda:0')


Epoch 1, Step 1025: Loss = 0.1279, Actor Loss = 0.0941, Critic Loss = 0.0677, Entropy = 0.1040, Returns = 1.1983, Value = 1.1698, mask_loss = 0.1967std_loss = 0.3443:  19%|█▊        | 1025/5475 [4:36:45<19:35:52, 15.85s/it] 

ratio tensor(0.7863, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5067, device='cuda:0') reward: tensor(1.3224, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.1897, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.3247, device='cuda:0') factor: tensor(0.1856, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3066, device='cuda:0') factor: tensor(0.1898, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1883, device='cuda:0')


Epoch 1, Step 1026: Loss = -0.1526, Actor Loss = -0.1848, Critic Loss = 0.0647, Entropy = 0.0925, Returns = 1.2578, Value = 1.3745, mask_loss = 0.1879std_loss = 0.3446:  19%|█▊        | 1026/5475 [4:37:05<21:06:46, 17.08s/it]

ratio tensor(2.6450, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.1984, device='cuda:0')
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.2002, device='cuda:0')
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5677, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.2015, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.3605, device='cuda:0') factor: tensor(0.1999, device='cuda:0')


Epoch 1, Step 1027: Loss = -0.0211, Actor Loss = -0.0449, Critic Loss = 0.0477, Entropy = 0.1044, Returns = 1.4292, Value = 1.4456, mask_loss = 0.2082std_loss = 0.3606:  19%|█▉        | 1027/5475 [4:37:21<20:35:53, 16.67s/it]

ratio tensor(0.8463, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.2056, device='cuda:0')
reward_raw: tensor(0.5726, device='cuda:0') reward: tensor(1.3669, device='cuda:0') factor: tensor(0.2057, device='cuda:0')
reward_raw: tensor(0.5539, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.2073, device='cuda:0')
reward_raw: tensor(0.5664, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.2064, device='cuda:0')
reward_raw: tensor(0.5622, device='cuda:0') reward: tensor(1.3622, device='cuda:0') factor: tensor(0.1999, device='cuda:0')


Epoch 1, Step 1028: Loss = 0.2260, Actor Loss = 0.1877, Critic Loss = 0.0770, Entropy = 0.1402, Returns = 1.2376, Value = 1.2679, mask_loss = 0.2643std_loss = 0.3734:  19%|█▉        | 1028/5475 [4:37:39<21:12:23, 17.17s/it]  

ratio tensor(0.4084, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3177, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.3401, device='cuda:0') factor: tensor(0.1999, device='cuda:0')
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.2008, device='cuda:0')
reward_raw: tensor(0.5230, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3109, device='cuda:0') factor: tensor(0.1856, device='cuda:0')


Epoch 1, Step 1029: Loss = -0.0118, Actor Loss = -0.0441, Critic Loss = 0.0648, Entropy = 0.1163, Returns = 1.2940, Value = 1.2996, mask_loss = 0.1824std_loss = 0.3354:  19%|█▉        | 1029/5475 [4:37:58<21:48:53, 17.66s/it]

ratio tensor(1.2691, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.1683, device='cuda:0')
reward_raw: tensor(0.5256, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.1665, device='cuda:0')
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.3691, device='cuda:0') factor: tensor(0.1653, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.3547, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.5356, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.1611, device='cuda:0')


Epoch 1, Step 1030: Loss = 0.0665, Actor Loss = 0.0465, Critic Loss = 0.0402, Entropy = 0.0950, Returns = 1.3191, Value = 1.4388, mask_loss = 0.1520std_loss = 0.3159:  19%|█▉        | 1030/5475 [4:38:13<20:59:06, 17.00s/it]  

ratio tensor(0.8571, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6038, device='cuda:0') reward: tensor(1.3916, device='cuda:0') factor: tensor(0.2122, device='cuda:0')
reward_raw: tensor(0.5973, device='cuda:0') reward: tensor(1.3981, device='cuda:0') factor: tensor(0.1991, device='cuda:0')
reward_raw: tensor(0.5953, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.2100, device='cuda:0')
reward_raw: tensor(0.5822, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.2033, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.2087, device='cuda:0')


Epoch 1, Step 1031: Loss = 0.0997, Actor Loss = 0.0637, Critic Loss = 0.0723, Entropy = 0.1240, Returns = 1.3728, Value = 1.3275, mask_loss = 0.2081std_loss = 0.3405:  19%|█▉        | 1031/5475 [4:38:26<19:32:08, 15.83s/it]

ratio tensor(0.7733, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4328, device='cuda:0') reward: tensor(1.2496, device='cuda:0') factor: tensor(0.1832, device='cuda:0')
reward_raw: tensor(0.4448, device='cuda:0') reward: tensor(1.2638, device='cuda:0') factor: tensor(0.1810, device='cuda:0')
reward_raw: tensor(0.4474, device='cuda:0') reward: tensor(1.2638, device='cuda:0') factor: tensor(0.1836, device='cuda:0')
reward_raw: tensor(0.4349, device='cuda:0') reward: tensor(1.2540, device='cuda:0') factor: tensor(0.1809, device='cuda:0')
reward_raw: tensor(0.4408, device='cuda:0') reward: tensor(1.2600, device='cuda:0') factor: tensor(0.1808, device='cuda:0')


Epoch 1, Step 1032: Loss = 0.2974, Actor Loss = 0.1729, Critic Loss = 0.2492, Entropy = 0.0950, Returns = 1.2351, Value = 1.1283, mask_loss = 0.1749std_loss = 0.3356:  19%|█▉        | 1032/5475 [4:38:47<21:09:52, 17.15s/it]

ratio tensor(0.6136, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5188, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.1911, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1925, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.3504, device='cuda:0') factor: tensor(0.1860, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.3144, device='cuda:0') factor: tensor(0.1894, device='cuda:0')


Epoch 1, Step 1033: Loss = -0.2654, Actor Loss = -0.3032, Critic Loss = 0.0758, Entropy = 0.0695, Returns = 1.4785, Value = 1.3611, mask_loss = 0.1745std_loss = 0.3394:  19%|█▉        | 1033/5475 [4:39:08<22:41:06, 18.39s/it]

ratio tensor(1.5281, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5226, device='cuda:0') reward: tensor(1.3357, device='cuda:0') factor: tensor(0.1869, device='cuda:0')
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.3326, device='cuda:0') factor: tensor(0.1852, device='cuda:0')
reward_raw: tensor(0.5392, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1811, device='cuda:0')
reward_raw: tensor(0.5251, device='cuda:0') reward: tensor(1.3409, device='cuda:0') factor: tensor(0.1842, device='cuda:0')
reward_raw: tensor(0.5425, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1845, device='cuda:0')


Epoch 1, Step 1034: Loss = -0.2466, Actor Loss = -0.2667, Critic Loss = 0.0403, Entropy = 0.0765, Returns = 1.3597, Value = 1.3648, mask_loss = 0.1316std_loss = 0.2897:  19%|█▉        | 1034/5475 [4:39:25<22:15:52, 18.05s/it]

ratio tensor(1.7121, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5773, device='cuda:0') reward: tensor(1.4107, device='cuda:0') factor: tensor(0.1667, device='cuda:0')
reward_raw: tensor(0.5933, device='cuda:0') reward: tensor(1.4289, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(1.4318, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.5848, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.1620, device='cuda:0')
reward_raw: tensor(0.5735, device='cuda:0') reward: tensor(1.4113, device='cuda:0') factor: tensor(0.1623, device='cuda:0')


Epoch 1, Step 1035: Loss = -0.0379, Actor Loss = -0.1186, Critic Loss = 0.1615, Entropy = 0.0846, Returns = 1.5644, Value = 1.3540, mask_loss = 0.1536std_loss = 0.3199:  19%|█▉        | 1035/5475 [4:39:41<21:20:07, 17.30s/it]

ratio tensor(1.7617, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5694, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.1695, device='cuda:0')
reward_raw: tensor(0.5530, device='cuda:0') reward: tensor(1.3971, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.3982, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.5628, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.1572, device='cuda:0')
reward_raw: tensor(0.5808, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.1657, device='cuda:0')


Epoch 1, Step 1036: Loss = 0.0002, Actor Loss = -0.0310, Critic Loss = 0.0626, Entropy = 0.0857, Returns = 1.4126, Value = 1.3624, mask_loss = 0.1573std_loss = 0.3248:  19%|█▉        | 1036/5475 [4:40:00<22:00:15, 17.85s/it] 

ratio tensor(1.2251, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.2922, device='cuda:0') factor: tensor(0.1870, device='cuda:0')
reward_raw: tensor(0.4669, device='cuda:0') reward: tensor(1.2830, device='cuda:0') factor: tensor(0.1839, device='cuda:0')
reward_raw: tensor(0.4531, device='cuda:0') reward: tensor(1.2687, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.4456, device='cuda:0') reward: tensor(1.2576, device='cuda:0') factor: tensor(0.1880, device='cuda:0')
reward_raw: tensor(0.4695, device='cuda:0') reward: tensor(1.2776, device='cuda:0') factor: tensor(0.1919, device='cuda:0')


Epoch 1, Step 1037: Loss = 0.1444, Actor Loss = 0.0889, Critic Loss = 0.1112, Entropy = 0.1332, Returns = 1.2537, Value = 1.2700, mask_loss = 0.2064std_loss = 0.3445:  19%|█▉        | 1037/5475 [4:40:18<22:15:44, 18.06s/it] 

ratio tensor(0.6497, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1906, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.3820, device='cuda:0') factor: tensor(0.1840, device='cuda:0')
reward_raw: tensor(0.5753, device='cuda:0') reward: tensor(1.3827, device='cuda:0') factor: tensor(0.1926, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.3808, device='cuda:0') factor: tensor(0.1867, device='cuda:0')
reward_raw: tensor(0.5657, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.1873, device='cuda:0')


Epoch 1, Step 1038: Loss = -0.0563, Actor Loss = -0.1054, Critic Loss = 0.0983, Entropy = 0.0924, Returns = 1.4625, Value = 1.4153, mask_loss = 0.2150std_loss = 0.3582:  19%|█▉        | 1038/5475 [4:40:37<22:15:57, 18.07s/it]

ratio tensor(2.7763, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6565, device='cuda:0') reward: tensor(1.4640, device='cuda:0') factor: tensor(0.1924, device='cuda:0')
reward_raw: tensor(0.6571, device='cuda:0') reward: tensor(1.4694, device='cuda:0') factor: tensor(0.1877, device='cuda:0')
reward_raw: tensor(0.6432, device='cuda:0') reward: tensor(1.4516, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.6617, device='cuda:0') reward: tensor(1.4736, device='cuda:0') factor: tensor(0.1881, device='cuda:0')
reward_raw: tensor(0.6422, device='cuda:0') reward: tensor(1.4513, device='cuda:0') factor: tensor(0.1908, device='cuda:0')


Epoch 1, Step 1039: Loss = -0.1336, Actor Loss = -0.1777, Critic Loss = 0.0883, Entropy = 0.1137, Returns = 1.5363, Value = 1.5058, mask_loss = 0.2099std_loss = 0.3577:  19%|█▉        | 1039/5475 [4:40:52<21:21:48, 17.34s/it]

ratio tensor(1.2690, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5837, device='cuda:0') reward: tensor(1.3947, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.1868, device='cuda:0')
reward_raw: tensor(0.5650, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1850, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.3945, device='cuda:0') factor: tensor(0.1861, device='cuda:0')
reward_raw: tensor(0.5783, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.1841, device='cuda:0')


Epoch 1, Step 1040: Loss = 0.1171, Actor Loss = 0.0968, Critic Loss = 0.0408, Entropy = 0.1045, Returns = 1.3673, Value = 1.3497, mask_loss = 0.1752std_loss = 0.3330:  19%|█▉        | 1040/5475 [4:41:08<20:37:15, 16.74s/it]  

ratio tensor(0.7565, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5878, device='cuda:0') reward: tensor(1.4008, device='cuda:0') factor: tensor(0.1870, device='cuda:0')
reward_raw: tensor(0.6007, device='cuda:0') reward: tensor(1.4043, device='cuda:0') factor: tensor(0.1964, device='cuda:0')
reward_raw: tensor(0.5869, device='cuda:0') reward: tensor(1.3875, device='cuda:0') factor: tensor(0.1994, device='cuda:0')
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.4051, device='cuda:0') factor: tensor(0.1943, device='cuda:0')
reward_raw: tensor(0.6009, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.2003, device='cuda:0')


Epoch 1, Step 1041: Loss = -0.2005, Actor Loss = -0.2563, Critic Loss = 0.1117, Entropy = 0.0936, Returns = 1.4382, Value = 1.3618, mask_loss = 0.1827std_loss = 0.3405:  19%|█▉        | 1040/5475 [4:41:23<20:37:15, 16.74s/it]

ratio tensor(1.2802, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1041: Loss = -0.2005, Actor Loss = -0.2563, Critic Loss = 0.1117, Entropy = 0.0936, Returns = 1.4382, Value = 1.3618, mask_loss = 0.1827std_loss = 0.3405:  19%|█▉        | 1041/5475 [4:41:23<20:06:07, 16.32s/it]

reward_raw: tensor(0.5366, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.2143, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.3266, device='cuda:0') factor: tensor(0.2079, device='cuda:0')
reward_raw: tensor(0.5326, device='cuda:0') reward: tensor(1.3176, device='cuda:0') factor: tensor(0.2150, device='cuda:0')
reward_raw: tensor(0.5326, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.2104, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.2155, device='cuda:0')


Epoch 1, Step 1042: Loss = 0.1056, Actor Loss = 0.0599, Critic Loss = 0.0916, Entropy = 0.0871, Returns = 1.4323, Value = 1.3032, mask_loss = 0.1774std_loss = 0.3403:  19%|█▉        | 1042/5475 [4:41:38<19:49:54, 16.11s/it]  

ratio tensor(0.7079, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3073, device='cuda:0') factor: tensor(0.1956, device='cuda:0')
reward_raw: tensor(0.4878, device='cuda:0') reward: tensor(1.2964, device='cuda:0') factor: tensor(0.1914, device='cuda:0')
reward_raw: tensor(0.4935, device='cuda:0') reward: tensor(1.2975, device='cuda:0') factor: tensor(0.1960, device='cuda:0')
reward_raw: tensor(0.4915, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.1918, device='cuda:0')
reward_raw: tensor(0.4788, device='cuda:0') reward: tensor(1.2857, device='cuda:0') factor: tensor(0.1931, device='cuda:0')


Epoch 1, Step 1043: Loss = 0.0534, Actor Loss = 0.0057, Critic Loss = 0.0956, Entropy = 0.1083, Returns = 1.3336, Value = 1.3137, mask_loss = 0.1988std_loss = 0.3363:  19%|█▉        | 1043/5475 [4:42:00<21:46:11, 17.68s/it]

ratio tensor(1.3162, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4457, device='cuda:0') reward: tensor(1.2737, device='cuda:0') factor: tensor(0.1720, device='cuda:0')
reward_raw: tensor(0.5064, device='cuda:0') reward: tensor(1.3275, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.4682, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1693, device='cuda:0')
reward_raw: tensor(0.4594, device='cuda:0') reward: tensor(1.2869, device='cuda:0') factor: tensor(0.1725, device='cuda:0')
reward_raw: tensor(0.4670, device='cuda:0') reward: tensor(1.2937, device='cuda:0') factor: tensor(0.1733, device='cuda:0')


Epoch 1, Step 1044: Loss = -0.0528, Actor Loss = -0.0954, Critic Loss = 0.0852, Entropy = 0.1006, Returns = 1.3364, Value = 1.3569, mask_loss = 0.1647std_loss = 0.3145:  19%|█▉        | 1044/5475 [4:42:16<21:12:01, 17.22s/it]

ratio tensor(0.9862, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6006, device='cuda:0') reward: tensor(1.4103, device='cuda:0') factor: tensor(0.1903, device='cuda:0')
reward_raw: tensor(0.6366, device='cuda:0') reward: tensor(1.4347, device='cuda:0') factor: tensor(0.2019, device='cuda:0')
reward_raw: tensor(0.6236, device='cuda:0') reward: tensor(1.4275, device='cuda:0') factor: tensor(0.1960, device='cuda:0')
reward_raw: tensor(0.6257, device='cuda:0') reward: tensor(1.4329, device='cuda:0') factor: tensor(0.1928, device='cuda:0')
reward_raw: tensor(0.6273, device='cuda:0') reward: tensor(1.4338, device='cuda:0') factor: tensor(0.1935, device='cuda:0')


Epoch 1, Step 1045: Loss = -0.1388, Actor Loss = -0.1582, Critic Loss = 0.0391, Entropy = 0.0966, Returns = 1.4788, Value = 1.4669, mask_loss = 0.1640std_loss = 0.3271:  19%|█▉        | 1045/5475 [4:42:35<21:49:22, 17.73s/it]

ratio tensor(1.5304, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3330, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.4956, device='cuda:0') reward: tensor(1.3317, device='cuda:0') factor: tensor(0.1638, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.1591, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.3563, device='cuda:0') factor: tensor(0.1590, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.3344, device='cuda:0') factor: tensor(0.1585, device='cuda:0')


Epoch 1, Step 1046: Loss = 0.1633, Actor Loss = 0.1039, Critic Loss = 0.1189, Entropy = 0.0848, Returns = 1.3268, Value = 1.3248, mask_loss = 0.1557std_loss = 0.3153:  19%|█▉        | 1046/5475 [4:42:53<21:52:53, 17.79s/it]  

ratio tensor(0.7412, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5607, device='cuda:0') reward: tensor(1.3840, device='cuda:0') factor: tensor(0.1767, device='cuda:0')
reward_raw: tensor(0.5625, device='cuda:0') reward: tensor(1.3928, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.3945, device='cuda:0') factor: tensor(0.1748, device='cuda:0')
reward_raw: tensor(0.5618, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1734, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.3930, device='cuda:0') factor: tensor(0.1739, device='cuda:0')


Epoch 1, Step 1047: Loss = -0.1315, Actor Loss = -0.1601, Critic Loss = 0.0573, Entropy = 0.0776, Returns = 1.3978, Value = 1.3438, mask_loss = 0.1475std_loss = 0.3156:  19%|█▉        | 1047/5475 [4:43:06<20:12:11, 16.43s/it]

ratio tensor(1.3238, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6252, device='cuda:0') reward: tensor(1.4270, device='cuda:0') factor: tensor(0.1982, device='cuda:0')
reward_raw: tensor(0.6354, device='cuda:0') reward: tensor(1.4368, device='cuda:0') factor: tensor(0.1986, device='cuda:0')
reward_raw: tensor(0.6626, device='cuda:0') reward: tensor(1.4612, device='cuda:0') factor: tensor(0.2014, device='cuda:0')
reward_raw: tensor(0.6374, device='cuda:0') reward: tensor(1.4301, device='cuda:0') factor: tensor(0.2073, device='cuda:0')
reward_raw: tensor(0.6405, device='cuda:0') reward: tensor(1.4374, device='cuda:0') factor: tensor(0.2031, device='cuda:0')


Epoch 1, Step 1048: Loss = 0.1145, Actor Loss = 0.0831, Critic Loss = 0.0630, Entropy = 0.1061, Returns = 1.3977, Value = 1.3690, mask_loss = 0.2106std_loss = 0.3549:  19%|█▉        | 1048/5475 [4:43:19<19:04:36, 15.51s/it]  

ratio tensor(0.7595, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.1220, device='cuda:0')
reward_raw: tensor(0.4979, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.1191, device='cuda:0')
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.3795, device='cuda:0') factor: tensor(0.1228, device='cuda:0')


Epoch 1, Step 1049: Loss = -0.0245, Actor Loss = -0.0315, Critic Loss = 0.0142, Entropy = 0.0649, Returns = 1.3659, Value = 1.3792, mask_loss = 0.1078std_loss = 0.2695:  19%|█▉        | 1049/5475 [4:43:37<19:59:19, 16.26s/it]

ratio tensor(0.8894, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4685, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.1744, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3046, device='cuda:0') factor: tensor(0.1886, device='cuda:0')
reward_raw: tensor(0.4705, device='cuda:0') reward: tensor(1.2884, device='cuda:0') factor: tensor(0.1821, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.1781, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3038, device='cuda:0') factor: tensor(0.1819, device='cuda:0')


Epoch 1, Step 1050: Loss = 0.1037, Actor Loss = 0.0588, Critic Loss = 0.0901, Entropy = 0.1062, Returns = 1.3313, Value = 1.2023, mask_loss = 0.1672std_loss = 0.3228:  19%|█▉        | 1050/5475 [4:43:58<21:42:53, 17.67s/it]  

ratio tensor(0.7958, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5966, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.1882, device='cuda:0')
reward_raw: tensor(0.5955, device='cuda:0') reward: tensor(1.4145, device='cuda:0') factor: tensor(0.1811, device='cuda:0')
reward_raw: tensor(0.5807, device='cuda:0') reward: tensor(1.3974, device='cuda:0') factor: tensor(0.1833, device='cuda:0')
reward_raw: tensor(0.5840, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.1845, device='cuda:0')
reward_raw: tensor(0.5849, device='cuda:0') reward: tensor(1.4026, device='cuda:0') factor: tensor(0.1823, device='cuda:0')


Epoch 1, Step 1051: Loss = 0.0602, Actor Loss = 0.0117, Critic Loss = 0.0972, Entropy = 0.1110, Returns = 1.2798, Value = 1.4397, mask_loss = 0.1779std_loss = 0.3310:  19%|█▉        | 1051/5475 [4:44:14<20:49:54, 16.95s/it]

ratio tensor(3.5938, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1747, device='cuda:0')
reward_raw: tensor(0.5311, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1753, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.1860, device='cuda:0')
reward_raw: tensor(0.5565, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1916, device='cuda:0')
reward_raw: tensor(0.5370, device='cuda:0') reward: tensor(1.3531, device='cuda:0') factor: tensor(0.1840, device='cuda:0')


Epoch 1, Step 1052: Loss = -0.0512, Actor Loss = -0.1188, Critic Loss = 0.1354, Entropy = 0.0799, Returns = 1.3575, Value = 1.3880, mask_loss = 0.1858std_loss = 0.3407:  19%|█▉        | 1052/5475 [4:44:27<19:35:27, 15.95s/it]

ratio tensor(0.9461, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6359, device='cuda:0') reward: tensor(1.4196, device='cuda:0') factor: tensor(0.2163, device='cuda:0')
reward_raw: tensor(0.6309, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.2189, device='cuda:0')
reward_raw: tensor(0.6591, device='cuda:0') reward: tensor(1.4407, device='cuda:0') factor: tensor(0.2184, device='cuda:0')
reward_raw: tensor(0.6333, device='cuda:0') reward: tensor(1.4162, device='cuda:0') factor: tensor(0.2171, device='cuda:0')
reward_raw: tensor(0.6242, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.2157, device='cuda:0')


Epoch 1, Step 1053: Loss = -0.0235, Actor Loss = -0.0479, Critic Loss = 0.0490, Entropy = 0.1142, Returns = 1.3958, Value = 1.3057, mask_loss = 0.2115std_loss = 0.3457:  19%|█▉        | 1053/5475 [4:44:46<20:37:16, 16.79s/it]

ratio tensor(1.5478, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6120, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.2140, device='cuda:0')
reward_raw: tensor(0.6194, device='cuda:0') reward: tensor(1.4067, device='cuda:0') factor: tensor(0.2127, device='cuda:0')
reward_raw: tensor(0.6093, device='cuda:0') reward: tensor(1.3914, device='cuda:0') factor: tensor(0.2179, device='cuda:0')
reward_raw: tensor(0.6412, device='cuda:0') reward: tensor(1.4232, device='cuda:0') factor: tensor(0.2180, device='cuda:0')
reward_raw: tensor(0.6366, device='cuda:0') reward: tensor(1.4197, device='cuda:0') factor: tensor(0.2169, device='cuda:0')


Epoch 1, Step 1054: Loss = 0.0744, Actor Loss = 0.0428, Critic Loss = 0.0635, Entropy = 0.1333, Returns = 1.3841, Value = 1.3900, mask_loss = 0.2277std_loss = 0.3536:  19%|█▉        | 1054/5475 [4:45:04<20:54:04, 17.02s/it]  

ratio tensor(0.8253, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5222, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.1440, device='cuda:0')
reward_raw: tensor(0.5219, device='cuda:0') reward: tensor(1.3741, device='cuda:0') factor: tensor(0.1478, device='cuda:0')
reward_raw: tensor(0.5157, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1464, device='cuda:0')
reward_raw: tensor(0.5047, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.1433, device='cuda:0')
reward_raw: tensor(0.5218, device='cuda:0') reward: tensor(1.3771, device='cuda:0') factor: tensor(0.1448, device='cuda:0')


Epoch 1, Step 1055: Loss = -0.0611, Actor Loss = -0.1000, Critic Loss = 0.0779, Entropy = 0.0888, Returns = 1.4191, Value = 1.3674, mask_loss = 0.1494std_loss = 0.3166:  19%|█▉        | 1055/5475 [4:45:19<20:17:30, 16.53s/it]

ratio tensor(1.4970, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5067, device='cuda:0') reward: tensor(1.3088, device='cuda:0') factor: tensor(0.1979, device='cuda:0')
reward_raw: tensor(0.5036, device='cuda:0') reward: tensor(1.3060, device='cuda:0') factor: tensor(0.1975, device='cuda:0')
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.3022, device='cuda:0') factor: tensor(0.1988, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3096, device='cuda:0') factor: tensor(0.1943, device='cuda:0')
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1950, device='cuda:0')


Epoch 1, Step 1056: Loss = 0.1621, Actor Loss = 0.1476, Critic Loss = 0.0293, Entropy = 0.1145, Returns = 1.2870, Value = 1.2742, mask_loss = 0.2032std_loss = 0.3515:  19%|█▉        | 1056/5475 [4:45:36<20:30:43, 16.71s/it]  

ratio tensor(0.4978, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5906, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.1901, device='cuda:0')
reward_raw: tensor(0.5711, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.1899, device='cuda:0')
reward_raw: tensor(0.5728, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.1905, device='cuda:0')
reward_raw: tensor(0.5801, device='cuda:0') reward: tensor(1.3941, device='cuda:0') factor: tensor(0.1860, device='cuda:0')
reward_raw: tensor(0.5686, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.1894, device='cuda:0')


Epoch 1, Step 1057: Loss = -0.0561, Actor Loss = -0.0863, Critic Loss = 0.0607, Entropy = 0.1052, Returns = 1.4147, Value = 1.3973, mask_loss = 0.1634std_loss = 0.3227:  19%|█▉        | 1057/5475 [4:45:50<19:24:52, 15.82s/it]

ratio tensor(2.6362, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5068, device='cuda:0') reward: tensor(1.2742, device='cuda:0') factor: tensor(0.2326, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.2782, device='cuda:0') factor: tensor(0.2281, device='cuda:0')
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.2883, device='cuda:0') factor: tensor(0.2273, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.2966, device='cuda:0') factor: tensor(0.2241, device='cuda:0')
reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.2829, device='cuda:0') factor: tensor(0.2315, device='cuda:0')


Epoch 1, Step 1058: Loss = -0.0278, Actor Loss = -0.0921, Critic Loss = 0.1289, Entropy = 0.1090, Returns = 1.2090, Value = 1.3065, mask_loss = 0.2315std_loss = 0.3713:  19%|█▉        | 1058/5475 [4:46:14<22:19:31, 18.20s/it]

ratio tensor(0.9627, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.2283, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.3320, device='cuda:0') factor: tensor(0.2187, device='cuda:0')
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.2357, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.3446, device='cuda:0') factor: tensor(0.2233, device='cuda:0')
reward_raw: tensor(0.5584, device='cuda:0') reward: tensor(1.3297, device='cuda:0') factor: tensor(0.2286, device='cuda:0')


Epoch 1, Step 1059: Loss = 0.1590, Actor Loss = 0.1155, Critic Loss = 0.0872, Entropy = 0.0775, Returns = 1.3887, Value = 1.3052, mask_loss = 0.1579std_loss = 0.3254:  19%|█▉        | 1059/5475 [4:46:26<20:12:15, 16.47s/it]  

ratio tensor(0.0152, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4940, device='cuda:0') reward: tensor(1.3520, device='cuda:0') factor: tensor(0.1420, device='cuda:0')
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.3544, device='cuda:0') factor: tensor(0.1424, device='cuda:0')
reward_raw: tensor(0.4906, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1466, device='cuda:0')
reward_raw: tensor(0.4695, device='cuda:0') reward: tensor(1.3247, device='cuda:0') factor: tensor(0.1448, device='cuda:0')
reward_raw: tensor(0.4603, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.1468, device='cuda:0')


Epoch 1, Step 1060: Loss = -0.0346, Actor Loss = -0.0878, Critic Loss = 0.1065, Entropy = 0.0707, Returns = 1.3536, Value = 1.3196, mask_loss = 0.1203std_loss = 0.2728:  19%|█▉        | 1060/5475 [4:46:44<20:49:58, 16.99s/it]

ratio tensor(1.2198, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5231, device='cuda:0') reward: tensor(1.3965, device='cuda:0') factor: tensor(0.1266, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.1263, device='cuda:0')
reward_raw: tensor(0.5268, device='cuda:0') reward: tensor(1.4060, device='cuda:0') factor: tensor(0.1208, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.1214, device='cuda:0')
reward_raw: tensor(0.5373, device='cuda:0') reward: tensor(1.4124, device='cuda:0') factor: tensor(0.1248, device='cuda:0')


Epoch 1, Step 1061: Loss = 0.1344, Actor Loss = 0.1001, Critic Loss = 0.0687, Entropy = 0.0887, Returns = 1.4325, Value = 1.4794, mask_loss = 0.1243std_loss = 0.2745:  19%|█▉        | 1060/5475 [4:47:02<20:49:58, 16.99s/it]  

ratio tensor(1.4630, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1061: Loss = 0.1344, Actor Loss = 0.1001, Critic Loss = 0.0687, Entropy = 0.0887, Returns = 1.4325, Value = 1.4794, mask_loss = 0.1243std_loss = 0.2745:  19%|█▉        | 1061/5475 [4:47:02<21:11:27, 17.28s/it]

reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.4156, device='cuda:0') factor: tensor(0.0999, device='cuda:0')
reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.3858, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.4111, device='cuda:0') factor: tensor(0.0948, device='cuda:0')
reward_raw: tensor(0.4794, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.0947, device='cuda:0')
reward_raw: tensor(0.5288, device='cuda:0') reward: tensor(1.4306, device='cuda:0') factor: tensor(0.0982, device='cuda:0')


Epoch 1, Step 1062: Loss = 0.1710, Actor Loss = 0.1403, Critic Loss = 0.0615, Entropy = 0.0602, Returns = 1.3633, Value = 1.4524, mask_loss = 0.0947std_loss = 0.2565:  19%|█▉        | 1062/5475 [4:47:20<21:16:21, 17.35s/it]

ratio tensor(0.8664, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3763, device='cuda:0') reward: tensor(1.2697, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.4050, device='cuda:0') reward: tensor(1.2952, device='cuda:0') factor: tensor(0.1098, device='cuda:0')
reward_raw: tensor(0.3894, device='cuda:0') reward: tensor(1.2819, device='cuda:0') factor: tensor(0.1075, device='cuda:0')
reward_raw: tensor(0.3818, device='cuda:0') reward: tensor(1.2793, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.3888, device='cuda:0') reward: tensor(1.2799, device='cuda:0') factor: tensor(0.1089, device='cuda:0')


Epoch 1, Step 1063: Loss = 0.1862, Actor Loss = 0.1228, Critic Loss = 0.1269, Entropy = 0.0729, Returns = 1.3395, Value = 1.2575, mask_loss = 0.1280std_loss = 0.2949:  19%|█▉        | 1063/5475 [4:47:34<20:03:40, 16.37s/it]

ratio tensor(1.0256, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4512, device='cuda:0') reward: tensor(1.3076, device='cuda:0') factor: tensor(0.1436, device='cuda:0')
reward_raw: tensor(0.4576, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1461, device='cuda:0')
reward_raw: tensor(0.4407, device='cuda:0') reward: tensor(1.2979, device='cuda:0') factor: tensor(0.1427, device='cuda:0')
reward_raw: tensor(0.4583, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.1488, device='cuda:0')


Epoch 1, Step 1064: Loss = -0.1416, Actor Loss = -0.1802, Critic Loss = 0.0773, Entropy = 0.0513, Returns = 1.3954, Value = 1.3723, mask_loss = 0.0883std_loss = 0.2504:  19%|█▉        | 1064/5475 [4:47:52<20:32:31, 16.77s/it]

ratio tensor(1.0410, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4146, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.3781, device='cuda:0') reward: tensor(1.2648, device='cuda:0') factor: tensor(0.1133, device='cuda:0')
reward_raw: tensor(0.3832, device='cuda:0') reward: tensor(1.2636, device='cuda:0') factor: tensor(0.1195, device='cuda:0')
reward_raw: tensor(0.3713, device='cuda:0') reward: tensor(1.2553, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.3907, device='cuda:0') reward: tensor(1.2675, device='cuda:0') factor: tensor(0.1232, device='cuda:0')


Epoch 1, Step 1065: Loss = 0.0742, Actor Loss = 0.0271, Critic Loss = 0.0944, Entropy = 0.0735, Returns = 1.2786, Value = 1.3296, mask_loss = 0.1218std_loss = 0.2802:  19%|█▉        | 1065/5475 [4:48:08<20:26:22, 16.69s/it]  

ratio tensor(0.9103, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4282, device='cuda:0') reward: tensor(1.3309, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4429, device='cuda:0') reward: tensor(1.3473, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.4343, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.0963, device='cuda:0')
reward_raw: tensor(0.4484, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.4365, device='cuda:0') reward: tensor(1.3349, device='cuda:0') factor: tensor(0.1016, device='cuda:0')


Epoch 1, Step 1066: Loss = -0.0408, Actor Loss = -0.0600, Critic Loss = 0.0386, Entropy = 0.0570, Returns = 1.3406, Value = 1.2979, mask_loss = 0.0959std_loss = 0.2570:  19%|█▉        | 1066/5475 [4:48:31<22:35:42, 18.45s/it]

ratio tensor(0.9827, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3656, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.1136, device='cuda:0')
reward_raw: tensor(0.4497, device='cuda:0') reward: tensor(1.3374, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4485, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.3617, device='cuda:0') factor: tensor(0.1133, device='cuda:0')


Epoch 1, Step 1067: Loss = -0.0267, Actor Loss = -0.0572, Critic Loss = 0.0611, Entropy = 0.0606, Returns = 1.3505, Value = 1.3501, mask_loss = 0.1130std_loss = 0.2805:  19%|█▉        | 1067/5475 [4:48:45<21:17:39, 17.39s/it]

ratio tensor(1.0395, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4739, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1097, device='cuda:0')
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.3883, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3841, device='cuda:0') factor: tensor(0.1189, device='cuda:0')


Epoch 1, Step 1068: Loss = -0.0276, Actor Loss = -0.0487, Critic Loss = 0.0425, Entropy = 0.0703, Returns = 1.4319, Value = 1.3912, mask_loss = 0.1115std_loss = 0.2682:  20%|█▉        | 1068/5475 [4:49:00<20:06:12, 16.42s/it]

ratio tensor(1.3012, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4520, device='cuda:0') reward: tensor(1.3160, device='cuda:0') factor: tensor(0.1360, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3330, device='cuda:0') factor: tensor(0.1327, device='cuda:0')
reward_raw: tensor(0.4320, device='cuda:0') reward: tensor(1.3029, device='cuda:0') factor: tensor(0.1292, device='cuda:0')
reward_raw: tensor(0.4562, device='cuda:0') reward: tensor(1.3279, device='cuda:0') factor: tensor(0.1283, device='cuda:0')
reward_raw: tensor(0.4524, device='cuda:0') reward: tensor(1.3195, device='cuda:0') factor: tensor(0.1329, device='cuda:0')


Epoch 1, Step 1069: Loss = 0.0814, Actor Loss = 0.0620, Critic Loss = 0.0389, Entropy = 0.0752, Returns = 1.2572, Value = 1.3633, mask_loss = 0.1273std_loss = 0.2918:  20%|█▉        | 1069/5475 [4:49:18<20:40:55, 16.90s/it]  

ratio tensor(1.0129, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4980, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.4605, device='cuda:0') reward: tensor(1.3575, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3838, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1006, device='cuda:0')
reward_raw: tensor(0.4751, device='cuda:0') reward: tensor(1.3690, device='cuda:0') factor: tensor(0.1061, device='cuda:0')


Epoch 1, Step 1070: Loss = -0.1366, Actor Loss = -0.1834, Critic Loss = 0.0938, Entropy = 0.0818, Returns = 1.4622, Value = 1.3477, mask_loss = 0.1302std_loss = 0.2942:  20%|█▉        | 1070/5475 [4:49:34<20:18:09, 16.59s/it]

ratio tensor(1.2628, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5690, device='cuda:0') reward: tensor(1.4454, device='cuda:0') factor: tensor(0.1237, device='cuda:0')
reward_raw: tensor(0.5867, device='cuda:0') reward: tensor(1.4736, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.5587, device='cuda:0') reward: tensor(1.4396, device='cuda:0') factor: tensor(0.1191, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4196, device='cuda:0') factor: tensor(0.1180, device='cuda:0')
reward_raw: tensor(0.5655, device='cuda:0') reward: tensor(1.4507, device='cuda:0') factor: tensor(0.1148, device='cuda:0')


Epoch 1, Step 1071: Loss = 0.0041, Actor Loss = -0.0143, Critic Loss = 0.0370, Entropy = 0.0626, Returns = 1.4805, Value = 1.4990, mask_loss = 0.1211std_loss = 0.2915:  20%|█▉        | 1071/5475 [4:49:47<19:19:33, 15.80s/it] 

ratio tensor(1.0315, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4468, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.4360, device='cuda:0') reward: tensor(1.3231, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.3402, device='cuda:0') factor: tensor(0.1139, device='cuda:0')
reward_raw: tensor(0.4343, device='cuda:0') reward: tensor(1.3286, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.4580, device='cuda:0') reward: tensor(1.3474, device='cuda:0') factor: tensor(0.1106, device='cuda:0')


Epoch 1, Step 1072: Loss = 0.1994, Actor Loss = 0.1671, Critic Loss = 0.0647, Entropy = 0.0700, Returns = 1.3149, Value = 1.2910, mask_loss = 0.1237std_loss = 0.2851:  20%|█▉        | 1072/5475 [4:50:02<18:51:44, 15.42s/it] 

ratio tensor(0.8862, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.3929, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.4852, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.1065, device='cuda:0')
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.1084, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3956, device='cuda:0') factor: tensor(0.1067, device='cuda:0')


Epoch 1, Step 1073: Loss = -0.1826, Actor Loss = -0.2016, Critic Loss = 0.0380, Entropy = 0.0508, Returns = 1.4131, Value = 1.4102, mask_loss = 0.1046std_loss = 0.2742:  20%|█▉        | 1073/5475 [4:50:20<19:47:22, 16.18s/it]

ratio tensor(1.1470, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3724, device='cuda:0') factor: tensor(0.1173, device='cuda:0')
reward_raw: tensor(0.4462, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.1124, device='cuda:0')
reward_raw: tensor(0.4624, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.4538, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.4724, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.1194, device='cuda:0')


Epoch 1, Step 1074: Loss = 0.1706, Actor Loss = 0.1328, Critic Loss = 0.0757, Entropy = 0.0581, Returns = 1.3771, Value = 1.5122, mask_loss = 0.0996std_loss = 0.2651:  20%|█▉        | 1074/5475 [4:50:40<21:03:53, 17.23s/it]  

ratio tensor(1.2344, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5042, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1189, device='cuda:0')
reward_raw: tensor(0.4910, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(1.3673, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3769, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.3925, device='cuda:0') factor: tensor(0.1132, device='cuda:0')


Epoch 1, Step 1075: Loss = 0.1045, Actor Loss = 0.0577, Critic Loss = 0.0938, Entropy = 0.0703, Returns = 1.3816, Value = 1.2807, mask_loss = 0.1017std_loss = 0.2583:  20%|█▉        | 1075/5475 [4:50:57<21:14:49, 17.38s/it]

ratio tensor(1.5864, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4790, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1173, device='cuda:0')
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.3887, device='cuda:0') factor: tensor(0.1164, device='cuda:0')
reward_raw: tensor(0.4831, device='cuda:0') reward: tensor(1.3631, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.4526, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4942, device='cuda:0') reward: tensor(1.3748, device='cuda:0') factor: tensor(0.1195, device='cuda:0')


Epoch 1, Step 1076: Loss = -0.1440, Actor Loss = -0.1651, Critic Loss = 0.0425, Entropy = 0.0826, Returns = 1.2706, Value = 1.2903, mask_loss = 0.1194std_loss = 0.2781:  20%|█▉        | 1076/5475 [4:51:11<19:53:46, 16.28s/it]

ratio tensor(1.1590, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4486, device='cuda:0') reward: tensor(1.3367, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.4534, device='cuda:0') reward: tensor(1.3456, device='cuda:0') factor: tensor(0.1078, device='cuda:0')
reward_raw: tensor(0.4514, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1084, device='cuda:0')
reward_raw: tensor(0.4465, device='cuda:0') reward: tensor(1.3411, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.4298, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.1108, device='cuda:0')


Epoch 1, Step 1077: Loss = 0.1047, Actor Loss = 0.0758, Critic Loss = 0.0578, Entropy = 0.0572, Returns = 1.2790, Value = 1.3226, mask_loss = 0.0989std_loss = 0.2647:  20%|█▉        | 1077/5475 [4:51:28<19:59:01, 16.36s/it]  

ratio tensor(0.9947, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5488, device='cuda:0') reward: tensor(1.4301, device='cuda:0') factor: tensor(0.1187, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.4284, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.4327, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.4175, device='cuda:0') factor: tensor(0.1115, device='cuda:0')


Epoch 1, Step 1078: Loss = -0.0308, Actor Loss = -0.0642, Critic Loss = 0.0668, Entropy = 0.0543, Returns = 1.4479, Value = 1.4244, mask_loss = 0.0987std_loss = 0.2618:  20%|█▉        | 1078/5475 [4:51:45<20:22:25, 16.68s/it]

ratio tensor(1.1236, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4443, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1176, device='cuda:0')
reward_raw: tensor(0.4356, device='cuda:0') reward: tensor(1.3208, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4473, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.1094, device='cuda:0')
reward_raw: tensor(0.4580, device='cuda:0') reward: tensor(1.3424, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4734, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.1155, device='cuda:0')


Epoch 1, Step 1079: Loss = -0.0345, Actor Loss = -0.0628, Critic Loss = 0.0569, Entropy = 0.0729, Returns = 1.3256, Value = 1.3284, mask_loss = 0.1209std_loss = 0.2838:  20%|█▉        | 1079/5475 [4:52:05<21:28:58, 17.59s/it]

ratio tensor(1.0785, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4274, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.3771, device='cuda:0') reward: tensor(1.2715, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4014, device='cuda:0') reward: tensor(1.2982, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.3825, device='cuda:0') reward: tensor(1.2746, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.3799, device='cuda:0') reward: tensor(1.2728, device='cuda:0') factor: tensor(0.1071, device='cuda:0')


Epoch 1, Step 1080: Loss = 0.2930, Actor Loss = 0.2546, Critic Loss = 0.0769, Entropy = 0.0649, Returns = 1.2542, Value = 1.2563, mask_loss = 0.1035std_loss = 0.2676:  20%|█▉        | 1080/5475 [4:52:21<20:48:06, 17.04s/it]  

ratio tensor(0.7695, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4978, device='cuda:0') reward: tensor(1.3508, device='cuda:0') factor: tensor(0.1469, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3537, device='cuda:0') factor: tensor(0.1406, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3523, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.5181, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1416, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3605, device='cuda:0') factor: tensor(0.1343, device='cuda:0')


Epoch 1, Step 1081: Loss = -0.0071, Actor Loss = -0.0403, Critic Loss = 0.0665, Entropy = 0.0773, Returns = 1.3209, Value = 1.4185, mask_loss = 0.1354std_loss = 0.2993:  20%|█▉        | 1080/5475 [4:52:36<20:48:06, 17.04s/it]

ratio tensor(1.6686, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1081: Loss = -0.0071, Actor Loss = -0.0403, Critic Loss = 0.0665, Entropy = 0.0773, Returns = 1.3209, Value = 1.4185, mask_loss = 0.1354std_loss = 0.2993:  20%|█▉        | 1081/5475 [4:52:36<20:09:05, 16.51s/it]

reward_raw: tensor(0.4274, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.1307, device='cuda:0')
reward_raw: tensor(0.4323, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.1267, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.1297, device='cuda:0')
reward_raw: tensor(0.4437, device='cuda:0') reward: tensor(1.3152, device='cuda:0') factor: tensor(0.1286, device='cuda:0')


Epoch 1, Step 1082: Loss = 0.0191, Actor Loss = 0.0011, Critic Loss = 0.0362, Entropy = 0.0702, Returns = 1.3105, Value = 1.3457, mask_loss = 0.1336std_loss = 0.2977:  20%|█▉        | 1082/5475 [4:52:52<20:07:37, 16.49s/it]  

ratio tensor(1.2186, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.3408, device='cuda:0') factor: tensor(0.1301, device='cuda:0')
reward_raw: tensor(0.4535, device='cuda:0') reward: tensor(1.3214, device='cuda:0') factor: tensor(0.1321, device='cuda:0')
reward_raw: tensor(0.4400, device='cuda:0') reward: tensor(1.3070, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3529, device='cuda:0') factor: tensor(0.1343, device='cuda:0')
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.1354, device='cuda:0')


Epoch 1, Step 1083: Loss = -0.0625, Actor Loss = -0.0821, Critic Loss = 0.0393, Entropy = 0.0694, Returns = 1.3473, Value = 1.3481, mask_loss = 0.1392std_loss = 0.3012:  20%|█▉        | 1083/5475 [4:53:12<21:21:52, 17.51s/it]

ratio tensor(1.0076, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.1094, device='cuda:0')
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.3900, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.5008, device='cuda:0') reward: tensor(1.3891, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.5111, device='cuda:0') reward: tensor(1.3954, device='cuda:0') factor: tensor(0.1157, device='cuda:0')


Epoch 1, Step 1084: Loss = 0.0010, Actor Loss = -0.0502, Critic Loss = 0.1025, Entropy = 0.0545, Returns = 1.3374, Value = 1.4966, mask_loss = 0.0943std_loss = 0.2606:  20%|█▉        | 1084/5475 [4:53:25<19:45:15, 16.20s/it] 

ratio tensor(1.3577, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3679, device='cuda:0') reward: tensor(1.2630, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.4066, device='cuda:0') reward: tensor(1.3039, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.3779, device='cuda:0') reward: tensor(1.2714, device='cuda:0') factor: tensor(0.1065, device='cuda:0')
reward_raw: tensor(0.4235, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.3899, device='cuda:0') reward: tensor(1.2829, device='cuda:0') factor: tensor(0.1070, device='cuda:0')


Epoch 1, Step 1085: Loss = 0.4687, Actor Loss = 0.4169, Critic Loss = 0.1039, Entropy = 0.0678, Returns = 1.2595, Value = 1.1994, mask_loss = 0.1210std_loss = 0.2877:  20%|█▉        | 1085/5475 [4:53:41<19:32:27, 16.02s/it] 

ratio tensor(0.8649, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3452, device='cuda:0') reward: tensor(1.2254, device='cuda:0') factor: tensor(0.1199, device='cuda:0')
reward_raw: tensor(0.3812, device='cuda:0') reward: tensor(1.2641, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.4027, device='cuda:0') reward: tensor(1.2806, device='cuda:0') factor: tensor(0.1221, device='cuda:0')
reward_raw: tensor(0.3746, device='cuda:0') reward: tensor(1.2561, device='cuda:0') factor: tensor(0.1185, device='cuda:0')
reward_raw: tensor(0.4174, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.1234, device='cuda:0')


Epoch 1, Step 1086: Loss = -0.2136, Actor Loss = -0.2539, Critic Loss = 0.0808, Entropy = 0.0730, Returns = 1.2297, Value = 1.2621, mask_loss = 0.1311std_loss = 0.2938:  20%|█▉        | 1086/5475 [4:53:54<18:33:01, 15.22s/it]

ratio tensor(2.7450, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4506, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1215, device='cuda:0')
reward_raw: tensor(0.4442, device='cuda:0') reward: tensor(1.3200, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.4379, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1230, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.3486, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.4634, device='cuda:0') reward: tensor(1.3450, device='cuda:0') factor: tensor(0.1184, device='cuda:0')


Epoch 1, Step 1087: Loss = -0.0288, Actor Loss = -0.0638, Critic Loss = 0.0701, Entropy = 0.0865, Returns = 1.3203, Value = 1.3492, mask_loss = 0.1317std_loss = 0.2937:  20%|█▉        | 1087/5475 [4:54:08<18:01:18, 14.79s/it]

ratio tensor(1.2152, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4279, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.3417, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1350, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3304, device='cuda:0') factor: tensor(0.1376, device='cuda:0')
reward_raw: tensor(0.4460, device='cuda:0') reward: tensor(1.3169, device='cuda:0') factor: tensor(0.1291, device='cuda:0')


Epoch 1, Step 1088: Loss = -0.0546, Actor Loss = -0.0886, Critic Loss = 0.0681, Entropy = 0.0729, Returns = 1.3720, Value = 1.2474, mask_loss = 0.1453std_loss = 0.3081:  20%|█▉        | 1088/5475 [4:54:22<17:35:42, 14.44s/it]

ratio tensor(1.0277, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.1223, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3935, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.3727, device='cuda:0') factor: tensor(0.1195, device='cuda:0')
reward_raw: tensor(0.4941, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.1209, device='cuda:0')
reward_raw: tensor(0.4929, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1175, device='cuda:0')


Epoch 1, Step 1089: Loss = -0.0109, Actor Loss = -0.0366, Critic Loss = 0.0517, Entropy = 0.0619, Returns = 1.4078, Value = 1.4276, mask_loss = 0.1211std_loss = 0.2917:  20%|█▉        | 1089/5475 [4:54:35<17:03:40, 14.00s/it]

ratio tensor(1.0050, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.3517, device='cuda:0') factor: tensor(0.1107, device='cuda:0')
reward_raw: tensor(0.4478, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.4411, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.1098, device='cuda:0')
reward_raw: tensor(0.4919, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1093, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(1.3668, device='cuda:0') factor: tensor(0.1092, device='cuda:0')


Epoch 1, Step 1090: Loss = -0.0930, Actor Loss = -0.1464, Critic Loss = 0.1069, Entropy = 0.0488, Returns = 1.3607, Value = 1.2822, mask_loss = 0.0910std_loss = 0.2575:  20%|█▉        | 1090/5475 [4:54:48<16:44:47, 13.75s/it]

ratio tensor(0.9779, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3976, device='cuda:0') reward: tensor(1.2871, device='cuda:0') factor: tensor(0.1106, device='cuda:0')
reward_raw: tensor(0.4195, device='cuda:0') reward: tensor(1.3087, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.4105, device='cuda:0') reward: tensor(1.3001, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.3851, device='cuda:0') reward: tensor(1.2789, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.3993, device='cuda:0') reward: tensor(1.2867, device='cuda:0') factor: tensor(0.1126, device='cuda:0')


Epoch 1, Step 1091: Loss = 0.2604, Actor Loss = 0.2206, Critic Loss = 0.0798, Entropy = 0.0717, Returns = 1.2978, Value = 1.3101, mask_loss = 0.1193std_loss = 0.2812:  20%|█▉        | 1091/5475 [4:55:05<18:00:19, 14.79s/it]  

ratio tensor(1.0739, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.5581, device='cuda:0') reward: tensor(1.4345, device='cuda:0') factor: tensor(0.1236, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3939, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.5227, device='cuda:0') reward: tensor(1.4058, device='cuda:0') factor: tensor(0.1169, device='cuda:0')
reward_raw: tensor(0.5419, device='cuda:0') reward: tensor(1.4217, device='cuda:0') factor: tensor(0.1202, device='cuda:0')


Epoch 1, Step 1092: Loss = -0.2150, Actor Loss = -0.2497, Critic Loss = 0.0696, Entropy = 0.0644, Returns = 1.4494, Value = 1.4153, mask_loss = 0.1150std_loss = 0.2819:  20%|█▉        | 1092/5475 [4:55:23<19:15:22, 15.82s/it]

ratio tensor(1.3138, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.4069, device='cuda:0') factor: tensor(0.1376, device='cuda:0')
reward_raw: tensor(0.5535, device='cuda:0') reward: tensor(1.4138, device='cuda:0') factor: tensor(0.1397, device='cuda:0')
reward_raw: tensor(0.5615, device='cuda:0') reward: tensor(1.4264, device='cuda:0') factor: tensor(0.1351, device='cuda:0')
reward_raw: tensor(0.5429, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.3978, device='cuda:0') factor: tensor(0.1370, device='cuda:0')


Epoch 1, Step 1093: Loss = 0.0958, Actor Loss = 0.0576, Critic Loss = 0.0767, Entropy = 0.0691, Returns = 1.4499, Value = 1.5113, mask_loss = 0.1108std_loss = 0.2706:  20%|█▉        | 1093/5475 [4:55:39<19:15:59, 15.83s/it]  

ratio tensor(0.8606, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3677, device='cuda:0') reward: tensor(1.2540, device='cuda:0') factor: tensor(0.1138, device='cuda:0')
reward_raw: tensor(0.3660, device='cuda:0') reward: tensor(1.2518, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.3769, device='cuda:0') reward: tensor(1.2627, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.3809, device='cuda:0') reward: tensor(1.2673, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.3754, device='cuda:0') reward: tensor(1.2624, device='cuda:0') factor: tensor(0.1130, device='cuda:0')


Epoch 1, Step 1094: Loss = 0.1366, Actor Loss = 0.0902, Critic Loss = 0.0928, Entropy = 0.0730, Returns = 1.2176, Value = 1.0805, mask_loss = 0.1147std_loss = 0.2769:  20%|█▉        | 1094/5475 [4:55:56<19:38:42, 16.14s/it]

ratio tensor(0.8294, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4203, device='cuda:0') reward: tensor(1.3038, device='cuda:0') factor: tensor(0.1165, device='cuda:0')
reward_raw: tensor(0.4204, device='cuda:0') reward: tensor(1.3115, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.4378, device='cuda:0') reward: tensor(1.3269, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.4196, device='cuda:0') reward: tensor(1.3062, device='cuda:0') factor: tensor(0.1134, device='cuda:0')
reward_raw: tensor(0.4515, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.1176, device='cuda:0')


Epoch 1, Step 1095: Loss = 0.0465, Actor Loss = -0.0088, Critic Loss = 0.1107, Entropy = 0.0861, Returns = 1.2753, Value = 1.2190, mask_loss = 0.1178std_loss = 0.2640:  20%|██        | 1095/5475 [4:56:12<19:25:42, 15.97s/it]

ratio tensor(1.0469, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.4144, device='cuda:0') factor: tensor(0.1258, device='cuda:0')
reward_raw: tensor(0.5570, device='cuda:0') reward: tensor(1.4325, device='cuda:0') factor: tensor(0.1246, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.4221, device='cuda:0') factor: tensor(0.1290, device='cuda:0')
reward_raw: tensor(0.5320, device='cuda:0') reward: tensor(1.4144, device='cuda:0') factor: tensor(0.1176, device='cuda:0')
reward_raw: tensor(0.5408, device='cuda:0') reward: tensor(1.4186, device='cuda:0') factor: tensor(0.1222, device='cuda:0')


Epoch 1, Step 1096: Loss = -0.1514, Actor Loss = -0.2187, Critic Loss = 0.1348, Entropy = 0.0714, Returns = 1.3893, Value = 1.5119, mask_loss = 0.1204std_loss = 0.2787:  20%|██        | 1096/5475 [4:56:29<19:56:28, 16.39s/it]

ratio tensor(1.6550, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3965, device='cuda:0') reward: tensor(1.2892, device='cuda:0') factor: tensor(0.1073, device='cuda:0')
reward_raw: tensor(0.3669, device='cuda:0') reward: tensor(1.2569, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.3863, device='cuda:0') reward: tensor(1.2861, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.3792, device='cuda:0') reward: tensor(1.2787, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.3864, device='cuda:0') reward: tensor(1.2769, device='cuda:0') factor: tensor(0.1094, device='cuda:0')


Epoch 1, Step 1097: Loss = 0.3069, Actor Loss = 0.2776, Critic Loss = 0.0588, Entropy = 0.0529, Returns = 1.3156, Value = 1.3877, mask_loss = 0.0996std_loss = 0.2674:  20%|██        | 1097/5475 [4:56:43<19:04:55, 15.69s/it]  

ratio tensor(1.0948, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3995, device='cuda:0') reward: tensor(1.3000, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.4206, device='cuda:0') reward: tensor(1.3216, device='cuda:0') factor: tensor(0.0990, device='cuda:0')
reward_raw: tensor(0.3955, device='cuda:0') reward: tensor(1.2925, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4121, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.0993, device='cuda:0')
reward_raw: tensor(0.4265, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.0975, device='cuda:0')


Epoch 1, Step 1098: Loss = -0.0124, Actor Loss = -0.0359, Critic Loss = 0.0472, Entropy = 0.0530, Returns = 1.2308, Value = 1.2858, mask_loss = 0.0928std_loss = 0.2576:  20%|██        | 1098/5475 [4:56:56<18:10:57, 14.95s/it]

ratio tensor(0.7568, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5295, device='cuda:0') reward: tensor(1.4132, device='cuda:0') factor: tensor(0.1163, device='cuda:0')
reward_raw: tensor(0.5443, device='cuda:0') reward: tensor(1.4272, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.4196, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.4248, device='cuda:0') factor: tensor(0.1240, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.4185, device='cuda:0') factor: tensor(0.1233, device='cuda:0')


Epoch 1, Step 1099: Loss = 0.0922, Actor Loss = 0.0064, Critic Loss = 0.1717, Entropy = 0.0661, Returns = 1.3919, Value = 1.2819, mask_loss = 0.1129std_loss = 0.2792:  20%|██        | 1099/5475 [4:57:11<17:59:01, 14.79s/it]  

ratio tensor(0.8865, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5469, device='cuda:0') reward: tensor(1.4257, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.4306, device='cuda:0') factor: tensor(0.1235, device='cuda:0')
reward_raw: tensor(0.5608, device='cuda:0') reward: tensor(1.4411, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.5771, device='cuda:0') reward: tensor(1.4521, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.5444, device='cuda:0') reward: tensor(1.4226, device='cuda:0') factor: tensor(0.1218, device='cuda:0')


Epoch 1, Step 1100: Loss = -0.0614, Actor Loss = -0.0938, Critic Loss = 0.0650, Entropy = 0.0815, Returns = 1.4280, Value = 1.4592, mask_loss = 0.1540std_loss = 0.3057:  20%|██        | 1100/5475 [4:57:25<17:43:59, 14.59s/it]

ratio tensor(1.1667, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4927, device='cuda:0') reward: tensor(1.3857, device='cuda:0') factor: tensor(0.1070, device='cuda:0')
reward_raw: tensor(0.4978, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.1107, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.4116, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.5025, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.1066, device='cuda:0')


Epoch 1, Step 1101: Loss = -0.0289, Actor Loss = -0.0598, Critic Loss = 0.0620, Entropy = 0.0865, Returns = 1.3590, Value = 1.3894, mask_loss = 0.1317std_loss = 0.2820:  20%|██        | 1100/5475 [4:57:40<17:43:59, 14.59s/it]

ratio tensor(1.2239, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1101: Loss = -0.0289, Actor Loss = -0.0598, Critic Loss = 0.0620, Entropy = 0.0865, Returns = 1.3590, Value = 1.3894, mask_loss = 0.1317std_loss = 0.2820:  20%|██        | 1101/5475 [4:57:40<17:57:33, 14.78s/it]

reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.4143, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.3929, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.3936, device='cuda:0') factor: tensor(0.1205, device='cuda:0')
reward_raw: tensor(0.5026, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3883, device='cuda:0') factor: tensor(0.1244, device='cuda:0')


Epoch 1, Step 1102: Loss = 0.0748, Actor Loss = 0.0301, Critic Loss = 0.0895, Entropy = 0.0582, Returns = 1.4504, Value = 1.3892, mask_loss = 0.0920std_loss = 0.2559:  20%|██        | 1102/5475 [4:57:54<17:37:26, 14.51s/it]  

ratio tensor(0.8323, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4436, device='cuda:0') reward: tensor(1.3300, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.4445, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.1107, device='cuda:0')
reward_raw: tensor(0.4309, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.4406, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.4500, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.1117, device='cuda:0')


Epoch 1, Step 1103: Loss = 0.1142, Actor Loss = 0.0843, Critic Loss = 0.0599, Entropy = 0.0733, Returns = 1.2971, Value = 1.2670, mask_loss = 0.1314std_loss = 0.2841:  20%|██        | 1103/5475 [4:58:11<18:34:15, 15.29s/it]

ratio tensor(0.9609, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.4504, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.5196, device='cuda:0') reward: tensor(1.4192, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.4342, device='cuda:0') factor: tensor(0.1023, device='cuda:0')
reward_raw: tensor(0.5268, device='cuda:0') reward: tensor(1.4266, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.5337, device='cuda:0') reward: tensor(1.4308, device='cuda:0') factor: tensor(0.1028, device='cuda:0')


Epoch 1, Step 1104: Loss = -0.0030, Actor Loss = -0.0815, Critic Loss = 0.1570, Entropy = 0.0665, Returns = 1.4223, Value = 1.6511, mask_loss = 0.1054std_loss = 0.2721:  20%|██        | 1104/5475 [4:58:24<17:43:37, 14.60s/it]

ratio tensor(1.2116, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1095, device='cuda:0')
reward_raw: tensor(0.4330, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.1097, device='cuda:0')
reward_raw: tensor(0.4466, device='cuda:0') reward: tensor(1.3435, device='cuda:0') factor: tensor(0.1031, device='cuda:0')
reward_raw: tensor(0.4558, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.1079, device='cuda:0')


Epoch 1, Step 1105: Loss = 0.0104, Actor Loss = -0.0324, Critic Loss = 0.0858, Entropy = 0.0653, Returns = 1.3786, Value = 1.2975, mask_loss = 0.0952std_loss = 0.2507:  20%|██        | 1105/5475 [4:58:41<18:47:42, 15.48s/it] 

ratio tensor(1.1219, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4394, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4594, device='cuda:0') reward: tensor(1.3513, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.4682, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4514, device='cuda:0') reward: tensor(1.3436, device='cuda:0') factor: tensor(0.1078, device='cuda:0')
reward_raw: tensor(0.4676, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1095, device='cuda:0')


Epoch 1, Step 1106: Loss = 0.1252, Actor Loss = 0.0907, Critic Loss = 0.0691, Entropy = 0.0832, Returns = 1.3502, Value = 1.3217, mask_loss = 0.1136std_loss = 0.2677:  20%|██        | 1106/5475 [4:58:57<18:39:10, 15.37s/it] 

ratio tensor(0.9110, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.3903, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4696, device='cuda:0') reward: tensor(1.3681, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1020, device='cuda:0')


Epoch 1, Step 1107: Loss = -0.2279, Actor Loss = -0.2800, Critic Loss = 0.1043, Entropy = 0.0492, Returns = 1.3800, Value = 1.4816, mask_loss = 0.0932std_loss = 0.2599:  20%|██        | 1107/5475 [4:59:11<18:27:22, 15.21s/it]

ratio tensor(1.4428, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.1296, device='cuda:0')
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1261, device='cuda:0')
reward_raw: tensor(0.4919, device='cuda:0') reward: tensor(1.3578, device='cuda:0') factor: tensor(0.1340, device='cuda:0')
reward_raw: tensor(0.4903, device='cuda:0') reward: tensor(1.3554, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.4745, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1319, device='cuda:0')


Epoch 1, Step 1108: Loss = 0.2336, Actor Loss = 0.1829, Critic Loss = 0.1016, Entropy = 0.0743, Returns = 1.3958, Value = 1.2729, mask_loss = 0.1357std_loss = 0.3027:  20%|██        | 1108/5475 [4:59:23<17:09:29, 14.14s/it]  

ratio tensor(0.8285, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3679, device='cuda:0') reward: tensor(1.2610, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.4076, device='cuda:0') reward: tensor(1.2946, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.4007, device='cuda:0') reward: tensor(1.2880, device='cuda:0') factor: tensor(0.1128, device='cuda:0')
reward_raw: tensor(0.4193, device='cuda:0') reward: tensor(1.3052, device='cuda:0') factor: tensor(0.1141, device='cuda:0')
reward_raw: tensor(0.4043, device='cuda:0') reward: tensor(1.2935, device='cuda:0') factor: tensor(0.1109, device='cuda:0')


Epoch 1, Step 1109: Loss = -0.0821, Actor Loss = -0.1090, Critic Loss = 0.0539, Entropy = 0.0773, Returns = 1.2859, Value = 1.2705, mask_loss = 0.1140std_loss = 0.2705:  20%|██        | 1109/5475 [4:59:37<16:57:24, 13.98s/it]

ratio tensor(1.2477, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.4358, device='cuda:0') factor: tensor(0.1244, device='cuda:0')
reward_raw: tensor(0.5525, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.1181, device='cuda:0')
reward_raw: tensor(0.5837, device='cuda:0') reward: tensor(1.4648, device='cuda:0') factor: tensor(0.1189, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.4492, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.5598, device='cuda:0') reward: tensor(1.4414, device='cuda:0') factor: tensor(0.1183, device='cuda:0')


Epoch 1, Step 1110: Loss = -0.1296, Actor Loss = -0.1546, Critic Loss = 0.0500, Entropy = 0.0735, Returns = 1.4600, Value = 1.5538, mask_loss = 0.1263std_loss = 0.2872:  20%|██        | 1110/5475 [4:59:47<15:33:06, 12.83s/it]

ratio tensor(1.0375, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.3741, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.4499, device='cuda:0') reward: tensor(1.3497, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.4297, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.4796, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3947, device='cuda:0') factor: tensor(0.1016, device='cuda:0')


Epoch 1, Step 1111: Loss = 0.3423, Actor Loss = 0.2970, Critic Loss = 0.0907, Entropy = 0.0573, Returns = 1.3402, Value = 1.3672, mask_loss = 0.0932std_loss = 0.2536:  20%|██        | 1111/5475 [5:00:05<17:20:56, 14.31s/it]  

ratio tensor(1.1877, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4389, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.4188, device='cuda:0') reward: tensor(1.3185, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.4318, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.4449, device='cuda:0') reward: tensor(1.3408, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.4401, device='cuda:0') reward: tensor(1.3334, device='cuda:0') factor: tensor(0.1067, device='cuda:0')


Epoch 1, Step 1112: Loss = 0.0174, Actor Loss = -0.0165, Critic Loss = 0.0679, Entropy = 0.0746, Returns = 1.3516, Value = 1.2501, mask_loss = 0.1176std_loss = 0.2807:  20%|██        | 1112/5475 [5:00:22<18:33:48, 15.32s/it]

ratio tensor(1.2597, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5627, device='cuda:0') reward: tensor(1.4471, device='cuda:0') factor: tensor(0.1156, device='cuda:0')
reward_raw: tensor(0.5437, device='cuda:0') reward: tensor(1.4265, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.5425, device='cuda:0') reward: tensor(1.4351, device='cuda:0') factor: tensor(0.1073, device='cuda:0')
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.4498, device='cuda:0') factor: tensor(0.1184, device='cuda:0')
reward_raw: tensor(0.5801, device='cuda:0') reward: tensor(1.4656, device='cuda:0') factor: tensor(0.1146, device='cuda:0')


Epoch 1, Step 1113: Loss = -0.1655, Actor Loss = -0.2011, Critic Loss = 0.0715, Entropy = 0.0710, Returns = 1.4982, Value = 1.4948, mask_loss = 0.1240std_loss = 0.2909:  20%|██        | 1113/5475 [5:00:41<19:56:50, 16.46s/it]

ratio tensor(1.2480, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4846, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1223, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(1.3846, device='cuda:0') factor: tensor(0.1223, device='cuda:0')
reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.3637, device='cuda:0') factor: tensor(0.1226, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1203, device='cuda:0')


Epoch 1, Step 1114: Loss = 0.0746, Actor Loss = 0.0491, Critic Loss = 0.0512, Entropy = 0.0724, Returns = 1.3800, Value = 1.3570, mask_loss = 0.1242std_loss = 0.2877:  20%|██        | 1114/5475 [5:00:55<19:02:37, 15.72s/it]  

ratio tensor(1.0266, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5611, device='cuda:0') reward: tensor(1.4296, device='cuda:0') factor: tensor(0.1315, device='cuda:0')
reward_raw: tensor(0.5506, device='cuda:0') reward: tensor(1.4187, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.6063, device='cuda:0') reward: tensor(1.4714, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.5718, device='cuda:0') reward: tensor(1.4404, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.5714, device='cuda:0') reward: tensor(1.4384, device='cuda:0') factor: tensor(0.1329, device='cuda:0')


Epoch 1, Step 1115: Loss = -0.0722, Actor Loss = -0.1100, Critic Loss = 0.0757, Entropy = 0.0725, Returns = 1.4916, Value = 1.4252, mask_loss = 0.1197std_loss = 0.2745:  20%|██        | 1115/5475 [5:01:10<18:33:56, 15.33s/it]

ratio tensor(1.0153, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.5217, device='cuda:0') reward: tensor(1.3931, device='cuda:0') factor: tensor(0.1287, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.3748, device='cuda:0') factor: tensor(0.1254, device='cuda:0')
reward_raw: tensor(0.5429, device='cuda:0') reward: tensor(1.4158, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.4926, device='cuda:0') reward: tensor(1.3719, device='cuda:0') factor: tensor(0.1207, device='cuda:0')


Epoch 1, Step 1116: Loss = -0.0100, Actor Loss = -0.0421, Critic Loss = 0.0644, Entropy = 0.0848, Returns = 1.3821, Value = 1.3593, mask_loss = 0.1288std_loss = 0.2854:  20%|██        | 1116/5475 [5:01:24<17:59:10, 14.85s/it]

ratio tensor(1.0746, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4194, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.0771, device='cuda:0')
reward_raw: tensor(0.4446, device='cuda:0') reward: tensor(1.3673, device='cuda:0') factor: tensor(0.0773, device='cuda:0')
reward_raw: tensor(0.4345, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.0754, device='cuda:0')
reward_raw: tensor(0.4270, device='cuda:0') reward: tensor(1.3528, device='cuda:0') factor: tensor(0.0742, device='cuda:0')
reward_raw: tensor(0.4287, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.0772, device='cuda:0')


Epoch 1, Step 1117: Loss = -0.0309, Actor Loss = -0.0765, Critic Loss = 0.0914, Entropy = 0.0378, Returns = 1.3759, Value = 1.4699, mask_loss = 0.0699std_loss = 0.2308:  20%|██        | 1117/5475 [5:01:44<20:05:06, 16.59s/it]

ratio tensor(1.0771, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4820, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.1106, device='cuda:0')
reward_raw: tensor(0.4816, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3838, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.4887, device='cuda:0') reward: tensor(1.3736, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.1127, device='cuda:0')


Epoch 1, Step 1118: Loss = 0.1694, Actor Loss = 0.1425, Critic Loss = 0.0539, Entropy = 0.0637, Returns = 1.3569, Value = 1.3051, mask_loss = 0.1050std_loss = 0.2672:  20%|██        | 1118/5475 [5:02:01<20:13:08, 16.71s/it]  

ratio tensor(0.8644, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4579, device='cuda:0') reward: tensor(1.3566, device='cuda:0') factor: tensor(0.1013, device='cuda:0')
reward_raw: tensor(0.4473, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.1031, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.1014, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3756, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.1042, device='cuda:0')


Epoch 1, Step 1119: Loss = -0.1502, Actor Loss = -0.1751, Critic Loss = 0.0500, Entropy = 0.0568, Returns = 1.3953, Value = 1.3282, mask_loss = 0.0959std_loss = 0.2621:  20%|██        | 1119/5475 [5:02:14<18:55:27, 15.64s/it]

ratio tensor(1.4853, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4898, device='cuda:0') reward: tensor(1.3952, device='cuda:0') factor: tensor(0.0947, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.4732, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.0978, device='cuda:0')
reward_raw: tensor(0.4370, device='cuda:0') reward: tensor(1.3412, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4691, device='cuda:0') reward: tensor(1.3736, device='cuda:0') factor: tensor(0.0955, device='cuda:0')


Epoch 1, Step 1120: Loss = 0.0858, Actor Loss = 0.0543, Critic Loss = 0.0631, Entropy = 0.0503, Returns = 1.2802, Value = 1.4153, mask_loss = 0.0921std_loss = 0.2599:  20%|██        | 1120/5475 [5:02:35<20:49:30, 17.21s/it]  

ratio tensor(0.9868, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.4512, device='cuda:0') reward: tensor(1.3305, device='cuda:0') factor: tensor(0.1207, device='cuda:0')
reward_raw: tensor(0.4442, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1182, device='cuda:0')
reward_raw: tensor(0.4807, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.1143, device='cuda:0')
reward_raw: tensor(0.5111, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.1216, device='cuda:0')


Epoch 1, Step 1121: Loss = 0.0924, Actor Loss = 0.0552, Critic Loss = 0.0746, Entropy = 0.0758, Returns = 1.3874, Value = 1.3219, mask_loss = 0.1278std_loss = 0.2861:  20%|██        | 1120/5475 [5:02:57<20:49:30, 17.21s/it]

ratio tensor(0.9157, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1121: Loss = 0.0924, Actor Loss = 0.0552, Critic Loss = 0.0746, Entropy = 0.0758, Returns = 1.3874, Value = 1.3219, mask_loss = 0.1278std_loss = 0.2861:  20%|██        | 1121/5475 [5:02:58<22:42:20, 18.77s/it]

reward_raw: tensor(0.5214, device='cuda:0') reward: tensor(1.4078, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.4105, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.1168, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.3829, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.5238, device='cuda:0') reward: tensor(1.4087, device='cuda:0') factor: tensor(0.1151, device='cuda:0')


Epoch 1, Step 1122: Loss = 0.0440, Actor Loss = -0.0005, Critic Loss = 0.0891, Entropy = 0.0747, Returns = 1.3867, Value = 1.4492, mask_loss = 0.1294std_loss = 0.2986:  20%|██        | 1122/5475 [5:03:13<21:18:05, 17.62s/it]

ratio tensor(0.7909, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4504, device='cuda:0') reward: tensor(1.3513, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4623, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.4478, device='cuda:0') reward: tensor(1.3454, device='cuda:0') factor: tensor(0.1025, device='cuda:0')
reward_raw: tensor(0.4556, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4405, device='cuda:0') reward: tensor(1.3444, device='cuda:0') factor: tensor(0.0962, device='cuda:0')


Epoch 1, Step 1123: Loss = -0.0593, Actor Loss = -0.1021, Critic Loss = 0.0855, Entropy = 0.0453, Returns = 1.3637, Value = 1.3823, mask_loss = 0.1081std_loss = 0.2789:  21%|██        | 1123/5475 [5:03:30<21:20:49, 17.66s/it]

ratio tensor(1.8943, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4530, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.1240, device='cuda:0')
reward_raw: tensor(0.4623, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1322, device='cuda:0')
reward_raw: tensor(0.4535, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.1266, device='cuda:0')


Epoch 1, Step 1124: Loss = 0.0268, Actor Loss = -0.0040, Critic Loss = 0.0618, Entropy = 0.0651, Returns = 1.3709, Value = 1.4458, mask_loss = 0.1255std_loss = 0.2935:  21%|██        | 1124/5475 [5:03:44<19:47:57, 16.38s/it] 

ratio tensor(0.9746, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4281, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.4030, device='cuda:0') reward: tensor(1.2978, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4339, device='cuda:0') reward: tensor(1.3318, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4188, device='cuda:0') reward: tensor(1.3117, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.4080, device='cuda:0') reward: tensor(1.3062, device='cuda:0') factor: tensor(0.1017, device='cuda:0')


Epoch 1, Step 1125: Loss = 0.1116, Actor Loss = 0.0850, Critic Loss = 0.0534, Entropy = 0.0531, Returns = 1.3372, Value = 1.1992, mask_loss = 0.1079std_loss = 0.2806:  21%|██        | 1125/5475 [5:03:58<19:07:10, 15.82s/it] 

ratio tensor(0.9691, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5134, device='cuda:0') reward: tensor(1.4076, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3935, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4223, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.4000, device='cuda:0') factor: tensor(0.1080, device='cuda:0')
reward_raw: tensor(0.5379, device='cuda:0') reward: tensor(1.4315, device='cuda:0') factor: tensor(0.1064, device='cuda:0')


Epoch 1, Step 1126: Loss = -0.1177, Actor Loss = -0.1700, Critic Loss = 0.1047, Entropy = 0.0707, Returns = 1.4498, Value = 1.4100, mask_loss = 0.1124std_loss = 0.2722:  21%|██        | 1126/5475 [5:04:10<17:39:15, 14.61s/it]

ratio tensor(1.4067, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4285, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4101, device='cuda:0') reward: tensor(1.3131, device='cuda:0') factor: tensor(0.0970, device='cuda:0')
reward_raw: tensor(0.4319, device='cuda:0') reward: tensor(1.3302, device='cuda:0') factor: tensor(0.1017, device='cuda:0')
reward_raw: tensor(0.3947, device='cuda:0') reward: tensor(1.2946, device='cuda:0') factor: tensor(0.1001, device='cuda:0')
reward_raw: tensor(0.4407, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.1027, device='cuda:0')


Epoch 1, Step 1127: Loss = 0.0076, Actor Loss = -0.0187, Critic Loss = 0.0528, Entropy = 0.0623, Returns = 1.2977, Value = 1.2914, mask_loss = 0.1134std_loss = 0.2774:  21%|██        | 1127/5475 [5:04:29<19:13:51, 15.92s/it] 

ratio tensor(1.0452, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.3876, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.4106, device='cuda:0') factor: tensor(0.1183, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3850, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.1209, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.1164, device='cuda:0')


Epoch 1, Step 1128: Loss = -0.0004, Actor Loss = -0.0194, Critic Loss = 0.0381, Entropy = 0.0607, Returns = 1.4030, Value = 1.4153, mask_loss = 0.1118std_loss = 0.2831:  21%|██        | 1128/5475 [5:04:43<18:24:53, 15.25s/it]

ratio tensor(0.9662, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4533, device='cuda:0') reward: tensor(1.3596, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.4610, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4354, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.4538, device='cuda:0') reward: tensor(1.3565, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.4643, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.0979, device='cuda:0')


Epoch 1, Step 1129: Loss = 0.1606, Actor Loss = 0.1237, Critic Loss = 0.0740, Entropy = 0.0569, Returns = 1.3079, Value = 1.4146, mask_loss = 0.1008std_loss = 0.2631:  21%|██        | 1129/5475 [5:04:57<17:59:47, 14.91s/it]  

ratio tensor(0.9214, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4821, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.1376, device='cuda:0')
reward_raw: tensor(0.4821, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.1323, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.4720, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.1305, device='cuda:0')


Epoch 1, Step 1130: Loss = -0.2417, Actor Loss = -0.2826, Critic Loss = 0.0819, Entropy = 0.0578, Returns = 1.4398, Value = 1.3402, mask_loss = 0.0964std_loss = 0.2530:  21%|██        | 1130/5475 [5:05:13<18:26:41, 15.28s/it]

ratio tensor(2.0252, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3717, device='cuda:0') reward: tensor(1.2792, device='cuda:0') factor: tensor(0.0925, device='cuda:0')
reward_raw: tensor(0.3902, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.3660, device='cuda:0') reward: tensor(1.2699, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.4011, device='cuda:0') reward: tensor(1.3034, device='cuda:0') factor: tensor(0.0977, device='cuda:0')
reward_raw: tensor(0.3875, device='cuda:0') reward: tensor(1.2943, device='cuda:0') factor: tensor(0.0932, device='cuda:0')


Epoch 1, Step 1131: Loss = 0.1088, Actor Loss = 0.0741, Critic Loss = 0.0696, Entropy = 0.0513, Returns = 1.2820, Value = 1.3799, mask_loss = 0.0881std_loss = 0.2527:  21%|██        | 1131/5475 [5:05:26<17:37:00, 14.60s/it]  

ratio tensor(1.0436, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.4174, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.4137, device='cuda:0') factor: tensor(0.0942, device='cuda:0')
reward_raw: tensor(0.4981, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.0924, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.4429, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.5296, device='cuda:0') reward: tensor(1.4291, device='cuda:0') factor: tensor(0.1005, device='cuda:0')


Epoch 1, Step 1132: Loss = 0.1064, Actor Loss = 0.0404, Critic Loss = 0.1320, Entropy = 0.0415, Returns = 1.4226, Value = 1.3045, mask_loss = 0.0872std_loss = 0.2515:  21%|██        | 1132/5475 [5:05:43<18:38:56, 15.46s/it]

ratio tensor(0.9887, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4600, device='cuda:0') reward: tensor(1.3455, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.1136, device='cuda:0')
reward_raw: tensor(0.4781, device='cuda:0') reward: tensor(1.3636, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1161, device='cuda:0')
reward_raw: tensor(0.4715, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1153, device='cuda:0')


Epoch 1, Step 1133: Loss = 0.2150, Actor Loss = 0.1246, Critic Loss = 0.1810, Entropy = 0.0640, Returns = 1.2363, Value = 1.3188, mask_loss = 0.1298std_loss = 0.3001:  21%|██        | 1133/5475 [5:05:59<18:51:21, 15.63s/it]

ratio tensor(0.8030, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4203, device='cuda:0') reward: tensor(1.3143, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.4230, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.1073, device='cuda:0')
reward_raw: tensor(0.4233, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.4346, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.1065, device='cuda:0')
reward_raw: tensor(0.4416, device='cuda:0') reward: tensor(1.3340, device='cuda:0') factor: tensor(0.1076, device='cuda:0')


Epoch 1, Step 1134: Loss = -0.0498, Actor Loss = -0.0730, Critic Loss = 0.0464, Entropy = 0.0649, Returns = 1.3305, Value = 1.3345, mask_loss = 0.1054std_loss = 0.2692:  21%|██        | 1134/5475 [5:06:12<17:43:54, 14.71s/it]

ratio tensor(1.2315, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5845, device='cuda:0') reward: tensor(1.4536, device='cuda:0') factor: tensor(0.1309, device='cuda:0')
reward_raw: tensor(0.5695, device='cuda:0') reward: tensor(1.4479, device='cuda:0') factor: tensor(0.1215, device='cuda:0')
reward_raw: tensor(0.5548, device='cuda:0') reward: tensor(1.4252, device='cuda:0') factor: tensor(0.1296, device='cuda:0')
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.4452, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.5819, device='cuda:0') reward: tensor(1.4496, device='cuda:0') factor: tensor(0.1323, device='cuda:0')


Epoch 1, Step 1135: Loss = 0.0433, Actor Loss = 0.0063, Critic Loss = 0.0741, Entropy = 0.0755, Returns = 1.3899, Value = 1.4452, mask_loss = 0.1449std_loss = 0.3148:  21%|██        | 1135/5475 [5:06:27<17:55:49, 14.87s/it]  

ratio tensor(0.9826, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3710, device='cuda:0') reward: tensor(1.2549, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.3722, device='cuda:0') reward: tensor(1.2577, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.3672, device='cuda:0') reward: tensor(1.2554, device='cuda:0') factor: tensor(0.1118, device='cuda:0')
reward_raw: tensor(0.3684, device='cuda:0') reward: tensor(1.2521, device='cuda:0') factor: tensor(0.1163, device='cuda:0')
reward_raw: tensor(0.3587, device='cuda:0') reward: tensor(1.2408, device='cuda:0') factor: tensor(0.1179, device='cuda:0')


Epoch 1, Step 1136: Loss = 0.0454, Actor Loss = 0.0094, Critic Loss = 0.0722, Entropy = 0.0716, Returns = 1.3103, Value = 1.1573, mask_loss = 0.1102std_loss = 0.2722:  21%|██        | 1136/5475 [5:06:44<18:46:08, 15.57s/it]

ratio tensor(0.8810, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4967, device='cuda:0') reward: tensor(1.3857, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.4860, device='cuda:0') reward: tensor(1.3768, device='cuda:0') factor: tensor(0.1092, device='cuda:0')
reward_raw: tensor(0.4926, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.1134, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.3815, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.3893, device='cuda:0') factor: tensor(0.1122, device='cuda:0')


Epoch 1, Step 1137: Loss = -0.1282, Actor Loss = -0.1560, Critic Loss = 0.0556, Entropy = 0.0533, Returns = 1.4104, Value = 1.4195, mask_loss = 0.0950std_loss = 0.2564:  21%|██        | 1137/5475 [5:07:02<19:17:51, 16.01s/it]

ratio tensor(0.9472, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3807, device='cuda:0') factor: tensor(0.1101, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.3929, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.4992, device='cuda:0') reward: tensor(1.3880, device='cuda:0') factor: tensor(0.1112, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3982, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.1102, device='cuda:0')


Epoch 1, Step 1138: Loss = 0.1644, Actor Loss = 0.1019, Critic Loss = 0.1251, Entropy = 0.0494, Returns = 1.4915, Value = 1.5359, mask_loss = 0.0923std_loss = 0.2561:  21%|██        | 1138/5475 [5:07:15<18:21:26, 15.24s/it]  

ratio tensor(1.3678, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4082, device='cuda:0') reward: tensor(1.3219, device='cuda:0') factor: tensor(0.0863, device='cuda:0')
reward_raw: tensor(0.4191, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.0901, device='cuda:0')
reward_raw: tensor(0.4247, device='cuda:0') reward: tensor(1.3333, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.4383, device='cuda:0') reward: tensor(1.3510, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4104, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.0866, device='cuda:0')


Epoch 1, Step 1139: Loss = 0.0648, Actor Loss = 0.0339, Critic Loss = 0.0619, Entropy = 0.0580, Returns = 1.3399, Value = 1.3196, mask_loss = 0.0928std_loss = 0.2552:  21%|██        | 1139/5475 [5:07:33<19:23:56, 16.11s/it]

ratio tensor(0.9007, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.1187, device='cuda:0')
reward_raw: tensor(0.4901, device='cuda:0') reward: tensor(1.3723, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1209, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.3886, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.4938, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1264, device='cuda:0')


Epoch 1, Step 1140: Loss = 0.0973, Actor Loss = 0.0550, Critic Loss = 0.0847, Entropy = 0.0580, Returns = 1.3113, Value = 1.4054, mask_loss = 0.1251std_loss = 0.2964:  21%|██        | 1140/5475 [5:07:47<18:45:19, 15.58s/it]

ratio tensor(0.8694, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4799, device='cuda:0') reward: tensor(1.3406, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.4764, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3356, device='cuda:0') factor: tensor(0.1423, device='cuda:0')
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.3434, device='cuda:0') factor: tensor(0.1450, device='cuda:0')
reward_raw: tensor(0.4764, device='cuda:0') reward: tensor(1.3362, device='cuda:0') factor: tensor(0.1401, device='cuda:0')


Epoch 1, Step 1141: Loss = 0.1832, Actor Loss = 0.1537, Critic Loss = 0.0592, Entropy = 0.0695, Returns = 1.3228, Value = 1.3684, mask_loss = 0.1431std_loss = 0.2977:  21%|██        | 1140/5475 [5:08:01<18:45:19, 15.58s/it]

ratio tensor(0.7440, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1141: Loss = 0.1832, Actor Loss = 0.1537, Critic Loss = 0.0592, Entropy = 0.0695, Returns = 1.3228, Value = 1.3684, mask_loss = 0.1431std_loss = 0.2977:  21%|██        | 1141/5475 [5:08:02<18:17:46, 15.20s/it]

reward_raw: tensor(0.5298, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.5327, device='cuda:0') reward: tensor(1.4051, device='cuda:0') factor: tensor(0.1276, device='cuda:0')
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.4152, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.1252, device='cuda:0')
reward_raw: tensor(0.5515, device='cuda:0') reward: tensor(1.4252, device='cuda:0') factor: tensor(0.1263, device='cuda:0')


Epoch 1, Step 1142: Loss = -0.1945, Actor Loss = -0.2275, Critic Loss = 0.0661, Entropy = 0.0551, Returns = 1.4082, Value = 1.3997, mask_loss = 0.1111std_loss = 0.2806:  21%|██        | 1142/5475 [5:08:18<18:31:16, 15.39s/it]

ratio tensor(1.5418, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3904, device='cuda:0') factor: tensor(0.1280, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.4801, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.4930, device='cuda:0') reward: tensor(1.3597, device='cuda:0') factor: tensor(0.1333, device='cuda:0')


Epoch 1, Step 1143: Loss = 0.0236, Actor Loss = -0.0301, Critic Loss = 0.1075, Entropy = 0.0596, Returns = 1.4017, Value = 1.3228, mask_loss = 0.1045std_loss = 0.2520:  21%|██        | 1143/5475 [5:08:33<18:41:22, 15.53s/it] 

ratio tensor(1.1657, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4482, device='cuda:0') reward: tensor(1.3521, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.4942, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4634, device='cuda:0') reward: tensor(1.3694, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.4835, device='cuda:0') reward: tensor(1.3906, device='cuda:0') factor: tensor(0.0928, device='cuda:0')
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.0909, device='cuda:0')


Epoch 1, Step 1144: Loss = 0.1264, Actor Loss = 0.0951, Critic Loss = 0.0628, Entropy = 0.0548, Returns = 1.3206, Value = 1.2511, mask_loss = 0.1058std_loss = 0.2721:  21%|██        | 1144/5475 [5:08:53<19:59:04, 16.61s/it] 

ratio tensor(1.0522, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1074, device='cuda:0')
reward_raw: tensor(0.4649, device='cuda:0') reward: tensor(1.3604, device='cuda:0') factor: tensor(0.1045, device='cuda:0')
reward_raw: tensor(0.4853, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.4733, device='cuda:0') reward: tensor(1.3633, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.4607, device='cuda:0') reward: tensor(1.3575, device='cuda:0') factor: tensor(0.1032, device='cuda:0')


Epoch 1, Step 1145: Loss = -0.0819, Actor Loss = -0.1150, Critic Loss = 0.0662, Entropy = 0.0611, Returns = 1.3828, Value = 1.4473, mask_loss = 0.0960std_loss = 0.2566:  21%|██        | 1145/5475 [5:09:08<19:26:28, 16.16s/it]

ratio tensor(1.2400, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4575, device='cuda:0') reward: tensor(1.3692, device='cuda:0') factor: tensor(0.0883, device='cuda:0')
reward_raw: tensor(0.4537, device='cuda:0') reward: tensor(1.3622, device='cuda:0') factor: tensor(0.0915, device='cuda:0')
reward_raw: tensor(0.4729, device='cuda:0') reward: tensor(1.3816, device='cuda:0') factor: tensor(0.0913, device='cuda:0')
reward_raw: tensor(0.4636, device='cuda:0') reward: tensor(1.3727, device='cuda:0') factor: tensor(0.0909, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.0905, device='cuda:0')


Epoch 1, Step 1146: Loss = -0.1155, Actor Loss = -0.1533, Critic Loss = 0.0758, Entropy = 0.0585, Returns = 1.4332, Value = 1.4378, mask_loss = 0.0891std_loss = 0.2475:  21%|██        | 1146/5475 [5:09:22<18:42:30, 15.56s/it]

ratio tensor(1.2740, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4069, device='cuda:0') reward: tensor(1.2939, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.4269, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.4135, device='cuda:0') reward: tensor(1.3003, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.4190, device='cuda:0') reward: tensor(1.3058, device='cuda:0') factor: tensor(0.1132, device='cuda:0')
reward_raw: tensor(0.4161, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.1106, device='cuda:0')


Epoch 1, Step 1147: Loss = 0.2296, Actor Loss = 0.2037, Critic Loss = 0.0519, Entropy = 0.0572, Returns = 1.3062, Value = 1.2597, mask_loss = 0.1075std_loss = 0.2679:  21%|██        | 1147/5475 [5:09:34<17:37:11, 14.66s/it]  

ratio tensor(0.8575, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4526, device='cuda:0') reward: tensor(1.3582, device='cuda:0') factor: tensor(0.0943, device='cuda:0')
reward_raw: tensor(0.4427, device='cuda:0') reward: tensor(1.3418, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.4541, device='cuda:0') reward: tensor(1.3568, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.4547, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.0968, device='cuda:0')


Epoch 1, Step 1148: Loss = -0.1689, Actor Loss = -0.1886, Critic Loss = 0.0396, Entropy = 0.0510, Returns = 1.3820, Value = 1.3889, mask_loss = 0.0931std_loss = 0.2532:  21%|██        | 1148/5475 [5:09:57<20:25:25, 16.99s/it]

ratio tensor(1.7328, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4388, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.1227, device='cuda:0')
reward_raw: tensor(0.4567, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.1264, device='cuda:0')
reward_raw: tensor(0.4696, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1270, device='cuda:0')
reward_raw: tensor(0.4448, device='cuda:0') reward: tensor(1.3171, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.4469, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.1208, device='cuda:0')


Epoch 1, Step 1149: Loss = 0.2069, Actor Loss = 0.1727, Critic Loss = 0.0684, Entropy = 0.0593, Returns = 1.2930, Value = 1.3212, mask_loss = 0.1123std_loss = 0.2823:  21%|██        | 1149/5475 [5:10:11<19:26:12, 16.17s/it]  

ratio tensor(0.8280, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(1.3424, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.4569, device='cuda:0') reward: tensor(1.3409, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.3573, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.4663, device='cuda:0') reward: tensor(1.3506, device='cuda:0') factor: tensor(0.1157, device='cuda:0')


Epoch 1, Step 1150: Loss = 0.0261, Actor Loss = 0.0076, Critic Loss = 0.0371, Entropy = 0.0532, Returns = 1.3776, Value = 1.3492, mask_loss = 0.1313std_loss = 0.3004:  21%|██        | 1150/5475 [5:10:26<19:07:18, 15.92s/it]

ratio tensor(0.9208, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.4147, device='cuda:0') factor: tensor(0.0873, device='cuda:0')
reward_raw: tensor(0.4896, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.0894, device='cuda:0')
reward_raw: tensor(0.4616, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.4279, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.4554, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.0940, device='cuda:0')


Epoch 1, Step 1151: Loss = -0.0195, Actor Loss = -0.0526, Critic Loss = 0.0663, Entropy = 0.0477, Returns = 1.3395, Value = 1.4550, mask_loss = 0.0861std_loss = 0.2447:  21%|██        | 1151/5475 [5:10:41<18:32:41, 15.44s/it]

ratio tensor(1.0197, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4717, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.1177, device='cuda:0')
reward_raw: tensor(0.4604, device='cuda:0') reward: tensor(1.3357, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.3634, device='cuda:0') factor: tensor(0.1243, device='cuda:0')
reward_raw: tensor(0.4649, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.1220, device='cuda:0')


Epoch 1, Step 1152: Loss = 0.1546, Actor Loss = 0.1061, Critic Loss = 0.0972, Entropy = 0.0604, Returns = 1.3291, Value = 1.4548, mask_loss = 0.1204std_loss = 0.2790:  21%|██        | 1152/5475 [5:10:55<18:11:50, 15.15s/it]  

ratio tensor(0.9071, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.4211, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.5056, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.5072, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.1024, device='cuda:0')


Epoch 1, Step 1153: Loss = 0.0147, Actor Loss = -0.0176, Critic Loss = 0.0648, Entropy = 0.0669, Returns = 1.3826, Value = 1.3714, mask_loss = 0.1163std_loss = 0.2847:  21%|██        | 1153/5475 [5:11:18<20:57:56, 17.46s/it]

ratio tensor(1.1041, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4285, device='cuda:0') reward: tensor(1.3119, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.4023, device='cuda:0') reward: tensor(1.2887, device='cuda:0') factor: tensor(0.1136, device='cuda:0')
reward_raw: tensor(0.4009, device='cuda:0') reward: tensor(1.2859, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4167, device='cuda:0') reward: tensor(1.3030, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.4234, device='cuda:0') reward: tensor(1.3093, device='cuda:0') factor: tensor(0.1141, device='cuda:0')


Epoch 1, Step 1154: Loss = 0.0229, Actor Loss = 0.0054, Critic Loss = 0.0351, Entropy = 0.0739, Returns = 1.2848, Value = 1.2866, mask_loss = 0.1302std_loss = 0.2894:  21%|██        | 1154/5475 [5:11:31<19:21:43, 16.13s/it] 

ratio tensor(0.9298, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.4430, device='cuda:0') factor: tensor(0.0939, device='cuda:0')
reward_raw: tensor(0.5509, device='cuda:0') reward: tensor(1.4559, device='cuda:0') factor: tensor(0.0950, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.4396, device='cuda:0') factor: tensor(0.0943, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.4282, device='cuda:0') factor: tensor(0.0924, device='cuda:0')
reward_raw: tensor(0.5529, device='cuda:0') reward: tensor(1.4615, device='cuda:0') factor: tensor(0.0914, device='cuda:0')


Epoch 1, Step 1155: Loss = -0.1829, Actor Loss = -0.2446, Critic Loss = 0.1236, Entropy = 0.0433, Returns = 1.4866, Value = 1.3657, mask_loss = 0.0826std_loss = 0.2456:  21%|██        | 1155/5475 [5:11:46<18:59:30, 15.83s/it]

ratio tensor(0.9723, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4617, device='cuda:0') reward: tensor(1.3450, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.4674, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.3488, device='cuda:0') factor: tensor(0.1125, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.3663, device='cuda:0') factor: tensor(0.1083, device='cuda:0')
reward_raw: tensor(0.4615, device='cuda:0') reward: tensor(1.3551, device='cuda:0') factor: tensor(0.1064, device='cuda:0')


Epoch 1, Step 1156: Loss = 0.4176, Actor Loss = 0.3850, Critic Loss = 0.0655, Entropy = 0.0636, Returns = 1.3013, Value = 1.4244, mask_loss = 0.0998std_loss = 0.2616:  21%|██        | 1156/5475 [5:11:59<17:50:01, 14.86s/it]  

ratio tensor(0.7769, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4237, device='cuda:0') factor: tensor(0.1139, device='cuda:0')
reward_raw: tensor(0.5189, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.4086, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.5203, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1147, device='cuda:0')
reward_raw: tensor(0.5443, device='cuda:0') reward: tensor(1.4361, device='cuda:0') factor: tensor(0.1082, device='cuda:0')


Epoch 1, Step 1157: Loss = -0.1324, Actor Loss = -0.1712, Critic Loss = 0.0777, Entropy = 0.0560, Returns = 1.4144, Value = 1.3507, mask_loss = 0.1063std_loss = 0.2706:  21%|██        | 1157/5475 [5:12:12<17:09:13, 14.30s/it]

ratio tensor(1.1520, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3951, device='cuda:0') reward: tensor(1.2887, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.4065, device='cuda:0') reward: tensor(1.3007, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.4281, device='cuda:0') reward: tensor(1.3200, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.4084, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.1088, device='cuda:0')
reward_raw: tensor(0.4056, device='cuda:0') reward: tensor(1.2972, device='cuda:0') factor: tensor(0.1084, device='cuda:0')


Epoch 1, Step 1158: Loss = 0.0172, Actor Loss = -0.0220, Critic Loss = 0.0786, Entropy = 0.0509, Returns = 1.3339, Value = 1.3741, mask_loss = 0.1006std_loss = 0.2701:  21%|██        | 1158/5475 [5:12:27<17:27:52, 14.56s/it] 

ratio tensor(1.3816, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5017, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1075, device='cuda:0')
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.5047, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.1088, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.3981, device='cuda:0') factor: tensor(0.1114, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.4062, device='cuda:0') factor: tensor(0.1065, device='cuda:0')


Epoch 1, Step 1159: Loss = -0.0706, Actor Loss = -0.1195, Critic Loss = 0.0981, Entropy = 0.0602, Returns = 1.3408, Value = 1.3391, mask_loss = 0.1216std_loss = 0.2856:  21%|██        | 1159/5475 [5:12:42<17:32:15, 14.63s/it]

ratio tensor(1.0244, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1111, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3886, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(1.4026, device='cuda:0') factor: tensor(0.1159, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.4012, device='cuda:0') factor: tensor(0.1156, device='cuda:0')
reward_raw: tensor(0.5075, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.1135, device='cuda:0')


Epoch 1, Step 1160: Loss = 0.0590, Actor Loss = 0.0182, Critic Loss = 0.0818, Entropy = 0.0754, Returns = 1.4174, Value = 1.3684, mask_loss = 0.1270std_loss = 0.2888:  21%|██        | 1160/5475 [5:12:55<17:04:44, 14.25s/it]  

ratio tensor(1.1961, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4589, device='cuda:0') reward: tensor(1.3312, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.4717, device='cuda:0') reward: tensor(1.3432, device='cuda:0') factor: tensor(0.1285, device='cuda:0')
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.3413, device='cuda:0') factor: tensor(0.1328, device='cuda:0')
reward_raw: tensor(0.4733, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.4449, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.1276, device='cuda:0')


Epoch 1, Step 1161: Loss = 0.0569, Actor Loss = 0.0336, Critic Loss = 0.0467, Entropy = 0.0740, Returns = 1.2979, Value = 1.3111, mask_loss = 0.1336std_loss = 0.3030:  21%|██        | 1160/5475 [5:13:11<17:04:44, 14.25s/it]

ratio tensor(0.9565, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1161: Loss = 0.0569, Actor Loss = 0.0336, Critic Loss = 0.0467, Entropy = 0.0740, Returns = 1.2979, Value = 1.3111, mask_loss = 0.1336std_loss = 0.3030:  21%|██        | 1161/5475 [5:13:11<17:48:07, 14.86s/it]

reward_raw: tensor(0.5820, device='cuda:0') reward: tensor(1.4614, device='cuda:0') factor: tensor(0.1207, device='cuda:0')
reward_raw: tensor(0.5880, device='cuda:0') reward: tensor(1.4694, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.6137, device='cuda:0') reward: tensor(1.4909, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.5892, device='cuda:0') reward: tensor(1.4690, device='cuda:0') factor: tensor(0.1201, device='cuda:0')
reward_raw: tensor(0.6019, device='cuda:0') reward: tensor(1.4792, device='cuda:0') factor: tensor(0.1227, device='cuda:0')


Epoch 1, Step 1162: Loss = -0.1201, Actor Loss = -0.1695, Critic Loss = 0.0990, Entropy = 0.0561, Returns = 1.4908, Value = 1.5002, mask_loss = 0.1160std_loss = 0.2883:  21%|██        | 1162/5475 [5:13:24<17:07:53, 14.30s/it]

ratio tensor(1.3213, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.4594, device='cuda:0') factor: tensor(0.0913, device='cuda:0')
reward_raw: tensor(0.5751, device='cuda:0') reward: tensor(1.4755, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.5501, device='cuda:0') reward: tensor(1.4487, device='cuda:0') factor: tensor(0.1013, device='cuda:0')
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.4798, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.4697, device='cuda:0') factor: tensor(0.0973, device='cuda:0')


Epoch 1, Step 1163: Loss = 0.2650, Actor Loss = 0.2266, Critic Loss = 0.0768, Entropy = 0.0489, Returns = 1.4613, Value = 1.5843, mask_loss = 0.1012std_loss = 0.2719:  21%|██        | 1163/5475 [5:13:45<19:19:17, 16.13s/it]  

ratio tensor(0.7532, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4458, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.1060, device='cuda:0')
reward_raw: tensor(0.4489, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.4439, device='cuda:0') reward: tensor(1.3362, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.4405, device='cuda:0') reward: tensor(1.3320, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.4357, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1066, device='cuda:0')


Epoch 1, Step 1164: Loss = -0.0546, Actor Loss = -0.1098, Critic Loss = 0.1105, Entropy = 0.0534, Returns = 1.4110, Value = 1.2367, mask_loss = 0.0956std_loss = 0.2607:  21%|██▏       | 1164/5475 [5:13:59<18:31:26, 15.47s/it]

ratio tensor(0.9805, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4941, device='cuda:0') reward: tensor(1.4131, device='cuda:0') factor: tensor(0.0810, device='cuda:0')
reward_raw: tensor(0.4937, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.0828, device='cuda:0')
reward_raw: tensor(0.4784, device='cuda:0') reward: tensor(1.3948, device='cuda:0') factor: tensor(0.0836, device='cuda:0')
reward_raw: tensor(0.4724, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.0827, device='cuda:0')
reward_raw: tensor(0.4770, device='cuda:0') reward: tensor(1.3966, device='cuda:0') factor: tensor(0.0804, device='cuda:0')


Epoch 1, Step 1165: Loss = -0.2234, Actor Loss = -0.2400, Critic Loss = 0.0333, Entropy = 0.0478, Returns = 1.4562, Value = 1.4976, mask_loss = 0.0821std_loss = 0.2444:  21%|██▏       | 1165/5475 [5:14:16<19:10:21, 16.01s/it]

ratio tensor(1.1704, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5477, device='cuda:0') reward: tensor(1.4617, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.4294, device='cuda:0') factor: tensor(0.0878, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.4261, device='cuda:0') factor: tensor(0.0857, device='cuda:0')
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.4345, device='cuda:0') factor: tensor(0.0898, device='cuda:0')
reward_raw: tensor(0.5188, device='cuda:0') reward: tensor(1.4274, device='cuda:0') factor: tensor(0.0914, device='cuda:0')


Epoch 1, Step 1166: Loss = 0.2298, Actor Loss = 0.2142, Critic Loss = 0.0313, Entropy = 0.0541, Returns = 1.4237, Value = 1.4014, mask_loss = 0.0972std_loss = 0.2657:  21%|██▏       | 1166/5475 [5:14:31<18:42:36, 15.63s/it]  

ratio tensor(0.7346, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5027, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4825, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.0977, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.4033, device='cuda:0') factor: tensor(0.1008, device='cuda:0')
reward_raw: tensor(0.5177, device='cuda:0') reward: tensor(1.4173, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.0953, device='cuda:0')


Epoch 1, Step 1167: Loss = 0.0232, Actor Loss = -0.0093, Critic Loss = 0.0651, Entropy = 0.0425, Returns = 1.3725, Value = 1.4439, mask_loss = 0.0881std_loss = 0.2555:  21%|██▏       | 1167/5475 [5:14:53<21:01:38, 17.57s/it]

ratio tensor(1.0767, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.1082, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(1.4230, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.4285, device='cuda:0') factor: tensor(0.1060, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3986, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.5299, device='cuda:0') reward: tensor(1.4249, device='cuda:0') factor: tensor(0.1051, device='cuda:0')


Epoch 1, Step 1168: Loss = -0.1725, Actor Loss = -0.1972, Critic Loss = 0.0495, Entropy = 0.0551, Returns = 1.4457, Value = 1.4179, mask_loss = 0.0988std_loss = 0.2668:  21%|██▏       | 1168/5475 [5:15:09<20:33:09, 17.18s/it]

ratio tensor(1.1819, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4837, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.1132, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.3975, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.4802, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.4929, device='cuda:0') reward: tensor(1.3857, device='cuda:0') factor: tensor(0.1072, device='cuda:0')


Epoch 1, Step 1169: Loss = 0.1948, Actor Loss = 0.1649, Critic Loss = 0.0600, Entropy = 0.0637, Returns = 1.3677, Value = 1.3460, mask_loss = 0.1317std_loss = 0.2990:  21%|██▏       | 1169/5475 [5:15:27<20:53:57, 17.47s/it]  

ratio tensor(1.0829, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4348, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.0967, device='cuda:0')
reward_raw: tensor(0.4267, device='cuda:0') reward: tensor(1.3257, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4203, device='cuda:0') reward: tensor(1.3221, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4324, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4253, device='cuda:0') reward: tensor(1.3288, device='cuda:0') factor: tensor(0.0965, device='cuda:0')


Epoch 1, Step 1170: Loss = -0.2060, Actor Loss = -0.2374, Critic Loss = 0.0628, Entropy = 0.0436, Returns = 1.3446, Value = 1.2802, mask_loss = 0.0964std_loss = 0.2617:  21%|██▏       | 1170/5475 [5:15:43<20:06:23, 16.81s/it]

ratio tensor(1.3956, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4548, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.0844, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.4424, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.0844, device='cuda:0')
reward_raw: tensor(0.4747, device='cuda:0') reward: tensor(1.3906, device='cuda:0') factor: tensor(0.0841, device='cuda:0')
reward_raw: tensor(0.4622, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.0845, device='cuda:0')


Epoch 1, Step 1171: Loss = 0.0361, Actor Loss = 0.0084, Critic Loss = 0.0556, Entropy = 0.0359, Returns = 1.3891, Value = 1.4029, mask_loss = 0.0785std_loss = 0.2439:  21%|██▏       | 1171/5475 [5:16:00<20:23:12, 17.05s/it]  

ratio tensor(0.9318, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4797, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.4742, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.1087, device='cuda:0')
reward_raw: tensor(0.4877, device='cuda:0') reward: tensor(1.3756, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.4729, device='cuda:0') reward: tensor(1.3640, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.4788, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.1099, device='cuda:0')


Epoch 1, Step 1172: Loss = 0.1744, Actor Loss = 0.1110, Critic Loss = 0.1268, Entropy = 0.0669, Returns = 1.3615, Value = 1.3240, mask_loss = 0.1169std_loss = 0.2814:  21%|██▏       | 1172/5475 [5:16:14<19:21:52, 16.20s/it]

ratio tensor(0.9656, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4368, device='cuda:0') reward: tensor(1.3306, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.4457, device='cuda:0') reward: tensor(1.3417, device='cuda:0') factor: tensor(0.1040, device='cuda:0')
reward_raw: tensor(0.4165, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.4033, device='cuda:0') reward: tensor(1.2917, device='cuda:0') factor: tensor(0.1116, device='cuda:0')
reward_raw: tensor(0.4330, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.1091, device='cuda:0')


Epoch 1, Step 1173: Loss = -0.0956, Actor Loss = -0.1211, Critic Loss = 0.0510, Entropy = 0.0653, Returns = 1.3768, Value = 1.2299, mask_loss = 0.1044std_loss = 0.2690:  21%|██▏       | 1173/5475 [5:16:30<19:10:06, 16.04s/it]

ratio tensor(1.4064, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4879, device='cuda:0') reward: tensor(1.4017, device='cuda:0') factor: tensor(0.0862, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.4275, device='cuda:0') factor: tensor(0.0897, device='cuda:0')
reward_raw: tensor(0.5269, device='cuda:0') reward: tensor(1.4364, device='cuda:0') factor: tensor(0.0905, device='cuda:0')
reward_raw: tensor(0.5085, device='cuda:0') reward: tensor(1.4198, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.4096, device='cuda:0') factor: tensor(0.0866, device='cuda:0')


Epoch 1, Step 1174: Loss = -0.0604, Actor Loss = -0.0930, Critic Loss = 0.0653, Entropy = 0.0462, Returns = 1.4177, Value = 1.5226, mask_loss = 0.0856std_loss = 0.2523:  21%|██▏       | 1174/5475 [5:16:45<18:41:10, 15.64s/it]

ratio tensor(0.9268, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4567, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.0863, device='cuda:0')
reward_raw: tensor(0.4481, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.4635, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4689, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.0892, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.0887, device='cuda:0')


Epoch 1, Step 1175: Loss = 0.0598, Actor Loss = 0.0298, Critic Loss = 0.0601, Entropy = 0.0419, Returns = 1.3643, Value = 1.2851, mask_loss = 0.0833std_loss = 0.2488:  21%|██▏       | 1175/5475 [5:17:03<19:38:01, 16.44s/it]  

ratio tensor(1.0419, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.1327, device='cuda:0')
reward_raw: tensor(0.5158, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1306, device='cuda:0')
reward_raw: tensor(0.5000, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1297, device='cuda:0')
reward_raw: tensor(0.5212, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.1274, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3932, device='cuda:0') factor: tensor(0.1308, device='cuda:0')


Epoch 1, Step 1176: Loss = -0.0102, Actor Loss = -0.0508, Critic Loss = 0.0813, Entropy = 0.0532, Returns = 1.3701, Value = 1.3660, mask_loss = 0.1076std_loss = 0.2757:  21%|██▏       | 1176/5475 [5:17:16<18:26:08, 15.44s/it]

ratio tensor(0.9877, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4365, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.4460, device='cuda:0') reward: tensor(1.3395, device='cuda:0') factor: tensor(0.1065, device='cuda:0')
reward_raw: tensor(0.4466, device='cuda:0') reward: tensor(1.3392, device='cuda:0') factor: tensor(0.1074, device='cuda:0')
reward_raw: tensor(0.4034, device='cuda:0') reward: tensor(1.2998, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.4628, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1022, device='cuda:0')


Epoch 1, Step 1177: Loss = -0.0911, Actor Loss = -0.1295, Critic Loss = 0.0770, Entropy = 0.0462, Returns = 1.3881, Value = 1.3896, mask_loss = 0.0920std_loss = 0.2627:  21%|██▏       | 1177/5475 [5:17:29<17:31:51, 14.68s/it]

ratio tensor(1.4187, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.5188, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.4054, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.5230, device='cuda:0') reward: tensor(1.4121, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.1103, device='cuda:0')


Epoch 1, Step 1178: Loss = -0.1148, Actor Loss = -0.1555, Critic Loss = 0.0816, Entropy = 0.0455, Returns = 1.4377, Value = 1.3578, mask_loss = 0.1007std_loss = 0.2758:  22%|██▏       | 1178/5475 [5:17:40<16:11:37, 13.57s/it]

ratio tensor(1.1497, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.0977, device='cuda:0')
reward_raw: tensor(0.4635, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1007, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.3675, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.4860, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.4802, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.1040, device='cuda:0')


Epoch 1, Step 1179: Loss = -0.0747, Actor Loss = -0.1182, Critic Loss = 0.0871, Entropy = 0.0493, Returns = 1.4420, Value = 1.2873, mask_loss = 0.0867std_loss = 0.2383:  22%|██▏       | 1179/5475 [5:17:56<16:53:30, 14.16s/it]

ratio tensor(1.2525, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.1073, device='cuda:0')
reward_raw: tensor(0.4569, device='cuda:0') reward: tensor(1.3473, device='cuda:0') factor: tensor(0.1096, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.3666, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.4748, device='cuda:0') reward: tensor(1.3714, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3680, device='cuda:0') factor: tensor(0.1069, device='cuda:0')


Epoch 1, Step 1180: Loss = 0.0007, Actor Loss = -0.0242, Critic Loss = 0.0497, Entropy = 0.0510, Returns = 1.2832, Value = 1.3251, mask_loss = 0.0978std_loss = 0.2669:  22%|██▏       | 1180/5475 [5:18:14<18:15:05, 15.30s/it] 

ratio tensor(1.5557, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4825, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1305, device='cuda:0')
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.3670, device='cuda:0') factor: tensor(0.1211, device='cuda:0')
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1287, device='cuda:0')
reward_raw: tensor(0.5068, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.5020, device='cuda:0') reward: tensor(1.3688, device='cuda:0') factor: tensor(0.1332, device='cuda:0')


Epoch 1, Step 1181: Loss = 0.0935, Actor Loss = 0.0607, Critic Loss = 0.0656, Entropy = 0.0612, Returns = 1.3482, Value = 1.4013, mask_loss = 0.1060std_loss = 0.2722:  22%|██▏       | 1180/5475 [5:18:28<18:15:05, 15.30s/it] 

ratio tensor(1.0274, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1181: Loss = 0.0935, Actor Loss = 0.0607, Critic Loss = 0.0656, Entropy = 0.0612, Returns = 1.3482, Value = 1.4013, mask_loss = 0.1060std_loss = 0.2722:  22%|██▏       | 1181/5475 [5:18:29<18:11:33, 15.25s/it]

reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.3807, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.5017, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.0980, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.4030, device='cuda:0') factor: tensor(0.1024, device='cuda:0')


Epoch 1, Step 1182: Loss = -0.1045, Actor Loss = -0.1483, Critic Loss = 0.0876, Entropy = 0.0561, Returns = 1.4612, Value = 1.3141, mask_loss = 0.0932std_loss = 0.2584:  22%|██▏       | 1182/5475 [5:18:46<18:50:28, 15.80s/it]

ratio tensor(1.1793, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4952, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.4812, device='cuda:0') reward: tensor(1.3702, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.4008, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.3936, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.1105, device='cuda:0')


Epoch 1, Step 1183: Loss = -0.0089, Actor Loss = -0.0221, Critic Loss = 0.0264, Entropy = 0.0482, Returns = 1.4233, Value = 1.3771, mask_loss = 0.1016std_loss = 0.2745:  22%|██▏       | 1183/5475 [5:19:01<18:47:37, 15.76s/it]

ratio tensor(0.9175, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3807, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.1153, device='cuda:0')
reward_raw: tensor(0.4879, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1101, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.1160, device='cuda:0')


Epoch 1, Step 1184: Loss = 0.0343, Actor Loss = 0.0160, Critic Loss = 0.0369, Entropy = 0.0690, Returns = 1.3760, Value = 1.3817, mask_loss = 0.1302std_loss = 0.2982:  22%|██▏       | 1184/5475 [5:19:16<18:29:01, 15.51s/it]  

ratio tensor(1.0750, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5803, device='cuda:0') reward: tensor(1.4748, device='cuda:0') factor: tensor(0.1055, device='cuda:0')
reward_raw: tensor(0.5724, device='cuda:0') reward: tensor(1.4622, device='cuda:0') factor: tensor(0.1102, device='cuda:0')
reward_raw: tensor(0.5692, device='cuda:0') reward: tensor(1.4629, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.4621, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.5849, device='cuda:0') reward: tensor(1.4753, device='cuda:0') factor: tensor(0.1095, device='cuda:0')


Epoch 1, Step 1185: Loss = -0.1773, Actor Loss = -0.1973, Critic Loss = 0.0402, Entropy = 0.0419, Returns = 1.5022, Value = 1.5082, mask_loss = 0.0905std_loss = 0.2605:  22%|██▏       | 1185/5475 [5:19:32<18:32:40, 15.56s/it]

ratio tensor(0.9584, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4816, device='cuda:0') reward: tensor(1.3795, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.3818, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.5281, device='cuda:0') reward: tensor(1.4285, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.4182, device='cuda:0') factor: tensor(0.1038, device='cuda:0')


Epoch 1, Step 1186: Loss = 0.1602, Actor Loss = 0.1396, Critic Loss = 0.0412, Entropy = 0.0415, Returns = 1.3853, Value = 1.3487, mask_loss = 0.1003std_loss = 0.2741:  22%|██▏       | 1186/5475 [5:19:47<18:23:27, 15.44s/it]  

ratio tensor(1.0220, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4566, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.0877, device='cuda:0')
reward_raw: tensor(0.4875, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.0963, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.0936, device='cuda:0')
reward_raw: tensor(0.4611, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.0993, device='cuda:0')
reward_raw: tensor(0.4462, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.0916, device='cuda:0')


Epoch 1, Step 1187: Loss = 0.0384, Actor Loss = -0.0131, Critic Loss = 0.1031, Entropy = 0.0542, Returns = 1.3925, Value = 1.4757, mask_loss = 0.0976std_loss = 0.2616:  22%|██▏       | 1187/5475 [5:20:03<18:34:26, 15.59s/it]

ratio tensor(1.4002, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5603, device='cuda:0') reward: tensor(1.4503, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.4328, device='cuda:0') factor: tensor(0.1078, device='cuda:0')
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.4334, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.4288, device='cuda:0') factor: tensor(0.1128, device='cuda:0')
reward_raw: tensor(0.5325, device='cuda:0') reward: tensor(1.4232, device='cuda:0') factor: tensor(0.1093, device='cuda:0')


Epoch 1, Step 1188: Loss = -0.0938, Actor Loss = -0.1346, Critic Loss = 0.0817, Entropy = 0.0419, Returns = 1.3729, Value = 1.3819, mask_loss = 0.0918std_loss = 0.2611:  22%|██▏       | 1188/5475 [5:20:18<18:25:49, 15.48s/it]

ratio tensor(1.3478, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.4047, device='cuda:0') factor: tensor(0.0984, device='cuda:0')
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.4097, device='cuda:0') factor: tensor(0.0966, device='cuda:0')
reward_raw: tensor(0.4830, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.1007, device='cuda:0')


Epoch 1, Step 1189: Loss = 0.2119, Actor Loss = 0.1802, Critic Loss = 0.0635, Entropy = 0.0436, Returns = 1.3787, Value = 1.3013, mask_loss = 0.0903std_loss = 0.2580:  22%|██▏       | 1189/5475 [5:20:35<18:47:49, 15.79s/it]  

ratio tensor(0.9429, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.4064, device='cuda:0') factor: tensor(0.0901, device='cuda:0')
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.4167, device='cuda:0') factor: tensor(0.0888, device='cuda:0')
reward_raw: tensor(0.4871, device='cuda:0') reward: tensor(1.3981, device='cuda:0') factor: tensor(0.0890, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.3927, device='cuda:0') factor: tensor(0.0895, device='cuda:0')
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.4060, device='cuda:0') factor: tensor(0.0902, device='cuda:0')


Epoch 1, Step 1190: Loss = -0.0711, Actor Loss = -0.1023, Critic Loss = 0.0624, Entropy = 0.0425, Returns = 1.3845, Value = 1.3942, mask_loss = 0.0878std_loss = 0.2535:  22%|██▏       | 1190/5475 [5:20:48<17:47:23, 14.95s/it]

ratio tensor(1.0758, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3849, device='cuda:0') reward: tensor(1.2729, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.4146, device='cuda:0') reward: tensor(1.2992, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4125, device='cuda:0') reward: tensor(1.2966, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.4196, device='cuda:0') reward: tensor(1.3067, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.4083, device='cuda:0') reward: tensor(1.2953, device='cuda:0') factor: tensor(0.1131, device='cuda:0')


Epoch 1, Step 1191: Loss = 0.1605, Actor Loss = 0.1175, Critic Loss = 0.0861, Entropy = 0.0565, Returns = 1.1917, Value = 1.2718, mask_loss = 0.0888std_loss = 0.2439:  22%|██▏       | 1191/5475 [5:21:04<18:13:43, 15.32s/it]  

ratio tensor(0.7527, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5647, device='cuda:0') reward: tensor(1.4611, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.4451, device='cuda:0') factor: tensor(0.0985, device='cuda:0')
reward_raw: tensor(0.5380, device='cuda:0') reward: tensor(1.4339, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.5265, device='cuda:0') reward: tensor(1.4230, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.5494, device='cuda:0') reward: tensor(1.4489, device='cuda:0') factor: tensor(0.1006, device='cuda:0')


Epoch 1, Step 1192: Loss = -0.3014, Actor Loss = -0.3582, Critic Loss = 0.1137, Entropy = 0.0360, Returns = 1.5559, Value = 1.4456, mask_loss = 0.0832std_loss = 0.2518:  22%|██▏       | 1192/5475 [5:21:21<18:56:56, 15.93s/it]

ratio tensor(1.2644, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4769, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.1071, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.4652, device='cuda:0') reward: tensor(1.3595, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3593, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4465, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.1042, device='cuda:0')


Epoch 1, Step 1193: Loss = 0.1272, Actor Loss = 0.1017, Critic Loss = 0.0512, Entropy = 0.0530, Returns = 1.2863, Value = 1.3391, mask_loss = 0.1003std_loss = 0.2639:  22%|██▏       | 1193/5475 [5:21:35<18:16:16, 15.36s/it]  

ratio tensor(0.8551, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4735, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1001, device='cuda:0')
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.4014, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.4043, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.4010, device='cuda:0') factor: tensor(0.1011, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(1.4212, device='cuda:0') factor: tensor(0.0973, device='cuda:0')


Epoch 1, Step 1194: Loss = 0.2103, Actor Loss = 0.1843, Critic Loss = 0.0522, Entropy = 0.0459, Returns = 1.4416, Value = 1.3246, mask_loss = 0.0925std_loss = 0.2616:  22%|██▏       | 1194/5475 [5:21:48<17:26:20, 14.66s/it]

ratio tensor(0.8870, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3856, device='cuda:0') factor: tensor(0.0967, device='cuda:0')
reward_raw: tensor(0.4767, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.0934, device='cuda:0')
reward_raw: tensor(0.4814, device='cuda:0') reward: tensor(1.3824, device='cuda:0') factor: tensor(0.0990, device='cuda:0')
reward_raw: tensor(0.4783, device='cuda:0') reward: tensor(1.3847, device='cuda:0') factor: tensor(0.0936, device='cuda:0')


Epoch 1, Step 1195: Loss = -0.0355, Actor Loss = -0.0685, Critic Loss = 0.0661, Entropy = 0.0427, Returns = 1.3198, Value = 1.4216, mask_loss = 0.1048std_loss = 0.2802:  22%|██▏       | 1195/5475 [5:22:03<17:20:42, 14.59s/it]

ratio tensor(0.9080, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4234, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.4119, device='cuda:0') reward: tensor(1.3257, device='cuda:0') factor: tensor(0.0862, device='cuda:0')
reward_raw: tensor(0.4164, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.0899, device='cuda:0')
reward_raw: tensor(0.4140, device='cuda:0') reward: tensor(1.3269, device='cuda:0') factor: tensor(0.0871, device='cuda:0')
reward_raw: tensor(0.4147, device='cuda:0') reward: tensor(1.3283, device='cuda:0') factor: tensor(0.0865, device='cuda:0')


Epoch 1, Step 1196: Loss = 0.1197, Actor Loss = 0.0850, Critic Loss = 0.0695, Entropy = 0.0385, Returns = 1.3206, Value = 1.2969, mask_loss = 0.0822std_loss = 0.2488:  22%|██▏       | 1196/5475 [5:22:18<17:31:54, 14.75s/it]  

ratio tensor(0.8563, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4119, device='cuda:0') reward: tensor(1.3240, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.4150, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.0852, device='cuda:0')
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.0926, device='cuda:0')
reward_raw: tensor(0.4283, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.0905, device='cuda:0')
reward_raw: tensor(0.4339, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.0906, device='cuda:0')


Epoch 1, Step 1197: Loss = -0.1191, Actor Loss = -0.1457, Critic Loss = 0.0533, Entropy = 0.0333, Returns = 1.2937, Value = 1.3220, mask_loss = 0.0780std_loss = 0.2434:  22%|██▏       | 1197/5475 [5:22:32<17:11:27, 14.47s/it]

ratio tensor(0.9902, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4902, device='cuda:0') reward: tensor(1.3935, device='cuda:0') factor: tensor(0.0967, device='cuda:0')
reward_raw: tensor(0.4875, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.0978, device='cuda:0')
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.4978, device='cuda:0') reward: tensor(1.3996, device='cuda:0') factor: tensor(0.0982, device='cuda:0')


Epoch 1, Step 1198: Loss = 0.1591, Actor Loss = 0.1322, Critic Loss = 0.0538, Entropy = 0.0543, Returns = 1.3953, Value = 1.4200, mask_loss = 0.1175std_loss = 0.2864:  22%|██▏       | 1198/5475 [5:22:46<16:58:57, 14.29s/it]  

ratio tensor(0.8660, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4312, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.4269, device='cuda:0') reward: tensor(1.3401, device='cuda:0') factor: tensor(0.0868, device='cuda:0')
reward_raw: tensor(0.4327, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.0898, device='cuda:0')
reward_raw: tensor(0.4468, device='cuda:0') reward: tensor(1.3575, device='cuda:0') factor: tensor(0.0893, device='cuda:0')
reward_raw: tensor(0.4185, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.0882, device='cuda:0')


Epoch 1, Step 1199: Loss = -0.0157, Actor Loss = -0.0487, Critic Loss = 0.0660, Entropy = 0.0387, Returns = 1.3791, Value = 1.3610, mask_loss = 0.0894std_loss = 0.2592:  22%|██▏       | 1199/5475 [5:23:03<18:13:26, 15.34s/it]

ratio tensor(0.9574, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.4147, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.4054, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.4083, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.4123, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.5161, device='cuda:0') reward: tensor(1.4144, device='cuda:0') factor: tensor(0.1016, device='cuda:0')


Epoch 1, Step 1200: Loss = -0.1934, Actor Loss = -0.2246, Critic Loss = 0.0625, Entropy = 0.0490, Returns = 1.4923, Value = 1.4717, mask_loss = 0.0914std_loss = 0.2538:  22%|██▏       | 1200/5475 [5:23:20<18:37:18, 15.68s/it]

ratio tensor(2.7586, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.4174, device='cuda:0') factor: tensor(0.0938, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.4077, device='cuda:0') factor: tensor(0.0922, device='cuda:0')
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.0880, device='cuda:0')
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.4037, device='cuda:0') factor: tensor(0.0915, device='cuda:0')
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.0919, device='cuda:0')


Epoch 1, Step 1201: Loss = 0.1833, Actor Loss = 0.1602, Critic Loss = 0.0462, Entropy = 0.0382, Returns = 1.4752, Value = 1.4362, mask_loss = 0.0838std_loss = 0.2525:  22%|██▏       | 1200/5475 [5:23:33<18:37:18, 15.68s/it]  

ratio tensor(1.0574, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1201: Loss = 0.1833, Actor Loss = 0.1602, Critic Loss = 0.0462, Entropy = 0.0382, Returns = 1.4752, Value = 1.4362, mask_loss = 0.0838std_loss = 0.2525:  22%|██▏       | 1201/5475 [5:23:33<17:43:31, 14.93s/it]

reward_raw: tensor(0.4495, device='cuda:0') reward: tensor(1.3432, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.5074, device='cuda:0') reward: tensor(1.3960, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3563, device='cuda:0') factor: tensor(0.1093, device='cuda:0')
reward_raw: tensor(0.5075, device='cuda:0') reward: tensor(1.3982, device='cuda:0') factor: tensor(0.1092, device='cuda:0')
reward_raw: tensor(0.4893, device='cuda:0') reward: tensor(1.3835, device='cuda:0') factor: tensor(0.1058, device='cuda:0')


Epoch 1, Step 1202: Loss = 0.0985, Actor Loss = 0.0799, Critic Loss = 0.0372, Entropy = 0.0639, Returns = 1.3380, Value = 1.2802, mask_loss = 0.1332std_loss = 0.3001:  22%|██▏       | 1202/5475 [5:23:44<16:26:47, 13.86s/it]

ratio tensor(1.0650, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4708, device='cuda:0') reward: tensor(1.3596, device='cuda:0') factor: tensor(0.1112, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.1143, device='cuda:0')
reward_raw: tensor(0.4921, device='cuda:0') reward: tensor(1.3756, device='cuda:0') factor: tensor(0.1165, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.4781, device='cuda:0') reward: tensor(1.3632, device='cuda:0') factor: tensor(0.1149, device='cuda:0')


Epoch 1, Step 1203: Loss = -0.2451, Actor Loss = -0.2642, Critic Loss = 0.0384, Entropy = 0.0488, Returns = 1.4567, Value = 1.4091, mask_loss = 0.1123std_loss = 0.2832:  22%|██▏       | 1203/5475 [5:24:00<17:07:05, 14.43s/it]

ratio tensor(2.4556, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4646, device='cuda:0') reward: tensor(1.3630, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.0926, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1007, device='cuda:0')
reward_raw: tensor(0.4607, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.1000, device='cuda:0')
reward_raw: tensor(0.4626, device='cuda:0') reward: tensor(1.3587, device='cuda:0') factor: tensor(0.1038, device='cuda:0')


Epoch 1, Step 1204: Loss = -0.0447, Actor Loss = -0.0717, Critic Loss = 0.0540, Entropy = 0.0409, Returns = 1.3922, Value = 1.4241, mask_loss = 0.1003std_loss = 0.2743:  22%|██▏       | 1204/5475 [5:24:15<17:08:04, 14.44s/it]

ratio tensor(1.0188, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.0988, device='cuda:0')
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.4483, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.0999, device='cuda:0')
reward_raw: tensor(0.4546, device='cuda:0') reward: tensor(1.3529, device='cuda:0') factor: tensor(0.1017, device='cuda:0')


Epoch 1, Step 1205: Loss = 0.0744, Actor Loss = 0.0412, Critic Loss = 0.0664, Entropy = 0.0512, Returns = 1.4219, Value = 1.2885, mask_loss = 0.0929std_loss = 0.2531:  22%|██▏       | 1205/5475 [5:24:31<17:42:26, 14.93s/it]  

ratio tensor(0.9949, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4981, device='cuda:0') reward: tensor(1.3991, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.5089, device='cuda:0') reward: tensor(1.4099, device='cuda:0') factor: tensor(0.0990, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.3945, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.4898, device='cuda:0') reward: tensor(1.3918, device='cuda:0') factor: tensor(0.0980, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.4232, device='cuda:0') factor: tensor(0.1007, device='cuda:0')


Epoch 1, Step 1206: Loss = -0.0854, Actor Loss = -0.1166, Critic Loss = 0.0625, Entropy = 0.0402, Returns = 1.4373, Value = 1.3693, mask_loss = 0.0958std_loss = 0.2696:  22%|██▏       | 1206/5475 [5:24:47<18:01:47, 15.20s/it]

ratio tensor(0.9870, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4770, device='cuda:0') reward: tensor(1.3960, device='cuda:0') factor: tensor(0.0810, device='cuda:0')
reward_raw: tensor(0.4862, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.0856, device='cuda:0')
reward_raw: tensor(0.5027, device='cuda:0') reward: tensor(1.4181, device='cuda:0') factor: tensor(0.0846, device='cuda:0')
reward_raw: tensor(0.4868, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.0883, device='cuda:0')
reward_raw: tensor(0.4850, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.0812, device='cuda:0')


Epoch 1, Step 1207: Loss = 0.1114, Actor Loss = 0.0893, Critic Loss = 0.0444, Entropy = 0.0386, Returns = 1.3864, Value = 1.4650, mask_loss = 0.0836std_loss = 0.2512:  22%|██▏       | 1207/5475 [5:24:59<16:51:27, 14.22s/it]  

ratio tensor(0.8478, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4200, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.4267, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4275, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.0935, device='cuda:0')
reward_raw: tensor(0.4468, device='cuda:0') reward: tensor(1.3512, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.4406, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.0936, device='cuda:0')


Epoch 1, Step 1208: Loss = 0.0307, Actor Loss = -0.0021, Critic Loss = 0.0657, Entropy = 0.0516, Returns = 1.3781, Value = 1.3452, mask_loss = 0.1000std_loss = 0.2597:  22%|██▏       | 1208/5475 [5:25:18<18:33:37, 15.66s/it]

ratio tensor(1.0550, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4229, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.0907, device='cuda:0')
reward_raw: tensor(0.4381, device='cuda:0') reward: tensor(1.3450, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.4395, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.4398, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.0969, device='cuda:0')
reward_raw: tensor(0.4385, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.0940, device='cuda:0')


Epoch 1, Step 1209: Loss = 0.0135, Actor Loss = -0.0305, Critic Loss = 0.0881, Entropy = 0.0356, Returns = 1.2387, Value = 1.2845, mask_loss = 0.0879std_loss = 0.2611:  22%|██▏       | 1209/5475 [5:25:31<17:47:40, 15.02s/it]

ratio tensor(1.0230, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5540, device='cuda:0') reward: tensor(1.4542, device='cuda:0') factor: tensor(0.0998, device='cuda:0')
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.4371, device='cuda:0') factor: tensor(0.0998, device='cuda:0')
reward_raw: tensor(0.5444, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.4360, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.4249, device='cuda:0') factor: tensor(0.0966, device='cuda:0')


Epoch 1, Step 1210: Loss = 0.1185, Actor Loss = 0.0860, Critic Loss = 0.0650, Entropy = 0.0427, Returns = 1.3388, Value = 1.4671, mask_loss = 0.0923std_loss = 0.2572:  22%|██▏       | 1210/5475 [5:25:48<18:23:37, 15.53s/it] 

ratio tensor(0.9741, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.0872, device='cuda:0')
reward_raw: tensor(0.4663, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.0901, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3993, device='cuda:0') factor: tensor(0.0871, device='cuda:0')
reward_raw: tensor(0.4651, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.4904, device='cuda:0') reward: tensor(1.4035, device='cuda:0') factor: tensor(0.0869, device='cuda:0')


Epoch 1, Step 1211: Loss = 0.0965, Actor Loss = 0.0507, Critic Loss = 0.0917, Entropy = 0.0386, Returns = 1.4096, Value = 1.4557, mask_loss = 0.0792std_loss = 0.2447:  22%|██▏       | 1211/5475 [5:26:01<17:43:53, 14.97s/it]

ratio tensor(1.0295, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.3899, device='cuda:0') reward: tensor(1.3034, device='cuda:0') factor: tensor(0.0865, device='cuda:0')
reward_raw: tensor(0.3713, device='cuda:0') reward: tensor(1.2821, device='cuda:0') factor: tensor(0.0892, device='cuda:0')
reward_raw: tensor(0.3957, device='cuda:0') reward: tensor(1.3064, device='cuda:0') factor: tensor(0.0894, device='cuda:0')
reward_raw: tensor(0.3897, device='cuda:0') reward: tensor(1.3019, device='cuda:0') factor: tensor(0.0878, device='cuda:0')
reward_raw: tensor(0.3939, device='cuda:0') reward: tensor(1.3023, device='cuda:0') factor: tensor(0.0916, device='cuda:0')


Epoch 1, Step 1212: Loss = -0.0521, Actor Loss = -0.0761, Critic Loss = 0.0483, Entropy = 0.0585, Returns = 1.3646, Value = 1.2483, mask_loss = 0.0940std_loss = 0.2573:  22%|██▏       | 1212/5475 [5:26:24<20:29:13, 17.30s/it]

ratio tensor(2.7577, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4590, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1055, device='cuda:0')
reward_raw: tensor(0.4673, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.4633, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.4708, device='cuda:0') reward: tensor(1.3710, device='cuda:0') factor: tensor(0.0998, device='cuda:0')


Epoch 1, Step 1213: Loss = 0.1022, Actor Loss = 0.0533, Critic Loss = 0.0981, Entropy = 0.0552, Returns = 1.3322, Value = 1.2313, mask_loss = 0.1094std_loss = 0.2682:  22%|██▏       | 1213/5475 [5:26:43<20:52:46, 17.64s/it]  

ratio tensor(0.7916, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3729, device='cuda:0') reward: tensor(1.2783, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.4110, device='cuda:0') reward: tensor(1.3144, device='cuda:0') factor: tensor(0.0966, device='cuda:0')
reward_raw: tensor(0.3607, device='cuda:0') reward: tensor(1.2619, device='cuda:0') factor: tensor(0.0988, device='cuda:0')
reward_raw: tensor(0.3584, device='cuda:0') reward: tensor(1.2660, device='cuda:0') factor: tensor(0.0924, device='cuda:0')
reward_raw: tensor(0.3901, device='cuda:0') reward: tensor(1.2965, device='cuda:0') factor: tensor(0.0936, device='cuda:0')


Epoch 1, Step 1214: Loss = -0.2101, Actor Loss = -0.2358, Critic Loss = 0.0516, Entropy = 0.0506, Returns = 1.3443, Value = 1.2693, mask_loss = 0.0954std_loss = 0.2656:  22%|██▏       | 1214/5475 [5:26:56<19:12:29, 16.23s/it]

ratio tensor(1.2724, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.4080, device='cuda:0') factor: tensor(0.0804, device='cuda:0')
reward_raw: tensor(0.4759, device='cuda:0') reward: tensor(1.3918, device='cuda:0') factor: tensor(0.0841, device='cuda:0')
reward_raw: tensor(0.4634, device='cuda:0') reward: tensor(1.3799, device='cuda:0') factor: tensor(0.0835, device='cuda:0')
reward_raw: tensor(0.4666, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.0817, device='cuda:0')
reward_raw: tensor(0.4967, device='cuda:0') reward: tensor(1.4104, device='cuda:0') factor: tensor(0.0863, device='cuda:0')


Epoch 1, Step 1215: Loss = -0.0305, Actor Loss = -0.0658, Critic Loss = 0.0707, Entropy = 0.0340, Returns = 1.3773, Value = 1.5603, mask_loss = 0.0838std_loss = 0.2540:  22%|██▏       | 1215/5475 [5:27:10<18:26:03, 15.58s/it]

ratio tensor(1.3039, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.3804, device='cuda:0') factor: tensor(0.1246, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.1261, device='cuda:0')
reward_raw: tensor(0.4976, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.1298, device='cuda:0')
reward_raw: tensor(0.5210, device='cuda:0') reward: tensor(1.3922, device='cuda:0') factor: tensor(0.1288, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3685, device='cuda:0') factor: tensor(0.1212, device='cuda:0')


Epoch 1, Step 1216: Loss = 0.0780, Actor Loss = 0.0292, Critic Loss = 0.0978, Entropy = 0.0585, Returns = 1.4092, Value = 1.2707, mask_loss = 0.1164std_loss = 0.2855:  22%|██▏       | 1216/5475 [5:27:29<19:42:48, 16.66s/it]  

ratio tensor(0.7815, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.0936, device='cuda:0')
reward_raw: tensor(0.4646, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.0947, device='cuda:0')
reward_raw: tensor(0.4669, device='cuda:0') reward: tensor(1.3691, device='cuda:0') factor: tensor(0.0978, device='cuda:0')
reward_raw: tensor(0.4762, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.0925, device='cuda:0')
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3692, device='cuda:0') factor: tensor(0.0948, device='cuda:0')


Epoch 1, Step 1217: Loss = -0.0206, Actor Loss = -0.0389, Critic Loss = 0.0367, Entropy = 0.0536, Returns = 1.3925, Value = 1.4025, mask_loss = 0.0977std_loss = 0.2598:  22%|██▏       | 1217/5475 [5:27:44<19:16:29, 16.30s/it]

ratio tensor(0.8354, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4669, device='cuda:0') reward: tensor(1.3608, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.3573, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.4660, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.4732, device='cuda:0') reward: tensor(1.3690, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4706, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1009, device='cuda:0')


Epoch 1, Step 1218: Loss = -0.0410, Actor Loss = -0.0753, Critic Loss = 0.0686, Entropy = 0.0505, Returns = 1.3598, Value = 1.2893, mask_loss = 0.0992std_loss = 0.2680:  22%|██▏       | 1218/5475 [5:27:59<18:37:04, 15.74s/it]

ratio tensor(1.0260, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4754, device='cuda:0') reward: tensor(1.3768, device='cuda:0') factor: tensor(0.0986, device='cuda:0')
reward_raw: tensor(0.4898, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1014, device='cuda:0')
reward_raw: tensor(0.4715, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.4718, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.0985, device='cuda:0')
reward_raw: tensor(0.4772, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1009, device='cuda:0')


Epoch 1, Step 1219: Loss = -0.0417, Actor Loss = -0.0832, Critic Loss = 0.0830, Entropy = 0.0407, Returns = 1.3402, Value = 1.4146, mask_loss = 0.0947std_loss = 0.2660:  22%|██▏       | 1219/5475 [5:28:20<20:24:23, 17.26s/it]

ratio tensor(1.2515, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3994, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.5024, device='cuda:0') reward: tensor(1.4055, device='cuda:0') factor: tensor(0.0969, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.4134, device='cuda:0') factor: tensor(0.0981, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.4061, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.4118, device='cuda:0') factor: tensor(0.0955, device='cuda:0')


Epoch 1, Step 1220: Loss = 0.0417, Actor Loss = -0.0006, Critic Loss = 0.0847, Entropy = 0.0463, Returns = 1.4002, Value = 1.3996, mask_loss = 0.0835std_loss = 0.2393:  22%|██▏       | 1220/5475 [5:28:34<19:32:56, 16.54s/it] 

ratio tensor(0.7836, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4527, device='cuda:0') reward: tensor(1.3269, device='cuda:0') factor: tensor(0.1258, device='cuda:0')
reward_raw: tensor(0.4347, device='cuda:0') reward: tensor(1.3105, device='cuda:0') factor: tensor(0.1242, device='cuda:0')
reward_raw: tensor(0.4433, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.1278, device='cuda:0')
reward_raw: tensor(0.4520, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.1244, device='cuda:0')
reward_raw: tensor(0.4447, device='cuda:0') reward: tensor(1.3171, device='cuda:0') factor: tensor(0.1276, device='cuda:0')


Epoch 1, Step 1221: Loss = 0.1159, Actor Loss = 0.0859, Critic Loss = 0.0602, Entropy = 0.0571, Returns = 1.2811, Value = 1.2706, mask_loss = 0.1339std_loss = 0.3040:  22%|██▏       | 1220/5475 [5:28:47<19:32:56, 16.54s/it] 

ratio tensor(0.8978, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1221: Loss = 0.1159, Actor Loss = 0.0859, Critic Loss = 0.0602, Entropy = 0.0571, Returns = 1.2811, Value = 1.2706, mask_loss = 0.1339std_loss = 0.3040:  22%|██▏       | 1221/5475 [5:28:47<18:10:44, 15.38s/it]

reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.4352, device='cuda:0') factor: tensor(0.0780, device='cuda:0')
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.4414, device='cuda:0') factor: tensor(0.0791, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.4330, device='cuda:0') factor: tensor(0.0803, device='cuda:0')
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.4362, device='cuda:0') factor: tensor(0.0831, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.4322, device='cuda:0') factor: tensor(0.0830, device='cuda:0')


Epoch 1, Step 1222: Loss = -0.1045, Actor Loss = -0.1528, Critic Loss = 0.0966, Entropy = 0.0380, Returns = 1.4198, Value = 1.5572, mask_loss = 0.0867std_loss = 0.2581:  22%|██▏       | 1222/5475 [5:29:02<18:10:56, 15.39s/it]

ratio tensor(1.0947, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5024, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.1101, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.1022, device='cuda:0')
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.1083, device='cuda:0')
reward_raw: tensor(0.4471, device='cuda:0') reward: tensor(1.3403, device='cuda:0') factor: tensor(0.1068, device='cuda:0')


Epoch 1, Step 1223: Loss = 0.2218, Actor Loss = 0.1640, Critic Loss = 0.1156, Entropy = 0.0404, Returns = 1.4315, Value = 1.4752, mask_loss = 0.0910std_loss = 0.2539:  22%|██▏       | 1223/5475 [5:29:18<18:12:56, 15.42s/it]  

ratio tensor(0.8423, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4645, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.1350, device='cuda:0')
reward_raw: tensor(0.4797, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.1331, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3597, device='cuda:0') factor: tensor(0.1313, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.3475, device='cuda:0') factor: tensor(0.1438, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.1373, device='cuda:0')


Epoch 1, Step 1224: Loss = -0.0585, Actor Loss = -0.0889, Critic Loss = 0.0611, Entropy = 0.0716, Returns = 1.3682, Value = 1.2637, mask_loss = 0.1343std_loss = 0.3024:  22%|██▏       | 1224/5475 [5:29:33<18:08:37, 15.37s/it]

ratio tensor(1.1762, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4926, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.4041, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4967, device='cuda:0') reward: tensor(1.3986, device='cuda:0') factor: tensor(0.0980, device='cuda:0')
reward_raw: tensor(0.5214, device='cuda:0') reward: tensor(1.4239, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.4179, device='cuda:0') factor: tensor(0.1029, device='cuda:0')


Epoch 1, Step 1225: Loss = -0.0750, Actor Loss = -0.1032, Critic Loss = 0.0566, Entropy = 0.0698, Returns = 1.4277, Value = 1.4144, mask_loss = 0.1237std_loss = 0.2858:  22%|██▏       | 1225/5475 [5:29:45<16:55:18, 14.33s/it]

ratio tensor(1.0588, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.3962, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3909, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.5188, device='cuda:0') reward: tensor(1.4022, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.4048, device='cuda:0') factor: tensor(0.1201, device='cuda:0')


Epoch 1, Step 1226: Loss = 0.0585, Actor Loss = 0.0025, Critic Loss = 0.1122, Entropy = 0.0569, Returns = 1.3668, Value = 1.3291, mask_loss = 0.1428std_loss = 0.3189:  22%|██▏       | 1226/5475 [5:30:00<17:13:39, 14.60s/it]  

ratio tensor(1.0310, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4204, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.4206, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4105, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.0927, device='cuda:0')
reward_raw: tensor(0.4245, device='cuda:0') reward: tensor(1.3333, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4159, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.0927, device='cuda:0')


Epoch 1, Step 1227: Loss = 0.1699, Actor Loss = 0.1369, Critic Loss = 0.0660, Entropy = 0.0426, Returns = 1.3463, Value = 1.4083, mask_loss = 0.0890std_loss = 0.2534:  22%|██▏       | 1227/5475 [5:30:16<17:43:34, 15.02s/it]

ratio tensor(0.7602, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4511, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.0934, device='cuda:0')
reward_raw: tensor(0.4559, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.0885, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.0888, device='cuda:0')
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.4368, device='cuda:0') reward: tensor(1.3474, device='cuda:0') factor: tensor(0.0895, device='cuda:0')


Epoch 1, Step 1228: Loss = 0.0467, Actor Loss = 0.0033, Critic Loss = 0.0868, Entropy = 0.0364, Returns = 1.3468, Value = 1.3933, mask_loss = 0.0903std_loss = 0.2637:  22%|██▏       | 1228/5475 [5:30:31<17:45:12, 15.05s/it]

ratio tensor(1.1174, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5361, device='cuda:0') reward: tensor(1.4244, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.5464, device='cuda:0') reward: tensor(1.4349, device='cuda:0') factor: tensor(0.1116, device='cuda:0')
reward_raw: tensor(0.5383, device='cuda:0') reward: tensor(1.4271, device='cuda:0') factor: tensor(0.1112, device='cuda:0')
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.4266, device='cuda:0') factor: tensor(0.1102, device='cuda:0')
reward_raw: tensor(0.5616, device='cuda:0') reward: tensor(1.4551, device='cuda:0') factor: tensor(0.1065, device='cuda:0')


Epoch 1, Step 1229: Loss = 0.0486, Actor Loss = 0.0166, Critic Loss = 0.0641, Entropy = 0.0603, Returns = 1.3609, Value = 1.5256, mask_loss = 0.1118std_loss = 0.2751:  22%|██▏       | 1229/5475 [5:30:50<19:02:59, 16.15s/it]

ratio tensor(0.9743, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4717, device='cuda:0') reward: tensor(1.3647, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.4484, device='cuda:0') reward: tensor(1.3374, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3596, device='cuda:0') factor: tensor(0.1080, device='cuda:0')
reward_raw: tensor(0.4573, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.1075, device='cuda:0')
reward_raw: tensor(0.4647, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.1079, device='cuda:0')


Epoch 1, Step 1230: Loss = 0.1097, Actor Loss = 0.0848, Critic Loss = 0.0499, Entropy = 0.0377, Returns = 1.3311, Value = 1.4580, mask_loss = 0.0950std_loss = 0.2728:  22%|██▏       | 1230/5475 [5:31:09<19:54:54, 16.89s/it]

ratio tensor(0.9382, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4267, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.4120, device='cuda:0') reward: tensor(1.3053, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.3781, device='cuda:0') reward: tensor(1.2770, device='cuda:0') factor: tensor(0.1011, device='cuda:0')
reward_raw: tensor(0.3974, device='cuda:0') reward: tensor(1.2933, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.3937, device='cuda:0') reward: tensor(1.2839, device='cuda:0') factor: tensor(0.1098, device='cuda:0')


Epoch 1, Step 1231: Loss = 0.0816, Actor Loss = 0.0430, Critic Loss = 0.0774, Entropy = 0.0688, Returns = 1.3446, Value = 1.1639, mask_loss = 0.1225std_loss = 0.2808:  22%|██▏       | 1231/5475 [5:31:25<19:29:56, 16.54s/it]

ratio tensor(1.1751, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.4157, device='cuda:0') factor: tensor(0.1151, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4177, device='cuda:0') factor: tensor(0.1105, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.1177, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3984, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.4901, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.1116, device='cuda:0')


Epoch 1, Step 1232: Loss = -0.0730, Actor Loss = -0.1316, Critic Loss = 0.1174, Entropy = 0.0546, Returns = 1.4248, Value = 1.4692, mask_loss = 0.1317std_loss = 0.3008:  23%|██▎       | 1232/5475 [5:31:40<19:14:31, 16.33s/it]

ratio tensor(1.2888, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.4139, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.4271, device='cuda:0') factor: tensor(0.1011, device='cuda:0')
reward_raw: tensor(0.5300, device='cuda:0') reward: tensor(1.4266, device='cuda:0') factor: tensor(0.1033, device='cuda:0')
reward_raw: tensor(0.5271, device='cuda:0') reward: tensor(1.4286, device='cuda:0') factor: tensor(0.0985, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(1.4032, device='cuda:0') factor: tensor(0.0984, device='cuda:0')


Epoch 1, Step 1233: Loss = 0.0143, Actor Loss = -0.0013, Critic Loss = 0.0313, Entropy = 0.0546, Returns = 1.4647, Value = 1.4903, mask_loss = 0.1037std_loss = 0.2714:  23%|██▎       | 1233/5475 [5:31:56<19:03:58, 16.18s/it] 

ratio tensor(1.1286, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.4258, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.5124, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.0999, device='cuda:0')
reward_raw: tensor(0.5304, device='cuda:0') reward: tensor(1.4310, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.4106, device='cuda:0') factor: tensor(0.0974, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.4211, device='cuda:0') factor: tensor(0.0925, device='cuda:0')


Epoch 1, Step 1234: Loss = 0.2023, Actor Loss = 0.1637, Critic Loss = 0.0773, Entropy = 0.0484, Returns = 1.4294, Value = 1.4458, mask_loss = 0.0992std_loss = 0.2707:  23%|██▎       | 1234/5475 [5:32:13<19:15:06, 16.34s/it] 

ratio tensor(1.1039, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.4176, device='cuda:0') factor: tensor(0.1114, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.4058, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.5304, device='cuda:0') reward: tensor(1.4196, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.5324, device='cuda:0') reward: tensor(1.4185, device='cuda:0') factor: tensor(0.1139, device='cuda:0')
reward_raw: tensor(0.5286, device='cuda:0') reward: tensor(1.4127, device='cuda:0') factor: tensor(0.1158, device='cuda:0')


Epoch 1, Step 1235: Loss = 0.0012, Actor Loss = -0.0364, Critic Loss = 0.0753, Entropy = 0.0450, Returns = 1.4435, Value = 1.5363, mask_loss = 0.0979std_loss = 0.2689:  23%|██▎       | 1235/5475 [5:32:25<17:40:58, 15.01s/it]

ratio tensor(1.0837, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.3852, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.1173, device='cuda:0')
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1219, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.3823, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3908, device='cuda:0') factor: tensor(0.1131, device='cuda:0')


Epoch 1, Step 1236: Loss = 0.0336, Actor Loss = 0.0124, Critic Loss = 0.0424, Entropy = 0.0546, Returns = 1.3532, Value = 1.2848, mask_loss = 0.1111std_loss = 0.2800:  23%|██▎       | 1236/5475 [5:32:40<17:48:07, 15.12s/it] 

ratio tensor(0.9245, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3794, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.1026, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4441, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1068, device='cuda:0')


Epoch 1, Step 1237: Loss = -0.0044, Actor Loss = -0.0458, Critic Loss = 0.0830, Entropy = 0.0379, Returns = 1.3493, Value = 1.4990, mask_loss = 0.0893std_loss = 0.2587:  23%|██▎       | 1237/5475 [5:32:56<18:11:56, 15.46s/it]

ratio tensor(0.8517, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5915, device='cuda:0') reward: tensor(1.4892, device='cuda:0') factor: tensor(0.1023, device='cuda:0')
reward_raw: tensor(0.5587, device='cuda:0') reward: tensor(1.4578, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4526, device='cuda:0') factor: tensor(0.0970, device='cuda:0')
reward_raw: tensor(0.5610, device='cuda:0') reward: tensor(1.4608, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.5764, device='cuda:0') reward: tensor(1.4773, device='cuda:0') factor: tensor(0.0991, device='cuda:0')


Epoch 1, Step 1238: Loss = 0.0085, Actor Loss = -0.0255, Critic Loss = 0.0681, Entropy = 0.0499, Returns = 1.4966, Value = 1.4836, mask_loss = 0.1043std_loss = 0.2753:  23%|██▎       | 1238/5475 [5:33:11<17:56:38, 15.25s/it] 

ratio tensor(0.9938, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5098, device='cuda:0') reward: tensor(1.3682, device='cuda:0') factor: tensor(0.1416, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.4912, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.1388, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.1394, device='cuda:0')


Epoch 1, Step 1239: Loss = 0.1261, Actor Loss = 0.0724, Critic Loss = 0.1076, Entropy = 0.0520, Returns = 1.4486, Value = 1.2703, mask_loss = 0.1282std_loss = 0.3060:  23%|██▎       | 1239/5475 [5:33:23<16:44:08, 14.22s/it] 

ratio tensor(0.8520, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.3981, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.4564, device='cuda:0') reward: tensor(1.3592, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4745, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.4772, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.0986, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.4024, device='cuda:0') factor: tensor(0.0944, device='cuda:0')


Epoch 1, Step 1240: Loss = -0.0962, Actor Loss = -0.1295, Critic Loss = 0.0667, Entropy = 0.0571, Returns = 1.3906, Value = 1.4516, mask_loss = 0.1069std_loss = 0.2760:  23%|██▎       | 1240/5475 [5:33:36<16:12:05, 13.77s/it]

ratio tensor(0.9506, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5142, device='cuda:0') reward: tensor(1.3984, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.5007, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.3909, device='cuda:0') factor: tensor(0.1101, device='cuda:0')
reward_raw: tensor(0.5217, device='cuda:0') reward: tensor(1.4107, device='cuda:0') factor: tensor(0.1110, device='cuda:0')


Epoch 1, Step 1241: Loss = 0.0575, Actor Loss = -0.0117, Critic Loss = 0.1386, Entropy = 0.0560, Returns = 1.4229, Value = 1.5041, mask_loss = 0.1372std_loss = 0.3054:  23%|██▎       | 1240/5475 [5:33:48<16:12:05, 13.77s/it] 

ratio tensor(1.0824, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1241: Loss = 0.0575, Actor Loss = -0.0117, Critic Loss = 0.1386, Entropy = 0.0560, Returns = 1.4229, Value = 1.5041, mask_loss = 0.1372std_loss = 0.3054:  23%|██▎       | 1241/5475 [5:33:48<15:44:53, 13.39s/it]

reward_raw: tensor(0.4319, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.4330, device='cuda:0') reward: tensor(1.3221, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.4212, device='cuda:0') reward: tensor(1.3152, device='cuda:0') factor: tensor(0.1060, device='cuda:0')
reward_raw: tensor(0.4200, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.4369, device='cuda:0') reward: tensor(1.3259, device='cuda:0') factor: tensor(0.1111, device='cuda:0')


Epoch 1, Step 1242: Loss = 0.2704, Actor Loss = 0.2271, Critic Loss = 0.0867, Entropy = 0.0457, Returns = 1.3345, Value = 1.3362, mask_loss = 0.1000std_loss = 0.2698:  23%|██▎       | 1242/5475 [5:34:03<16:15:21, 13.83s/it] 

ratio tensor(0.6943, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.3548, device='cuda:0') factor: tensor(0.1141, device='cuda:0')
reward_raw: tensor(0.4666, device='cuda:0') reward: tensor(1.3531, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3501, device='cuda:0') factor: tensor(0.1139, device='cuda:0')
reward_raw: tensor(0.4799, device='cuda:0') reward: tensor(1.3592, device='cuda:0') factor: tensor(0.1206, device='cuda:0')
reward_raw: tensor(0.4654, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1135, device='cuda:0')


Epoch 1, Step 1243: Loss = -0.0688, Actor Loss = -0.1053, Critic Loss = 0.0731, Entropy = 0.0474, Returns = 1.3574, Value = 1.2824, mask_loss = 0.1198std_loss = 0.3003:  23%|██▎       | 1243/5475 [5:34:16<16:04:16, 13.67s/it]

ratio tensor(1.0049, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5474, device='cuda:0') reward: tensor(1.4253, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.4241, device='cuda:0') factor: tensor(0.1252, device='cuda:0')
reward_raw: tensor(0.5257, device='cuda:0') reward: tensor(1.4060, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.4159, device='cuda:0') factor: tensor(0.1231, device='cuda:0')
reward_raw: tensor(0.5465, device='cuda:0') reward: tensor(1.4205, device='cuda:0') factor: tensor(0.1260, device='cuda:0')


Epoch 1, Step 1244: Loss = -0.2330, Actor Loss = -0.2841, Critic Loss = 0.1024, Entropy = 0.0696, Returns = 1.4227, Value = 1.5343, mask_loss = 0.1270std_loss = 0.2914:  23%|██▎       | 1244/5475 [5:34:32<16:55:35, 14.40s/it]

ratio tensor(1.8168, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.3906, device='cuda:0') factor: tensor(0.1111, device='cuda:0')
reward_raw: tensor(0.5498, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.4246, device='cuda:0') factor: tensor(0.1102, device='cuda:0')
reward_raw: tensor(0.5461, device='cuda:0') reward: tensor(1.4323, device='cuda:0') factor: tensor(0.1138, device='cuda:0')
reward_raw: tensor(0.5426, device='cuda:0') reward: tensor(1.4302, device='cuda:0') factor: tensor(0.1124, device='cuda:0')


Epoch 1, Step 1245: Loss = 0.1380, Actor Loss = 0.1093, Critic Loss = 0.0575, Entropy = 0.0492, Returns = 1.4412, Value = 1.3594, mask_loss = 0.0994std_loss = 0.2611:  23%|██▎       | 1245/5475 [5:34:48<17:09:16, 14.60s/it]  

ratio tensor(0.7221, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.4262, device='cuda:0') factor: tensor(0.0875, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.4080, device='cuda:0') factor: tensor(0.0863, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.4154, device='cuda:0') factor: tensor(0.0848, device='cuda:0')
reward_raw: tensor(0.5108, device='cuda:0') reward: tensor(1.4269, device='cuda:0') factor: tensor(0.0839, device='cuda:0')
reward_raw: tensor(0.5154, device='cuda:0') reward: tensor(1.4277, device='cuda:0') factor: tensor(0.0877, device='cuda:0')


Epoch 1, Step 1246: Loss = -0.0386, Actor Loss = -0.0581, Critic Loss = 0.0392, Entropy = 0.0422, Returns = 1.3891, Value = 1.3967, mask_loss = 0.0907std_loss = 0.2616:  23%|██▎       | 1246/5475 [5:35:05<18:19:32, 15.60s/it]

ratio tensor(1.3844, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4446, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.1224, device='cuda:0')
reward_raw: tensor(0.4323, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1191, device='cuda:0')
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3233, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.4346, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.4361, device='cuda:0') reward: tensor(1.3138, device='cuda:0') factor: tensor(0.1222, device='cuda:0')


Epoch 1, Step 1247: Loss = 0.0880, Actor Loss = 0.0589, Critic Loss = 0.0585, Entropy = 0.0478, Returns = 1.3162, Value = 1.3437, mask_loss = 0.1116std_loss = 0.2844:  23%|██▎       | 1247/5475 [5:35:18<17:18:18, 14.73s/it]  

ratio tensor(1.0089, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4799, device='cuda:0') reward: tensor(1.3847, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3956, device='cuda:0') factor: tensor(0.0944, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.4161, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.4211, device='cuda:0') factor: tensor(0.1000, device='cuda:0')
reward_raw: tensor(0.4669, device='cuda:0') reward: tensor(1.3725, device='cuda:0') factor: tensor(0.0944, device='cuda:0')


Epoch 1, Step 1248: Loss = -0.0905, Actor Loss = -0.1221, Critic Loss = 0.0632, Entropy = 0.0521, Returns = 1.4342, Value = 1.3435, mask_loss = 0.0986std_loss = 0.2668:  23%|██▎       | 1248/5475 [5:35:36<18:15:14, 15.55s/it]

ratio tensor(1.1080, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.4040, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.4293, device='cuda:0') factor: tensor(0.1052, device='cuda:0')
reward_raw: tensor(0.5146, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1044, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.4029, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.5232, device='cuda:0') reward: tensor(1.4197, device='cuda:0') factor: tensor(0.1035, device='cuda:0')


Epoch 1, Step 1249: Loss = -0.0061, Actor Loss = -0.0323, Critic Loss = 0.0525, Entropy = 0.0440, Returns = 1.5105, Value = 1.4362, mask_loss = 0.0999std_loss = 0.2679:  23%|██▎       | 1249/5475 [5:35:53<18:43:09, 15.95s/it]

ratio tensor(0.9248, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4771, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3584, device='cuda:0') factor: tensor(0.1099, device='cuda:0')
reward_raw: tensor(0.4748, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.1124, device='cuda:0')
reward_raw: tensor(0.4561, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.1052, device='cuda:0')
reward_raw: tensor(0.4701, device='cuda:0') reward: tensor(1.3568, device='cuda:0') factor: tensor(0.1133, device='cuda:0')


Epoch 1, Step 1250: Loss = -0.0220, Actor Loss = -0.0543, Critic Loss = 0.0647, Entropy = 0.0401, Returns = 1.3458, Value = 1.3400, mask_loss = 0.0907std_loss = 0.2593:  23%|██▎       | 1250/5475 [5:36:10<19:19:51, 16.47s/it]

ratio tensor(1.0700, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.3914, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1025, device='cuda:0')
reward_raw: tensor(0.5159, device='cuda:0') reward: tensor(1.4130, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.4119, device='cuda:0') factor: tensor(0.1037, device='cuda:0')


Epoch 1, Step 1251: Loss = -0.0621, Actor Loss = -0.0958, Critic Loss = 0.0675, Entropy = 0.0483, Returns = 1.3362, Value = 1.4107, mask_loss = 0.1114std_loss = 0.2837:  23%|██▎       | 1251/5475 [5:36:27<19:21:45, 16.50s/it]

ratio tensor(1.1951, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4612, device='cuda:0') reward: tensor(1.3418, device='cuda:0') factor: tensor(0.1193, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.1134, device='cuda:0')
reward_raw: tensor(0.4699, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.4584, device='cuda:0') reward: tensor(1.3384, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.4317, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.1193, device='cuda:0')


Epoch 1, Step 1252: Loss = 0.2010, Actor Loss = 0.1633, Critic Loss = 0.0754, Entropy = 0.0441, Returns = 1.3990, Value = 1.3574, mask_loss = 0.1106std_loss = 0.2858:  23%|██▎       | 1252/5475 [5:36:42<18:50:16, 16.06s/it]  

ratio tensor(0.8829, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.4849, device='cuda:0') factor: tensor(0.0900, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.4759, device='cuda:0') factor: tensor(0.0880, device='cuda:0')
reward_raw: tensor(0.5800, device='cuda:0') reward: tensor(1.4882, device='cuda:0') factor: tensor(0.0918, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.4659, device='cuda:0') factor: tensor(0.0907, device='cuda:0')
reward_raw: tensor(0.5695, device='cuda:0') reward: tensor(1.4810, device='cuda:0') factor: tensor(0.0885, device='cuda:0')


Epoch 1, Step 1253: Loss = -0.2607, Actor Loss = -0.3141, Critic Loss = 0.1069, Entropy = 0.0309, Returns = 1.5052, Value = 1.4754, mask_loss = 0.1117std_loss = 0.2912:  23%|██▎       | 1253/5475 [5:37:00<19:38:19, 16.75s/it]

ratio tensor(1.1108, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4563, device='cuda:0') reward: tensor(1.3460, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.4369, device='cuda:0') reward: tensor(1.3256, device='cuda:0') factor: tensor(0.1114, device='cuda:0')
reward_raw: tensor(0.4064, device='cuda:0') reward: tensor(1.2977, device='cuda:0') factor: tensor(0.1087, device='cuda:0')
reward_raw: tensor(0.4384, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.1107, device='cuda:0')
reward_raw: tensor(0.4135, device='cuda:0') reward: tensor(1.3053, device='cuda:0') factor: tensor(0.1082, device='cuda:0')


Epoch 1, Step 1254: Loss = 0.1298, Actor Loss = 0.1009, Critic Loss = 0.0579, Entropy = 0.0519, Returns = 1.3834, Value = 1.2741, mask_loss = 0.1269std_loss = 0.2905:  23%|██▎       | 1254/5475 [5:37:13<18:23:00, 15.68s/it]  

ratio tensor(0.7953, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4652, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4747, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4799, device='cuda:0') reward: tensor(1.3779, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.4668, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.0989, device='cuda:0')


Epoch 1, Step 1255: Loss = -0.0902, Actor Loss = -0.1109, Critic Loss = 0.0415, Entropy = 0.0457, Returns = 1.3990, Value = 1.3253, mask_loss = 0.1116std_loss = 0.2861:  23%|██▎       | 1255/5475 [5:37:28<17:58:23, 15.33s/it]

ratio tensor(1.4705, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4466, device='cuda:0') reward: tensor(1.3438, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.4419, device='cuda:0') reward: tensor(1.3371, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.4456, device='cuda:0') reward: tensor(1.3460, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4636, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4466, device='cuda:0') reward: tensor(1.3432, device='cuda:0') factor: tensor(0.1033, device='cuda:0')


Epoch 1, Step 1256: Loss = 0.0901, Actor Loss = 0.0549, Critic Loss = 0.0705, Entropy = 0.0425, Returns = 1.4018, Value = 1.3419, mask_loss = 0.1010std_loss = 0.2743:  23%|██▎       | 1256/5475 [5:37:44<18:15:41, 15.58s/it]  

ratio tensor(1.0041, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4191, device='cuda:0') reward: tensor(1.3230, device='cuda:0') factor: tensor(0.0962, device='cuda:0')
reward_raw: tensor(0.4171, device='cuda:0') reward: tensor(1.3226, device='cuda:0') factor: tensor(0.0945, device='cuda:0')
reward_raw: tensor(0.4139, device='cuda:0') reward: tensor(1.3221, device='cuda:0') factor: tensor(0.0919, device='cuda:0')
reward_raw: tensor(0.4127, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.0906, device='cuda:0')
reward_raw: tensor(0.4208, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.0954, device='cuda:0')


Epoch 1, Step 1257: Loss = 0.2185, Actor Loss = 0.1434, Critic Loss = 0.1501, Entropy = 0.0421, Returns = 1.3017, Value = 1.4093, mask_loss = 0.1069std_loss = 0.2829:  23%|██▎       | 1257/5475 [5:38:04<19:55:40, 17.01s/it]

ratio tensor(1.1489, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4843, device='cuda:0') reward: tensor(1.3718, device='cuda:0') factor: tensor(0.1125, device='cuda:0')
reward_raw: tensor(0.4825, device='cuda:0') reward: tensor(1.3808, device='cuda:0') factor: tensor(0.1017, device='cuda:0')
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.3653, device='cuda:0') factor: tensor(0.1083, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1044, device='cuda:0')


Epoch 1, Step 1258: Loss = -0.2394, Actor Loss = -0.2663, Critic Loss = 0.0540, Entropy = 0.0463, Returns = 1.3647, Value = 1.3793, mask_loss = 0.1178std_loss = 0.2923:  23%|██▎       | 1258/5475 [5:38:20<19:16:05, 16.45s/it]

ratio tensor(1.0777, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5004, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.1195, device='cuda:0')
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3790, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.3867, device='cuda:0') factor: tensor(0.1264, device='cuda:0')
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.3944, device='cuda:0') factor: tensor(0.1221, device='cuda:0')


Epoch 1, Step 1259: Loss = 0.2215, Actor Loss = 0.1653, Critic Loss = 0.1125, Entropy = 0.0435, Returns = 1.3838, Value = 1.5086, mask_loss = 0.1106std_loss = 0.2842:  23%|██▎       | 1259/5475 [5:38:33<18:04:57, 15.44s/it]  

ratio tensor(0.7403, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3826, device='cuda:0') reward: tensor(1.2695, device='cuda:0') factor: tensor(0.1132, device='cuda:0')
reward_raw: tensor(0.3828, device='cuda:0') reward: tensor(1.2674, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.3980, device='cuda:0') reward: tensor(1.2808, device='cuda:0') factor: tensor(0.1172, device='cuda:0')
reward_raw: tensor(0.3990, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.4002, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.1140, device='cuda:0')


Epoch 1, Step 1260: Loss = 0.1797, Actor Loss = 0.1210, Critic Loss = 0.1175, Entropy = 0.0349, Returns = 1.2596, Value = 1.1364, mask_loss = 0.1003std_loss = 0.2797:  23%|██▎       | 1260/5475 [5:38:45<17:08:41, 14.64s/it]

ratio tensor(0.9175, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5953, device='cuda:0') reward: tensor(1.4766, device='cuda:0') factor: tensor(0.1187, device='cuda:0')
reward_raw: tensor(0.5925, device='cuda:0') reward: tensor(1.4709, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.6138, device='cuda:0') reward: tensor(1.4946, device='cuda:0') factor: tensor(0.1192, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.4753, device='cuda:0') factor: tensor(0.1252, device='cuda:0')
reward_raw: tensor(0.5939, device='cuda:0') reward: tensor(1.4748, device='cuda:0') factor: tensor(0.1191, device='cuda:0')


Epoch 1, Step 1261: Loss = -0.2840, Actor Loss = -0.3056, Critic Loss = 0.0434, Entropy = 0.0506, Returns = 1.4555, Value = 1.6021, mask_loss = 0.1307std_loss = 0.3030:  23%|██▎       | 1260/5475 [5:39:00<17:08:41, 14.64s/it]

ratio tensor(1.7817, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1261: Loss = -0.2840, Actor Loss = -0.3056, Critic Loss = 0.0434, Entropy = 0.0506, Returns = 1.4555, Value = 1.6021, mask_loss = 0.1307std_loss = 0.3030:  23%|██▎       | 1261/5475 [5:39:01<17:21:23, 14.83s/it]

reward_raw: tensor(0.5231, device='cuda:0') reward: tensor(1.4118, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.4177, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.1096, device='cuda:0')


Epoch 1, Step 1262: Loss = 0.2070, Actor Loss = 0.1946, Critic Loss = 0.0250, Entropy = 0.0436, Returns = 1.4043, Value = 1.4100, mask_loss = 0.1171std_loss = 0.2976:  23%|██▎       | 1262/5475 [5:39:16<17:30:21, 14.96s/it]  

ratio tensor(0.6669, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.4614, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.4332, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.5753, device='cuda:0') reward: tensor(1.4710, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.5746, device='cuda:0') reward: tensor(1.4707, device='cuda:0') factor: tensor(0.1039, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.4567, device='cuda:0') factor: tensor(0.1025, device='cuda:0')


Epoch 1, Step 1263: Loss = 0.1057, Actor Loss = 0.0500, Critic Loss = 0.1115, Entropy = 0.0457, Returns = 1.3239, Value = 1.5004, mask_loss = 0.0944std_loss = 0.2653:  23%|██▎       | 1263/5475 [5:39:33<18:18:25, 15.65s/it]

ratio tensor(1.0076, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.4343, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.4408, device='cuda:0') factor: tensor(0.1075, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.4293, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.4415, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.5517, device='cuda:0') reward: tensor(1.4468, device='cuda:0') factor: tensor(0.1049, device='cuda:0')


Epoch 1, Step 1264: Loss = 0.1518, Actor Loss = 0.1218, Critic Loss = 0.0600, Entropy = 0.0500, Returns = 1.4461, Value = 1.4803, mask_loss = 0.1078std_loss = 0.2804:  23%|██▎       | 1264/5475 [5:39:51<19:02:34, 16.28s/it]

ratio tensor(1.2638, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5187, device='cuda:0') reward: tensor(1.4153, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.4194, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.5209, device='cuda:0') reward: tensor(1.4213, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4334, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.0982, device='cuda:0')


Epoch 1, Step 1265: Loss = 0.1138, Actor Loss = 0.0572, Critic Loss = 0.1133, Entropy = 0.0349, Returns = 1.3310, Value = 1.4268, mask_loss = 0.0821std_loss = 0.2491:  23%|██▎       | 1265/5475 [5:40:08<19:15:42, 16.47s/it]

ratio tensor(0.8670, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.3401, device='cuda:0') factor: tensor(0.1432, device='cuda:0')
reward_raw: tensor(0.4851, device='cuda:0') reward: tensor(1.3516, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3406, device='cuda:0') factor: tensor(0.1373, device='cuda:0')
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.4712, device='cuda:0') reward: tensor(1.3313, device='cuda:0') factor: tensor(0.1399, device='cuda:0')


Epoch 1, Step 1266: Loss = -0.0140, Actor Loss = -0.0366, Critic Loss = 0.0455, Entropy = 0.0545, Returns = 1.4045, Value = 1.2913, mask_loss = 0.1181std_loss = 0.2929:  23%|██▎       | 1266/5475 [5:40:21<18:13:11, 15.58s/it]

ratio tensor(1.3848, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5631, device='cuda:0') reward: tensor(1.4382, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.4347, device='cuda:0') factor: tensor(0.1299, device='cuda:0')
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.4552, device='cuda:0') factor: tensor(0.1240, device='cuda:0')
reward_raw: tensor(0.5407, device='cuda:0') reward: tensor(1.4137, device='cuda:0') factor: tensor(0.1270, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.4162, device='cuda:0') factor: tensor(0.1258, device='cuda:0')


Epoch 1, Step 1267: Loss = -0.1246, Actor Loss = -0.1635, Critic Loss = 0.0780, Entropy = 0.0529, Returns = 1.3663, Value = 1.4013, mask_loss = 0.1349std_loss = 0.3071:  23%|██▎       | 1267/5475 [5:40:44<20:42:17, 17.71s/it]

ratio tensor(1.6415, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4734, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3840, device='cuda:0') factor: tensor(0.1017, device='cuda:0')
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3636, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3844, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.4816, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.1011, device='cuda:0')


Epoch 1, Step 1268: Loss = 0.1577, Actor Loss = 0.1037, Critic Loss = 0.1079, Entropy = 0.0365, Returns = 1.4062, Value = 1.3399, mask_loss = 0.0982std_loss = 0.2699:  23%|██▎       | 1268/5475 [5:40:57<19:00:32, 16.27s/it]  

ratio tensor(1.0817, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5986, device='cuda:0') reward: tensor(1.4824, device='cuda:0') factor: tensor(0.1162, device='cuda:0')
reward_raw: tensor(0.6013, device='cuda:0') reward: tensor(1.4882, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.6213, device='cuda:0') reward: tensor(1.5031, device='cuda:0') factor: tensor(0.1182, device='cuda:0')
reward_raw: tensor(0.6011, device='cuda:0') reward: tensor(1.4837, device='cuda:0') factor: tensor(0.1174, device='cuda:0')
reward_raw: tensor(0.6072, device='cuda:0') reward: tensor(1.4922, device='cuda:0') factor: tensor(0.1150, device='cuda:0')


Epoch 1, Step 1269: Loss = -0.2474, Actor Loss = -0.2649, Critic Loss = 0.0351, Entropy = 0.0384, Returns = 1.4848, Value = 1.5993, mask_loss = 0.1023std_loss = 0.2790:  23%|██▎       | 1269/5475 [5:41:10<17:51:52, 15.29s/it]

ratio tensor(1.4261, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3750, device='cuda:0') reward: tensor(1.2676, device='cuda:0') factor: tensor(0.1074, device='cuda:0')
reward_raw: tensor(0.3761, device='cuda:0') reward: tensor(1.2706, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.3679, device='cuda:0') reward: tensor(1.2663, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.3741, device='cuda:0') reward: tensor(1.2713, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.3614, device='cuda:0') reward: tensor(1.2528, device='cuda:0') factor: tensor(0.1086, device='cuda:0')


Epoch 1, Step 1270: Loss = 0.5186, Actor Loss = 0.4740, Critic Loss = 0.0891, Entropy = 0.0429, Returns = 1.1872, Value = 1.2858, mask_loss = 0.1030std_loss = 0.2779:  23%|██▎       | 1270/5475 [5:41:25<17:36:39, 15.08s/it]  

ratio tensor(0.5955, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4893, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.0921, device='cuda:0')
reward_raw: tensor(0.4879, device='cuda:0') reward: tensor(1.3909, device='cuda:0') factor: tensor(0.0970, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.4901, device='cuda:0') reward: tensor(1.3930, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4795, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.0891, device='cuda:0')


Epoch 1, Step 1271: Loss = -0.0625, Actor Loss = -0.1088, Critic Loss = 0.0928, Entropy = 0.0526, Returns = 1.3361, Value = 1.4427, mask_loss = 0.1122std_loss = 0.2856:  23%|██▎       | 1271/5475 [5:41:40<17:37:39, 15.10s/it]

ratio tensor(1.1595, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5179, device='cuda:0') reward: tensor(1.4249, device='cuda:0') factor: tensor(0.0930, device='cuda:0')
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.4171, device='cuda:0') factor: tensor(0.0919, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.4365, device='cuda:0') factor: tensor(0.0896, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.4341, device='cuda:0') factor: tensor(0.0909, device='cuda:0')
reward_raw: tensor(0.5114, device='cuda:0') reward: tensor(1.4212, device='cuda:0') factor: tensor(0.0902, device='cuda:0')


Epoch 1, Step 1272: Loss = 0.0367, Actor Loss = -0.0131, Critic Loss = 0.0997, Entropy = 0.0319, Returns = 1.3867, Value = 1.5569, mask_loss = 0.0840std_loss = 0.2544:  23%|██▎       | 1272/5475 [5:41:55<17:41:58, 15.16s/it] 

ratio tensor(0.9348, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4578, device='cuda:0') reward: tensor(1.3248, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.4585, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.1339, device='cuda:0')
reward_raw: tensor(0.4725, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.1334, device='cuda:0')
reward_raw: tensor(0.4769, device='cuda:0') reward: tensor(1.3434, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.4703, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.1331, device='cuda:0')


Epoch 1, Step 1273: Loss = 0.1076, Actor Loss = 0.0385, Critic Loss = 0.1382, Entropy = 0.0565, Returns = 1.3123, Value = 1.2122, mask_loss = 0.1505std_loss = 0.3249:  23%|██▎       | 1273/5475 [5:42:09<17:24:55, 14.92s/it] 

ratio tensor(1.5051, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.4087, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.1164, device='cuda:0')
reward_raw: tensor(0.5232, device='cuda:0') reward: tensor(1.4055, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.5412, device='cuda:0') reward: tensor(1.4245, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.4089, device='cuda:0') factor: tensor(0.1154, device='cuda:0')


Epoch 1, Step 1274: Loss = -0.1845, Actor Loss = -0.2149, Critic Loss = 0.0610, Entropy = 0.0388, Returns = 1.4505, Value = 1.4817, mask_loss = 0.1264std_loss = 0.3102:  23%|██▎       | 1274/5475 [5:42:27<18:32:38, 15.89s/it]

ratio tensor(1.5780, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3460, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.3305, device='cuda:0') factor: tensor(0.1411, device='cuda:0')
reward_raw: tensor(0.4647, device='cuda:0') reward: tensor(1.3198, device='cuda:0') factor: tensor(0.1449, device='cuda:0')
reward_raw: tensor(0.4848, device='cuda:0') reward: tensor(1.3421, device='cuda:0') factor: tensor(0.1428, device='cuda:0')
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.1416, device='cuda:0')


Epoch 1, Step 1275: Loss = 0.2071, Actor Loss = 0.1838, Critic Loss = 0.0466, Entropy = 0.0513, Returns = 1.2871, Value = 1.2504, mask_loss = 0.1583std_loss = 0.3325:  23%|██▎       | 1275/5475 [5:42:45<19:10:09, 16.43s/it]  

ratio tensor(0.4811, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.4379, device='cuda:0') factor: tensor(0.1045, device='cuda:0')
reward_raw: tensor(0.5480, device='cuda:0') reward: tensor(1.4430, device='cuda:0') factor: tensor(0.1050, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4356, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.4363, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4333, device='cuda:0') factor: tensor(0.1043, device='cuda:0')


Epoch 1, Step 1276: Loss = -0.1756, Actor Loss = -0.2089, Critic Loss = 0.0665, Entropy = 0.0418, Returns = 1.5552, Value = 1.4397, mask_loss = 0.1055std_loss = 0.2802:  23%|██▎       | 1276/5475 [5:43:03<19:30:23, 16.72s/it]

ratio tensor(1.0697, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4247, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.4481, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4549, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4383, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.1091, device='cuda:0')
reward_raw: tensor(0.4572, device='cuda:0') reward: tensor(1.3448, device='cuda:0') factor: tensor(0.1124, device='cuda:0')


Epoch 1, Step 1277: Loss = 0.1365, Actor Loss = 0.0900, Critic Loss = 0.0930, Entropy = 0.0441, Returns = 1.2964, Value = 1.3905, mask_loss = 0.1117std_loss = 0.2871:  23%|██▎       | 1277/5475 [5:43:19<19:16:55, 16.54s/it]  

ratio tensor(0.9957, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.3513, device='cuda:0') factor: tensor(0.1307, device='cuda:0')
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3510, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.3544, device='cuda:0') factor: tensor(0.1248, device='cuda:0')
reward_raw: tensor(0.4818, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1331, device='cuda:0')
reward_raw: tensor(0.4879, device='cuda:0') reward: tensor(1.3595, device='cuda:0') factor: tensor(0.1284, device='cuda:0')


Epoch 1, Step 1278: Loss = 0.0822, Actor Loss = 0.0398, Critic Loss = 0.0847, Entropy = 0.0430, Returns = 1.3318, Value = 1.2124, mask_loss = 0.1373std_loss = 0.3194:  23%|██▎       | 1278/5475 [5:43:36<19:35:09, 16.80s/it]

ratio tensor(1.1961, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4366, device='cuda:0') reward: tensor(1.3174, device='cuda:0') factor: tensor(0.1192, device='cuda:0')
reward_raw: tensor(0.4206, device='cuda:0') reward: tensor(1.3010, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.4222, device='cuda:0') reward: tensor(1.3001, device='cuda:0') factor: tensor(0.1221, device='cuda:0')
reward_raw: tensor(0.4263, device='cuda:0') reward: tensor(1.3033, device='cuda:0') factor: tensor(0.1230, device='cuda:0')
reward_raw: tensor(0.4349, device='cuda:0') reward: tensor(1.3129, device='cuda:0') factor: tensor(0.1219, device='cuda:0')


Epoch 1, Step 1279: Loss = 0.0607, Actor Loss = 0.0059, Critic Loss = 0.1096, Entropy = 0.0497, Returns = 1.2512, Value = 1.3387, mask_loss = 0.1194std_loss = 0.2910:  23%|██▎       | 1279/5475 [5:43:57<21:00:58, 18.03s/it]

ratio tensor(1.0170, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5146, device='cuda:0') reward: tensor(1.3821, device='cuda:0') factor: tensor(0.1325, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.3946, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.5267, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.1361, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.1370, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3932, device='cuda:0') factor: tensor(0.1331, device='cuda:0')


Epoch 1, Step 1280: Loss = 0.0945, Actor Loss = 0.0589, Critic Loss = 0.0714, Entropy = 0.0606, Returns = 1.3205, Value = 1.4454, mask_loss = 0.1290std_loss = 0.3018:  23%|██▎       | 1280/5475 [5:44:14<20:47:14, 17.84s/it]

ratio tensor(0.7808, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5716, device='cuda:0') reward: tensor(1.4304, device='cuda:0') factor: tensor(0.1412, device='cuda:0')
reward_raw: tensor(0.5769, device='cuda:0') reward: tensor(1.4383, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.4309, device='cuda:0') factor: tensor(0.1398, device='cuda:0')
reward_raw: tensor(0.5750, device='cuda:0') reward: tensor(1.4379, device='cuda:0') factor: tensor(0.1371, device='cuda:0')
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.4205, device='cuda:0') factor: tensor(0.1409, device='cuda:0')


Epoch 1, Step 1281: Loss = -0.0791, Actor Loss = -0.0985, Critic Loss = 0.0390, Entropy = 0.0483, Returns = 1.4466, Value = 1.4541, mask_loss = 0.1102std_loss = 0.2779:  23%|██▎       | 1280/5475 [5:44:33<20:47:14, 17.84s/it]

ratio tensor(1.2607, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1281: Loss = -0.0791, Actor Loss = -0.0985, Critic Loss = 0.0390, Entropy = 0.0483, Returns = 1.4466, Value = 1.4541, mask_loss = 0.1102std_loss = 0.2779:  23%|██▎       | 1281/5475 [5:44:33<21:07:11, 18.13s/it]

reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.4836, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.1455, device='cuda:0')
reward_raw: tensor(0.4880, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1378, device='cuda:0')
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.3256, device='cuda:0') factor: tensor(0.1415, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1439, device='cuda:0')


Epoch 1, Step 1282: Loss = -0.0144, Actor Loss = -0.0414, Critic Loss = 0.0540, Entropy = 0.0465, Returns = 1.3396, Value = 1.2931, mask_loss = 0.1505std_loss = 0.3312:  23%|██▎       | 1282/5475 [5:44:51<20:52:45, 17.93s/it]

ratio tensor(1.2344, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.1096, device='cuda:0')
reward_raw: tensor(0.4831, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.1126, device='cuda:0')
reward_raw: tensor(0.4631, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.1142, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1165, device='cuda:0')
reward_raw: tensor(0.4954, device='cuda:0') reward: tensor(1.3806, device='cuda:0') factor: tensor(0.1148, device='cuda:0')


Epoch 1, Step 1283: Loss = 0.1436, Actor Loss = 0.1154, Critic Loss = 0.0564, Entropy = 0.0571, Returns = 1.3154, Value = 1.3302, mask_loss = 0.1243std_loss = 0.2957:  23%|██▎       | 1283/5475 [5:45:14<22:41:29, 19.49s/it]  

ratio tensor(0.6788, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6298, device='cuda:0') reward: tensor(1.5137, device='cuda:0') factor: tensor(0.1161, device='cuda:0')
reward_raw: tensor(0.6196, device='cuda:0') reward: tensor(1.5033, device='cuda:0') factor: tensor(0.1163, device='cuda:0')
reward_raw: tensor(0.6190, device='cuda:0') reward: tensor(1.5003, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.6268, device='cuda:0') reward: tensor(1.5060, device='cuda:0') factor: tensor(0.1208, device='cuda:0')
reward_raw: tensor(0.6039, device='cuda:0') reward: tensor(1.4849, device='cuda:0') factor: tensor(0.1190, device='cuda:0')


Epoch 1, Step 1284: Loss = -0.2850, Actor Loss = -0.3427, Critic Loss = 0.1156, Entropy = 0.0502, Returns = 1.5663, Value = 1.3755, mask_loss = 0.1231std_loss = 0.3007:  23%|██▎       | 1284/5475 [5:45:35<23:26:55, 20.14s/it]

ratio tensor(1.4638, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3945, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3977, device='cuda:0') factor: tensor(0.1264, device='cuda:0')
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.3927, device='cuda:0') factor: tensor(0.1284, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.3847, device='cuda:0') factor: tensor(0.1303, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3967, device='cuda:0') factor: tensor(0.1296, device='cuda:0')


Epoch 1, Step 1285: Loss = 0.1365, Actor Loss = 0.0968, Critic Loss = 0.0795, Entropy = 0.0549, Returns = 1.4223, Value = 1.4895, mask_loss = 0.1204std_loss = 0.2798:  23%|██▎       | 1285/5475 [5:45:51<21:48:00, 18.73s/it]  

ratio tensor(0.8092, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4147, device='cuda:0') reward: tensor(1.2957, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.3953, device='cuda:0') reward: tensor(1.2760, device='cuda:0') factor: tensor(0.1193, device='cuda:0')
reward_raw: tensor(0.4145, device='cuda:0') reward: tensor(1.2954, device='cuda:0') factor: tensor(0.1192, device='cuda:0')
reward_raw: tensor(0.4285, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4114, device='cuda:0') reward: tensor(1.2885, device='cuda:0') factor: tensor(0.1228, device='cuda:0')


Epoch 1, Step 1286: Loss = -0.0014, Actor Loss = -0.0417, Critic Loss = 0.0806, Entropy = 0.0653, Returns = 1.2933, Value = 1.1573, mask_loss = 0.1324std_loss = 0.2945:  23%|██▎       | 1286/5475 [5:46:05<20:13:39, 17.38s/it]

ratio tensor(1.2880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5603, device='cuda:0') reward: tensor(1.4317, device='cuda:0') factor: tensor(0.1286, device='cuda:0')
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.4609, device='cuda:0') factor: tensor(0.1140, device='cuda:0')
reward_raw: tensor(0.5812, device='cuda:0') reward: tensor(1.4584, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.5747, device='cuda:0') reward: tensor(1.4533, device='cuda:0') factor: tensor(0.1214, device='cuda:0')
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4369, device='cuda:0') factor: tensor(0.1194, device='cuda:0')


Epoch 1, Step 1287: Loss = -0.0837, Actor Loss = -0.1173, Critic Loss = 0.0673, Entropy = 0.0613, Returns = 1.4567, Value = 1.5556, mask_loss = 0.1190std_loss = 0.2885:  24%|██▎       | 1287/5475 [5:46:26<21:29:25, 18.47s/it]

ratio tensor(1.0266, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5694, device='cuda:0') reward: tensor(1.4407, device='cuda:0') factor: tensor(0.1287, device='cuda:0')
reward_raw: tensor(0.5446, device='cuda:0') reward: tensor(1.4156, device='cuda:0') factor: tensor(0.1289, device='cuda:0')
reward_raw: tensor(0.5530, device='cuda:0') reward: tensor(1.4216, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.5732, device='cuda:0') reward: tensor(1.4419, device='cuda:0') factor: tensor(0.1313, device='cuda:0')
reward_raw: tensor(0.5629, device='cuda:0') reward: tensor(1.4339, device='cuda:0') factor: tensor(0.1290, device='cuda:0')


Epoch 1, Step 1288: Loss = 0.0489, Actor Loss = 0.0225, Critic Loss = 0.0528, Entropy = 0.0525, Returns = 1.4942, Value = 1.3459, mask_loss = 0.1076std_loss = 0.2782:  24%|██▎       | 1288/5475 [5:46:43<20:47:55, 17.88s/it]  

ratio tensor(0.8148, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5295, device='cuda:0') reward: tensor(1.4132, device='cuda:0') factor: tensor(0.1163, device='cuda:0')
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.4265, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.5204, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1147, device='cuda:0')
reward_raw: tensor(0.5331, device='cuda:0') reward: tensor(1.4160, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.5463, device='cuda:0') reward: tensor(1.4338, device='cuda:0') factor: tensor(0.1124, device='cuda:0')


Epoch 1, Step 1289: Loss = -0.0619, Actor Loss = -0.1033, Critic Loss = 0.0829, Entropy = 0.0430, Returns = 1.3887, Value = 1.4201, mask_loss = 0.1278std_loss = 0.3086:  24%|██▎       | 1289/5475 [5:46:56<19:18:10, 16.60s/it]

ratio tensor(1.0325, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4543, device='cuda:0') reward: tensor(1.3472, device='cuda:0') factor: tensor(0.1071, device='cuda:0')
reward_raw: tensor(0.4652, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.4566, device='cuda:0') reward: tensor(1.3481, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.4443, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.1080, device='cuda:0')
reward_raw: tensor(0.4464, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.1096, device='cuda:0')


Epoch 1, Step 1290: Loss = -0.0740, Actor Loss = -0.1094, Critic Loss = 0.0709, Entropy = 0.0568, Returns = 1.4523, Value = 1.2873, mask_loss = 0.1138std_loss = 0.2842:  24%|██▎       | 1290/5475 [5:47:15<19:57:13, 17.16s/it]

ratio tensor(5.3916, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4609, device='cuda:0') reward: tensor(1.3456, device='cuda:0') factor: tensor(0.1152, device='cuda:0')
reward_raw: tensor(0.4532, device='cuda:0') reward: tensor(1.3384, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4574, device='cuda:0') reward: tensor(1.3408, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.4633, device='cuda:0') reward: tensor(1.3461, device='cuda:0') factor: tensor(0.1172, device='cuda:0')
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3494, device='cuda:0') factor: tensor(0.1189, device='cuda:0')


Epoch 1, Step 1291: Loss = 0.0854, Actor Loss = 0.0625, Critic Loss = 0.0458, Entropy = 0.0484, Returns = 1.3557, Value = 1.3305, mask_loss = 0.1364std_loss = 0.3078:  24%|██▎       | 1291/5475 [5:47:36<21:22:14, 18.39s/it]  

ratio tensor(0.6207, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1485, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1467, device='cuda:0')
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1457, device='cuda:0')
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1431, device='cuda:0')
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.3568, device='cuda:0') factor: tensor(0.1401, device='cuda:0')


Epoch 1, Step 1292: Loss = -0.0273, Actor Loss = -0.0485, Critic Loss = 0.0425, Entropy = 0.0654, Returns = 1.3731, Value = 1.3987, mask_loss = 0.1777std_loss = 0.3494:  24%|██▎       | 1292/5475 [5:47:51<20:16:15, 17.45s/it]

ratio tensor(0.8998, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.4282, device='cuda:0') factor: tensor(0.1342, device='cuda:0')
reward_raw: tensor(0.5227, device='cuda:0') reward: tensor(1.3904, device='cuda:0') factor: tensor(0.1323, device='cuda:0')
reward_raw: tensor(0.5650, device='cuda:0') reward: tensor(1.4328, device='cuda:0') factor: tensor(0.1321, device='cuda:0')
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.1274, device='cuda:0')
reward_raw: tensor(0.5527, device='cuda:0') reward: tensor(1.4233, device='cuda:0') factor: tensor(0.1294, device='cuda:0')


Epoch 1, Step 1293: Loss = 0.0069, Actor Loss = -0.0386, Critic Loss = 0.0912, Entropy = 0.0557, Returns = 1.4117, Value = 1.4967, mask_loss = 0.1196std_loss = 0.2938:  24%|██▎       | 1293/5475 [5:48:08<20:09:36, 17.35s/it] 

ratio tensor(0.9232, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5800, device='cuda:0') reward: tensor(1.4544, device='cuda:0') factor: tensor(0.1256, device='cuda:0')
reward_raw: tensor(0.5573, device='cuda:0') reward: tensor(1.4333, device='cuda:0') factor: tensor(0.1240, device='cuda:0')
reward_raw: tensor(0.5598, device='cuda:0') reward: tensor(1.4350, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.5718, device='cuda:0') reward: tensor(1.4453, device='cuda:0') factor: tensor(0.1265, device='cuda:0')
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.4453, device='cuda:0') factor: tensor(0.1271, device='cuda:0')


Epoch 1, Step 1294: Loss = -0.0196, Actor Loss = -0.0641, Critic Loss = 0.0892, Entropy = 0.0523, Returns = 1.4243, Value = 1.3236, mask_loss = 0.1292std_loss = 0.3080:  24%|██▎       | 1294/5475 [5:48:26<20:16:18, 17.45s/it]

ratio tensor(0.7810, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.4157, device='cuda:0') factor: tensor(0.1152, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.4291, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.4100, device='cuda:0') factor: tensor(0.1095, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.4253, device='cuda:0') factor: tensor(0.1152, device='cuda:0')


Epoch 1, Step 1295: Loss = -0.0143, Actor Loss = -0.0577, Critic Loss = 0.0869, Entropy = 0.0556, Returns = 1.4007, Value = 1.4554, mask_loss = 0.1174std_loss = 0.2847:  24%|██▎       | 1295/5475 [5:48:46<21:08:36, 18.21s/it]

ratio tensor(0.9546, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.4030, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.5081, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.4028, device='cuda:0') factor: tensor(0.1116, device='cuda:0')
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.4028, device='cuda:0') factor: tensor(0.1029, device='cuda:0')


Epoch 1, Step 1296: Loss = 0.0587, Actor Loss = 0.0240, Critic Loss = 0.0694, Entropy = 0.0654, Returns = 1.3994, Value = 1.3121, mask_loss = 0.1158std_loss = 0.2845:  24%|██▎       | 1296/5475 [5:49:02<20:24:28, 17.58s/it]  

ratio tensor(0.5520, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.3768, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.5158, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.1347, device='cuda:0')
reward_raw: tensor(0.5440, device='cuda:0') reward: tensor(1.4076, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1323, device='cuda:0')
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.3866, device='cuda:0') factor: tensor(0.1331, device='cuda:0')


Epoch 1, Step 1297: Loss = -0.0760, Actor Loss = -0.1159, Critic Loss = 0.0800, Entropy = 0.0418, Returns = 1.4237, Value = 1.3121, mask_loss = 0.1068std_loss = 0.2842:  24%|██▎       | 1297/5475 [5:49:16<19:05:12, 16.45s/it]

ratio tensor(1.0075, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5369, device='cuda:0') reward: tensor(1.4362, device='cuda:0') factor: tensor(0.1007, device='cuda:0')
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.4205, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.5267, device='cuda:0') reward: tensor(1.4277, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.4371, device='cuda:0') factor: tensor(0.1025, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.4310, device='cuda:0') factor: tensor(0.1054, device='cuda:0')


Epoch 1, Step 1298: Loss = -0.0525, Actor Loss = -0.0674, Critic Loss = 0.0299, Entropy = 0.0368, Returns = 1.4402, Value = 1.4496, mask_loss = 0.0952std_loss = 0.2735:  24%|██▎       | 1298/5475 [5:49:32<18:54:03, 16.29s/it]

ratio tensor(0.8711, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.4613, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.5988, device='cuda:0') reward: tensor(1.4641, device='cuda:0') factor: tensor(0.1348, device='cuda:0')
reward_raw: tensor(0.5930, device='cuda:0') reward: tensor(1.4610, device='cuda:0') factor: tensor(0.1320, device='cuda:0')
reward_raw: tensor(0.5950, device='cuda:0') reward: tensor(1.4627, device='cuda:0') factor: tensor(0.1324, device='cuda:0')
reward_raw: tensor(0.5868, device='cuda:0') reward: tensor(1.4530, device='cuda:0') factor: tensor(0.1338, device='cuda:0')


Epoch 1, Step 1299: Loss = 0.0298, Actor Loss = 0.0038, Critic Loss = 0.0522, Entropy = 0.0705, Returns = 1.3911, Value = 1.4037, mask_loss = 0.1570std_loss = 0.3265:  24%|██▎       | 1299/5475 [5:49:46<18:19:04, 15.79s/it]  

ratio tensor(1.0689, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.3893, device='cuda:0') factor: tensor(0.1267, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.1239, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.4032, device='cuda:0') factor: tensor(0.1274, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.1251, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3838, device='cuda:0') factor: tensor(0.1250, device='cuda:0')


Epoch 1, Step 1300: Loss = -0.0518, Actor Loss = -0.0808, Critic Loss = 0.0582, Entropy = 0.0744, Returns = 1.3771, Value = 1.2917, mask_loss = 0.1431std_loss = 0.3059:  24%|██▎       | 1300/5475 [5:50:05<19:23:56, 16.73s/it]

ratio tensor(1.3444, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4904, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.1215, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3707, device='cuda:0') factor: tensor(0.1211, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.3605, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.1223, device='cuda:0')
reward_raw: tensor(0.4818, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1212, device='cuda:0')


Epoch 1, Step 1301: Loss = 0.0528, Actor Loss = 0.0176, Critic Loss = 0.0706, Entropy = 0.0660, Returns = 1.4067, Value = 1.3230, mask_loss = 0.1236std_loss = 0.2918:  24%|██▎       | 1300/5475 [5:50:20<19:23:56, 16.73s/it]  

ratio tensor(1.0563, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1301: Loss = 0.0528, Actor Loss = 0.0176, Critic Loss = 0.0706, Entropy = 0.0660, Returns = 1.4067, Value = 1.3230, mask_loss = 0.1236std_loss = 0.2918:  24%|██▍       | 1301/5475 [5:50:21<18:54:35, 16.31s/it]

reward_raw: tensor(0.4668, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.4612, device='cuda:0') reward: tensor(1.3257, device='cuda:0') factor: tensor(0.1355, device='cuda:0')
reward_raw: tensor(0.4553, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.1318, device='cuda:0')
reward_raw: tensor(0.4670, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.1333, device='cuda:0')
reward_raw: tensor(0.4520, device='cuda:0') reward: tensor(1.3137, device='cuda:0') factor: tensor(0.1383, device='cuda:0')


Epoch 1, Step 1302: Loss = -0.0216, Actor Loss = -0.0511, Critic Loss = 0.0590, Entropy = 0.0629, Returns = 1.3231, Value = 1.3353, mask_loss = 0.1143std_loss = 0.2856:  24%|██▍       | 1302/5475 [5:50:34<17:52:53, 15.43s/it]

ratio tensor(0.9953, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4622, device='cuda:0') reward: tensor(1.3204, device='cuda:0') factor: tensor(0.1418, device='cuda:0')
reward_raw: tensor(0.4476, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.1407, device='cuda:0')
reward_raw: tensor(0.4628, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.1347, device='cuda:0')
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.1352, device='cuda:0')
reward_raw: tensor(0.4764, device='cuda:0') reward: tensor(1.3360, device='cuda:0') factor: tensor(0.1405, device='cuda:0')


Epoch 1, Step 1303: Loss = 0.0407, Actor Loss = 0.0014, Critic Loss = 0.0788, Entropy = 0.0617, Returns = 1.3102, Value = 1.3833, mask_loss = 0.1233std_loss = 0.2945:  24%|██▍       | 1303/5475 [5:50:51<18:31:07, 15.98s/it]  

ratio tensor(1.6043, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5740, device='cuda:0') reward: tensor(1.4483, device='cuda:0') factor: tensor(0.1257, device='cuda:0')
reward_raw: tensor(0.5824, device='cuda:0') reward: tensor(1.4581, device='cuda:0') factor: tensor(0.1244, device='cuda:0')
reward_raw: tensor(0.5530, device='cuda:0') reward: tensor(1.4267, device='cuda:0') factor: tensor(0.1263, device='cuda:0')
reward_raw: tensor(0.5890, device='cuda:0') reward: tensor(1.4629, device='cuda:0') factor: tensor(0.1261, device='cuda:0')
reward_raw: tensor(0.5588, device='cuda:0') reward: tensor(1.4331, device='cuda:0') factor: tensor(0.1256, device='cuda:0')


Epoch 1, Step 1304: Loss = -0.0823, Actor Loss = -0.1085, Critic Loss = 0.0525, Entropy = 0.0731, Returns = 1.4255, Value = 1.4282, mask_loss = 0.1485std_loss = 0.3147:  24%|██▍       | 1304/5475 [5:51:07<18:21:48, 15.85s/it]

ratio tensor(32.2441, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5042, device='cuda:0') reward: tensor(1.3802, device='cuda:0') factor: tensor(0.1239, device='cuda:0')
reward_raw: tensor(0.4896, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.1198, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.4908, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1284, device='cuda:0')
reward_raw: tensor(0.4956, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1253, device='cuda:0')


Epoch 1, Step 1305: Loss = 0.1817, Actor Loss = 0.1612, Critic Loss = 0.0411, Entropy = 0.0540, Returns = 1.3764, Value = 1.4027, mask_loss = 0.1108std_loss = 0.2765:  24%|██▍       | 1305/5475 [5:51:22<18:07:43, 15.65s/it]  

ratio tensor(0.6992, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.1184, device='cuda:0')
reward_raw: tensor(0.5476, device='cuda:0') reward: tensor(1.4295, device='cuda:0') factor: tensor(0.1181, device='cuda:0')
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.4117, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.4321, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.5645, device='cuda:0') reward: tensor(1.4448, device='cuda:0') factor: tensor(0.1197, device='cuda:0')


Epoch 1, Step 1306: Loss = -0.0632, Actor Loss = -0.0950, Critic Loss = 0.0637, Entropy = 0.0710, Returns = 1.4053, Value = 1.4205, mask_loss = 0.1349std_loss = 0.3003:  24%|██▍       | 1306/5475 [5:51:37<17:58:58, 15.53s/it]

ratio tensor(1.3347, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.1511, device='cuda:0')
reward_raw: tensor(0.5044, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.3536, device='cuda:0') factor: tensor(0.1506, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1560, device='cuda:0')
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.1540, device='cuda:0')


Epoch 1, Step 1307: Loss = 0.1054, Actor Loss = 0.0818, Critic Loss = 0.0472, Entropy = 0.0558, Returns = 1.3247, Value = 1.4243, mask_loss = 0.1134std_loss = 0.2891:  24%|██▍       | 1307/5475 [5:51:51<17:18:57, 14.96s/it]  

ratio tensor(0.8436, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5267, device='cuda:0') reward: tensor(1.4081, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.5448, device='cuda:0') reward: tensor(1.4221, device='cuda:0') factor: tensor(0.1228, device='cuda:0')
reward_raw: tensor(0.5142, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.1182, device='cuda:0')
reward_raw: tensor(0.5313, device='cuda:0') reward: tensor(1.4055, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.1213, device='cuda:0')


Epoch 1, Step 1308: Loss = -0.0810, Actor Loss = -0.1147, Critic Loss = 0.0676, Entropy = 0.0653, Returns = 1.4265, Value = 1.3657, mask_loss = 0.1182std_loss = 0.2880:  24%|██▍       | 1308/5475 [5:52:07<17:34:36, 15.19s/it]

ratio tensor(1.2052, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1217, device='cuda:0')
reward_raw: tensor(0.4742, device='cuda:0') reward: tensor(1.3421, device='cuda:0') factor: tensor(0.1321, device='cuda:0')
reward_raw: tensor(0.4532, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.4643, device='cuda:0') reward: tensor(1.3352, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1292, device='cuda:0')


Epoch 1, Step 1309: Loss = 0.0545, Actor Loss = 0.0284, Critic Loss = 0.0522, Entropy = 0.0623, Returns = 1.3775, Value = 1.2852, mask_loss = 0.1241std_loss = 0.2998:  24%|██▍       | 1309/5475 [5:52:24<18:12:44, 15.74s/it]  

ratio tensor(1.0259, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3844, device='cuda:0') factor: tensor(0.1371, device='cuda:0')
reward_raw: tensor(0.5078, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.5097, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.1383, device='cuda:0')
reward_raw: tensor(0.5115, device='cuda:0') reward: tensor(1.3676, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1387, device='cuda:0')


Epoch 1, Step 1310: Loss = -0.1589, Actor Loss = -0.1924, Critic Loss = 0.0672, Entropy = 0.0757, Returns = 1.4008, Value = 1.3754, mask_loss = 0.1502std_loss = 0.3158:  24%|██▍       | 1310/5475 [5:52:38<17:51:55, 15.44s/it]

ratio tensor(1.2791, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4209, device='cuda:0') reward: tensor(1.2888, device='cuda:0') factor: tensor(0.1321, device='cuda:0')
reward_raw: tensor(0.4608, device='cuda:0') reward: tensor(1.3269, device='cuda:0') factor: tensor(0.1339, device='cuda:0')
reward_raw: tensor(0.4537, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.1341, device='cuda:0')
reward_raw: tensor(0.4387, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.1281, device='cuda:0')
reward_raw: tensor(0.4464, device='cuda:0') reward: tensor(1.3147, device='cuda:0') factor: tensor(0.1317, device='cuda:0')


Epoch 1, Step 1311: Loss = 0.0743, Actor Loss = 0.0487, Critic Loss = 0.0514, Entropy = 0.0630, Returns = 1.3132, Value = 1.3008, mask_loss = 0.1293std_loss = 0.3002:  24%|██▍       | 1311/5475 [5:52:52<17:10:00, 14.84s/it]  

ratio tensor(0.7372, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5769, device='cuda:0') reward: tensor(1.4688, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.4567, device='cuda:0') factor: tensor(0.1093, device='cuda:0')
reward_raw: tensor(0.5824, device='cuda:0') reward: tensor(1.4727, device='cuda:0') factor: tensor(0.1098, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.4599, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.5818, device='cuda:0') reward: tensor(1.4748, device='cuda:0') factor: tensor(0.1070, device='cuda:0')


Epoch 1, Step 1312: Loss = -0.1629, Actor Loss = -0.1905, Critic Loss = 0.0552, Entropy = 0.0539, Returns = 1.5093, Value = 1.5463, mask_loss = 0.1010std_loss = 0.2708:  24%|██▍       | 1312/5475 [5:53:11<18:28:44, 15.98s/it]

ratio tensor(1.1751, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4270, device='cuda:0') reward: tensor(1.3215, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4355, device='cuda:0') reward: tensor(1.3305, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.4166, device='cuda:0') reward: tensor(1.3107, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.4168, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.4260, device='cuda:0') reward: tensor(1.3214, device='cuda:0') factor: tensor(0.1047, device='cuda:0')


Epoch 1, Step 1313: Loss = 0.2035, Actor Loss = 0.1673, Critic Loss = 0.0726, Entropy = 0.0572, Returns = 1.3230, Value = 1.3229, mask_loss = 0.1065std_loss = 0.2794:  24%|██▍       | 1313/5475 [5:53:27<18:31:30, 16.02s/it]  

ratio tensor(0.9874, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.3896, device='cuda:0') factor: tensor(0.1412, device='cuda:0')
reward_raw: tensor(0.5056, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.1392, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.5304, device='cuda:0') reward: tensor(1.3879, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.1434, device='cuda:0')


Epoch 1, Step 1314: Loss = -0.0306, Actor Loss = -0.0591, Critic Loss = 0.0571, Entropy = 0.0840, Returns = 1.3573, Value = 1.2999, mask_loss = 0.1562std_loss = 0.3185:  24%|██▍       | 1314/5475 [5:53:41<17:51:54, 15.46s/it]

ratio tensor(1.3226, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1212, device='cuda:0')
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.3617, device='cuda:0') factor: tensor(0.1217, device='cuda:0')
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.3750, device='cuda:0') factor: tensor(0.1221, device='cuda:0')
reward_raw: tensor(0.4837, device='cuda:0') reward: tensor(1.3585, device='cuda:0') factor: tensor(0.1251, device='cuda:0')
reward_raw: tensor(0.4718, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.1242, device='cuda:0')


Epoch 1, Step 1315: Loss = -0.0515, Actor Loss = -0.0855, Critic Loss = 0.0681, Entropy = 0.0784, Returns = 1.3896, Value = 1.3423, mask_loss = 0.1269std_loss = 0.2918:  24%|██▍       | 1315/5475 [5:53:57<17:59:33, 15.57s/it]

ratio tensor(1.5325, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3119, device='cuda:0') factor: tensor(0.1521, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3349, device='cuda:0') factor: tensor(0.1493, device='cuda:0')
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.1498, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3324, device='cuda:0') factor: tensor(0.1498, device='cuda:0')
reward_raw: tensor(0.4754, device='cuda:0') reward: tensor(1.3297, device='cuda:0') factor: tensor(0.1457, device='cuda:0')


Epoch 1, Step 1316: Loss = 0.0033, Actor Loss = -0.0237, Critic Loss = 0.0541, Entropy = 0.0554, Returns = 1.3671, Value = 1.3197, mask_loss = 0.1213std_loss = 0.2931:  24%|██▍       | 1316/5475 [5:54:11<17:25:54, 15.09s/it] 

ratio tensor(1.0399, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.3923, device='cuda:0') factor: tensor(0.1483, device='cuda:0')
reward_raw: tensor(0.5305, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1552, device='cuda:0')
reward_raw: tensor(0.5368, device='cuda:0') reward: tensor(1.3856, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.1467, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.4040, device='cuda:0') factor: tensor(0.1552, device='cuda:0')


Epoch 1, Step 1317: Loss = -0.0986, Actor Loss = -0.1339, Critic Loss = 0.0707, Entropy = 0.0861, Returns = 1.3991, Value = 1.4386, mask_loss = 0.1645std_loss = 0.3252:  24%|██▍       | 1317/5475 [5:54:28<18:14:56, 15.80s/it]

ratio tensor(1.2064, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5838, device='cuda:0') reward: tensor(1.4655, device='cuda:0') factor: tensor(0.1183, device='cuda:0')
reward_raw: tensor(0.5814, device='cuda:0') reward: tensor(1.4568, device='cuda:0') factor: tensor(0.1246, device='cuda:0')
reward_raw: tensor(0.5904, device='cuda:0') reward: tensor(1.4649, device='cuda:0') factor: tensor(0.1255, device='cuda:0')
reward_raw: tensor(0.6052, device='cuda:0') reward: tensor(1.4846, device='cuda:0') factor: tensor(0.1206, device='cuda:0')
reward_raw: tensor(0.5934, device='cuda:0') reward: tensor(1.4684, device='cuda:0') factor: tensor(0.1250, device='cuda:0')


Epoch 1, Step 1318: Loss = 0.1265, Actor Loss = 0.0758, Critic Loss = 0.1015, Entropy = 0.0643, Returns = 1.3880, Value = 1.4985, mask_loss = 0.1422std_loss = 0.3153:  24%|██▍       | 1318/5475 [5:54:42<17:37:23, 15.26s/it]  

ratio tensor(1.4120, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4652, device='cuda:0') reward: tensor(1.2742, device='cuda:0') factor: tensor(0.1910, device='cuda:0')
reward_raw: tensor(0.4676, device='cuda:0') reward: tensor(1.2728, device='cuda:0') factor: tensor(0.1948, device='cuda:0')
reward_raw: tensor(0.4661, device='cuda:0') reward: tensor(1.2757, device='cuda:0') factor: tensor(0.1904, device='cuda:0')
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(1.2936, device='cuda:0') factor: tensor(0.1933, device='cuda:0')
reward_raw: tensor(0.4810, device='cuda:0') reward: tensor(1.2837, device='cuda:0') factor: tensor(0.1974, device='cuda:0')


Epoch 1, Step 1319: Loss = 0.3744, Actor Loss = 0.3500, Critic Loss = 0.0490, Entropy = 0.0993, Returns = 1.2117, Value = 1.2012, mask_loss = 0.1851std_loss = 0.3144:  24%|██▍       | 1319/5475 [5:54:59<18:18:06, 15.85s/it]

ratio tensor(0.6491, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5288, device='cuda:0') reward: tensor(1.3926, device='cuda:0') factor: tensor(0.1363, device='cuda:0')
reward_raw: tensor(0.5342, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.1352, device='cuda:0')
reward_raw: tensor(0.5294, device='cuda:0') reward: tensor(1.3936, device='cuda:0') factor: tensor(0.1358, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.3841, device='cuda:0') factor: tensor(0.1258, device='cuda:0')
reward_raw: tensor(0.5166, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.1334, device='cuda:0')


Epoch 1, Step 1320: Loss = -0.3432, Actor Loss = -0.3632, Critic Loss = 0.0400, Entropy = 0.0700, Returns = 1.4171, Value = 1.4447, mask_loss = 0.1221std_loss = 0.2914:  24%|██▍       | 1320/5475 [5:55:11<17:01:01, 14.74s/it]

ratio tensor(1.6438, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.4207, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.4003, device='cuda:0') factor: tensor(0.1387, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.1385, device='cuda:0')
reward_raw: tensor(0.5549, device='cuda:0') reward: tensor(1.4146, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.5453, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.1361, device='cuda:0')


Epoch 1, Step 1321: Loss = 0.1419, Actor Loss = 0.1148, Critic Loss = 0.0544, Entropy = 0.0764, Returns = 1.4265, Value = 1.4314, mask_loss = 0.1478std_loss = 0.3095:  24%|██▍       | 1320/5475 [5:55:29<17:01:01, 14.74s/it]  

ratio tensor(0.7783, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1321: Loss = 0.1419, Actor Loss = 0.1148, Critic Loss = 0.0544, Entropy = 0.0764, Returns = 1.4265, Value = 1.4314, mask_loss = 0.1478std_loss = 0.3095:  24%|██▍       | 1321/5475 [5:55:30<18:09:27, 15.74s/it]

reward_raw: tensor(0.5649, device='cuda:0') reward: tensor(1.4032, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5582, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.1632, device='cuda:0')
reward_raw: tensor(0.5561, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.1664, device='cuda:0')
reward_raw: tensor(0.5204, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.1684, device='cuda:0')


Epoch 1, Step 1322: Loss = 0.1669, Actor Loss = 0.1226, Critic Loss = 0.0888, Entropy = 0.1017, Returns = 1.3409, Value = 1.2989, mask_loss = 0.1743std_loss = 0.3261:  24%|██▍       | 1322/5475 [5:55:47<18:43:32, 16.23s/it]

ratio tensor(0.8177, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5715, device='cuda:0') reward: tensor(1.4266, device='cuda:0') factor: tensor(0.1448, device='cuda:0')
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.4136, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.5527, device='cuda:0') reward: tensor(1.4089, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.5615, device='cuda:0') reward: tensor(1.4200, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.5699, device='cuda:0') reward: tensor(1.4265, device='cuda:0') factor: tensor(0.1434, device='cuda:0')


Epoch 1, Step 1323: Loss = -0.1186, Actor Loss = -0.1577, Critic Loss = 0.0782, Entropy = 0.0549, Returns = 1.4113, Value = 1.5824, mask_loss = 0.1306std_loss = 0.2985:  24%|██▍       | 1323/5475 [5:56:01<18:09:07, 15.74s/it]

ratio tensor(1.5743, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5179, device='cuda:0') reward: tensor(1.3966, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.4859, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1156, device='cuda:0')
reward_raw: tensor(0.4790, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.3891, device='cuda:0') factor: tensor(0.1244, device='cuda:0')
reward_raw: tensor(0.4974, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1209, device='cuda:0')


Epoch 1, Step 1324: Loss = 0.1461, Actor Loss = 0.1218, Critic Loss = 0.0488, Entropy = 0.0668, Returns = 1.3633, Value = 1.2902, mask_loss = 0.1334std_loss = 0.3042:  24%|██▍       | 1324/5475 [5:56:16<17:35:20, 15.25s/it]  

ratio tensor(0.9240, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.4752, device='cuda:0') reward: tensor(1.3257, device='cuda:0') factor: tensor(0.1495, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.3352, device='cuda:0') factor: tensor(0.1423, device='cuda:0')
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.1432, device='cuda:0')
reward_raw: tensor(0.4664, device='cuda:0') reward: tensor(1.3209, device='cuda:0') factor: tensor(0.1455, device='cuda:0')


Epoch 1, Step 1325: Loss = 0.0012, Actor Loss = -0.0539, Critic Loss = 0.1103, Entropy = 0.0701, Returns = 1.3554, Value = 1.2229, mask_loss = 0.1365std_loss = 0.3002:  24%|██▍       | 1325/5475 [5:56:33<18:16:03, 15.85s/it]

ratio tensor(1.2460, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4730, device='cuda:0') reward: tensor(1.3702, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.4618, device='cuda:0') reward: tensor(1.3622, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.4802, device='cuda:0') reward: tensor(1.3799, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.3934, device='cuda:0') factor: tensor(0.0980, device='cuda:0')
reward_raw: tensor(0.4821, device='cuda:0') reward: tensor(1.3818, device='cuda:0') factor: tensor(0.1003, device='cuda:0')


Epoch 1, Step 1326: Loss = -0.2300, Actor Loss = -0.2786, Critic Loss = 0.0973, Entropy = 0.0492, Returns = 1.3683, Value = 1.4622, mask_loss = 0.0970std_loss = 0.2697:  24%|██▍       | 1326/5475 [5:56:51<19:00:01, 16.49s/it]

ratio tensor(1.5626, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6060, device='cuda:0') reward: tensor(1.4702, device='cuda:0') factor: tensor(0.1358, device='cuda:0')
reward_raw: tensor(0.6044, device='cuda:0') reward: tensor(1.4714, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.5735, device='cuda:0') reward: tensor(1.4460, device='cuda:0') factor: tensor(0.1275, device='cuda:0')
reward_raw: tensor(0.5885, device='cuda:0') reward: tensor(1.4594, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.5728, device='cuda:0') reward: tensor(1.4446, device='cuda:0') factor: tensor(0.1283, device='cuda:0')


Epoch 1, Step 1327: Loss = 0.1438, Actor Loss = 0.1084, Critic Loss = 0.0708, Entropy = 0.0643, Returns = 1.4452, Value = 1.4376, mask_loss = 0.1420std_loss = 0.3145:  24%|██▍       | 1327/5475 [5:57:03<17:40:26, 15.34s/it]  

ratio tensor(0.8775, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1454, device='cuda:0')
reward_raw: tensor(0.5014, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.1443, device='cuda:0')
reward_raw: tensor(0.4893, device='cuda:0') reward: tensor(1.3463, device='cuda:0') factor: tensor(0.1431, device='cuda:0')
reward_raw: tensor(0.5288, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1462, device='cuda:0')
reward_raw: tensor(0.4973, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1427, device='cuda:0')


Epoch 1, Step 1328: Loss = 0.2214, Actor Loss = 0.1379, Critic Loss = 0.1671, Entropy = 0.0795, Returns = 1.3370, Value = 1.3151, mask_loss = 0.1344std_loss = 0.2929:  24%|██▍       | 1328/5475 [5:57:18<17:22:58, 15.09s/it]

ratio tensor(1.5777, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5347, device='cuda:0') reward: tensor(1.3442, device='cuda:0') factor: tensor(0.1905, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.1906, device='cuda:0')
reward_raw: tensor(0.5025, device='cuda:0') reward: tensor(1.3187, device='cuda:0') factor: tensor(0.1838, device='cuda:0')
reward_raw: tensor(0.5114, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1823, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.1858, device='cuda:0')


Epoch 1, Step 1329: Loss = 0.2519, Actor Loss = 0.2191, Critic Loss = 0.0658, Entropy = 0.1005, Returns = 1.3062, Value = 1.3622, mask_loss = 0.1935std_loss = 0.3453:  24%|██▍       | 1329/5475 [5:57:30<16:21:27, 14.20s/it]

ratio tensor(0.5014, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4852, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.1484, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1458, device='cuda:0')
reward_raw: tensor(0.4758, device='cuda:0') reward: tensor(1.3305, device='cuda:0') factor: tensor(0.1452, device='cuda:0')
reward_raw: tensor(0.4836, device='cuda:0') reward: tensor(1.3327, device='cuda:0') factor: tensor(0.1510, device='cuda:0')
reward_raw: tensor(0.4770, device='cuda:0') reward: tensor(1.3276, device='cuda:0') factor: tensor(0.1494, device='cuda:0')


Epoch 1, Step 1330: Loss = 0.1343, Actor Loss = 0.1015, Critic Loss = 0.0657, Entropy = 0.0911, Returns = 1.2731, Value = 1.3645, mask_loss = 0.1510std_loss = 0.3104:  24%|██▍       | 1330/5475 [5:57:48<17:46:05, 15.43s/it]

ratio tensor(1.2566, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4958, device='cuda:0') reward: tensor(1.3523, device='cuda:0') factor: tensor(0.1435, device='cuda:0')
reward_raw: tensor(0.4930, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.1396, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3644, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1436, device='cuda:0')


Epoch 1, Step 1331: Loss = 0.0288, Actor Loss = 0.0009, Critic Loss = 0.0559, Entropy = 0.0760, Returns = 1.2976, Value = 1.4150, mask_loss = 0.1505std_loss = 0.3143:  24%|██▍       | 1331/5475 [5:58:06<18:28:57, 16.06s/it]

ratio tensor(0.7644, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4206, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.5495, device='cuda:0') reward: tensor(1.4180, device='cuda:0') factor: tensor(0.1315, device='cuda:0')
reward_raw: tensor(0.5591, device='cuda:0') reward: tensor(1.4274, device='cuda:0') factor: tensor(0.1317, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.4066, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3934, device='cuda:0') factor: tensor(0.1336, device='cuda:0')


Epoch 1, Step 1332: Loss = -0.0249, Actor Loss = -0.0554, Critic Loss = 0.0613, Entropy = 0.0564, Returns = 1.3948, Value = 1.3049, mask_loss = 0.1223std_loss = 0.2985:  24%|██▍       | 1332/5475 [5:58:19<17:34:26, 15.27s/it]

ratio tensor(1.8616, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6212, device='cuda:0') reward: tensor(1.4856, device='cuda:0') factor: tensor(0.1355, device='cuda:0')
reward_raw: tensor(0.5970, device='cuda:0') reward: tensor(1.4611, device='cuda:0') factor: tensor(0.1358, device='cuda:0')
reward_raw: tensor(0.6024, device='cuda:0') reward: tensor(1.4659, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.5956, device='cuda:0') reward: tensor(1.4574, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.5923, device='cuda:0') reward: tensor(1.4601, device='cuda:0') factor: tensor(0.1323, device='cuda:0')


Epoch 1, Step 1333: Loss = -0.0529, Actor Loss = -0.0757, Critic Loss = 0.0457, Entropy = 0.0679, Returns = 1.5177, Value = 1.4951, mask_loss = 0.1325std_loss = 0.3053:  24%|██▍       | 1333/5475 [5:58:32<16:44:27, 14.55s/it]

ratio tensor(1.7001, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5027, device='cuda:0') reward: tensor(1.3814, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.4811, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1194, device='cuda:0')
reward_raw: tensor(0.4851, device='cuda:0') reward: tensor(1.3657, device='cuda:0') factor: tensor(0.1194, device='cuda:0')
reward_raw: tensor(0.4728, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1210, device='cuda:0')


Epoch 1, Step 1334: Loss = 0.1181, Actor Loss = 0.0936, Critic Loss = 0.0492, Entropy = 0.0847, Returns = 1.3440, Value = 1.4252, mask_loss = 0.1368std_loss = 0.3023:  24%|██▍       | 1334/5475 [5:58:49<17:39:20, 15.35s/it]  

ratio tensor(0.6026, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4889, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.1609, device='cuda:0')
reward_raw: tensor(0.4921, device='cuda:0') reward: tensor(1.3259, device='cuda:0') factor: tensor(0.1662, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.1636, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3142, device='cuda:0') factor: tensor(0.1636, device='cuda:0')
reward_raw: tensor(0.4779, device='cuda:0') reward: tensor(1.3171, device='cuda:0') factor: tensor(0.1608, device='cuda:0')


Epoch 1, Step 1335: Loss = 0.1063, Actor Loss = 0.0846, Critic Loss = 0.0437, Entropy = 0.1169, Returns = 1.3585, Value = 1.2938, mask_loss = 0.1857std_loss = 0.3321:  24%|██▍       | 1335/5475 [5:59:03<16:51:52, 14.66s/it]

ratio tensor(0.8133, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5512, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1650, device='cuda:0')
reward_raw: tensor(0.5586, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.1646, device='cuda:0')
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.3823, device='cuda:0') factor: tensor(0.1575, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.3875, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.5384, device='cuda:0') reward: tensor(1.3804, device='cuda:0') factor: tensor(0.1580, device='cuda:0')


Epoch 1, Step 1336: Loss = -0.0045, Actor Loss = -0.0403, Critic Loss = 0.0718, Entropy = 0.0779, Returns = 1.3440, Value = 1.4489, mask_loss = 0.1347std_loss = 0.2981:  24%|██▍       | 1336/5475 [5:59:18<16:57:59, 14.76s/it]

ratio tensor(0.9979, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5342, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.1441, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.3886, device='cuda:0') factor: tensor(0.1468, device='cuda:0')
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.3852, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.3891, device='cuda:0') factor: tensor(0.1474, device='cuda:0')


Epoch 1, Step 1337: Loss = -0.0260, Actor Loss = -0.0510, Critic Loss = 0.0501, Entropy = 0.0605, Returns = 1.4069, Value = 1.3712, mask_loss = 0.1333std_loss = 0.2989:  24%|██▍       | 1337/5475 [5:59:42<20:09:24, 17.54s/it]

ratio tensor(0.9756, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.4141, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.5495, device='cuda:0') reward: tensor(1.4101, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.5378, device='cuda:0') reward: tensor(1.3987, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.5688, device='cuda:0') reward: tensor(1.4285, device='cuda:0') factor: tensor(0.1404, device='cuda:0')
reward_raw: tensor(0.5382, device='cuda:0') reward: tensor(1.3962, device='cuda:0') factor: tensor(0.1420, device='cuda:0')


Epoch 1, Step 1338: Loss = 0.0504, Actor Loss = 0.0202, Critic Loss = 0.0606, Entropy = 0.0769, Returns = 1.4445, Value = 1.4056, mask_loss = 0.1405std_loss = 0.3111:  24%|██▍       | 1338/5475 [5:59:59<20:06:14, 17.49s/it]  

ratio tensor(1.2696, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5667, device='cuda:0') reward: tensor(1.4295, device='cuda:0') factor: tensor(0.1372, device='cuda:0')
reward_raw: tensor(0.5621, device='cuda:0') reward: tensor(1.4244, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.5689, device='cuda:0') reward: tensor(1.4250, device='cuda:0') factor: tensor(0.1439, device='cuda:0')
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.4314, device='cuda:0') factor: tensor(0.1361, device='cuda:0')


Epoch 1, Step 1339: Loss = -0.1185, Actor Loss = -0.1604, Critic Loss = 0.0840, Entropy = 0.0662, Returns = 1.4469, Value = 1.4939, mask_loss = 0.1576std_loss = 0.3299:  24%|██▍       | 1339/5475 [6:00:13<18:53:00, 16.44s/it]

ratio tensor(1.8643, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.3633, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.5037, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1340, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3552, device='cuda:0') factor: tensor(0.1345, device='cuda:0')
reward_raw: tensor(0.4890, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.5078, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.1353, device='cuda:0')


Epoch 1, Step 1340: Loss = 0.2432, Actor Loss = 0.2174, Critic Loss = 0.0518, Entropy = 0.0645, Returns = 1.2745, Value = 1.3121, mask_loss = 0.1289std_loss = 0.2952:  24%|██▍       | 1340/5475 [6:00:31<19:25:26, 16.91s/it]  

ratio tensor(0.9107, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5973, device='cuda:0') reward: tensor(1.4663, device='cuda:0') factor: tensor(0.1310, device='cuda:0')
reward_raw: tensor(0.5666, device='cuda:0') reward: tensor(1.4296, device='cuda:0') factor: tensor(0.1370, device='cuda:0')
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.4382, device='cuda:0') factor: tensor(0.1355, device='cuda:0')
reward_raw: tensor(0.5622, device='cuda:0') reward: tensor(1.4251, device='cuda:0') factor: tensor(0.1371, device='cuda:0')
reward_raw: tensor(0.5820, device='cuda:0') reward: tensor(1.4487, device='cuda:0') factor: tensor(0.1334, device='cuda:0')


Epoch 1, Step 1341: Loss = -0.0748, Actor Loss = -0.1075, Critic Loss = 0.0654, Entropy = 0.0845, Returns = 1.4119, Value = 1.4429, mask_loss = 0.1358std_loss = 0.3002:  24%|██▍       | 1340/5475 [6:00:46<19:25:26, 16.91s/it]

ratio tensor(0.9503, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1341: Loss = -0.0748, Actor Loss = -0.1075, Critic Loss = 0.0654, Entropy = 0.0845, Returns = 1.4119, Value = 1.4429, mask_loss = 0.1358std_loss = 0.3002:  24%|██▍       | 1341/5475 [6:00:47<19:00:20, 16.55s/it]

reward_raw: tensor(0.4957, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.1333, device='cuda:0')
reward_raw: tensor(0.4752, device='cuda:0') reward: tensor(1.3444, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.4731, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.1334, device='cuda:0')
reward_raw: tensor(0.4774, device='cuda:0') reward: tensor(1.3408, device='cuda:0') factor: tensor(0.1366, device='cuda:0')
reward_raw: tensor(0.4738, device='cuda:0') reward: tensor(1.3380, device='cuda:0') factor: tensor(0.1357, device='cuda:0')


Epoch 1, Step 1342: Loss = 0.0074, Actor Loss = -0.0307, Critic Loss = 0.0764, Entropy = 0.0871, Returns = 1.4052, Value = 1.3030, mask_loss = 0.1311std_loss = 0.2948:  25%|██▍       | 1342/5475 [6:01:01<18:24:30, 16.03s/it] 

ratio tensor(1.3120, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5196, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1632, device='cuda:0')
reward_raw: tensor(0.5192, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.5510, device='cuda:0') reward: tensor(1.3794, device='cuda:0') factor: tensor(0.1716, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.3587, device='cuda:0') factor: tensor(0.1659, device='cuda:0')
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.1702, device='cuda:0')


Epoch 1, Step 1343: Loss = 0.0152, Actor Loss = -0.0258, Critic Loss = 0.0823, Entropy = 0.1026, Returns = 1.2470, Value = 1.2034, mask_loss = 0.2255std_loss = 0.3703:  25%|██▍       | 1343/5475 [6:01:13<16:49:35, 14.66s/it]

ratio tensor(1.0616, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5400, device='cuda:0') reward: tensor(1.4040, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.5347, device='cuda:0') reward: tensor(1.4041, device='cuda:0') factor: tensor(0.1306, device='cuda:0')
reward_raw: tensor(0.5291, device='cuda:0') reward: tensor(1.3956, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1425, device='cuda:0')
reward_raw: tensor(0.5198, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.1316, device='cuda:0')


Epoch 1, Step 1344: Loss = -0.0644, Actor Loss = -0.1003, Critic Loss = 0.0720, Entropy = 0.0895, Returns = 1.3496, Value = 1.3955, mask_loss = 0.1475std_loss = 0.3086:  25%|██▍       | 1344/5475 [6:01:28<16:52:03, 14.70s/it]

ratio tensor(1.3037, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4477, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.4638, device='cuda:0') reward: tensor(1.3302, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(1.3408, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.4721, device='cuda:0') reward: tensor(1.3413, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.4631, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.1324, device='cuda:0')


Epoch 1, Step 1345: Loss = 0.2244, Actor Loss = 0.1377, Critic Loss = 0.1735, Entropy = 0.0659, Returns = 1.3732, Value = 1.3717, mask_loss = 0.1244std_loss = 0.2971:  25%|██▍       | 1345/5475 [6:01:46<18:06:15, 15.78s/it]  

ratio tensor(0.9604, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.1773, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3330, device='cuda:0') factor: tensor(0.1846, device='cuda:0')
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.3464, device='cuda:0') factor: tensor(0.1770, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1821, device='cuda:0')
reward_raw: tensor(0.5340, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.1764, device='cuda:0')


Epoch 1, Step 1346: Loss = 0.1338, Actor Loss = 0.0965, Critic Loss = 0.0747, Entropy = 0.1030, Returns = 1.2827, Value = 1.4092, mask_loss = 0.1925std_loss = 0.3475:  25%|██▍       | 1346/5475 [6:02:05<19:20:38, 16.87s/it]

ratio tensor(3.4714, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.3513, device='cuda:0') factor: tensor(0.1740, device='cuda:0')
reward_raw: tensor(0.5248, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.1734, device='cuda:0')
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.3234, device='cuda:0') factor: tensor(0.1680, device='cuda:0')
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.3531, device='cuda:0') factor: tensor(0.1759, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1804, device='cuda:0')


Epoch 1, Step 1347: Loss = -0.1175, Actor Loss = -0.1314, Critic Loss = 0.0280, Entropy = 0.0888, Returns = 1.3643, Value = 1.4199, mask_loss = 0.1781std_loss = 0.3404:  25%|██▍       | 1347/5475 [6:02:20<18:28:07, 16.11s/it]

ratio tensor(1.3996, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5107, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.1530, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1476, device='cuda:0')
reward_raw: tensor(0.5308, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.1522, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.3649, device='cuda:0') factor: tensor(0.1473, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.3935, device='cuda:0') factor: tensor(0.1483, device='cuda:0')


Epoch 1, Step 1348: Loss = 0.2450, Actor Loss = 0.1954, Critic Loss = 0.0994, Entropy = 0.0947, Returns = 1.3523, Value = 1.4843, mask_loss = 0.1713std_loss = 0.3256:  25%|██▍       | 1348/5475 [6:02:33<17:30:35, 15.27s/it]  

ratio tensor(0.4621, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5348, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.1328, device='cuda:0')
reward_raw: tensor(0.5324, device='cuda:0') reward: tensor(1.3987, device='cuda:0') factor: tensor(0.1337, device='cuda:0')
reward_raw: tensor(0.5226, device='cuda:0') reward: tensor(1.3914, device='cuda:0') factor: tensor(0.1312, device='cuda:0')
reward_raw: tensor(0.5588, device='cuda:0') reward: tensor(1.4252, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.3944, device='cuda:0') factor: tensor(0.1336, device='cuda:0')


Epoch 1, Step 1349: Loss = -0.1435, Actor Loss = -0.1704, Critic Loss = 0.0538, Entropy = 0.0731, Returns = 1.3513, Value = 1.3116, mask_loss = 0.1519std_loss = 0.3121:  25%|██▍       | 1349/5475 [6:02:49<17:48:14, 15.53s/it]

ratio tensor(0.8626, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4974, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1346, device='cuda:0')
reward_raw: tensor(0.5003, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.3815, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.5006, device='cuda:0') reward: tensor(1.3670, device='cuda:0') factor: tensor(0.1335, device='cuda:0')


Epoch 1, Step 1350: Loss = 0.2181, Actor Loss = 0.1839, Critic Loss = 0.0684, Entropy = 0.0473, Returns = 1.2967, Value = 1.4126, mask_loss = 0.0980std_loss = 0.2690:  25%|██▍       | 1350/5475 [6:03:04<17:31:11, 15.29s/it]  

ratio tensor(0.9584, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.1418, device='cuda:0')
reward_raw: tensor(0.4954, device='cuda:0') reward: tensor(1.3542, device='cuda:0') factor: tensor(0.1412, device='cuda:0')
reward_raw: tensor(0.4994, device='cuda:0') reward: tensor(1.3500, device='cuda:0') factor: tensor(0.1494, device='cuda:0')
reward_raw: tensor(0.4935, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1417, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.3452, device='cuda:0') factor: tensor(0.1472, device='cuda:0')


Epoch 1, Step 1351: Loss = 0.1296, Actor Loss = 0.0859, Critic Loss = 0.0877, Entropy = 0.0924, Returns = 1.3487, Value = 1.3392, mask_loss = 0.1577std_loss = 0.3185:  25%|██▍       | 1351/5475 [6:03:16<16:29:59, 14.40s/it]

ratio tensor(0.6259, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5901, device='cuda:0') reward: tensor(1.4499, device='cuda:0') factor: tensor(0.1402, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.4449, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.4121, device='cuda:0') factor: tensor(0.1431, device='cuda:0')
reward_raw: tensor(0.5729, device='cuda:0') reward: tensor(1.4350, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.5902, device='cuda:0') reward: tensor(1.4439, device='cuda:0') factor: tensor(0.1463, device='cuda:0')


Epoch 1, Step 1352: Loss = -0.0751, Actor Loss = -0.1278, Critic Loss = 0.1056, Entropy = 0.0960, Returns = 1.4982, Value = 1.5540, mask_loss = 0.1653std_loss = 0.3206:  25%|██▍       | 1352/5475 [6:03:30<16:11:26, 14.14s/it]

ratio tensor(2.3721, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.2859, device='cuda:0') factor: tensor(0.1832, device='cuda:0')
reward_raw: tensor(0.4458, device='cuda:0') reward: tensor(1.2659, device='cuda:0') factor: tensor(0.1799, device='cuda:0')
reward_raw: tensor(0.4583, device='cuda:0') reward: tensor(1.2785, device='cuda:0') factor: tensor(0.1798, device='cuda:0')
reward_raw: tensor(0.4652, device='cuda:0') reward: tensor(1.2853, device='cuda:0') factor: tensor(0.1800, device='cuda:0')
reward_raw: tensor(0.4424, device='cuda:0') reward: tensor(1.2650, device='cuda:0') factor: tensor(0.1774, device='cuda:0')


Epoch 1, Step 1353: Loss = 0.1953, Actor Loss = 0.1466, Critic Loss = 0.0974, Entropy = 0.0624, Returns = 1.2652, Value = 1.2343, mask_loss = 0.1291std_loss = 0.3031:  25%|██▍       | 1353/5475 [6:03:56<20:20:18, 17.76s/it]  

ratio tensor(0.9158, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6597, device='cuda:0') reward: tensor(1.5272, device='cuda:0') factor: tensor(0.1325, device='cuda:0')
reward_raw: tensor(0.6594, device='cuda:0') reward: tensor(1.5302, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.6669, device='cuda:0') reward: tensor(1.5379, device='cuda:0') factor: tensor(0.1290, device='cuda:0')
reward_raw: tensor(0.6819, device='cuda:0') reward: tensor(1.5516, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.6624, device='cuda:0') reward: tensor(1.5329, device='cuda:0') factor: tensor(0.1295, device='cuda:0')


Epoch 1, Step 1354: Loss = -0.1022, Actor Loss = -0.1879, Critic Loss = 0.1716, Entropy = 0.0676, Returns = 1.4905, Value = 1.7430, mask_loss = 0.1193std_loss = 0.2871:  25%|██▍       | 1354/5475 [6:04:13<20:12:53, 17.66s/it]

ratio tensor(1.4508, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.1226, device='cuda:0')
reward_raw: tensor(0.4648, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.1316, device='cuda:0')
reward_raw: tensor(0.4840, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1315, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3611, device='cuda:0') factor: tensor(0.1332, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.1273, device='cuda:0')


Epoch 1, Step 1355: Loss = 0.2592, Actor Loss = 0.2403, Critic Loss = 0.0379, Entropy = 0.0619, Returns = 1.3852, Value = 1.3186, mask_loss = 0.1029std_loss = 0.2691:  25%|██▍       | 1355/5475 [6:04:30<19:53:53, 17.39s/it]  

ratio tensor(0.6105, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4976, device='cuda:0') reward: tensor(1.3668, device='cuda:0') factor: tensor(0.1309, device='cuda:0')
reward_raw: tensor(0.4935, device='cuda:0') reward: tensor(1.3639, device='cuda:0') factor: tensor(0.1295, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1286, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3718, device='cuda:0') factor: tensor(0.1266, device='cuda:0')
reward_raw: tensor(0.5014, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.1282, device='cuda:0')


Epoch 1, Step 1356: Loss = -0.1662, Actor Loss = -0.1855, Critic Loss = 0.0389, Entropy = 0.0886, Returns = 1.3807, Value = 1.3679, mask_loss = 0.1328std_loss = 0.2931:  25%|██▍       | 1356/5475 [6:04:51<20:55:42, 18.29s/it]

ratio tensor(1.2276, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4796, device='cuda:0') reward: tensor(1.3108, device='cuda:0') factor: tensor(0.1688, device='cuda:0')
reward_raw: tensor(0.4661, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.1705, device='cuda:0')
reward_raw: tensor(0.4500, device='cuda:0') reward: tensor(1.2776, device='cuda:0') factor: tensor(0.1724, device='cuda:0')
reward_raw: tensor(0.4570, device='cuda:0') reward: tensor(1.2865, device='cuda:0') factor: tensor(0.1705, device='cuda:0')
reward_raw: tensor(0.4557, device='cuda:0') reward: tensor(1.2848, device='cuda:0') factor: tensor(0.1709, device='cuda:0')


Epoch 1, Step 1357: Loss = 0.1364, Actor Loss = 0.1045, Critic Loss = 0.0641, Entropy = 0.1134, Returns = 1.2776, Value = 1.3369, mask_loss = 0.1949std_loss = 0.3352:  25%|██▍       | 1357/5475 [6:05:06<19:55:16, 17.42s/it]  

ratio tensor(0.8087, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4164, device='cuda:0') reward: tensor(1.2990, device='cuda:0') factor: tensor(0.1174, device='cuda:0')
reward_raw: tensor(0.4168, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.4281, device='cuda:0') reward: tensor(1.3118, device='cuda:0') factor: tensor(0.1163, device='cuda:0')
reward_raw: tensor(0.4292, device='cuda:0') reward: tensor(1.3156, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.4282, device='cuda:0') reward: tensor(1.3134, device='cuda:0') factor: tensor(0.1148, device='cuda:0')


Epoch 1, Step 1358: Loss = 0.1354, Actor Loss = 0.1141, Critic Loss = 0.0427, Entropy = 0.0707, Returns = 1.2448, Value = 1.2330, mask_loss = 0.1415std_loss = 0.3085:  25%|██▍       | 1358/5475 [6:05:19<18:22:29, 16.07s/it]

ratio tensor(1.0901, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4337, device='cuda:0') reward: tensor(1.2511, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.4341, device='cuda:0') reward: tensor(1.2500, device='cuda:0') factor: tensor(0.1841, device='cuda:0')
reward_raw: tensor(0.4364, device='cuda:0') reward: tensor(1.2461, device='cuda:0') factor: tensor(0.1903, device='cuda:0')
reward_raw: tensor(0.4404, device='cuda:0') reward: tensor(1.2517, device='cuda:0') factor: tensor(0.1887, device='cuda:0')
reward_raw: tensor(0.4343, device='cuda:0') reward: tensor(1.2520, device='cuda:0') factor: tensor(0.1823, device='cuda:0')


Epoch 1, Step 1359: Loss = -0.0274, Actor Loss = -0.0762, Critic Loss = 0.0979, Entropy = 0.0920, Returns = 1.3001, Value = 1.1685, mask_loss = 0.1822std_loss = 0.3295:  25%|██▍       | 1359/5475 [6:05:33<17:41:18, 15.47s/it]

ratio tensor(1.4697, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4574, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.1413, device='cuda:0')
reward_raw: tensor(0.4489, device='cuda:0') reward: tensor(1.3080, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.4467, device='cuda:0') reward: tensor(1.3069, device='cuda:0') factor: tensor(0.1398, device='cuda:0')
reward_raw: tensor(0.4604, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.1437, device='cuda:0')
reward_raw: tensor(0.4477, device='cuda:0') reward: tensor(1.3029, device='cuda:0') factor: tensor(0.1449, device='cuda:0')


Epoch 1, Step 1360: Loss = -0.2467, Actor Loss = -0.2831, Critic Loss = 0.0730, Entropy = 0.0832, Returns = 1.3649, Value = 1.4473, mask_loss = 0.1383std_loss = 0.3049:  25%|██▍       | 1360/5475 [6:05:52<18:57:13, 16.58s/it]

ratio tensor(2.7673, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4458, device='cuda:0') reward: tensor(1.2630, device='cuda:0') factor: tensor(0.1828, device='cuda:0')
reward_raw: tensor(0.4562, device='cuda:0') reward: tensor(1.2770, device='cuda:0') factor: tensor(0.1792, device='cuda:0')
reward_raw: tensor(0.4538, device='cuda:0') reward: tensor(1.2805, device='cuda:0') factor: tensor(0.1733, device='cuda:0')
reward_raw: tensor(0.4721, device='cuda:0') reward: tensor(1.2894, device='cuda:0') factor: tensor(0.1827, device='cuda:0')
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.2733, device='cuda:0') factor: tensor(0.1811, device='cuda:0')


Epoch 1, Step 1361: Loss = 0.4312, Actor Loss = 0.4082, Critic Loss = 0.0463, Entropy = 0.1079, Returns = 1.2464, Value = 1.2162, mask_loss = 0.2206std_loss = 0.3490:  25%|██▍       | 1360/5475 [6:06:08<18:57:13, 16.58s/it]  

ratio tensor(0.8451, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1361: Loss = 0.4312, Actor Loss = 0.4082, Critic Loss = 0.0463, Entropy = 0.1079, Returns = 1.2464, Value = 1.2162, mask_loss = 0.2206std_loss = 0.3490:  25%|██▍       | 1361/5475 [6:06:08<18:51:59, 16.51s/it]

reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.4216, device='cuda:0') factor: tensor(0.1457, device='cuda:0')
reward_raw: tensor(0.5637, device='cuda:0') reward: tensor(1.4141, device='cuda:0') factor: tensor(0.1496, device='cuda:0')
reward_raw: tensor(0.5753, device='cuda:0') reward: tensor(1.4284, device='cuda:0') factor: tensor(0.1469, device='cuda:0')
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.4253, device='cuda:0') factor: tensor(0.1501, device='cuda:0')
reward_raw: tensor(0.5559, device='cuda:0') reward: tensor(1.4125, device='cuda:0') factor: tensor(0.1434, device='cuda:0')


Epoch 1, Step 1362: Loss = -0.2535, Actor Loss = -0.2872, Critic Loss = 0.0676, Entropy = 0.0873, Returns = 1.3572, Value = 1.4292, mask_loss = 0.1547std_loss = 0.3056:  25%|██▍       | 1362/5475 [6:06:24<18:25:33, 16.13s/it]

ratio tensor(1.9430, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5982, device='cuda:0') reward: tensor(1.4592, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.5767, device='cuda:0') reward: tensor(1.4389, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.5863, device='cuda:0') reward: tensor(1.4482, device='cuda:0') factor: tensor(0.1381, device='cuda:0')
reward_raw: tensor(0.6038, device='cuda:0') reward: tensor(1.4645, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.5910, device='cuda:0') reward: tensor(1.4532, device='cuda:0') factor: tensor(0.1378, device='cuda:0')


Epoch 1, Step 1363: Loss = 0.1857, Actor Loss = 0.1336, Critic Loss = 0.1044, Entropy = 0.1084, Returns = 1.4011, Value = 1.5754, mask_loss = 0.1371std_loss = 0.2892:  25%|██▍       | 1363/5475 [6:06:41<18:52:22, 16.52s/it]  

ratio tensor(1.1717, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.4028, device='cuda:0') factor: tensor(0.1653, device='cuda:0')
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.1733, device='cuda:0')
reward_raw: tensor(0.5560, device='cuda:0') reward: tensor(1.3911, device='cuda:0') factor: tensor(0.1650, device='cuda:0')
reward_raw: tensor(0.5627, device='cuda:0') reward: tensor(1.3942, device='cuda:0') factor: tensor(0.1685, device='cuda:0')
reward_raw: tensor(0.5781, device='cuda:0') reward: tensor(1.4097, device='cuda:0') factor: tensor(0.1684, device='cuda:0')


Epoch 1, Step 1364: Loss = 0.1453, Actor Loss = 0.1237, Critic Loss = 0.0434, Entropy = 0.0794, Returns = 1.3878, Value = 1.3502, mask_loss = 0.1725std_loss = 0.3345:  25%|██▍       | 1364/5475 [6:06:57<18:45:18, 16.42s/it]

ratio tensor(0.6519, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3568, device='cuda:0') factor: tensor(0.1448, device='cuda:0')
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.1477, device='cuda:0')
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1490, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1441, device='cuda:0')
reward_raw: tensor(0.5034, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.1447, device='cuda:0')


Epoch 1, Step 1365: Loss = -0.0861, Actor Loss = -0.1134, Critic Loss = 0.0548, Entropy = 0.0885, Returns = 1.3614, Value = 1.3235, mask_loss = 0.1603std_loss = 0.3201:  25%|██▍       | 1365/5475 [6:07:18<20:15:50, 17.75s/it]

ratio tensor(6.9503, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5556, device='cuda:0') reward: tensor(1.4061, device='cuda:0') factor: tensor(0.1495, device='cuda:0')
reward_raw: tensor(0.5467, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.1492, device='cuda:0')
reward_raw: tensor(0.5442, device='cuda:0') reward: tensor(1.3936, device='cuda:0') factor: tensor(0.1506, device='cuda:0')
reward_raw: tensor(0.5554, device='cuda:0') reward: tensor(1.4036, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.5604, device='cuda:0') reward: tensor(1.4121, device='cuda:0') factor: tensor(0.1483, device='cuda:0')


Epoch 1, Step 1366: Loss = 0.0638, Actor Loss = 0.0396, Critic Loss = 0.0486, Entropy = 0.0713, Returns = 1.3545, Value = 1.4188, mask_loss = 0.1427std_loss = 0.3139:  25%|██▍       | 1366/5475 [6:07:30<18:07:43, 15.88s/it]  

ratio tensor(1.8508, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.1511, device='cuda:0')
reward_raw: tensor(0.5606, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.5701, device='cuda:0') reward: tensor(1.4234, device='cuda:0') factor: tensor(0.1467, device='cuda:0')
reward_raw: tensor(0.5736, device='cuda:0') reward: tensor(1.4199, device='cuda:0') factor: tensor(0.1538, device='cuda:0')
reward_raw: tensor(0.5586, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.1534, device='cuda:0')


Epoch 1, Step 1367: Loss = -0.0686, Actor Loss = -0.0968, Critic Loss = 0.0566, Entropy = 0.1077, Returns = 1.4261, Value = 1.3461, mask_loss = 0.1750std_loss = 0.3290:  25%|██▍       | 1367/5475 [6:07:51<20:06:34, 17.62s/it]

ratio tensor(1.1205, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3468, device='cuda:0') factor: tensor(0.1562, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1512, device='cuda:0')
reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.3607, device='cuda:0') factor: tensor(0.1537, device='cuda:0')
reward_raw: tensor(0.5363, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.5315, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.1617, device='cuda:0')


Epoch 1, Step 1368: Loss = 0.0676, Actor Loss = 0.0282, Critic Loss = 0.0790, Entropy = 0.0844, Returns = 1.3304, Value = 1.4280, mask_loss = 0.1508std_loss = 0.3163:  25%|██▍       | 1368/5475 [6:08:05<18:39:59, 16.36s/it]  

ratio tensor(1.3957, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.3873, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.4884, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1256, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3711, device='cuda:0') factor: tensor(0.1310, device='cuda:0')
reward_raw: tensor(0.4836, device='cuda:0') reward: tensor(1.3541, device='cuda:0') factor: tensor(0.1295, device='cuda:0')
reward_raw: tensor(0.4850, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.1269, device='cuda:0')


Epoch 1, Step 1369: Loss = -0.0932, Actor Loss = -0.1463, Critic Loss = 0.1063, Entropy = 0.0800, Returns = 1.4198, Value = 1.2936, mask_loss = 0.1271std_loss = 0.2888:  25%|██▌       | 1369/5475 [6:08:19<17:49:01, 15.62s/it]

ratio tensor(1.1162, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4849, device='cuda:0') reward: tensor(1.3337, device='cuda:0') factor: tensor(0.1512, device='cuda:0')
reward_raw: tensor(0.4937, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.1539, device='cuda:0')
reward_raw: tensor(0.4820, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.1571, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1567, device='cuda:0')


Epoch 1, Step 1370: Loss = -0.0499, Actor Loss = -0.0783, Critic Loss = 0.0570, Entropy = 0.0796, Returns = 1.3562, Value = 1.3273, mask_loss = 0.1465std_loss = 0.3127:  25%|██▌       | 1370/5475 [6:08:35<17:53:06, 15.69s/it]

ratio tensor(1.3469, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3384, device='cuda:0') factor: tensor(0.1309, device='cuda:0')
reward_raw: tensor(0.4900, device='cuda:0') reward: tensor(1.3584, device='cuda:0') factor: tensor(0.1316, device='cuda:0')
reward_raw: tensor(0.4695, device='cuda:0') reward: tensor(1.3403, device='cuda:0') factor: tensor(0.1292, device='cuda:0')
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1319, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1308, device='cuda:0')


Epoch 1, Step 1371: Loss = 0.0381, Actor Loss = -0.0069, Critic Loss = 0.0902, Entropy = 0.1023, Returns = 1.3875, Value = 1.2142, mask_loss = 0.1465std_loss = 0.2880:  25%|██▌       | 1371/5475 [6:08:50<17:40:58, 15.51s/it] 

ratio tensor(0.9007, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4573, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1424, device='cuda:0')
reward_raw: tensor(0.4665, device='cuda:0') reward: tensor(1.3250, device='cuda:0') factor: tensor(0.1415, device='cuda:0')
reward_raw: tensor(0.4560, device='cuda:0') reward: tensor(1.3113, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4369, device='cuda:0') reward: tensor(1.2935, device='cuda:0') factor: tensor(0.1434, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3183, device='cuda:0') factor: tensor(0.1497, device='cuda:0')


Epoch 1, Step 1372: Loss = -0.1827, Actor Loss = -0.2106, Critic Loss = 0.0559, Entropy = 0.0821, Returns = 1.3717, Value = 1.3459, mask_loss = 0.1430std_loss = 0.3120:  25%|██▌       | 1372/5475 [6:09:05<17:42:58, 15.54s/it]

ratio tensor(1.4059, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5210, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.1695, device='cuda:0')
reward_raw: tensor(0.5302, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1656, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.3824, device='cuda:0') factor: tensor(0.1646, device='cuda:0')
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.1666, device='cuda:0')
reward_raw: tensor(0.5760, device='cuda:0') reward: tensor(1.4146, device='cuda:0') factor: tensor(0.1614, device='cuda:0')


Epoch 1, Step 1373: Loss = 0.1573, Actor Loss = 0.1159, Critic Loss = 0.0832, Entropy = 0.1170, Returns = 1.3365, Value = 1.4442, mask_loss = 0.1801std_loss = 0.3237:  25%|██▌       | 1373/5475 [6:09:20<17:19:11, 15.20s/it]  

ratio tensor(0.8144, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4383, device='cuda:0') reward: tensor(1.2616, device='cuda:0') factor: tensor(0.1766, device='cuda:0')
reward_raw: tensor(0.4561, device='cuda:0') reward: tensor(1.2798, device='cuda:0') factor: tensor(0.1764, device='cuda:0')
reward_raw: tensor(0.4417, device='cuda:0') reward: tensor(1.2671, device='cuda:0') factor: tensor(0.1746, device='cuda:0')
reward_raw: tensor(0.4382, device='cuda:0') reward: tensor(1.2640, device='cuda:0') factor: tensor(0.1742, device='cuda:0')
reward_raw: tensor(0.4266, device='cuda:0') reward: tensor(1.2513, device='cuda:0') factor: tensor(0.1752, device='cuda:0')


Epoch 1, Step 1374: Loss = 0.2889, Actor Loss = 0.2464, Critic Loss = 0.0852, Entropy = 0.1018, Returns = 1.1997, Value = 1.3144, mask_loss = 0.1721std_loss = 0.3206:  25%|██▌       | 1374/5475 [6:09:35<17:26:48, 15.32s/it]

ratio tensor(0.7586, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5532, device='cuda:0') reward: tensor(1.3915, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5744, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.1652, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.5680, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.1684, device='cuda:0')


Epoch 1, Step 1375: Loss = -0.4588, Actor Loss = -0.4927, Critic Loss = 0.0682, Entropy = 0.1029, Returns = 1.4967, Value = 1.3528, mask_loss = 0.1955std_loss = 0.3474:  25%|██▌       | 1375/5475 [6:09:53<18:09:01, 15.94s/it]

ratio tensor(6.7215, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4644, device='cuda:0') reward: tensor(1.2941, device='cuda:0') factor: tensor(0.1703, device='cuda:0')
reward_raw: tensor(0.4764, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.1640, device='cuda:0')
reward_raw: tensor(0.4659, device='cuda:0') reward: tensor(1.2963, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.1668, device='cuda:0')
reward_raw: tensor(0.4644, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1654, device='cuda:0')


Epoch 1, Step 1376: Loss = 0.3833, Actor Loss = 0.3287, Critic Loss = 0.1094, Entropy = 0.1022, Returns = 1.1976, Value = 1.3247, mask_loss = 0.1850std_loss = 0.3388:  25%|██▌       | 1376/5475 [6:10:09<18:07:56, 15.92s/it]  

ratio tensor(0.6576, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3529, device='cuda:0') factor: tensor(0.1636, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.1602, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.1563, device='cuda:0')
reward_raw: tensor(0.5148, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.1627, device='cuda:0')
reward_raw: tensor(0.4953, device='cuda:0') reward: tensor(1.3320, device='cuda:0') factor: tensor(0.1633, device='cuda:0')


Epoch 1, Step 1377: Loss = -0.0022, Actor Loss = -0.0685, Critic Loss = 0.1328, Entropy = 0.0706, Returns = 1.3998, Value = 1.1368, mask_loss = 0.1493std_loss = 0.3199:  25%|██▌       | 1377/5475 [6:10:31<20:29:10, 18.00s/it]

ratio tensor(0.9216, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6668, device='cuda:0') reward: tensor(1.5249, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.6661, device='cuda:0') reward: tensor(1.5299, device='cuda:0') factor: tensor(0.1361, device='cuda:0')
reward_raw: tensor(0.6635, device='cuda:0') reward: tensor(1.5255, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.6707, device='cuda:0') reward: tensor(1.5347, device='cuda:0') factor: tensor(0.1360, device='cuda:0')
reward_raw: tensor(0.6394, device='cuda:0') reward: tensor(1.5083, device='cuda:0') factor: tensor(0.1311, device='cuda:0')


Epoch 1, Step 1378: Loss = -0.1670, Actor Loss = -0.2134, Critic Loss = 0.0931, Entropy = 0.0682, Returns = 1.4792, Value = 1.6739, mask_loss = 0.1520std_loss = 0.3018:  25%|██▌       | 1378/5475 [6:10:49<20:15:11, 17.80s/it]

ratio tensor(3.3360, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4482, device='cuda:0') reward: tensor(1.3082, device='cuda:0') factor: tensor(0.1400, device='cuda:0')
reward_raw: tensor(0.4577, device='cuda:0') reward: tensor(1.3140, device='cuda:0') factor: tensor(0.1437, device='cuda:0')
reward_raw: tensor(0.4706, device='cuda:0') reward: tensor(1.3263, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.4548, device='cuda:0') reward: tensor(1.3133, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.4619, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.1448, device='cuda:0')


Epoch 1, Step 1379: Loss = 0.2669, Actor Loss = 0.2544, Critic Loss = 0.0251, Entropy = 0.0775, Returns = 1.2786, Value = 1.2431, mask_loss = 0.1612std_loss = 0.3142:  25%|██▌       | 1379/5475 [6:11:09<21:04:40, 18.53s/it]  

ratio tensor(0.5931, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.1274, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.1265, device='cuda:0')
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.3592, device='cuda:0') factor: tensor(0.1293, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.5000, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.1311, device='cuda:0')


Epoch 1, Step 1380: Loss = -0.0053, Actor Loss = -0.0544, Critic Loss = 0.0983, Entropy = 0.0687, Returns = 1.3324, Value = 1.3429, mask_loss = 0.1400std_loss = 0.3128:  25%|██▌       | 1380/5475 [6:11:34<23:11:26, 20.39s/it]

ratio tensor(1.1698, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5594, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.1609, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.5595, device='cuda:0') reward: tensor(1.3953, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.1646, device='cuda:0')
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1569, device='cuda:0')


Epoch 1, Step 1381: Loss = -0.0530, Actor Loss = -0.0956, Critic Loss = 0.0854, Entropy = 0.0761, Returns = 1.3849, Value = 1.4399, mask_loss = 0.1471std_loss = 0.3113:  25%|██▌       | 1380/5475 [6:11:49<23:11:26, 20.39s/it]

ratio tensor(1.1898, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1381: Loss = -0.0530, Actor Loss = -0.0956, Critic Loss = 0.0854, Entropy = 0.0761, Returns = 1.3849, Value = 1.4399, mask_loss = 0.1471std_loss = 0.3113:  25%|██▌       | 1381/5475 [6:11:49<21:35:26, 18.99s/it]

reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.4409, device='cuda:0') factor: tensor(0.1217, device='cuda:0')
reward_raw: tensor(0.5714, device='cuda:0') reward: tensor(1.4462, device='cuda:0') factor: tensor(0.1251, device='cuda:0')
reward_raw: tensor(0.5617, device='cuda:0') reward: tensor(1.4333, device='cuda:0') factor: tensor(0.1283, device='cuda:0')
reward_raw: tensor(0.5871, device='cuda:0') reward: tensor(1.4616, device='cuda:0') factor: tensor(0.1255, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.4481, device='cuda:0') factor: tensor(0.1193, device='cuda:0')


Epoch 1, Step 1382: Loss = 0.2343, Actor Loss = 0.2199, Critic Loss = 0.0289, Entropy = 0.0614, Returns = 1.3471, Value = 1.3528, mask_loss = 0.1311std_loss = 0.3000:  25%|██▌       | 1382/5475 [6:12:03<19:54:56, 17.52s/it]  

ratio tensor(0.9789, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5836, device='cuda:0') reward: tensor(1.4417, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.5803, device='cuda:0') reward: tensor(1.4395, device='cuda:0') factor: tensor(0.1409, device='cuda:0')
reward_raw: tensor(0.5782, device='cuda:0') reward: tensor(1.4405, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.5474, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.1454, device='cuda:0')
reward_raw: tensor(0.5708, device='cuda:0') reward: tensor(1.4328, device='cuda:0') factor: tensor(0.1380, device='cuda:0')


Epoch 1, Step 1383: Loss = -0.1004, Actor Loss = -0.1341, Critic Loss = 0.0676, Entropy = 0.0673, Returns = 1.4089, Value = 1.4419, mask_loss = 0.1486std_loss = 0.3154:  25%|██▌       | 1383/5475 [6:12:19<19:09:15, 16.85s/it]

ratio tensor(3.1996, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5452, device='cuda:0') reward: tensor(1.4167, device='cuda:0') factor: tensor(0.1286, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1303, device='cuda:0')
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4307, device='cuda:0') factor: tensor(0.1256, device='cuda:0')
reward_raw: tensor(0.5476, device='cuda:0') reward: tensor(1.4185, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.1274, device='cuda:0')


Epoch 1, Step 1384: Loss = 0.0730, Actor Loss = 0.0311, Critic Loss = 0.0839, Entropy = 0.0597, Returns = 1.3546, Value = 1.3828, mask_loss = 0.1009std_loss = 0.2664:  25%|██▌       | 1384/5475 [6:12:32<17:57:47, 15.81s/it]  

ratio tensor(1.2066, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4649, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.4660, device='cuda:0') reward: tensor(1.3254, device='cuda:0') factor: tensor(0.1406, device='cuda:0')
reward_raw: tensor(0.4627, device='cuda:0') reward: tensor(1.3244, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.4476, device='cuda:0') reward: tensor(1.3143, device='cuda:0') factor: tensor(0.1333, device='cuda:0')
reward_raw: tensor(0.4528, device='cuda:0') reward: tensor(1.3174, device='cuda:0') factor: tensor(0.1353, device='cuda:0')


Epoch 1, Step 1385: Loss = 0.2496, Actor Loss = 0.2217, Critic Loss = 0.0559, Entropy = 0.0798, Returns = 1.2823, Value = 1.3289, mask_loss = 0.1403std_loss = 0.2972:  25%|██▌       | 1385/5475 [6:12:43<16:21:20, 14.40s/it]

ratio tensor(0.8091, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4743, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1209, device='cuda:0')
reward_raw: tensor(0.4733, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1246, device='cuda:0')
reward_raw: tensor(0.4617, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.4637, device='cuda:0') reward: tensor(1.3439, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.4621, device='cuda:0') reward: tensor(1.3402, device='cuda:0') factor: tensor(0.1219, device='cuda:0')


Epoch 1, Step 1386: Loss = -0.3161, Actor Loss = -0.3385, Critic Loss = 0.0449, Entropy = 0.0551, Returns = 1.3828, Value = 1.3609, mask_loss = 0.1004std_loss = 0.2621:  25%|██▌       | 1386/5475 [6:13:00<17:09:35, 15.11s/it]

ratio tensor(2.6892, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.3403, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.4789, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.1421, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3298, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.4765, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.1365, device='cuda:0')


Epoch 1, Step 1387: Loss = 0.1211, Actor Loss = 0.0733, Critic Loss = 0.0957, Entropy = 0.0548, Returns = 1.2701, Value = 1.3683, mask_loss = 0.1337std_loss = 0.3040:  25%|██▌       | 1387/5475 [6:13:15<17:16:12, 15.21s/it]  

ratio tensor(0.9172, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4067, device='cuda:0') reward: tensor(1.2802, device='cuda:0') factor: tensor(0.1266, device='cuda:0')
reward_raw: tensor(0.3973, device='cuda:0') reward: tensor(1.2691, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.3904, device='cuda:0') reward: tensor(1.2663, device='cuda:0') factor: tensor(0.1241, device='cuda:0')
reward_raw: tensor(0.3967, device='cuda:0') reward: tensor(1.2716, device='cuda:0') factor: tensor(0.1251, device='cuda:0')
reward_raw: tensor(0.4016, device='cuda:0') reward: tensor(1.2754, device='cuda:0') factor: tensor(0.1262, device='cuda:0')


Epoch 1, Step 1388: Loss = 0.0886, Actor Loss = 0.0533, Critic Loss = 0.0707, Entropy = 0.0618, Returns = 1.3111, Value = 1.1651, mask_loss = 0.1261std_loss = 0.2877:  25%|██▌       | 1388/5475 [6:13:31<17:20:24, 15.27s/it]

ratio tensor(0.8867, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5209, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.1153, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.4096, device='cuda:0') factor: tensor(0.1126, device='cuda:0')
reward_raw: tensor(0.5145, device='cuda:0') reward: tensor(1.4055, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.4074, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.5248, device='cuda:0') reward: tensor(1.4108, device='cuda:0') factor: tensor(0.1140, device='cuda:0')


Epoch 1, Step 1389: Loss = -0.2027, Actor Loss = -0.2332, Critic Loss = 0.0611, Entropy = 0.0482, Returns = 1.4304, Value = 1.4517, mask_loss = 0.1072std_loss = 0.2814:  25%|██▌       | 1389/5475 [6:13:49<18:21:12, 16.17s/it]

ratio tensor(1.3812, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5458, device='cuda:0') reward: tensor(1.4380, device='cuda:0') factor: tensor(0.1078, device='cuda:0')
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.4315, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.4162, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.4238, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.4132, device='cuda:0') factor: tensor(0.1092, device='cuda:0')


Epoch 1, Step 1390: Loss = 0.1459, Actor Loss = 0.1192, Critic Loss = 0.0535, Entropy = 0.0496, Returns = 1.4222, Value = 1.4845, mask_loss = 0.1074std_loss = 0.2830:  25%|██▌       | 1390/5475 [6:14:03<17:34:09, 15.48s/it]  

ratio tensor(0.9998, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5711, device='cuda:0') reward: tensor(1.4376, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.5896, device='cuda:0') reward: tensor(1.4502, device='cuda:0') factor: tensor(0.1395, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.4340, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward_raw: tensor(0.5692, device='cuda:0') reward: tensor(1.4308, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.4464, device='cuda:0') factor: tensor(0.1377, device='cuda:0')


Epoch 1, Step 1391: Loss = 0.0637, Actor Loss = 0.0245, Critic Loss = 0.0785, Entropy = 0.0563, Returns = 1.4332, Value = 1.3708, mask_loss = 0.1190std_loss = 0.2928:  25%|██▌       | 1391/5475 [6:14:18<17:19:59, 15.28s/it]

ratio tensor(1.0118, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5381, device='cuda:0') reward: tensor(1.4248, device='cuda:0') factor: tensor(0.1133, device='cuda:0')
reward_raw: tensor(0.5382, device='cuda:0') reward: tensor(1.4248, device='cuda:0') factor: tensor(0.1134, device='cuda:0')
reward_raw: tensor(0.4947, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.5463, device='cuda:0') reward: tensor(1.4371, device='cuda:0') factor: tensor(0.1092, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.4372, device='cuda:0') factor: tensor(0.1114, device='cuda:0')


Epoch 1, Step 1392: Loss = -0.1007, Actor Loss = -0.1521, Critic Loss = 0.1029, Entropy = 0.0406, Returns = 1.4270, Value = 1.3065, mask_loss = 0.1186std_loss = 0.3011:  25%|██▌       | 1392/5475 [6:14:36<18:27:00, 16.27s/it]

ratio tensor(1.3766, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5003, device='cuda:0') reward: tensor(1.3900, device='cuda:0') factor: tensor(0.1103, device='cuda:0')
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.4062, device='cuda:0') factor: tensor(0.1135, device='cuda:0')
reward_raw: tensor(0.5173, device='cuda:0') reward: tensor(1.4050, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3916, device='cuda:0') factor: tensor(0.1105, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.3885, device='cuda:0') factor: tensor(0.1116, device='cuda:0')


Epoch 1, Step 1393: Loss = -0.0168, Actor Loss = -0.0460, Critic Loss = 0.0584, Entropy = 0.0463, Returns = 1.3963, Value = 1.3788, mask_loss = 0.0992std_loss = 0.2730:  25%|██▌       | 1393/5475 [6:14:52<18:13:05, 16.07s/it]

ratio tensor(0.9115, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6166, device='cuda:0') reward: tensor(1.4667, device='cuda:0') factor: tensor(0.1499, device='cuda:0')
reward_raw: tensor(0.6091, device='cuda:0') reward: tensor(1.4690, device='cuda:0') factor: tensor(0.1400, device='cuda:0')
reward_raw: tensor(0.6158, device='cuda:0') reward: tensor(1.4665, device='cuda:0') factor: tensor(0.1493, device='cuda:0')
reward_raw: tensor(0.6215, device='cuda:0') reward: tensor(1.4724, device='cuda:0') factor: tensor(0.1491, device='cuda:0')
reward_raw: tensor(0.6276, device='cuda:0') reward: tensor(1.4827, device='cuda:0') factor: tensor(0.1449, device='cuda:0')


Epoch 1, Step 1394: Loss = -0.0340, Actor Loss = -0.0632, Critic Loss = 0.0585, Entropy = 0.0577, Returns = 1.5495, Value = 1.4564, mask_loss = 0.1345std_loss = 0.3055:  25%|██▌       | 1394/5475 [6:15:08<18:10:54, 16.04s/it]

ratio tensor(1.1670, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.3916, device='cuda:0') factor: tensor(0.1214, device='cuda:0')
reward_raw: tensor(0.5014, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.5108, device='cuda:0') reward: tensor(1.3886, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.1193, device='cuda:0')
reward_raw: tensor(0.4866, device='cuda:0') reward: tensor(1.3714, device='cuda:0') factor: tensor(0.1152, device='cuda:0')


Epoch 1, Step 1395: Loss = 0.0121, Actor Loss = -0.0045, Critic Loss = 0.0332, Entropy = 0.0507, Returns = 1.4461, Value = 1.4129, mask_loss = 0.1061std_loss = 0.2810:  25%|██▌       | 1395/5475 [6:15:27<19:20:19, 17.06s/it] 

ratio tensor(1.5942, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.4062, device='cuda:0') factor: tensor(0.1071, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.4121, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.4121, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3953, device='cuda:0') factor: tensor(0.1141, device='cuda:0')
reward_raw: tensor(0.4875, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1110, device='cuda:0')


Epoch 1, Step 1396: Loss = -0.0230, Actor Loss = -0.0507, Critic Loss = 0.0555, Entropy = 0.0514, Returns = 1.3982, Value = 1.3895, mask_loss = 0.1081std_loss = 0.2734:  25%|██▌       | 1396/5475 [6:15:45<19:22:18, 17.10s/it]

ratio tensor(0.9734, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.3460, device='cuda:0') factor: tensor(0.1169, device='cuda:0')
reward_raw: tensor(0.4648, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.4644, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1080, device='cuda:0')
reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.4631, device='cuda:0') reward: tensor(1.3450, device='cuda:0') factor: tensor(0.1181, device='cuda:0')


Epoch 1, Step 1397: Loss = 0.1419, Actor Loss = 0.1166, Critic Loss = 0.0508, Entropy = 0.0523, Returns = 1.3439, Value = 1.3448, mask_loss = 0.1178std_loss = 0.2945:  26%|██▌       | 1397/5475 [6:15:59<18:30:50, 16.34s/it]  

ratio tensor(0.6910, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4814, device='cuda:0') reward: tensor(1.3625, device='cuda:0') factor: tensor(0.1189, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1178, device='cuda:0')
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.1164, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.1189, device='cuda:0')
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.3666, device='cuda:0') factor: tensor(0.1147, device='cuda:0')


Epoch 1, Step 1398: Loss = -0.0819, Actor Loss = -0.1065, Critic Loss = 0.0492, Entropy = 0.0468, Returns = 1.3164, Value = 1.3724, mask_loss = 0.1192std_loss = 0.3017:  26%|██▌       | 1398/5475 [6:16:18<19:25:34, 17.15s/it]

ratio tensor(2.8984, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1404, device='cuda:0')
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.1455, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.5269, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.1464, device='cuda:0')
reward_raw: tensor(0.5191, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.1446, device='cuda:0')


Epoch 1, Step 1399: Loss = 0.1353, Actor Loss = 0.0907, Critic Loss = 0.0893, Entropy = 0.0658, Returns = 1.2955, Value = 1.4078, mask_loss = 0.1286std_loss = 0.3030:  26%|██▌       | 1399/5475 [6:16:31<17:56:49, 15.85s/it]  

ratio tensor(1.5017, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.4015, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3902, device='cuda:0') factor: tensor(0.1126, device='cuda:0')
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.3904, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.5077, device='cuda:0') reward: tensor(1.3947, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.3948, device='cuda:0') factor: tensor(0.1155, device='cuda:0')


Epoch 1, Step 1400: Loss = 0.1649, Actor Loss = 0.1079, Critic Loss = 0.1140, Entropy = 0.0363, Returns = 1.3534, Value = 1.4292, mask_loss = 0.0975std_loss = 0.2749:  26%|██▌       | 1400/5475 [6:16:48<18:22:35, 16.23s/it]

ratio tensor(0.6131, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3584, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.4481, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.1285, device='cuda:0')
reward_raw: tensor(0.4742, device='cuda:0') reward: tensor(1.3452, device='cuda:0') factor: tensor(0.1290, device='cuda:0')
reward_raw: tensor(0.4821, device='cuda:0') reward: tensor(1.3510, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.4603, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.1322, device='cuda:0')


Epoch 1, Step 1401: Loss = 0.1285, Actor Loss = 0.0727, Critic Loss = 0.1118, Entropy = 0.0655, Returns = 1.4282, Value = 1.3995, mask_loss = 0.1287std_loss = 0.3024:  26%|██▌       | 1400/5475 [6:17:04<18:22:35, 16.23s/it]

ratio tensor(0.8990, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1401: Loss = 0.1285, Actor Loss = 0.0727, Critic Loss = 0.1118, Entropy = 0.0655, Returns = 1.4282, Value = 1.3995, mask_loss = 0.1287std_loss = 0.3024:  26%|██▌       | 1401/5475 [6:17:04<18:15:47, 16.14s/it]

reward_raw: tensor(0.5866, device='cuda:0') reward: tensor(1.4813, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.5567, device='cuda:0') reward: tensor(1.4538, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.5684, device='cuda:0') reward: tensor(1.4643, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.4671, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.4632, device='cuda:0') factor: tensor(0.1061, device='cuda:0')


Epoch 1, Step 1402: Loss = -0.1518, Actor Loss = -0.1936, Critic Loss = 0.0836, Entropy = 0.0327, Returns = 1.4238, Value = 1.5000, mask_loss = 0.0990std_loss = 0.2731:  26%|██▌       | 1402/5475 [6:17:21<18:22:39, 16.24s/it]

ratio tensor(1.4984, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.4494, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5434, device='cuda:0') reward: tensor(1.4333, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.4382, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4496, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.5443, device='cuda:0') reward: tensor(1.4316, device='cuda:0') factor: tensor(0.1126, device='cuda:0')


Epoch 1, Step 1403: Loss = 0.1405, Actor Loss = 0.1094, Critic Loss = 0.0623, Entropy = 0.0371, Returns = 1.4862, Value = 1.5222, mask_loss = 0.0990std_loss = 0.2778:  26%|██▌       | 1403/5475 [6:17:34<17:31:57, 15.50s/it]  

ratio tensor(0.7193, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1360, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.1349, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3896, device='cuda:0') factor: tensor(0.1317, device='cuda:0')
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.5076, device='cuda:0') reward: tensor(1.3716, device='cuda:0') factor: tensor(0.1361, device='cuda:0')


Epoch 1, Step 1404: Loss = 0.1934, Actor Loss = 0.1458, Critic Loss = 0.0953, Entropy = 0.0563, Returns = 1.2950, Value = 1.4727, mask_loss = 0.1525std_loss = 0.3250:  26%|██▌       | 1404/5475 [6:17:47<16:41:20, 14.76s/it]

ratio tensor(0.6531, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.4425, device='cuda:0') factor: tensor(0.1088, device='cuda:0')
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.4215, device='cuda:0') factor: tensor(0.1124, device='cuda:0')
reward_raw: tensor(0.5831, device='cuda:0') reward: tensor(1.4748, device='cuda:0') factor: tensor(0.1083, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.4526, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.5631, device='cuda:0') reward: tensor(1.4493, device='cuda:0') factor: tensor(0.1138, device='cuda:0')


Epoch 1, Step 1405: Loss = -0.1255, Actor Loss = -0.1737, Critic Loss = 0.0966, Entropy = 0.0431, Returns = 1.5000, Value = 1.4097, mask_loss = 0.1032std_loss = 0.2821:  26%|██▌       | 1405/5475 [6:18:01<16:20:04, 14.45s/it]

ratio tensor(1.4367, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4320, device='cuda:0') reward: tensor(1.2976, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.4264, device='cuda:0') reward: tensor(1.2928, device='cuda:0') factor: tensor(0.1336, device='cuda:0')
reward_raw: tensor(0.4262, device='cuda:0') reward: tensor(1.2915, device='cuda:0') factor: tensor(0.1347, device='cuda:0')
reward_raw: tensor(0.4261, device='cuda:0') reward: tensor(1.2928, device='cuda:0') factor: tensor(0.1333, device='cuda:0')
reward_raw: tensor(0.4359, device='cuda:0') reward: tensor(1.3018, device='cuda:0') factor: tensor(0.1341, device='cuda:0')


Epoch 1, Step 1406: Loss = 0.1658, Actor Loss = 0.1456, Critic Loss = 0.0406, Entropy = 0.0519, Returns = 1.2253, Value = 1.2092, mask_loss = 0.1241std_loss = 0.2942:  26%|██▌       | 1406/5475 [6:18:14<15:53:36, 14.06s/it]  

ratio tensor(0.5065, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5509, device='cuda:0') reward: tensor(1.4352, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.5517, device='cuda:0') reward: tensor(1.4364, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.4385, device='cuda:0') factor: tensor(0.1143, device='cuda:0')
reward_raw: tensor(0.5588, device='cuda:0') reward: tensor(1.4416, device='cuda:0') factor: tensor(0.1172, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.4313, device='cuda:0') factor: tensor(0.1193, device='cuda:0')


Epoch 1, Step 1407: Loss = -0.2589, Actor Loss = -0.2840, Critic Loss = 0.0503, Entropy = 0.0394, Returns = 1.4560, Value = 1.4508, mask_loss = 0.1194std_loss = 0.3005:  26%|██▌       | 1407/5475 [6:18:28<15:46:56, 13.97s/it]

ratio tensor(1.1566, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.4605, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.5546, device='cuda:0') reward: tensor(1.4462, device='cuda:0') factor: tensor(0.1084, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.4540, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.4502, device='cuda:0') factor: tensor(0.0997, device='cuda:0')
reward_raw: tensor(0.5676, device='cuda:0') reward: tensor(1.4649, device='cuda:0') factor: tensor(0.1027, device='cuda:0')


Epoch 1, Step 1408: Loss = 0.1284, Actor Loss = 0.0871, Critic Loss = 0.0827, Entropy = 0.0395, Returns = 1.4525, Value = 1.5580, mask_loss = 0.0978std_loss = 0.2726:  26%|██▌       | 1408/5475 [6:18:45<16:54:52, 14.97s/it]  

ratio tensor(0.7584, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4479, device='cuda:0') reward: tensor(1.3455, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4735, device='cuda:0') reward: tensor(1.3726, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.4600, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.4604, device='cuda:0') reward: tensor(1.3561, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4482, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1055, device='cuda:0')


Epoch 1, Step 1409: Loss = 0.0735, Actor Loss = 0.0329, Critic Loss = 0.0813, Entropy = 0.0428, Returns = 1.3986, Value = 1.3038, mask_loss = 0.0932std_loss = 0.2605:  26%|██▌       | 1409/5475 [6:19:04<18:20:43, 16.24s/it]

ratio tensor(1.1300, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4514, device='cuda:0') reward: tensor(1.3428, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.4593, device='cuda:0') reward: tensor(1.3473, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.4453, device='cuda:0') reward: tensor(1.3336, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.4575, device='cuda:0') reward: tensor(1.3419, device='cuda:0') factor: tensor(0.1156, device='cuda:0')
reward_raw: tensor(0.4618, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.1129, device='cuda:0')


Epoch 1, Step 1410: Loss = 0.1896, Actor Loss = 0.1472, Critic Loss = 0.0848, Entropy = 0.0503, Returns = 1.2515, Value = 1.3416, mask_loss = 0.1205std_loss = 0.2959:  26%|██▌       | 1410/5475 [6:19:25<19:52:42, 17.60s/it]

ratio tensor(0.8142, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5407, device='cuda:0') reward: tensor(1.4341, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.4572, device='cuda:0') factor: tensor(0.1052, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.4408, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.5309, device='cuda:0') reward: tensor(1.4291, device='cuda:0') factor: tensor(0.1018, device='cuda:0')
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.4433, device='cuda:0') factor: tensor(0.1042, device='cuda:0')


Epoch 1, Step 1411: Loss = -0.0788, Actor Loss = -0.1247, Critic Loss = 0.0918, Entropy = 0.0377, Returns = 1.4765, Value = 1.4273, mask_loss = 0.0922std_loss = 0.2663:  26%|██▌       | 1411/5475 [6:19:38<18:16:31, 16.19s/it]

ratio tensor(0.9438, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4878, device='cuda:0') reward: tensor(1.3631, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.4715, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1191, device='cuda:0')
reward_raw: tensor(0.4654, device='cuda:0') reward: tensor(1.3425, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4922, device='cuda:0') reward: tensor(1.3611, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.4733, device='cuda:0') reward: tensor(1.3467, device='cuda:0') factor: tensor(0.1266, device='cuda:0')


Epoch 1, Step 1412: Loss = -0.0319, Actor Loss = -0.0565, Critic Loss = 0.0492, Entropy = 0.0463, Returns = 1.3688, Value = 1.3302, mask_loss = 0.1106std_loss = 0.2866:  26%|██▌       | 1412/5475 [6:19:52<17:30:45, 15.52s/it]

ratio tensor(1.3178, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4812, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.4811, device='cuda:0') reward: tensor(1.3808, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4713, device='cuda:0') reward: tensor(1.3683, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4846, device='cuda:0') reward: tensor(1.3814, device='cuda:0') factor: tensor(0.1031, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3975, device='cuda:0') factor: tensor(0.1024, device='cuda:0')


Epoch 1, Step 1413: Loss = -0.0714, Actor Loss = -0.1270, Critic Loss = 0.1113, Entropy = 0.0423, Returns = 1.4065, Value = 1.3935, mask_loss = 0.1111std_loss = 0.2898:  26%|██▌       | 1413/5475 [6:20:05<16:40:42, 14.78s/it]

ratio tensor(0.9678, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.4143, device='cuda:0') factor: tensor(0.1295, device='cuda:0')
reward_raw: tensor(0.5154, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.1258, device='cuda:0')
reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.1242, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.4100, device='cuda:0') factor: tensor(0.1291, device='cuda:0')
reward_raw: tensor(0.5081, device='cuda:0') reward: tensor(1.3795, device='cuda:0') factor: tensor(0.1286, device='cuda:0')


Epoch 1, Step 1414: Loss = 0.1981, Actor Loss = 0.1699, Critic Loss = 0.0564, Entropy = 0.0643, Returns = 1.3951, Value = 1.4240, mask_loss = 0.1431std_loss = 0.3130:  26%|██▌       | 1414/5475 [6:20:21<17:07:47, 15.19s/it]  

ratio tensor(0.7060, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.3868, device='cuda:0') factor: tensor(0.1234, device='cuda:0')
reward_raw: tensor(0.5137, device='cuda:0') reward: tensor(1.3926, device='cuda:0') factor: tensor(0.1211, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3653, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3788, device='cuda:0') factor: tensor(0.1228, device='cuda:0')


Epoch 1, Step 1415: Loss = -0.0211, Actor Loss = -0.0447, Critic Loss = 0.0472, Entropy = 0.0438, Returns = 1.3750, Value = 1.3894, mask_loss = 0.1214std_loss = 0.3042:  26%|██▌       | 1415/5475 [6:20:36<17:00:56, 15.09s/it]

ratio tensor(1.0517, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4612, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.4583, device='cuda:0') reward: tensor(1.3393, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.4465, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1144, device='cuda:0')
reward_raw: tensor(0.4431, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.1140, device='cuda:0')
reward_raw: tensor(0.4769, device='cuda:0') reward: tensor(1.3603, device='cuda:0') factor: tensor(0.1165, device='cuda:0')


Epoch 1, Step 1416: Loss = -0.1206, Actor Loss = -0.1611, Critic Loss = 0.0810, Entropy = 0.0447, Returns = 1.4596, Value = 1.4208, mask_loss = 0.1123std_loss = 0.2910:  26%|██▌       | 1416/5475 [6:20:52<17:22:50, 15.42s/it]

ratio tensor(1.0459, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5231, device='cuda:0') reward: tensor(1.4327, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.4687, device='cuda:0') factor: tensor(0.0915, device='cuda:0')
reward_raw: tensor(0.5505, device='cuda:0') reward: tensor(1.4588, device='cuda:0') factor: tensor(0.0917, device='cuda:0')
reward_raw: tensor(0.5434, device='cuda:0') reward: tensor(1.4519, device='cuda:0') factor: tensor(0.0915, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.4486, device='cuda:0') factor: tensor(0.0932, device='cuda:0')


Epoch 1, Step 1417: Loss = 0.0265, Actor Loss = -0.0119, Critic Loss = 0.0767, Entropy = 0.0449, Returns = 1.4089, Value = 1.4644, mask_loss = 0.0852std_loss = 0.2495:  26%|██▌       | 1417/5475 [6:21:13<19:13:30, 17.06s/it] 

ratio tensor(1.0489, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.4153, device='cuda:0') factor: tensor(0.0950, device='cuda:0')
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.4242, device='cuda:0') factor: tensor(0.0927, device='cuda:0')
reward_raw: tensor(0.4985, device='cuda:0') reward: tensor(1.4050, device='cuda:0') factor: tensor(0.0934, device='cuda:0')
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.4034, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.4943, device='cuda:0') reward: tensor(1.3995, device='cuda:0') factor: tensor(0.0948, device='cuda:0')


Epoch 1, Step 1418: Loss = 0.0910, Actor Loss = 0.0729, Critic Loss = 0.0362, Entropy = 0.0366, Returns = 1.4101, Value = 1.3751, mask_loss = 0.0864std_loss = 0.2575:  26%|██▌       | 1418/5475 [6:21:28<18:19:10, 16.26s/it] 

ratio tensor(0.9446, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.4596, device='cuda:0') reward: tensor(1.3448, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.4603, device='cuda:0') reward: tensor(1.3484, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.4350, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.4576, device='cuda:0') reward: tensor(1.3482, device='cuda:0') factor: tensor(0.1095, device='cuda:0')


Epoch 1, Step 1419: Loss = -0.0001, Actor Loss = -0.0359, Critic Loss = 0.0715, Entropy = 0.0467, Returns = 1.3714, Value = 1.3172, mask_loss = 0.1124std_loss = 0.2903:  26%|██▌       | 1419/5475 [6:21:43<17:57:28, 15.94s/it]

ratio tensor(1.2913, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.1078, device='cuda:0')
reward_raw: tensor(0.4897, device='cuda:0') reward: tensor(1.3820, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.5011, device='cuda:0') reward: tensor(1.3924, device='cuda:0') factor: tensor(0.1087, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.3741, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.3841, device='cuda:0') factor: tensor(0.1076, device='cuda:0')


Epoch 1, Step 1420: Loss = -0.0804, Actor Loss = -0.1222, Critic Loss = 0.0835, Entropy = 0.0378, Returns = 1.3930, Value = 1.4426, mask_loss = 0.1113std_loss = 0.2904:  26%|██▌       | 1420/5475 [6:22:05<20:05:14, 17.83s/it]

ratio tensor(2.2944, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4748, device='cuda:0') reward: tensor(1.3546, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.3879, device='cuda:0') factor: tensor(0.1192, device='cuda:0')
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.3512, device='cuda:0') factor: tensor(0.1141, device='cuda:0')
reward_raw: tensor(0.4803, device='cuda:0') reward: tensor(1.3636, device='cuda:0') factor: tensor(0.1167, device='cuda:0')


Epoch 1, Step 1421: Loss = 0.1832, Actor Loss = 0.1632, Critic Loss = 0.0402, Entropy = 0.0450, Returns = 1.3555, Value = 1.3627, mask_loss = 0.1119std_loss = 0.2907:  26%|██▌       | 1420/5475 [6:22:23<20:05:14, 17.83s/it]  

ratio tensor(0.5581, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1421: Loss = 0.1832, Actor Loss = 0.1632, Critic Loss = 0.0402, Entropy = 0.0450, Returns = 1.3555, Value = 1.3627, mask_loss = 0.1119std_loss = 0.2907:  26%|██▌       | 1421/5475 [6:22:23<20:14:09, 17.97s/it]

reward_raw: tensor(0.5118, device='cuda:0') reward: tensor(1.4059, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5003, device='cuda:0') reward: tensor(1.3988, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.4221, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.5108, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.4085, device='cuda:0') factor: tensor(0.1026, device='cuda:0')


Epoch 1, Step 1422: Loss = -0.1305, Actor Loss = -0.1658, Critic Loss = 0.0708, Entropy = 0.0347, Returns = 1.4381, Value = 1.3800, mask_loss = 0.1016std_loss = 0.2786:  26%|██▌       | 1422/5475 [6:22:38<19:14:43, 17.09s/it]

ratio tensor(1.2335, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4389, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1096, device='cuda:0')
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3304, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.4275, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.4213, device='cuda:0') reward: tensor(1.3146, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.4464, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.1077, device='cuda:0')


Epoch 1, Step 1423: Loss = 0.2254, Actor Loss = 0.1915, Critic Loss = 0.0680, Entropy = 0.0539, Returns = 1.2998, Value = 1.3776, mask_loss = 0.1233std_loss = 0.2921:  26%|██▌       | 1423/5475 [6:22:54<18:41:22, 16.60s/it]  

ratio tensor(0.5269, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4257, device='cuda:0') reward: tensor(1.3241, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4233, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4341, device='cuda:0') reward: tensor(1.3289, device='cuda:0') factor: tensor(0.1052, device='cuda:0')
reward_raw: tensor(0.4129, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4130, device='cuda:0') reward: tensor(1.3122, device='cuda:0') factor: tensor(0.1008, device='cuda:0')


Epoch 1, Step 1424: Loss = -0.0514, Actor Loss = -0.0884, Critic Loss = 0.0741, Entropy = 0.0432, Returns = 1.3392, Value = 1.3399, mask_loss = 0.1130std_loss = 0.2902:  26%|██▌       | 1424/5475 [6:23:10<18:30:09, 16.44s/it]

ratio tensor(1.4242, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5455, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.1334, device='cuda:0')
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.3911, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.5239, device='cuda:0') reward: tensor(1.3831, device='cuda:0') factor: tensor(0.1408, device='cuda:0')
reward_raw: tensor(0.5527, device='cuda:0') reward: tensor(1.4183, device='cuda:0') factor: tensor(0.1344, device='cuda:0')


Epoch 1, Step 1425: Loss = 0.1935, Actor Loss = 0.1258, Critic Loss = 0.1354, Entropy = 0.0429, Returns = 1.3607, Value = 1.5523, mask_loss = 0.1337std_loss = 0.3165:  26%|██▌       | 1425/5475 [6:23:26<18:16:30, 16.24s/it]  

ratio tensor(0.9388, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4152, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.1133, device='cuda:0')
reward_raw: tensor(0.4145, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.4252, device='cuda:0') reward: tensor(1.3118, device='cuda:0') factor: tensor(0.1134, device='cuda:0')
reward_raw: tensor(0.4198, device='cuda:0') reward: tensor(1.3107, device='cuda:0') factor: tensor(0.1091, device='cuda:0')
reward_raw: tensor(0.4220, device='cuda:0') reward: tensor(1.3127, device='cuda:0') factor: tensor(0.1093, device='cuda:0')


Epoch 1, Step 1426: Loss = 0.3456, Actor Loss = 0.3121, Critic Loss = 0.0671, Entropy = 0.0356, Returns = 1.2883, Value = 1.2392, mask_loss = 0.1089std_loss = 0.2888:  26%|██▌       | 1426/5475 [6:23:43<18:29:16, 16.44s/it]

ratio tensor(0.8299, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5770, device='cuda:0') reward: tensor(1.4794, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.5474, device='cuda:0') reward: tensor(1.4493, device='cuda:0') factor: tensor(0.0981, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.4682, device='cuda:0') factor: tensor(0.0966, device='cuda:0')
reward_raw: tensor(0.5652, device='cuda:0') reward: tensor(1.4720, device='cuda:0') factor: tensor(0.0932, device='cuda:0')
reward_raw: tensor(0.5758, device='cuda:0') reward: tensor(1.4770, device='cuda:0') factor: tensor(0.0987, device='cuda:0')


Epoch 1, Step 1427: Loss = -0.4203, Actor Loss = -0.4454, Critic Loss = 0.0501, Entropy = 0.0311, Returns = 1.4681, Value = 1.6078, mask_loss = 0.0989std_loss = 0.2797:  26%|██▌       | 1427/5475 [6:23:59<18:30:34, 16.46s/it]

ratio tensor(1.3328, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5581, device='cuda:0') reward: tensor(1.4676, device='cuda:0') factor: tensor(0.0905, device='cuda:0')
reward_raw: tensor(0.5688, device='cuda:0') reward: tensor(1.4750, device='cuda:0') factor: tensor(0.0938, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.4500, device='cuda:0') factor: tensor(0.0913, device='cuda:0')
reward_raw: tensor(0.5447, device='cuda:0') reward: tensor(1.4509, device='cuda:0') factor: tensor(0.0938, device='cuda:0')
reward_raw: tensor(0.5521, device='cuda:0') reward: tensor(1.4566, device='cuda:0') factor: tensor(0.0955, device='cuda:0')


Epoch 1, Step 1428: Loss = 0.1754, Actor Loss = 0.1439, Critic Loss = 0.0629, Entropy = 0.0378, Returns = 1.4884, Value = 1.5302, mask_loss = 0.1091std_loss = 0.2833:  26%|██▌       | 1428/5475 [6:24:20<19:55:47, 17.73s/it]  

ratio tensor(0.7832, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4051, device='cuda:0') reward: tensor(1.3164, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.3874, device='cuda:0') reward: tensor(1.2978, device='cuda:0') factor: tensor(0.0897, device='cuda:0')
reward_raw: tensor(0.3972, device='cuda:0') reward: tensor(1.3103, device='cuda:0') factor: tensor(0.0869, device='cuda:0')
reward_raw: tensor(0.3919, device='cuda:0') reward: tensor(1.3008, device='cuda:0') factor: tensor(0.0911, device='cuda:0')
reward_raw: tensor(0.3885, device='cuda:0') reward: tensor(1.2983, device='cuda:0') factor: tensor(0.0903, device='cuda:0')


Epoch 1, Step 1429: Loss = 0.2405, Actor Loss = 0.2036, Critic Loss = 0.0738, Entropy = 0.0364, Returns = 1.3224, Value = 1.2463, mask_loss = 0.0833std_loss = 0.2482:  26%|██▌       | 1429/5475 [6:24:37<19:43:27, 17.55s/it]

ratio tensor(0.7310, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4353, device='cuda:0') reward: tensor(1.3349, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.4554, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.0993, device='cuda:0')
reward_raw: tensor(0.4459, device='cuda:0') reward: tensor(1.3486, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.4500, device='cuda:0') reward: tensor(1.3482, device='cuda:0') factor: tensor(0.1018, device='cuda:0')


Epoch 1, Step 1430: Loss = -0.3146, Actor Loss = -0.3360, Critic Loss = 0.0428, Entropy = 0.0352, Returns = 1.3979, Value = 1.3625, mask_loss = 0.1005std_loss = 0.2803:  26%|██▌       | 1430/5475 [6:24:57<20:41:50, 18.42s/it]

ratio tensor(1.9779, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4582, device='cuda:0') reward: tensor(1.3517, device='cuda:0') factor: tensor(0.1065, device='cuda:0')
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.3570, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4587, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1062, device='cuda:0')


Epoch 1, Step 1431: Loss = 0.1036, Actor Loss = 0.0730, Critic Loss = 0.0612, Entropy = 0.0356, Returns = 1.3804, Value = 1.3680, mask_loss = 0.0977std_loss = 0.2762:  26%|██▌       | 1431/5475 [6:25:11<19:10:32, 17.07s/it]  

ratio tensor(0.8225, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.3954, device='cuda:0') reward: tensor(1.3094, device='cuda:0') factor: tensor(0.0859, device='cuda:0')
reward_raw: tensor(0.3833, device='cuda:0') reward: tensor(1.2993, device='cuda:0') factor: tensor(0.0840, device='cuda:0')
reward_raw: tensor(0.4041, device='cuda:0') reward: tensor(1.3222, device='cuda:0') factor: tensor(0.0819, device='cuda:0')
reward_raw: tensor(0.3929, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.0865, device='cuda:0')
reward_raw: tensor(0.4076, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.0833, device='cuda:0')


Epoch 1, Step 1432: Loss = 0.0868, Actor Loss = 0.0551, Critic Loss = 0.0635, Entropy = 0.0388, Returns = 1.2728, Value = 1.3095, mask_loss = 0.0964std_loss = 0.2673:  26%|██▌       | 1432/5475 [6:25:26<18:26:45, 16.42s/it]

ratio tensor(1.1520, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3839, device='cuda:0') reward: tensor(1.2907, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.4251, device='cuda:0') reward: tensor(1.3333, device='cuda:0') factor: tensor(0.0919, device='cuda:0')
reward_raw: tensor(0.4195, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.4080, device='cuda:0') reward: tensor(1.3189, device='cuda:0') factor: tensor(0.0892, device='cuda:0')
reward_raw: tensor(0.4258, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.0912, device='cuda:0')


Epoch 1, Step 1433: Loss = 0.0589, Actor Loss = 0.0414, Critic Loss = 0.0351, Entropy = 0.0390, Returns = 1.2212, Value = 1.3104, mask_loss = 0.1023std_loss = 0.2740:  26%|██▌       | 1433/5475 [6:25:44<18:58:48, 16.90s/it]

ratio tensor(0.8499, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4520, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.1017, device='cuda:0')
reward_raw: tensor(0.4250, device='cuda:0') reward: tensor(1.3196, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.4468, device='cuda:0') reward: tensor(1.3465, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4350, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.4309, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.1041, device='cuda:0')


Epoch 1, Step 1434: Loss = -0.1477, Actor Loss = -0.1952, Critic Loss = 0.0950, Entropy = 0.0429, Returns = 1.3790, Value = 1.2353, mask_loss = 0.0989std_loss = 0.2697:  26%|██▌       | 1434/5475 [6:25:59<18:07:27, 16.15s/it]

ratio tensor(1.5317, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.3918, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.4815, device='cuda:0') reward: tensor(1.3846, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.4977, device='cuda:0') reward: tensor(1.3991, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.4926, device='cuda:0') reward: tensor(1.3967, device='cuda:0') factor: tensor(0.0959, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.4086, device='cuda:0') factor: tensor(0.0952, device='cuda:0')


Epoch 1, Step 1435: Loss = -0.0829, Actor Loss = -0.1201, Critic Loss = 0.0746, Entropy = 0.0314, Returns = 1.4263, Value = 1.5218, mask_loss = 0.1059std_loss = 0.2867:  26%|██▌       | 1435/5475 [6:26:11<16:57:40, 15.11s/it]

ratio tensor(1.1942, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5351, device='cuda:0') reward: tensor(1.4460, device='cuda:0') factor: tensor(0.0891, device='cuda:0')
reward_raw: tensor(0.5710, device='cuda:0') reward: tensor(1.4813, device='cuda:0') factor: tensor(0.0897, device='cuda:0')
reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.4732, device='cuda:0') factor: tensor(0.0910, device='cuda:0')
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.4838, device='cuda:0') factor: tensor(0.0889, device='cuda:0')
reward_raw: tensor(0.5791, device='cuda:0') reward: tensor(1.4916, device='cuda:0') factor: tensor(0.0875, device='cuda:0')


Epoch 1, Step 1436: Loss = -0.0059, Actor Loss = -0.0354, Critic Loss = 0.0592, Entropy = 0.0317, Returns = 1.5157, Value = 1.4750, mask_loss = 0.0931std_loss = 0.2683:  26%|██▌       | 1436/5475 [6:26:25<16:24:02, 14.62s/it]

ratio tensor(1.0585, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4541, device='cuda:0') reward: tensor(1.3529, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.4454, device='cuda:0') reward: tensor(1.3422, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.4457, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.4611, device='cuda:0') reward: tensor(1.3598, device='cuda:0') factor: tensor(0.1013, device='cuda:0')
reward_raw: tensor(0.4566, device='cuda:0') reward: tensor(1.3553, device='cuda:0') factor: tensor(0.1012, device='cuda:0')


Epoch 1, Step 1437: Loss = 0.2137, Actor Loss = 0.1756, Critic Loss = 0.0764, Entropy = 0.0342, Returns = 1.3010, Value = 1.4024, mask_loss = 0.1068std_loss = 0.2820:  26%|██▌       | 1437/5475 [6:26:38<15:54:59, 14.19s/it]  

ratio tensor(0.7750, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.3758, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4636, device='cuda:0') reward: tensor(1.3579, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.4632, device='cuda:0') reward: tensor(1.3584, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.4369, device='cuda:0') reward: tensor(1.3353, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.4721, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1075, device='cuda:0')


Epoch 1, Step 1438: Loss = 0.0289, Actor Loss = -0.0365, Critic Loss = 0.1310, Entropy = 0.0413, Returns = 1.4053, Value = 1.3030, mask_loss = 0.1046std_loss = 0.2685:  26%|██▋       | 1438/5475 [6:26:54<16:39:39, 14.86s/it]

ratio tensor(0.7656, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3393, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.4336, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.4380, device='cuda:0') reward: tensor(1.3264, device='cuda:0') factor: tensor(0.1116, device='cuda:0')
reward_raw: tensor(0.4171, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.4348, device='cuda:0') reward: tensor(1.3262, device='cuda:0') factor: tensor(0.1085, device='cuda:0')


Epoch 1, Step 1439: Loss = -0.0753, Actor Loss = -0.1199, Critic Loss = 0.0892, Entropy = 0.0334, Returns = 1.3607, Value = 1.2616, mask_loss = 0.0880std_loss = 0.2527:  26%|██▋       | 1439/5475 [6:27:09<16:41:41, 14.89s/it]

ratio tensor(1.2222, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4566, device='cuda:0') reward: tensor(1.3574, device='cuda:0') factor: tensor(0.0992, device='cuda:0')
reward_raw: tensor(0.4587, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.3478, device='cuda:0') factor: tensor(0.0974, device='cuda:0')
reward_raw: tensor(0.4622, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.1055, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.1006, device='cuda:0')


Epoch 1, Step 1440: Loss = -0.0897, Actor Loss = -0.1230, Critic Loss = 0.0666, Entropy = 0.0419, Returns = 1.4437, Value = 1.3715, mask_loss = 0.1086std_loss = 0.2830:  26%|██▋       | 1440/5475 [6:27:28<18:05:16, 16.14s/it]

ratio tensor(1.1935, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4670, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4826, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.0942, device='cuda:0')
reward_raw: tensor(0.4591, device='cuda:0') reward: tensor(1.3630, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.4994, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.4481, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.0946, device='cuda:0')


Epoch 1, Step 1441: Loss = 0.0098, Actor Loss = -0.0211, Critic Loss = 0.0618, Entropy = 0.0269, Returns = 1.4276, Value = 1.3859, mask_loss = 0.0950std_loss = 0.2757:  26%|██▋       | 1440/5475 [6:27:44<18:05:16, 16.14s/it] 

ratio tensor(1.2537, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1441: Loss = 0.0098, Actor Loss = -0.0211, Critic Loss = 0.0618, Entropy = 0.0269, Returns = 1.4276, Value = 1.3859, mask_loss = 0.0950std_loss = 0.2757:  26%|██▋       | 1441/5475 [6:27:44<18:03:27, 16.12s/it]

reward_raw: tensor(0.4585, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.3460, device='cuda:0') factor: tensor(0.1230, device='cuda:0')
reward_raw: tensor(0.4787, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.1217, device='cuda:0')
reward_raw: tensor(0.4641, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1182, device='cuda:0')
reward_raw: tensor(0.4433, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.1190, device='cuda:0')


Epoch 1, Step 1442: Loss = -0.1011, Actor Loss = -0.1287, Critic Loss = 0.0552, Entropy = 0.0300, Returns = 1.3515, Value = 1.3763, mask_loss = 0.0985std_loss = 0.2793:  26%|██▋       | 1442/5475 [6:28:00<17:51:38, 15.94s/it]

ratio tensor(0.9508, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4517, device='cuda:0') reward: tensor(1.3506, device='cuda:0') factor: tensor(0.1011, device='cuda:0')
reward_raw: tensor(0.4457, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.4595, device='cuda:0') reward: tensor(1.3656, device='cuda:0') factor: tensor(0.0939, device='cuda:0')
reward_raw: tensor(0.4588, device='cuda:0') reward: tensor(1.3639, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.4547, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.0961, device='cuda:0')


Epoch 1, Step 1443: Loss = 0.1488, Actor Loss = 0.1136, Critic Loss = 0.0704, Entropy = 0.0451, Returns = 1.3834, Value = 1.4649, mask_loss = 0.0980std_loss = 0.2689:  26%|██▋       | 1443/5475 [6:28:19<18:51:12, 16.83s/it]  

ratio tensor(1.1335, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4794, device='cuda:0') factor: tensor(0.0770, device='cuda:0')
reward_raw: tensor(0.5425, device='cuda:0') reward: tensor(1.4730, device='cuda:0') factor: tensor(0.0695, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.4755, device='cuda:0') factor: tensor(0.0756, device='cuda:0')
reward_raw: tensor(0.5681, device='cuda:0') reward: tensor(1.4904, device='cuda:0') factor: tensor(0.0777, device='cuda:0')
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.4982, device='cuda:0') factor: tensor(0.0743, device='cuda:0')


Epoch 1, Step 1444: Loss = -0.0088, Actor Loss = -0.0652, Critic Loss = 0.1128, Entropy = 0.0269, Returns = 1.4350, Value = 1.4673, mask_loss = 0.0887std_loss = 0.2668:  26%|██▋       | 1444/5475 [6:28:41<20:45:20, 18.54s/it]

ratio tensor(0.8863, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3987, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.0999, device='cuda:0')
reward_raw: tensor(0.3739, device='cuda:0') reward: tensor(1.2786, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.3904, device='cuda:0') reward: tensor(1.2882, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.3966, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.3892, device='cuda:0') reward: tensor(1.2882, device='cuda:0') factor: tensor(0.1010, device='cuda:0')


Epoch 1, Step 1445: Loss = 0.2190, Actor Loss = 0.2006, Critic Loss = 0.0369, Entropy = 0.0462, Returns = 1.3023, Value = 1.3389, mask_loss = 0.1261std_loss = 0.2979:  26%|██▋       | 1445/5475 [6:28:54<18:52:47, 16.87s/it]  

ratio tensor(0.6132, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.4672, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.4483, device='cuda:0') reward: tensor(1.3595, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.4772, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.0890, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.0893, device='cuda:0')


Epoch 1, Step 1446: Loss = -0.0672, Actor Loss = -0.1173, Critic Loss = 0.1002, Entropy = 0.0298, Returns = 1.4542, Value = 1.3244, mask_loss = 0.0824std_loss = 0.2565:  26%|██▋       | 1446/5475 [6:29:08<17:45:28, 15.87s/it]

ratio tensor(1.2148, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5124, device='cuda:0') reward: tensor(1.4078, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.5508, device='cuda:0') reward: tensor(1.4467, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.5407, device='cuda:0') reward: tensor(1.4396, device='cuda:0') factor: tensor(0.1011, device='cuda:0')
reward_raw: tensor(0.5278, device='cuda:0') reward: tensor(1.4219, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5340, device='cuda:0') reward: tensor(1.4270, device='cuda:0') factor: tensor(0.1069, device='cuda:0')


Epoch 1, Step 1447: Loss = -0.0694, Actor Loss = -0.0963, Critic Loss = 0.0538, Entropy = 0.0383, Returns = 1.4762, Value = 1.5462, mask_loss = 0.1134std_loss = 0.2934:  26%|██▋       | 1447/5475 [6:29:19<16:14:19, 14.51s/it]

ratio tensor(1.0315, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4664, device='cuda:0') factor: tensor(0.0832, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.4759, device='cuda:0') factor: tensor(0.0915, device='cuda:0')
reward_raw: tensor(0.5392, device='cuda:0') reward: tensor(1.4524, device='cuda:0') factor: tensor(0.0868, device='cuda:0')
reward_raw: tensor(0.5616, device='cuda:0') reward: tensor(1.4734, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.5275, device='cuda:0') reward: tensor(1.4385, device='cuda:0') factor: tensor(0.0890, device='cuda:0')


Epoch 1, Step 1448: Loss = 0.0746, Actor Loss = 0.0349, Critic Loss = 0.0795, Entropy = 0.0184, Returns = 1.5363, Value = 1.4525, mask_loss = 0.0750std_loss = 0.2496:  26%|██▋       | 1448/5475 [6:29:35<16:29:36, 14.74s/it]  

ratio tensor(1.0971, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4794, device='cuda:0') reward: tensor(1.3952, device='cuda:0') factor: tensor(0.0842, device='cuda:0')
reward_raw: tensor(0.4855, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.0856, device='cuda:0')
reward_raw: tensor(0.4759, device='cuda:0') reward: tensor(1.3895, device='cuda:0') factor: tensor(0.0864, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3878, device='cuda:0') factor: tensor(0.0848, device='cuda:0')
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.0864, device='cuda:0')


Epoch 1, Step 1449: Loss = 0.0507, Actor Loss = 0.0089, Critic Loss = 0.0836, Entropy = 0.0288, Returns = 1.3409, Value = 1.3551, mask_loss = 0.0867std_loss = 0.2568:  26%|██▋       | 1449/5475 [6:29:52<17:19:34, 15.49s/it]

ratio tensor(1.1914, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3974, device='cuda:0') reward: tensor(1.3178, device='cuda:0') factor: tensor(0.0796, device='cuda:0')
reward_raw: tensor(0.4172, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.0830, device='cuda:0')
reward_raw: tensor(0.4079, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.0836, device='cuda:0')
reward_raw: tensor(0.3886, device='cuda:0') reward: tensor(1.3014, device='cuda:0') factor: tensor(0.0872, device='cuda:0')
reward_raw: tensor(0.3933, device='cuda:0') reward: tensor(1.3115, device='cuda:0') factor: tensor(0.0818, device='cuda:0')


Epoch 1, Step 1450: Loss = 0.1636, Actor Loss = 0.1426, Critic Loss = 0.0420, Entropy = 0.0288, Returns = 1.2946, Value = 1.3390, mask_loss = 0.0854std_loss = 0.2593:  26%|██▋       | 1450/5475 [6:30:06<16:56:23, 15.15s/it]

ratio tensor(0.9483, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4839, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3926, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.4927, device='cuda:0') reward: tensor(1.3874, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.1031, device='cuda:0')


Epoch 1, Step 1451: Loss = -0.1501, Actor Loss = -0.1843, Critic Loss = 0.0685, Entropy = 0.0255, Returns = 1.3882, Value = 1.4591, mask_loss = 0.1097std_loss = 0.2957:  27%|██▋       | 1451/5475 [6:30:23<17:35:10, 15.73s/it]

ratio tensor(1.1413, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5132, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.4172, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.5210, device='cuda:0') reward: tensor(1.4219, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4977, device='cuda:0') reward: tensor(1.3971, device='cuda:0') factor: tensor(0.1006, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.1001, device='cuda:0')


Epoch 1, Step 1452: Loss = 0.0896, Actor Loss = 0.0688, Critic Loss = 0.0416, Entropy = 0.0376, Returns = 1.4016, Value = 1.4203, mask_loss = 0.1019std_loss = 0.2787:  27%|██▋       | 1452/5475 [6:30:39<17:31:20, 15.68s/it]  

ratio tensor(0.8513, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4072, device='cuda:0') reward: tensor(1.3132, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.4009, device='cuda:0') reward: tensor(1.3035, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.3964, device='cuda:0') reward: tensor(1.2987, device='cuda:0') factor: tensor(0.0977, device='cuda:0')
reward_raw: tensor(0.4127, device='cuda:0') reward: tensor(1.3173, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.3969, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.0904, device='cuda:0')


Epoch 1, Step 1453: Loss = 0.1303, Actor Loss = 0.1050, Critic Loss = 0.0506, Entropy = 0.0323, Returns = 1.2913, Value = 1.2087, mask_loss = 0.0884std_loss = 0.2635:  27%|██▋       | 1453/5475 [6:30:54<17:29:03, 15.65s/it]

ratio tensor(0.8096, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3874, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.4010, device='cuda:0') factor: tensor(0.0926, device='cuda:0')
reward_raw: tensor(0.4695, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.0921, device='cuda:0')
reward_raw: tensor(0.4853, device='cuda:0') reward: tensor(1.3896, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4906, device='cuda:0') reward: tensor(1.3955, device='cuda:0') factor: tensor(0.0951, device='cuda:0')


Epoch 1, Step 1454: Loss = -0.1500, Actor Loss = -0.2038, Critic Loss = 0.1078, Entropy = 0.0402, Returns = 1.3363, Value = 1.3364, mask_loss = 0.0977std_loss = 0.2732:  27%|██▋       | 1454/5475 [6:31:07<16:21:51, 14.65s/it]

ratio tensor(2.0462, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4242, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.0851, device='cuda:0')
reward_raw: tensor(0.4425, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.0856, device='cuda:0')
reward_raw: tensor(0.4597, device='cuda:0') reward: tensor(1.3709, device='cuda:0') factor: tensor(0.0889, device='cuda:0')
reward_raw: tensor(0.4246, device='cuda:0') reward: tensor(1.3367, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.4444, device='cuda:0') reward: tensor(1.3560, device='cuda:0') factor: tensor(0.0883, device='cuda:0')


Epoch 1, Step 1455: Loss = 0.0418, Actor Loss = -0.0044, Critic Loss = 0.0925, Entropy = 0.0233, Returns = 1.4272, Value = 1.3366, mask_loss = 0.0814std_loss = 0.2556:  27%|██▋       | 1455/5475 [6:31:23<17:02:27, 15.26s/it] 

ratio tensor(1.0534, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4588, device='cuda:0') reward: tensor(1.3825, device='cuda:0') factor: tensor(0.0763, device='cuda:0')
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.4387, device='cuda:0') factor: tensor(0.0806, device='cuda:0')
reward_raw: tensor(0.4728, device='cuda:0') reward: tensor(1.3911, device='cuda:0') factor: tensor(0.0817, device='cuda:0')
reward_raw: tensor(0.5248, device='cuda:0') reward: tensor(1.4410, device='cuda:0') factor: tensor(0.0838, device='cuda:0')
reward_raw: tensor(0.4720, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.0780, device='cuda:0')


Epoch 1, Step 1456: Loss = -0.1031, Actor Loss = -0.1386, Critic Loss = 0.0711, Entropy = 0.0272, Returns = 1.4073, Value = 1.4310, mask_loss = 0.0874std_loss = 0.2620:  27%|██▋       | 1456/5475 [6:31:39<17:06:38, 15.33s/it]

ratio tensor(1.3222, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5712, device='cuda:0') reward: tensor(1.4946, device='cuda:0') factor: tensor(0.0766, device='cuda:0')
reward_raw: tensor(0.5636, device='cuda:0') reward: tensor(1.4846, device='cuda:0') factor: tensor(0.0789, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.4801, device='cuda:0') factor: tensor(0.0776, device='cuda:0')
reward_raw: tensor(0.5570, device='cuda:0') reward: tensor(1.4774, device='cuda:0') factor: tensor(0.0796, device='cuda:0')
reward_raw: tensor(0.5340, device='cuda:0') reward: tensor(1.4543, device='cuda:0') factor: tensor(0.0797, device='cuda:0')


Epoch 1, Step 1457: Loss = 0.0355, Actor Loss = -0.0076, Critic Loss = 0.0862, Entropy = 0.0204, Returns = 1.4839, Value = 1.5037, mask_loss = 0.0844std_loss = 0.2639:  27%|██▋       | 1457/5475 [6:31:58<18:24:38, 16.50s/it] 

ratio tensor(1.0078, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.3881, device='cuda:0') factor: tensor(0.0790, device='cuda:0')
reward_raw: tensor(0.4713, device='cuda:0') reward: tensor(1.3915, device='cuda:0') factor: tensor(0.0799, device='cuda:0')
reward_raw: tensor(0.4660, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.0772, device='cuda:0')
reward_raw: tensor(0.4734, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.0798, device='cuda:0')
reward_raw: tensor(0.4893, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.0821, device='cuda:0')


Epoch 1, Step 1458: Loss = 0.0987, Actor Loss = 0.0757, Critic Loss = 0.0460, Entropy = 0.0231, Returns = 1.4258, Value = 1.2953, mask_loss = 0.0726std_loss = 0.2416:  27%|██▋       | 1458/5475 [6:32:19<19:42:31, 17.66s/it] 

ratio tensor(0.8719, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4544, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.0925, device='cuda:0')
reward_raw: tensor(0.4273, device='cuda:0') reward: tensor(1.3394, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.4088, device='cuda:0') reward: tensor(1.3221, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.4429, device='cuda:0') reward: tensor(1.3557, device='cuda:0') factor: tensor(0.0872, device='cuda:0')
reward_raw: tensor(0.4484, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.0859, device='cuda:0')


Epoch 1, Step 1459: Loss = -0.0487, Actor Loss = -0.0700, Critic Loss = 0.0428, Entropy = 0.0257, Returns = 1.3406, Value = 1.4090, mask_loss = 0.0835std_loss = 0.2556:  27%|██▋       | 1459/5475 [6:32:35<19:11:18, 17.20s/it]

ratio tensor(1.0407, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4299, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.0819, device='cuda:0')
reward_raw: tensor(0.4396, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.0825, device='cuda:0')
reward_raw: tensor(0.4382, device='cuda:0') reward: tensor(1.3545, device='cuda:0') factor: tensor(0.0837, device='cuda:0')
reward_raw: tensor(0.4175, device='cuda:0') reward: tensor(1.3361, device='cuda:0') factor: tensor(0.0814, device='cuda:0')
reward_raw: tensor(0.4476, device='cuda:0') reward: tensor(1.3671, device='cuda:0') factor: tensor(0.0805, device='cuda:0')


Epoch 1, Step 1460: Loss = 0.1010, Actor Loss = 0.0779, Critic Loss = 0.0462, Entropy = 0.0260, Returns = 1.3280, Value = 1.3558, mask_loss = 0.0802std_loss = 0.2548:  27%|██▋       | 1460/5475 [6:32:49<18:21:39, 16.46s/it]  

ratio tensor(1.3028, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5409, device='cuda:0') reward: tensor(1.4456, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.5361, device='cuda:0') reward: tensor(1.4439, device='cuda:0') factor: tensor(0.0922, device='cuda:0')
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.4404, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.4559, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.5321, device='cuda:0') reward: tensor(1.4420, device='cuda:0') factor: tensor(0.0901, device='cuda:0')


Epoch 1, Step 1461: Loss = -0.1617, Actor Loss = -0.1929, Critic Loss = 0.0626, Entropy = 0.0240, Returns = 1.4841, Value = 1.4085, mask_loss = 0.0966std_loss = 0.2768:  27%|██▋       | 1460/5475 [6:33:04<18:21:39, 16.46s/it]

ratio tensor(1.2070, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1461: Loss = -0.1617, Actor Loss = -0.1929, Critic Loss = 0.0626, Entropy = 0.0240, Returns = 1.4841, Value = 1.4085, mask_loss = 0.0966std_loss = 0.2768:  27%|██▋       | 1461/5475 [6:33:04<17:46:22, 15.94s/it]

reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.0970, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.3997, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.5011, device='cuda:0') reward: tensor(1.4060, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.0933, device='cuda:0')


Epoch 1, Step 1462: Loss = 0.0332, Actor Loss = 0.0096, Critic Loss = 0.0474, Entropy = 0.0284, Returns = 1.4014, Value = 1.3669, mask_loss = 0.1058std_loss = 0.2851:  27%|██▋       | 1462/5475 [6:33:23<18:35:56, 16.68s/it]  

ratio tensor(0.9097, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4631, device='cuda:0') reward: tensor(1.3729, device='cuda:0') factor: tensor(0.0902, device='cuda:0')
reward_raw: tensor(0.4778, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4512, device='cuda:0') reward: tensor(1.3595, device='cuda:0') factor: tensor(0.0917, device='cuda:0')
reward_raw: tensor(0.4763, device='cuda:0') reward: tensor(1.3830, device='cuda:0') factor: tensor(0.0933, device='cuda:0')
reward_raw: tensor(0.4597, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.0920, device='cuda:0')


Epoch 1, Step 1463: Loss = -0.0523, Actor Loss = -0.0853, Critic Loss = 0.0662, Entropy = 0.0270, Returns = 1.3245, Value = 1.3706, mask_loss = 0.0851std_loss = 0.2636:  27%|██▋       | 1463/5475 [6:33:36<17:28:50, 15.69s/it]

ratio tensor(1.2413, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4556, device='cuda:0') reward: tensor(1.3710, device='cuda:0') factor: tensor(0.0846, device='cuda:0')
reward_raw: tensor(0.4247, device='cuda:0') reward: tensor(1.3404, device='cuda:0') factor: tensor(0.0842, device='cuda:0')
reward_raw: tensor(0.4332, device='cuda:0') reward: tensor(1.3475, device='cuda:0') factor: tensor(0.0857, device='cuda:0')
reward_raw: tensor(0.4288, device='cuda:0') reward: tensor(1.3446, device='cuda:0') factor: tensor(0.0843, device='cuda:0')
reward_raw: tensor(0.4321, device='cuda:0') reward: tensor(1.3471, device='cuda:0') factor: tensor(0.0850, device='cuda:0')


Epoch 1, Step 1464: Loss = 0.1684, Actor Loss = 0.1254, Critic Loss = 0.0861, Entropy = 0.0226, Returns = 1.3777, Value = 1.3850, mask_loss = 0.1005std_loss = 0.2832:  27%|██▋       | 1464/5475 [6:33:54<18:16:40, 16.41s/it]  

ratio tensor(0.9201, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4039, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.0708, device='cuda:0')
reward_raw: tensor(0.4086, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.0711, device='cuda:0')
reward_raw: tensor(0.3940, device='cuda:0') reward: tensor(1.3217, device='cuda:0') factor: tensor(0.0723, device='cuda:0')
reward_raw: tensor(0.4077, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.0722, device='cuda:0')
reward_raw: tensor(0.3997, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.0736, device='cuda:0')


Epoch 1, Step 1465: Loss = 0.1021, Actor Loss = 0.0692, Critic Loss = 0.0658, Entropy = 0.0236, Returns = 1.2664, Value = 1.3445, mask_loss = 0.0755std_loss = 0.2462:  27%|██▋       | 1465/5475 [6:34:12<18:59:28, 17.05s/it]

ratio tensor(0.7394, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5795, device='cuda:0') reward: tensor(1.4962, device='cuda:0') factor: tensor(0.0833, device='cuda:0')
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.4091, device='cuda:0') factor: tensor(0.0848, device='cuda:0')
reward_raw: tensor(0.6076, device='cuda:0') reward: tensor(1.5220, device='cuda:0') factor: tensor(0.0856, device='cuda:0')
reward_raw: tensor(0.5882, device='cuda:0') reward: tensor(1.4985, device='cuda:0') factor: tensor(0.0898, device='cuda:0')
reward_raw: tensor(0.5955, device='cuda:0') reward: tensor(1.5094, device='cuda:0') factor: tensor(0.0861, device='cuda:0')


Epoch 1, Step 1466: Loss = -0.2535, Actor Loss = -0.2883, Critic Loss = 0.0696, Entropy = 0.0320, Returns = 1.5390, Value = 1.6286, mask_loss = 0.0840std_loss = 0.2561:  27%|██▋       | 1466/5475 [6:34:27<17:58:35, 16.14s/it]

ratio tensor(1.4880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5394, device='cuda:0') reward: tensor(1.4533, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.4606, device='cuda:0') factor: tensor(0.0896, device='cuda:0')
reward_raw: tensor(0.5506, device='cuda:0') reward: tensor(1.4639, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.4752, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.5421, device='cuda:0') reward: tensor(1.4556, device='cuda:0') factor: tensor(0.0865, device='cuda:0')


Epoch 1, Step 1467: Loss = 0.3338, Actor Loss = 0.2902, Critic Loss = 0.0873, Entropy = 0.0360, Returns = 1.4141, Value = 1.5445, mask_loss = 0.0892std_loss = 0.2585:  27%|██▋       | 1467/5475 [6:34:39<16:49:22, 15.11s/it]  

ratio tensor(0.7640, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4160, device='cuda:0') reward: tensor(1.3312, device='cuda:0') factor: tensor(0.0848, device='cuda:0')
reward_raw: tensor(0.4227, device='cuda:0') reward: tensor(1.3372, device='cuda:0') factor: tensor(0.0855, device='cuda:0')
reward_raw: tensor(0.4256, device='cuda:0') reward: tensor(1.3409, device='cuda:0') factor: tensor(0.0847, device='cuda:0')
reward_raw: tensor(0.4476, device='cuda:0') reward: tensor(1.3634, device='cuda:0') factor: tensor(0.0842, device='cuda:0')
reward_raw: tensor(0.4309, device='cuda:0') reward: tensor(1.3473, device='cuda:0') factor: tensor(0.0836, device='cuda:0')


Epoch 1, Step 1468: Loss = -0.0205, Actor Loss = -0.0797, Critic Loss = 0.1185, Entropy = 0.0241, Returns = 1.4055, Value = 1.1124, mask_loss = 0.0820std_loss = 0.2518:  27%|██▋       | 1468/5475 [6:35:01<18:52:40, 16.96s/it]

ratio tensor(1.0994, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4766, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.0815, device='cuda:0')
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.0839, device='cuda:0')
reward_raw: tensor(0.4686, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.0837, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.0815, device='cuda:0')
reward_raw: tensor(0.4887, device='cuda:0') reward: tensor(1.4023, device='cuda:0') factor: tensor(0.0863, device='cuda:0')


Epoch 1, Step 1469: Loss = -0.1063, Actor Loss = -0.1377, Critic Loss = 0.0628, Entropy = 0.0264, Returns = 1.4410, Value = 1.4312, mask_loss = 0.0813std_loss = 0.2549:  27%|██▋       | 1469/5475 [6:35:13<17:22:02, 15.61s/it]

ratio tensor(1.3923, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4014, device='cuda:0') reward: tensor(1.3194, device='cuda:0') factor: tensor(0.0820, device='cuda:0')
reward_raw: tensor(0.4060, device='cuda:0') reward: tensor(1.3215, device='cuda:0') factor: tensor(0.0845, device='cuda:0')
reward_raw: tensor(0.3938, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.0809, device='cuda:0')
reward_raw: tensor(0.3797, device='cuda:0') reward: tensor(1.2972, device='cuda:0') factor: tensor(0.0825, device='cuda:0')
reward_raw: tensor(0.3607, device='cuda:0') reward: tensor(1.2842, device='cuda:0') factor: tensor(0.0766, device='cuda:0')


Epoch 1, Step 1470: Loss = 0.0090, Actor Loss = -0.0367, Critic Loss = 0.0915, Entropy = 0.0243, Returns = 1.3708, Value = 1.3330, mask_loss = 0.0677std_loss = 0.2259:  27%|██▋       | 1470/5475 [6:35:30<17:54:17, 16.09s/it] 

ratio tensor(0.8128, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3281, device='cuda:0') reward: tensor(1.2601, device='cuda:0') factor: tensor(0.0680, device='cuda:0')
reward_raw: tensor(0.3478, device='cuda:0') reward: tensor(1.2799, device='cuda:0') factor: tensor(0.0679, device='cuda:0')
reward_raw: tensor(0.3386, device='cuda:0') reward: tensor(1.2690, device='cuda:0') factor: tensor(0.0695, device='cuda:0')
reward_raw: tensor(0.3342, device='cuda:0') reward: tensor(1.2612, device='cuda:0') factor: tensor(0.0730, device='cuda:0')
reward_raw: tensor(0.3342, device='cuda:0') reward: tensor(1.2675, device='cuda:0') factor: tensor(0.0668, device='cuda:0')


Epoch 1, Step 1471: Loss = -0.0449, Actor Loss = -0.0693, Critic Loss = 0.0488, Entropy = 0.0206, Returns = 1.3078, Value = 1.2196, mask_loss = 0.0742std_loss = 0.2453:  27%|██▋       | 1471/5475 [6:35:47<18:04:02, 16.24s/it]

ratio tensor(1.0360, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.4366, device='cuda:0') factor: tensor(0.0746, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.0739, device='cuda:0')
reward_raw: tensor(0.4894, device='cuda:0') reward: tensor(1.4161, device='cuda:0') factor: tensor(0.0732, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.4231, device='cuda:0') factor: tensor(0.0767, device='cuda:0')
reward_raw: tensor(0.5188, device='cuda:0') reward: tensor(1.4443, device='cuda:0') factor: tensor(0.0744, device='cuda:0')


Epoch 1, Step 1472: Loss = -0.1244, Actor Loss = -0.1496, Critic Loss = 0.0503, Entropy = 0.0202, Returns = 1.4307, Value = 1.3772, mask_loss = 0.0695std_loss = 0.2392:  27%|██▋       | 1472/5475 [6:36:03<17:55:15, 16.12s/it]

ratio tensor(1.4376, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.0788, device='cuda:0')
reward_raw: tensor(0.4747, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.0764, device='cuda:0')
reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.0814, device='cuda:0')
reward_raw: tensor(0.4598, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.0770, device='cuda:0')
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.0791, device='cuda:0')


Epoch 1, Step 1473: Loss = -0.0675, Actor Loss = -0.0902, Critic Loss = 0.0454, Entropy = 0.0212, Returns = 1.3854, Value = 1.3214, mask_loss = 0.0783std_loss = 0.2552:  27%|██▋       | 1473/5475 [6:36:19<18:05:50, 16.28s/it]

ratio tensor(0.9658, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.4321, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.4488, device='cuda:0') factor: tensor(0.0929, device='cuda:0')
reward_raw: tensor(0.5421, device='cuda:0') reward: tensor(1.4478, device='cuda:0') factor: tensor(0.0943, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.4313, device='cuda:0') factor: tensor(0.0903, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.4265, device='cuda:0') factor: tensor(0.0900, device='cuda:0')


Epoch 1, Step 1474: Loss = -0.1491, Actor Loss = -0.1852, Critic Loss = 0.0723, Entropy = 0.0281, Returns = 1.4919, Value = 1.5341, mask_loss = 0.1017std_loss = 0.2819:  27%|██▋       | 1474/5475 [6:36:32<16:57:16, 15.26s/it]

ratio tensor(1.2435, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4601, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.4616, device='cuda:0') reward: tensor(1.3621, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4682, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.4529, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.4357, device='cuda:0') reward: tensor(1.3340, device='cuda:0') factor: tensor(0.1016, device='cuda:0')


Epoch 1, Step 1475: Loss = 0.1412, Actor Loss = 0.0948, Critic Loss = 0.0929, Entropy = 0.0389, Returns = 1.3840, Value = 1.2458, mask_loss = 0.1211std_loss = 0.2985:  27%|██▋       | 1475/5475 [6:36:46<16:23:09, 14.75s/it]  

ratio tensor(0.4708, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5012, device='cuda:0') reward: tensor(1.4083, device='cuda:0') factor: tensor(0.0929, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.4012, device='cuda:0') factor: tensor(0.0984, device='cuda:0')
reward_raw: tensor(0.4829, device='cuda:0') reward: tensor(1.3890, device='cuda:0') factor: tensor(0.0938, device='cuda:0')
reward_raw: tensor(0.4912, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.0944, device='cuda:0')


Epoch 1, Step 1476: Loss = -0.0977, Actor Loss = -0.1438, Critic Loss = 0.0922, Entropy = 0.0339, Returns = 1.4207, Value = 1.5063, mask_loss = 0.0915std_loss = 0.2679:  27%|██▋       | 1476/5475 [6:37:03<17:09:36, 15.45s/it]

ratio tensor(2.0656, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4797, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.4563, device='cuda:0') reward: tensor(1.3538, device='cuda:0') factor: tensor(0.1025, device='cuda:0')
reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.3704, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4629, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.1097, device='cuda:0')
reward_raw: tensor(0.4579, device='cuda:0') reward: tensor(1.3554, device='cuda:0') factor: tensor(0.1025, device='cuda:0')


Epoch 1, Step 1477: Loss = 0.2300, Actor Loss = 0.1935, Critic Loss = 0.0731, Entropy = 0.0259, Returns = 1.3596, Value = 1.3423, mask_loss = 0.0835std_loss = 0.2572:  27%|██▋       | 1477/5475 [6:37:20<17:39:55, 15.91s/it]  

ratio tensor(0.9039, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5139, device='cuda:0') reward: tensor(1.4310, device='cuda:0') factor: tensor(0.0830, device='cuda:0')
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.4137, device='cuda:0') factor: tensor(0.0825, device='cuda:0')
reward_raw: tensor(0.4977, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.0851, device='cuda:0')
reward_raw: tensor(0.5287, device='cuda:0') reward: tensor(1.4455, device='cuda:0') factor: tensor(0.0832, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.4200, device='cuda:0') factor: tensor(0.0796, device='cuda:0')


Epoch 1, Step 1478: Loss = -0.2054, Actor Loss = -0.2475, Critic Loss = 0.0843, Entropy = 0.0288, Returns = 1.4215, Value = 1.4086, mask_loss = 0.0897std_loss = 0.2674:  27%|██▋       | 1478/5475 [6:37:34<17:06:55, 15.42s/it]

ratio tensor(1.2476, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5375, device='cuda:0') reward: tensor(1.4465, device='cuda:0') factor: tensor(0.0910, device='cuda:0')
reward_raw: tensor(0.5500, device='cuda:0') reward: tensor(1.4599, device='cuda:0') factor: tensor(0.0901, device='cuda:0')
reward_raw: tensor(0.5385, device='cuda:0') reward: tensor(1.4474, device='cuda:0') factor: tensor(0.0911, device='cuda:0')
reward_raw: tensor(0.5531, device='cuda:0') reward: tensor(1.4587, device='cuda:0') factor: tensor(0.0944, device='cuda:0')
reward_raw: tensor(0.5479, device='cuda:0') reward: tensor(1.4581, device='cuda:0') factor: tensor(0.0897, device='cuda:0')


Epoch 1, Step 1479: Loss = 0.2142, Actor Loss = 0.1604, Critic Loss = 0.1075, Entropy = 0.0229, Returns = 1.4238, Value = 1.4799, mask_loss = 0.0867std_loss = 0.2611:  27%|██▋       | 1479/5475 [6:37:51<17:29:51, 15.76s/it]  

ratio tensor(0.8955, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4656, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4510, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.0890, device='cuda:0')
reward_raw: tensor(0.4650, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.4650, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.0875, device='cuda:0')
reward_raw: tensor(0.4622, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.0879, device='cuda:0')


Epoch 1, Step 1480: Loss = 0.0808, Actor Loss = 0.0598, Critic Loss = 0.0420, Entropy = 0.0222, Returns = 1.3447, Value = 1.2314, mask_loss = 0.0805std_loss = 0.2569:  27%|██▋       | 1480/5475 [6:38:07<17:36:02, 15.86s/it]

ratio tensor(0.8356, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4597, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.0864, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4535, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.0832, device='cuda:0')
reward_raw: tensor(0.4566, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.0821, device='cuda:0')
reward_raw: tensor(0.4554, device='cuda:0') reward: tensor(1.3704, device='cuda:0') factor: tensor(0.0850, device='cuda:0')


Epoch 1, Step 1481: Loss = 0.0244, Actor Loss = -0.0182, Critic Loss = 0.0852, Entropy = 0.0331, Returns = 1.3414, Value = 1.4101, mask_loss = 0.0903std_loss = 0.2647:  27%|██▋       | 1480/5475 [6:38:21<17:36:02, 15.86s/it]

ratio tensor(0.9941, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1481: Loss = 0.0244, Actor Loss = -0.0182, Critic Loss = 0.0852, Entropy = 0.0331, Returns = 1.3414, Value = 1.4101, mask_loss = 0.0903std_loss = 0.2647:  27%|██▋       | 1481/5475 [6:38:22<17:16:37, 15.57s/it]

reward_raw: tensor(0.4762, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.0710, device='cuda:0')
reward_raw: tensor(0.4626, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.0675, device='cuda:0')
reward_raw: tensor(0.4448, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.0662, device='cuda:0')
reward_raw: tensor(0.4553, device='cuda:0') reward: tensor(1.3856, device='cuda:0') factor: tensor(0.0697, device='cuda:0')
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.0661, device='cuda:0')


Epoch 1, Step 1482: Loss = -0.0145, Actor Loss = -0.0508, Critic Loss = 0.0726, Entropy = 0.0214, Returns = 1.4067, Value = 1.4537, mask_loss = 0.0804std_loss = 0.2434:  27%|██▋       | 1482/5475 [6:38:39<17:59:17, 16.22s/it]

ratio tensor(1.0858, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5311, device='cuda:0') reward: tensor(1.4485, device='cuda:0') factor: tensor(0.0825, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.4556, device='cuda:0') factor: tensor(0.0796, device='cuda:0')
reward_raw: tensor(0.5285, device='cuda:0') reward: tensor(1.4492, device='cuda:0') factor: tensor(0.0793, device='cuda:0')
reward_raw: tensor(0.5595, device='cuda:0') reward: tensor(1.4771, device='cuda:0') factor: tensor(0.0825, device='cuda:0')
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4691, device='cuda:0') factor: tensor(0.0805, device='cuda:0')


Epoch 1, Step 1483: Loss = -0.0489, Actor Loss = -0.0961, Critic Loss = 0.0945, Entropy = 0.0190, Returns = 1.5552, Value = 1.5398, mask_loss = 0.0699std_loss = 0.2393:  27%|██▋       | 1483/5475 [6:38:55<17:49:00, 16.07s/it]

ratio tensor(1.0028, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5610, device='cuda:0') reward: tensor(1.4719, device='cuda:0') factor: tensor(0.0891, device='cuda:0')
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.4744, device='cuda:0') factor: tensor(0.0917, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.4742, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.5753, device='cuda:0') reward: tensor(1.4867, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.4773, device='cuda:0') factor: tensor(0.0896, device='cuda:0')


Epoch 1, Step 1484: Loss = 0.1065, Actor Loss = 0.0778, Critic Loss = 0.0575, Entropy = 0.0371, Returns = 1.4613, Value = 1.5352, mask_loss = 0.1008std_loss = 0.2725:  27%|██▋       | 1484/5475 [6:39:18<20:05:39, 18.13s/it]  

ratio tensor(0.9114, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4443, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.4362, device='cuda:0') reward: tensor(1.3166, device='cuda:0') factor: tensor(0.1195, device='cuda:0')
reward_raw: tensor(0.4233, device='cuda:0') reward: tensor(1.3027, device='cuda:0') factor: tensor(0.1207, device='cuda:0')
reward_raw: tensor(0.4384, device='cuda:0') reward: tensor(1.3219, device='cuda:0') factor: tensor(0.1164, device='cuda:0')
reward_raw: tensor(0.4462, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1211, device='cuda:0')


Epoch 1, Step 1485: Loss = 0.2135, Actor Loss = 0.1912, Critic Loss = 0.0446, Entropy = 0.0344, Returns = 1.3118, Value = 1.2661, mask_loss = 0.1172std_loss = 0.2949:  27%|██▋       | 1485/5475 [6:39:34<19:14:06, 17.36s/it]

ratio tensor(0.7915, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4982, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.0862, device='cuda:0')
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.4394, device='cuda:0') factor: tensor(0.0829, device='cuda:0')
reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.4237, device='cuda:0') factor: tensor(0.0834, device='cuda:0')
reward_raw: tensor(0.4891, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.0834, device='cuda:0')
reward_raw: tensor(0.5009, device='cuda:0') reward: tensor(1.4166, device='cuda:0') factor: tensor(0.0844, device='cuda:0')


Epoch 1, Step 1486: Loss = -0.3357, Actor Loss = -0.3563, Critic Loss = 0.0411, Entropy = 0.0208, Returns = 1.5134, Value = 1.4654, mask_loss = 0.0796std_loss = 0.2576:  27%|██▋       | 1486/5475 [6:39:48<18:18:55, 16.53s/it]

ratio tensor(1.2856, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4773, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1023, device='cuda:0')
reward_raw: tensor(0.4748, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.0971, device='cuda:0')
reward_raw: tensor(0.4610, device='cuda:0') reward: tensor(1.3616, device='cuda:0') factor: tensor(0.0993, device='cuda:0')
reward_raw: tensor(0.4784, device='cuda:0') reward: tensor(1.3747, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.4847, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.0926, device='cuda:0')


Epoch 1, Step 1487: Loss = -0.0518, Actor Loss = -0.0871, Critic Loss = 0.0705, Entropy = 0.0292, Returns = 1.4136, Value = 1.4414, mask_loss = 0.1092std_loss = 0.2816:  27%|██▋       | 1487/5475 [6:40:04<18:00:33, 16.26s/it]

ratio tensor(0.9239, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4359, device='cuda:0') reward: tensor(1.3394, device='cuda:0') factor: tensor(0.0965, device='cuda:0')
reward_raw: tensor(0.4380, device='cuda:0') reward: tensor(1.3395, device='cuda:0') factor: tensor(0.0986, device='cuda:0')
reward_raw: tensor(0.4365, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.0980, device='cuda:0')
reward_raw: tensor(0.4314, device='cuda:0') reward: tensor(1.3310, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.4464, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.0972, device='cuda:0')


Epoch 1, Step 1488: Loss = 0.3436, Actor Loss = 0.3219, Critic Loss = 0.0435, Entropy = 0.0294, Returns = 1.2939, Value = 1.1850, mask_loss = 0.0936std_loss = 0.2709:  27%|██▋       | 1488/5475 [6:40:18<17:25:52, 15.74s/it]  

ratio tensor(0.4594, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4819, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.4796, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4803, device='cuda:0') reward: tensor(1.3857, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.4788, device='cuda:0') reward: tensor(1.3860, device='cuda:0') factor: tensor(0.0928, device='cuda:0')
reward_raw: tensor(0.4704, device='cuda:0') reward: tensor(1.3782, device='cuda:0') factor: tensor(0.0921, device='cuda:0')


Epoch 1, Step 1489: Loss = -0.2331, Actor Loss = -0.2732, Critic Loss = 0.0802, Entropy = 0.0259, Returns = 1.3669, Value = 1.3846, mask_loss = 0.1033std_loss = 0.2877:  27%|██▋       | 1489/5475 [6:40:34<17:23:57, 15.71s/it]

ratio tensor(1.4159, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3840, device='cuda:0') reward: tensor(1.3069, device='cuda:0') factor: tensor(0.0772, device='cuda:0')
reward_raw: tensor(0.3978, device='cuda:0') reward: tensor(1.3192, device='cuda:0') factor: tensor(0.0785, device='cuda:0')
reward_raw: tensor(0.3902, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.0791, device='cuda:0')
reward_raw: tensor(0.4266, device='cuda:0') reward: tensor(1.3441, device='cuda:0') factor: tensor(0.0825, device='cuda:0')
reward_raw: tensor(0.4015, device='cuda:0') reward: tensor(1.3192, device='cuda:0') factor: tensor(0.0823, device='cuda:0')


Epoch 1, Step 1490: Loss = 0.1313, Actor Loss = 0.1041, Critic Loss = 0.0544, Entropy = 0.0272, Returns = 1.3313, Value = 1.3483, mask_loss = 0.0827std_loss = 0.2553:  27%|██▋       | 1490/5475 [6:40:52<18:19:54, 16.56s/it]  

ratio tensor(0.6598, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5281, device='cuda:0') reward: tensor(1.4230, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.4024, device='cuda:0') factor: tensor(0.1014, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.4793, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.0995, device='cuda:0')


Epoch 1, Step 1491: Loss = -0.0369, Actor Loss = -0.0605, Critic Loss = 0.0473, Entropy = 0.0316, Returns = 1.3005, Value = 1.3554, mask_loss = 0.1031std_loss = 0.2841:  27%|██▋       | 1491/5475 [6:41:12<19:25:29, 17.55s/it]

ratio tensor(3.4992, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4838, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3926, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4740, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.0869, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.4081, device='cuda:0') factor: tensor(0.0878, device='cuda:0')
reward_raw: tensor(0.4826, device='cuda:0') reward: tensor(1.3930, device='cuda:0') factor: tensor(0.0895, device='cuda:0')


Epoch 1, Step 1492: Loss = 0.0566, Actor Loss = 0.0285, Critic Loss = 0.0563, Entropy = 0.0323, Returns = 1.3824, Value = 1.5071, mask_loss = 0.0817std_loss = 0.2469:  27%|██▋       | 1492/5475 [6:41:33<20:32:48, 18.57s/it]  

ratio tensor(1.1191, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.4573, device='cuda:0') reward: tensor(1.3652, device='cuda:0') factor: tensor(0.0920, device='cuda:0')
reward_raw: tensor(0.4603, device='cuda:0') reward: tensor(1.3681, device='cuda:0') factor: tensor(0.0923, device='cuda:0')
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.0926, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.0930, device='cuda:0')


Epoch 1, Step 1493: Loss = 0.1833, Actor Loss = 0.1338, Critic Loss = 0.0991, Entropy = 0.0219, Returns = 1.4342, Value = 1.3465, mask_loss = 0.0858std_loss = 0.2612:  27%|██▋       | 1493/5475 [6:41:47<18:54:56, 17.10s/it]

ratio tensor(0.7718, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5244, device='cuda:0') reward: tensor(1.4256, device='cuda:0') factor: tensor(0.0988, device='cuda:0')
reward_raw: tensor(0.5373, device='cuda:0') reward: tensor(1.4379, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.5149, device='cuda:0') reward: tensor(1.4161, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.4213, device='cuda:0') factor: tensor(0.0977, device='cuda:0')
reward_raw: tensor(0.5246, device='cuda:0') reward: tensor(1.4260, device='cuda:0') factor: tensor(0.0985, device='cuda:0')


Epoch 1, Step 1494: Loss = -0.0759, Actor Loss = -0.1112, Critic Loss = 0.0707, Entropy = 0.0283, Returns = 1.4130, Value = 1.4489, mask_loss = 0.1012std_loss = 0.2800:  27%|██▋       | 1494/5475 [6:42:02<18:19:50, 16.58s/it]

ratio tensor(0.9695, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.3997, device='cuda:0') factor: tensor(0.0965, device='cuda:0')
reward_raw: tensor(0.4931, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.0980, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.4076, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.4203, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.0960, device='cuda:0')


Epoch 1, Step 1495: Loss = -0.0105, Actor Loss = -0.0366, Critic Loss = 0.0522, Entropy = 0.0345, Returns = 1.4046, Value = 1.4210, mask_loss = 0.1005std_loss = 0.2769:  27%|██▋       | 1495/5475 [6:42:16<17:22:09, 15.71s/it]

ratio tensor(1.0918, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4890, device='cuda:0') reward: tensor(1.4094, device='cuda:0') factor: tensor(0.0797, device='cuda:0')
reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.0806, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.4075, device='cuda:0') factor: tensor(0.0783, device='cuda:0')
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.4077, device='cuda:0') factor: tensor(0.0800, device='cuda:0')
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.0804, device='cuda:0')


Epoch 1, Step 1496: Loss = 0.1101, Actor Loss = 0.0838, Critic Loss = 0.0528, Entropy = 0.0201, Returns = 1.3962, Value = 1.2821, mask_loss = 0.0891std_loss = 0.2705:  27%|██▋       | 1496/5475 [6:42:36<18:51:03, 17.06s/it]  

ratio tensor(0.9742, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4376, device='cuda:0') reward: tensor(1.3285, device='cuda:0') factor: tensor(0.1091, device='cuda:0')
reward_raw: tensor(0.4397, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.1162, device='cuda:0')
reward_raw: tensor(0.4381, device='cuda:0') reward: tensor(1.3283, device='cuda:0') factor: tensor(0.1097, device='cuda:0')
reward_raw: tensor(0.4373, device='cuda:0') reward: tensor(1.3294, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.4333, device='cuda:0') reward: tensor(1.3264, device='cuda:0') factor: tensor(0.1069, device='cuda:0')


Epoch 1, Step 1497: Loss = -0.2743, Actor Loss = -0.3130, Critic Loss = 0.0776, Entropy = 0.0257, Returns = 1.3935, Value = 1.2320, mask_loss = 0.0966std_loss = 0.2601:  27%|██▋       | 1497/5475 [6:42:57<20:07:53, 18.22s/it]

ratio tensor(1.6264, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6118, device='cuda:0') reward: tensor(1.5090, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.6315, device='cuda:0') reward: tensor(1.5215, device='cuda:0') factor: tensor(0.1099, device='cuda:0')
reward_raw: tensor(0.6166, device='cuda:0') reward: tensor(1.5068, device='cuda:0') factor: tensor(0.1098, device='cuda:0')
reward_raw: tensor(0.6424, device='cuda:0') reward: tensor(1.5364, device='cuda:0') factor: tensor(0.1060, device='cuda:0')
reward_raw: tensor(0.6219, device='cuda:0') reward: tensor(1.5167, device='cuda:0') factor: tensor(0.1052, device='cuda:0')


Epoch 1, Step 1498: Loss = -0.0910, Actor Loss = -0.1163, Critic Loss = 0.0507, Entropy = 0.0284, Returns = 1.4712, Value = 1.6504, mask_loss = 0.0961std_loss = 0.2741:  27%|██▋       | 1498/5475 [6:43:12<18:55:45, 17.13s/it]

ratio tensor(1.2240, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5301, device='cuda:0') reward: tensor(1.4264, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.5251, device='cuda:0') reward: tensor(1.4208, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.4308, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.4362, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.4328, device='cuda:0') factor: tensor(0.1021, device='cuda:0')


Epoch 1, Step 1499: Loss = 0.2390, Actor Loss = 0.2032, Critic Loss = 0.0717, Entropy = 0.0332, Returns = 1.4080, Value = 1.3866, mask_loss = 0.0911std_loss = 0.2614:  27%|██▋       | 1499/5475 [6:43:26<17:54:35, 16.22s/it]  

ratio tensor(0.8708, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.4275, device='cuda:0') factor: tensor(0.0918, device='cuda:0')
reward_raw: tensor(0.5404, device='cuda:0') reward: tensor(1.4452, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.5452, device='cuda:0') reward: tensor(1.4460, device='cuda:0') factor: tensor(0.0992, device='cuda:0')
reward_raw: tensor(0.5281, device='cuda:0') reward: tensor(1.4322, device='cuda:0') factor: tensor(0.0959, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.4256, device='cuda:0') factor: tensor(0.0920, device='cuda:0')


Epoch 1, Step 1500: Loss = 0.0241, Actor Loss = -0.0063, Critic Loss = 0.0608, Entropy = 0.0259, Returns = 1.3952, Value = 1.3704, mask_loss = 0.1031std_loss = 0.2818:  27%|██▋       | 1500/5475 [6:43:45<18:54:24, 17.12s/it]

ratio tensor(0.9408, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4972, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.0743, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.4226, device='cuda:0') factor: tensor(0.0776, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.4166, device='cuda:0') factor: tensor(0.0761, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.4140, device='cuda:0') factor: tensor(0.0772, device='cuda:0')
reward_raw: tensor(0.4789, device='cuda:0') reward: tensor(1.4031, device='cuda:0') factor: tensor(0.0758, device='cuda:0')


Epoch 1, Step 1501: Loss = -0.0849, Actor Loss = -0.1231, Critic Loss = 0.0764, Entropy = 0.0226, Returns = 1.4558, Value = 1.4642, mask_loss = 0.0732std_loss = 0.2441:  27%|██▋       | 1500/5475 [6:43:58<18:54:24, 17.12s/it]

ratio tensor(1.0015, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1501: Loss = -0.0849, Actor Loss = -0.1231, Critic Loss = 0.0764, Entropy = 0.0226, Returns = 1.4558, Value = 1.4642, mask_loss = 0.0732std_loss = 0.2441:  27%|██▋       | 1501/5475 [6:43:58<17:39:04, 15.99s/it]

reward_raw: tensor(0.4699, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1164, device='cuda:0')
reward_raw: tensor(0.4616, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.4576, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1118, device='cuda:0')
reward_raw: tensor(0.4514, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.1098, device='cuda:0')
reward_raw: tensor(0.4522, device='cuda:0') reward: tensor(1.3399, device='cuda:0') factor: tensor(0.1123, device='cuda:0')


Epoch 1, Step 1502: Loss = 0.2292, Actor Loss = 0.2042, Critic Loss = 0.0500, Entropy = 0.0375, Returns = 1.2874, Value = 1.2109, mask_loss = 0.1257std_loss = 0.3043:  27%|██▋       | 1502/5475 [6:44:10<16:16:44, 14.75s/it]  

ratio tensor(0.7143, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4557, device='cuda:0') reward: tensor(1.3517, device='cuda:0') factor: tensor(0.1040, device='cuda:0')
reward_raw: tensor(0.4359, device='cuda:0') reward: tensor(1.3302, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.4090, device='cuda:0') reward: tensor(1.3086, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4290, device='cuda:0') reward: tensor(1.3275, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.3948, device='cuda:0') reward: tensor(1.2946, device='cuda:0') factor: tensor(0.1003, device='cuda:0')


Epoch 1, Step 1503: Loss = -0.2187, Actor Loss = -0.2534, Critic Loss = 0.0696, Entropy = 0.0325, Returns = 1.3591, Value = 1.2967, mask_loss = 0.1016std_loss = 0.2817:  27%|██▋       | 1503/5475 [6:44:22<15:21:14, 13.92s/it]

ratio tensor(1.0194, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5427, device='cuda:0') reward: tensor(1.4392, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.5416, device='cuda:0') reward: tensor(1.4378, device='cuda:0') factor: tensor(0.1039, device='cuda:0')
reward_raw: tensor(0.5433, device='cuda:0') reward: tensor(1.4415, device='cuda:0') factor: tensor(0.1018, device='cuda:0')
reward_raw: tensor(0.5528, device='cuda:0') reward: tensor(1.4501, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.4425, device='cuda:0') factor: tensor(0.1014, device='cuda:0')


Epoch 1, Step 1504: Loss = -0.0156, Actor Loss = -0.0368, Critic Loss = 0.0426, Entropy = 0.0433, Returns = 1.4431, Value = 1.5778, mask_loss = 0.0897std_loss = 0.2593:  27%|██▋       | 1504/5475 [6:44:33<14:19:23, 12.98s/it]

ratio tensor(1.1111, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5976, device='cuda:0') reward: tensor(1.4930, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.4578, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5804, device='cuda:0') reward: tensor(1.4768, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.6029, device='cuda:0') reward: tensor(1.4930, device='cuda:0') factor: tensor(0.1099, device='cuda:0')
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.4809, device='cuda:0') factor: tensor(0.1045, device='cuda:0')


Epoch 1, Step 1505: Loss = 0.2459, Actor Loss = 0.2264, Critic Loss = 0.0390, Entropy = 0.0302, Returns = 1.4202, Value = 1.4110, mask_loss = 0.1019std_loss = 0.2849:  27%|██▋       | 1505/5475 [6:44:46<14:21:34, 13.02s/it]  

ratio tensor(0.7757, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4668, device='cuda:0') reward: tensor(1.3715, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.4826, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.0992, device='cuda:0')
reward_raw: tensor(0.4563, device='cuda:0') reward: tensor(1.3560, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.0975, device='cuda:0')


Epoch 1, Step 1506: Loss = -0.0436, Actor Loss = -0.0690, Critic Loss = 0.0509, Entropy = 0.0402, Returns = 1.3865, Value = 1.3642, mask_loss = 0.1041std_loss = 0.2797:  28%|██▊       | 1506/5475 [6:45:00<14:45:38, 13.39s/it]

ratio tensor(1.0806, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5354, device='cuda:0') reward: tensor(1.4300, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.5332, device='cuda:0') reward: tensor(1.4301, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.5389, device='cuda:0') reward: tensor(1.4341, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.5340, device='cuda:0') reward: tensor(1.4314, device='cuda:0') factor: tensor(0.1026, device='cuda:0')
reward_raw: tensor(0.5374, device='cuda:0') reward: tensor(1.4336, device='cuda:0') factor: tensor(0.1038, device='cuda:0')


Epoch 1, Step 1507: Loss = 0.0180, Actor Loss = -0.0284, Critic Loss = 0.0930, Entropy = 0.0462, Returns = 1.5300, Value = 1.5011, mask_loss = 0.1082std_loss = 0.2753:  28%|██▊       | 1507/5475 [6:45:16<15:32:13, 14.10s/it] 

ratio tensor(1.1572, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4277, device='cuda:0') reward: tensor(1.3283, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.4159, device='cuda:0') reward: tensor(1.3121, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.4440, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4290, device='cuda:0') reward: tensor(1.3303, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.4384, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.1007, device='cuda:0')


Epoch 1, Step 1508: Loss = 0.1663, Actor Loss = 0.1431, Critic Loss = 0.0466, Entropy = 0.0575, Returns = 1.3734, Value = 1.3062, mask_loss = 0.1089std_loss = 0.2802:  28%|██▊       | 1508/5475 [6:45:33<16:19:44, 14.82s/it] 

ratio tensor(1.9257, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.4234, device='cuda:0') factor: tensor(0.1025, device='cuda:0')
reward_raw: tensor(0.5482, device='cuda:0') reward: tensor(1.4451, device='cuda:0') factor: tensor(0.1031, device='cuda:0')
reward_raw: tensor(0.5228, device='cuda:0') reward: tensor(1.4219, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.5124, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.0998, device='cuda:0')
reward_raw: tensor(0.4928, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.0969, device='cuda:0')


Epoch 1, Step 1509: Loss = 0.0017, Actor Loss = -0.0336, Critic Loss = 0.0708, Entropy = 0.0436, Returns = 1.3754, Value = 1.4639, mask_loss = 0.0937std_loss = 0.2616:  28%|██▊       | 1509/5475 [6:45:46<15:58:15, 14.50s/it]

ratio tensor(2.4305, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3868, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4867, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.4801, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.1158, device='cuda:0')
reward_raw: tensor(0.4985, device='cuda:0') reward: tensor(1.3829, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4674, device='cuda:0') reward: tensor(1.3554, device='cuda:0') factor: tensor(0.1120, device='cuda:0')


Epoch 1, Step 1510: Loss = 0.0632, Actor Loss = 0.0272, Critic Loss = 0.0721, Entropy = 0.0449, Returns = 1.3728, Value = 1.3446, mask_loss = 0.1155std_loss = 0.2931:  28%|██▊       | 1510/5475 [6:46:02<16:24:42, 14.90s/it] 

ratio tensor(0.9653, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.4023, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.4617, device='cuda:0') reward: tensor(1.3560, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.4545, device='cuda:0') reward: tensor(1.3521, device='cuda:0') factor: tensor(0.1025, device='cuda:0')
reward_raw: tensor(0.4710, device='cuda:0') reward: tensor(1.3650, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.4804, device='cuda:0') reward: tensor(1.3750, device='cuda:0') factor: tensor(0.1054, device='cuda:0')


Epoch 1, Step 1511: Loss = 0.0335, Actor Loss = 0.0086, Critic Loss = 0.0501, Entropy = 0.0468, Returns = 1.3076, Value = 1.3221, mask_loss = 0.0990std_loss = 0.2734:  28%|██▊       | 1511/5475 [6:46:14<15:17:18, 13.88s/it]

ratio tensor(1.0204, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.1261, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3988, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.4868, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1166, device='cuda:0')
reward_raw: tensor(0.4796, device='cuda:0') reward: tensor(1.3580, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.3821, device='cuda:0') factor: tensor(0.1199, device='cuda:0')


Epoch 1, Step 1512: Loss = 0.1270, Actor Loss = 0.0855, Critic Loss = 0.0829, Entropy = 0.0567, Returns = 1.3808, Value = 1.4846, mask_loss = 0.1249std_loss = 0.2884:  28%|██▊       | 1512/5475 [6:46:30<16:09:16, 14.67s/it]

ratio tensor(1.5514, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.0988, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3728, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.3960, device='cuda:0') factor: tensor(0.1082, device='cuda:0')
reward_raw: tensor(0.4623, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1000, device='cuda:0')


Epoch 1, Step 1513: Loss = 0.0279, Actor Loss = 0.0026, Critic Loss = 0.0506, Entropy = 0.0474, Returns = 1.4095, Value = 1.3822, mask_loss = 0.0994std_loss = 0.2620:  28%|██▊       | 1513/5475 [6:46:50<17:59:09, 16.34s/it]

ratio tensor(1.0852, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5267, device='cuda:0') reward: tensor(1.3831, device='cuda:0') factor: tensor(0.1436, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.4007, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.5463, device='cuda:0') reward: tensor(1.4075, device='cuda:0') factor: tensor(0.1388, device='cuda:0')
reward_raw: tensor(0.5409, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.1371, device='cuda:0')
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1463, device='cuda:0')


Epoch 1, Step 1514: Loss = -0.1410, Actor Loss = -0.1882, Critic Loss = 0.0945, Entropy = 0.0504, Returns = 1.4621, Value = 1.4251, mask_loss = 0.1098std_loss = 0.2800:  28%|██▊       | 1514/5475 [6:47:03<16:35:48, 15.08s/it]

ratio tensor(1.3980, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4339, device='cuda:0') reward: tensor(1.3488, device='cuda:0') factor: tensor(0.0851, device='cuda:0')
reward_raw: tensor(0.4375, device='cuda:0') reward: tensor(1.3574, device='cuda:0') factor: tensor(0.0801, device='cuda:0')
reward_raw: tensor(0.4227, device='cuda:0') reward: tensor(1.3417, device='cuda:0') factor: tensor(0.0810, device='cuda:0')
reward_raw: tensor(0.4371, device='cuda:0') reward: tensor(1.3504, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.4269, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.0838, device='cuda:0')


Epoch 1, Step 1515: Loss = 0.0919, Actor Loss = 0.0230, Critic Loss = 0.1378, Entropy = 0.0305, Returns = 1.3617, Value = 1.3427, mask_loss = 0.0825std_loss = 0.2536:  28%|██▊       | 1515/5475 [6:47:17<16:12:27, 14.73s/it]  

ratio tensor(0.8629, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.4593, device='cuda:0') factor: tensor(0.0865, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.4646, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.5693, device='cuda:0') reward: tensor(1.4837, device='cuda:0') factor: tensor(0.0855, device='cuda:0')
reward_raw: tensor(0.5481, device='cuda:0') reward: tensor(1.4631, device='cuda:0') factor: tensor(0.0850, device='cuda:0')
reward_raw: tensor(0.5506, device='cuda:0') reward: tensor(1.4644, device='cuda:0') factor: tensor(0.0862, device='cuda:0')


Epoch 1, Step 1516: Loss = -0.1939, Actor Loss = -0.2486, Critic Loss = 0.1093, Entropy = 0.0345, Returns = 1.4394, Value = 1.3169, mask_loss = 0.0771std_loss = 0.2443:  28%|██▊       | 1516/5475 [6:47:33<16:53:16, 15.36s/it]

ratio tensor(1.3994, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5318, device='cuda:0') reward: tensor(1.4243, device='cuda:0') factor: tensor(0.1074, device='cuda:0')
reward_raw: tensor(0.5415, device='cuda:0') reward: tensor(1.4372, device='cuda:0') factor: tensor(0.1044, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.4290, device='cuda:0') factor: tensor(0.1040, device='cuda:0')
reward_raw: tensor(0.5395, device='cuda:0') reward: tensor(1.4336, device='cuda:0') factor: tensor(0.1060, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(1.4228, device='cuda:0') factor: tensor(0.1090, device='cuda:0')


Epoch 1, Step 1517: Loss = 0.0685, Actor Loss = 0.0173, Critic Loss = 0.1026, Entropy = 0.0280, Returns = 1.3446, Value = 1.5002, mask_loss = 0.0851std_loss = 0.2600:  28%|██▊       | 1517/5475 [6:47:46<16:00:39, 14.56s/it]  

ratio tensor(0.9655, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4777, device='cuda:0') reward: tensor(1.3530, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1238, device='cuda:0')
reward_raw: tensor(0.4692, device='cuda:0') reward: tensor(1.3459, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.4912, device='cuda:0') reward: tensor(1.3690, device='cuda:0') factor: tensor(0.1222, device='cuda:0')
reward_raw: tensor(0.4888, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1220, device='cuda:0')


Epoch 1, Step 1518: Loss = 0.2266, Actor Loss = 0.1818, Critic Loss = 0.0898, Entropy = 0.0512, Returns = 1.3849, Value = 1.2367, mask_loss = 0.1267std_loss = 0.2925:  28%|██▊       | 1518/5475 [6:48:01<16:09:14, 14.70s/it]

ratio tensor(0.8579, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4879, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1352, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.1313, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3716, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1309, device='cuda:0')


Epoch 1, Step 1519: Loss = -0.2506, Actor Loss = -0.2848, Critic Loss = 0.0686, Entropy = 0.0570, Returns = 1.3849, Value = 1.2951, mask_loss = 0.1483std_loss = 0.3229:  28%|██▊       | 1519/5475 [6:48:14<15:34:52, 14.18s/it]

ratio tensor(1.1199, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.4521, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.5664, device='cuda:0') reward: tensor(1.4597, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.5837, device='cuda:0') reward: tensor(1.4793, device='cuda:0') factor: tensor(0.1044, device='cuda:0')
reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.4612, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.5669, device='cuda:0') reward: tensor(1.4604, device='cuda:0') factor: tensor(0.1065, device='cuda:0')


Epoch 1, Step 1520: Loss = 0.0628, Actor Loss = 0.0215, Critic Loss = 0.0828, Entropy = 0.0372, Returns = 1.4184, Value = 1.5848, mask_loss = 0.1000std_loss = 0.2786:  28%|██▊       | 1520/5475 [6:48:28<15:21:23, 13.98s/it]  

ratio tensor(0.9316, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.4366, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.4527, device='cuda:0') factor: tensor(0.1049, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.4292, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.5470, device='cuda:0') reward: tensor(1.4422, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.5412, device='cuda:0') reward: tensor(1.4354, device='cuda:0') factor: tensor(0.1057, device='cuda:0')


Epoch 1, Step 1521: Loss = 0.0345, Actor Loss = 0.0040, Critic Loss = 0.0611, Entropy = 0.0340, Returns = 1.5451, Value = 1.4077, mask_loss = 0.0980std_loss = 0.2674:  28%|██▊       | 1520/5475 [6:48:47<15:21:23, 13.98s/it]

ratio tensor(0.9783, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1521: Loss = 0.0345, Actor Loss = 0.0040, Critic Loss = 0.0611, Entropy = 0.0340, Returns = 1.5451, Value = 1.4077, mask_loss = 0.0980std_loss = 0.2674:  28%|██▊       | 1521/5475 [6:48:47<17:10:00, 15.63s/it]

reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.4079, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.4997, device='cuda:0') reward: tensor(1.4010, device='cuda:0') factor: tensor(0.0986, device='cuda:0')
reward_raw: tensor(0.5085, device='cuda:0') reward: tensor(1.4075, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4981, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.1005, device='cuda:0')


Epoch 1, Step 1522: Loss = -0.1497, Actor Loss = -0.1704, Critic Loss = 0.0415, Entropy = 0.0441, Returns = 1.4358, Value = 1.4361, mask_loss = 0.0959std_loss = 0.2666:  28%|██▊       | 1522/5475 [6:49:02<16:49:49, 15.33s/it]

ratio tensor(1.1951, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4672, device='cuda:0') reward: tensor(1.3829, device='cuda:0') factor: tensor(0.0844, device='cuda:0')
reward_raw: tensor(0.4830, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.4681, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.0817, device='cuda:0')
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.4160, device='cuda:0') factor: tensor(0.0815, device='cuda:0')
reward_raw: tensor(0.4904, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.0847, device='cuda:0')


Epoch 1, Step 1523: Loss = 0.0941, Actor Loss = 0.0601, Critic Loss = 0.0680, Entropy = 0.0571, Returns = 1.4253, Value = 1.3403, mask_loss = 0.0874std_loss = 0.2486:  28%|██▊       | 1523/5475 [6:49:15<16:15:42, 14.81s/it]  

ratio tensor(0.8674, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5605, device='cuda:0') reward: tensor(1.4500, device='cuda:0') factor: tensor(0.1105, device='cuda:0')
reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.4336, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.4375, device='cuda:0') factor: tensor(0.1137, device='cuda:0')
reward_raw: tensor(0.5493, device='cuda:0') reward: tensor(1.4415, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.5642, device='cuda:0') reward: tensor(1.4560, device='cuda:0') factor: tensor(0.1081, device='cuda:0')


Epoch 1, Step 1524: Loss = 0.0174, Actor Loss = -0.0012, Critic Loss = 0.0372, Entropy = 0.0535, Returns = 1.3923, Value = 1.5116, mask_loss = 0.1244std_loss = 0.3007:  28%|██▊       | 1524/5475 [6:49:31<16:31:15, 15.05s/it]

ratio tensor(0.9288, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.4382, device='cuda:0') factor: tensor(0.1220, device='cuda:0')
reward_raw: tensor(0.5383, device='cuda:0') reward: tensor(1.4104, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.5447, device='cuda:0') reward: tensor(1.4211, device='cuda:0') factor: tensor(0.1236, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.1261, device='cuda:0')
reward_raw: tensor(0.5258, device='cuda:0') reward: tensor(1.4011, device='cuda:0') factor: tensor(0.1247, device='cuda:0')


Epoch 1, Step 1525: Loss = 0.0923, Actor Loss = 0.0589, Critic Loss = 0.0669, Entropy = 0.0529, Returns = 1.4060, Value = 1.3919, mask_loss = 0.1290std_loss = 0.3082:  28%|██▊       | 1525/5475 [6:49:48<17:07:22, 15.61s/it] 

ratio tensor(0.8134, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4769, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.4696, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.0935, device='cuda:0')
reward_raw: tensor(0.4965, device='cuda:0') reward: tensor(1.4008, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.0976, device='cuda:0')


Epoch 1, Step 1526: Loss = 0.0213, Actor Loss = -0.0086, Critic Loss = 0.0599, Entropy = 0.0333, Returns = 1.2756, Value = 1.3424, mask_loss = 0.0862std_loss = 0.2552:  28%|██▊       | 1526/5475 [6:50:02<16:49:29, 15.34s/it]

ratio tensor(2.2994, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4177, device='cuda:0') factor: tensor(0.1199, device='cuda:0')
reward_raw: tensor(0.5459, device='cuda:0') reward: tensor(1.4226, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5557, device='cuda:0') reward: tensor(1.4354, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.5451, device='cuda:0') reward: tensor(1.4225, device='cuda:0') factor: tensor(0.1226, device='cuda:0')
reward_raw: tensor(0.5458, device='cuda:0') reward: tensor(1.4278, device='cuda:0') factor: tensor(0.1180, device='cuda:0')


Epoch 1, Step 1527: Loss = 0.0160, Actor Loss = -0.0259, Critic Loss = 0.0840, Entropy = 0.0444, Returns = 1.4502, Value = 1.4624, mask_loss = 0.1083std_loss = 0.2806:  28%|██▊       | 1527/5475 [6:50:17<16:32:18, 15.08s/it]

ratio tensor(1.8273, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5220, device='cuda:0') reward: tensor(1.4241, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.5525, device='cuda:0') reward: tensor(1.4530, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.5291, device='cuda:0') reward: tensor(1.4252, device='cuda:0') factor: tensor(0.1039, device='cuda:0')
reward_raw: tensor(0.5395, device='cuda:0') reward: tensor(1.4341, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(1.4104, device='cuda:0') factor: tensor(0.0966, device='cuda:0')


Epoch 1, Step 1528: Loss = 0.1550, Actor Loss = 0.1170, Critic Loss = 0.0761, Entropy = 0.0517, Returns = 1.3586, Value = 1.4467, mask_loss = 0.0997std_loss = 0.2649:  28%|██▊       | 1528/5475 [6:50:40<19:07:06, 17.44s/it] 

ratio tensor(1.0779, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3968, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.5399, device='cuda:0') reward: tensor(1.4057, device='cuda:0') factor: tensor(0.1343, device='cuda:0')
reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.4076, device='cuda:0') factor: tensor(0.1361, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3801, device='cuda:0') factor: tensor(0.1326, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3869, device='cuda:0') factor: tensor(0.1286, device='cuda:0')


Epoch 1, Step 1529: Loss = 0.0250, Actor Loss = -0.0101, Critic Loss = 0.0703, Entropy = 0.0673, Returns = 1.3714, Value = 1.4310, mask_loss = 0.1280std_loss = 0.2954:  28%|██▊       | 1529/5475 [6:50:51<16:59:15, 15.50s/it]

ratio tensor(1.2571, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.5070, device='cuda:0') reward: tensor(1.3910, device='cuda:0') factor: tensor(0.1161, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1193, device='cuda:0')
reward_raw: tensor(0.4888, device='cuda:0') reward: tensor(1.3728, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.4960, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1134, device='cuda:0')


Epoch 1, Step 1530: Loss = 0.0634, Actor Loss = 0.0394, Critic Loss = 0.0481, Entropy = 0.0567, Returns = 1.3770, Value = 1.3903, mask_loss = 0.1278std_loss = 0.2985:  28%|██▊       | 1530/5475 [6:51:12<18:49:16, 17.18s/it] 

ratio tensor(0.8987, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5627, device='cuda:0') reward: tensor(1.4429, device='cuda:0') factor: tensor(0.1199, device='cuda:0')
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.4386, device='cuda:0') factor: tensor(0.1165, device='cuda:0')
reward_raw: tensor(0.5449, device='cuda:0') reward: tensor(1.4305, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.4208, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.5479, device='cuda:0') reward: tensor(1.4299, device='cuda:0') factor: tensor(0.1180, device='cuda:0')


Epoch 1, Step 1531: Loss = -0.0447, Actor Loss = -0.0696, Critic Loss = 0.0501, Entropy = 0.0480, Returns = 1.4879, Value = 1.3905, mask_loss = 0.0866std_loss = 0.2467:  28%|██▊       | 1531/5475 [6:51:26<17:38:01, 16.10s/it]

ratio tensor(1.1058, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5128, device='cuda:0') reward: tensor(1.3827, device='cuda:0') factor: tensor(0.1301, device='cuda:0')
reward_raw: tensor(0.5077, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.5104, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1330, device='cuda:0')
reward_raw: tensor(0.4967, device='cuda:0') reward: tensor(1.3665, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.3721, device='cuda:0') factor: tensor(0.1319, device='cuda:0')


Epoch 1, Step 1532: Loss = 0.1637, Actor Loss = 0.1196, Critic Loss = 0.0884, Entropy = 0.0568, Returns = 1.3157, Value = 1.4464, mask_loss = 0.1205std_loss = 0.2913:  28%|██▊       | 1532/5475 [6:51:49<20:07:32, 18.37s/it]  

ratio tensor(0.8301, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4078, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.1173, device='cuda:0')
reward_raw: tensor(0.4208, device='cuda:0') reward: tensor(1.3018, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.4285, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.4054, device='cuda:0') reward: tensor(1.2924, device='cuda:0') factor: tensor(0.1129, device='cuda:0')
reward_raw: tensor(0.4409, device='cuda:0') reward: tensor(1.3295, device='cuda:0') factor: tensor(0.1114, device='cuda:0')


Epoch 1, Step 1533: Loss = -0.0584, Actor Loss = -0.0794, Critic Loss = 0.0421, Entropy = 0.0595, Returns = 1.3427, Value = 1.3065, mask_loss = 0.1169std_loss = 0.2852:  28%|██▊       | 1533/5475 [6:52:04<19:05:02, 17.43s/it]

ratio tensor(0.7653, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5355, device='cuda:0') reward: tensor(1.4215, device='cuda:0') factor: tensor(0.1140, device='cuda:0')
reward_raw: tensor(0.5317, device='cuda:0') reward: tensor(1.4238, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.4498, device='cuda:0') factor: tensor(0.1094, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1112, device='cuda:0')
reward_raw: tensor(0.5263, device='cuda:0') reward: tensor(1.4170, device='cuda:0') factor: tensor(0.1093, device='cuda:0')


Epoch 1, Step 1534: Loss = 0.1093, Actor Loss = 0.0553, Critic Loss = 0.1082, Entropy = 0.0864, Returns = 1.4193, Value = 1.4960, mask_loss = 0.1558std_loss = 0.3069:  28%|██▊       | 1534/5475 [6:52:22<18:59:34, 17.35s/it]  

ratio tensor(0.8167, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4524, device='cuda:0') reward: tensor(1.3140, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.1288, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3055, device='cuda:0') factor: tensor(0.1396, device='cuda:0')
reward_raw: tensor(0.4687, device='cuda:0') reward: tensor(1.3398, device='cuda:0') factor: tensor(0.1289, device='cuda:0')
reward_raw: tensor(0.4474, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.1373, device='cuda:0')


Epoch 1, Step 1535: Loss = 0.2348, Actor Loss = 0.2068, Critic Loss = 0.0563, Entropy = 0.1339, Returns = 1.3632, Value = 1.2419, mask_loss = 0.1805std_loss = 0.3115:  28%|██▊       | 1535/5475 [6:52:37<18:11:18, 16.62s/it]

ratio tensor(0.0249, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4971, device='cuda:0') reward: tensor(1.3105, device='cuda:0') factor: tensor(0.1866, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.2947, device='cuda:0') factor: tensor(0.1799, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.2999, device='cuda:0') factor: tensor(0.1857, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.3056, device='cuda:0') factor: tensor(0.1939, device='cuda:0')
reward_raw: tensor(0.4797, device='cuda:0') reward: tensor(1.2989, device='cuda:0') factor: tensor(0.1809, device='cuda:0')


Epoch 1, Step 1536: Loss = 0.0306, Actor Loss = -0.0323, Critic Loss = 0.1261, Entropy = 0.1248, Returns = 1.3494, Value = 1.3749, mask_loss = 0.2070std_loss = 0.3450:  28%|██▊       | 1536/5475 [6:52:50<17:10:27, 15.70s/it]

ratio tensor(0.6947, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.3153, device='cuda:0') factor: tensor(0.1837, device='cuda:0')
reward_raw: tensor(0.5154, device='cuda:0') reward: tensor(1.3190, device='cuda:0') factor: tensor(0.1964, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3096, device='cuda:0') factor: tensor(0.1935, device='cuda:0')
reward_raw: tensor(0.5128, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.1807, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3293, device='cuda:0') factor: tensor(0.1842, device='cuda:0')


Epoch 1, Step 1537: Loss = 0.0903, Actor Loss = 0.0619, Critic Loss = 0.0569, Entropy = 0.1309, Returns = 1.2776, Value = 1.3581, mask_loss = 0.2057std_loss = 0.3458:  28%|██▊       | 1537/5475 [6:53:04<16:32:55, 15.13s/it] 

ratio tensor(0.7844, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4967, device='cuda:0') reward: tensor(1.3452, device='cuda:0') factor: tensor(0.1514, device='cuda:0')
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.3388, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3566, device='cuda:0') factor: tensor(0.1528, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3415, device='cuda:0') factor: tensor(0.1517, device='cuda:0')
reward_raw: tensor(0.4986, device='cuda:0') reward: tensor(1.3387, device='cuda:0') factor: tensor(0.1600, device='cuda:0')


Epoch 1, Step 1538: Loss = -0.1157, Actor Loss = -0.1439, Critic Loss = 0.0566, Entropy = 0.0963, Returns = 1.3590, Value = 1.2294, mask_loss = 0.1650std_loss = 0.3250:  28%|██▊       | 1538/5475 [6:53:20<16:57:59, 15.51s/it]

ratio tensor(2.3546, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5692, device='cuda:0') reward: tensor(1.3808, device='cuda:0') factor: tensor(0.1884, device='cuda:0')
reward_raw: tensor(0.5704, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.1841, device='cuda:0')
reward_raw: tensor(0.5644, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.1832, device='cuda:0')
reward_raw: tensor(0.5872, device='cuda:0') reward: tensor(1.4004, device='cuda:0') factor: tensor(0.1868, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.3841, device='cuda:0') factor: tensor(0.1832, device='cuda:0')


Epoch 1, Step 1539: Loss = -0.1184, Actor Loss = -0.1417, Critic Loss = 0.0467, Entropy = 0.0969, Returns = 1.4218, Value = 1.4420, mask_loss = 0.2153std_loss = 0.3676:  28%|██▊       | 1539/5475 [6:53:38<17:50:31, 16.32s/it]

ratio tensor(2.8348, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.3677, device='cuda:0') factor: tensor(0.1688, device='cuda:0')
reward_raw: tensor(0.5442, device='cuda:0') reward: tensor(1.3753, device='cuda:0') factor: tensor(0.1688, device='cuda:0')
reward_raw: tensor(0.5299, device='cuda:0') reward: tensor(1.3578, device='cuda:0') factor: tensor(0.1720, device='cuda:0')
reward_raw: tensor(0.5143, device='cuda:0') reward: tensor(1.3418, device='cuda:0') factor: tensor(0.1725, device='cuda:0')
reward_raw: tensor(0.5282, device='cuda:0') reward: tensor(1.3572, device='cuda:0') factor: tensor(0.1711, device='cuda:0')


Epoch 1, Step 1540: Loss = 0.4931, Actor Loss = 0.4566, Critic Loss = 0.0733, Entropy = 0.1065, Returns = 1.3327, Value = 1.5108, mask_loss = 0.1964std_loss = 0.3384:  28%|██▊       | 1540/5475 [6:53:55<17:44:37, 16.23s/it]  

ratio tensor(1.0530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5535, device='cuda:0') reward: tensor(1.3773, device='cuda:0') factor: tensor(0.1763, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.3518, device='cuda:0') factor: tensor(0.1858, device='cuda:0')
reward_raw: tensor(0.5335, device='cuda:0') reward: tensor(1.3440, device='cuda:0') factor: tensor(0.1896, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5426, device='cuda:0') reward: tensor(1.3639, device='cuda:0') factor: tensor(0.1787, device='cuda:0')


Epoch 1, Step 1541: Loss = 0.0738, Actor Loss = 0.0259, Critic Loss = 0.0961, Entropy = 0.1012, Returns = 1.3828, Value = 1.2997, mask_loss = 0.2192std_loss = 0.3368:  28%|██▊       | 1540/5475 [6:54:13<17:44:37, 16.23s/it]

ratio tensor(0.7217, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1541: Loss = 0.0738, Actor Loss = 0.0259, Critic Loss = 0.0961, Entropy = 0.1012, Returns = 1.3828, Value = 1.2997, mask_loss = 0.2192std_loss = 0.3368:  28%|██▊       | 1541/5475 [6:54:14<18:42:01, 17.11s/it]

reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(1.3676, device='cuda:0') factor: tensor(0.1539, device='cuda:0')
reward_raw: tensor(0.5234, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.1489, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.1460, device='cuda:0')
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3790, device='cuda:0') factor: tensor(0.1480, device='cuda:0')
reward_raw: tensor(0.5391, device='cuda:0') reward: tensor(1.3956, device='cuda:0') factor: tensor(0.1435, device='cuda:0')


Epoch 1, Step 1542: Loss = -0.1497, Actor Loss = -0.1615, Critic Loss = 0.0238, Entropy = 0.1141, Returns = 1.4050, Value = 1.3958, mask_loss = 0.1446std_loss = 0.2911:  28%|██▊       | 1542/5475 [6:54:32<19:01:28, 17.41s/it]

ratio tensor(10.0488, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.3935, device='cuda:0') factor: tensor(0.2059, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5837, device='cuda:0') reward: tensor(1.3720, device='cuda:0') factor: tensor(0.2117, device='cuda:0')
reward_raw: tensor(0.5916, device='cuda:0') reward: tensor(1.3743, device='cuda:0') factor: tensor(0.2172, device='cuda:0')
reward_raw: tensor(0.5956, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.2130, device='cuda:0')


Epoch 1, Step 1543: Loss = -0.0648, Actor Loss = -0.0945, Critic Loss = 0.0596, Entropy = 0.1076, Returns = 1.3928, Value = 1.4248, mask_loss = 0.1759std_loss = 0.3164:  28%|██▊       | 1543/5475 [6:54:44<17:25:39, 15.96s/it]

ratio tensor(5.3903, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5125, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.1655, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3533, device='cuda:0') factor: tensor(0.1680, device='cuda:0')
reward_raw: tensor(0.4901, device='cuda:0') reward: tensor(1.3271, device='cuda:0') factor: tensor(0.1630, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.3516, device='cuda:0') factor: tensor(0.1651, device='cuda:0')


Epoch 1, Step 1544: Loss = 0.1483, Actor Loss = 0.1230, Critic Loss = 0.0509, Entropy = 0.1030, Returns = 1.3789, Value = 1.3168, mask_loss = 0.1559std_loss = 0.3034:  28%|██▊       | 1544/5475 [6:54:58<16:32:25, 15.15s/it]  

ratio tensor(0.6174, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.2546, device='cuda:0') factor: tensor(0.2195, device='cuda:0')
reward_raw: tensor(0.4720, device='cuda:0') reward: tensor(1.2498, device='cuda:0') factor: tensor(0.2222, device='cuda:0')
reward_raw: tensor(0.4835, device='cuda:0') reward: tensor(1.2593, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.2505, device='cuda:0') factor: tensor(0.2218, device='cuda:0')
reward_raw: tensor(0.4679, device='cuda:0') reward: tensor(1.2475, device='cuda:0') factor: tensor(0.2204, device='cuda:0')


Epoch 1, Step 1545: Loss = 0.0755, Actor Loss = 0.0551, Critic Loss = 0.0411, Entropy = 0.1609, Returns = 1.2252, Value = 1.2829, mask_loss = 0.2449std_loss = 0.3637:  28%|██▊       | 1545/5475 [6:55:17<17:48:05, 16.31s/it]

ratio tensor(0.2900, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.2715, device='cuda:0') factor: tensor(0.2221, device='cuda:0')
reward_raw: tensor(0.5193, device='cuda:0') reward: tensor(1.2899, device='cuda:0') factor: tensor(0.2294, device='cuda:0')
reward_raw: tensor(0.5218, device='cuda:0') reward: tensor(1.2859, device='cuda:0') factor: tensor(0.2359, device='cuda:0')
reward_raw: tensor(0.5161, device='cuda:0') reward: tensor(1.2938, device='cuda:0') factor: tensor(0.2223, device='cuda:0')
reward_raw: tensor(0.5055, device='cuda:0') reward: tensor(1.2787, device='cuda:0') factor: tensor(0.2268, device='cuda:0')


Epoch 1, Step 1546: Loss = 0.0885, Actor Loss = 0.0643, Critic Loss = 0.0485, Entropy = 0.1075, Returns = 1.2390, Value = 1.2631, mask_loss = 0.1915std_loss = 0.3465:  28%|██▊       | 1546/5475 [6:55:31<17:04:02, 15.64s/it]

ratio tensor(0.8453, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5329, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.1465, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.3891, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.5297, device='cuda:0') reward: tensor(1.3842, device='cuda:0') factor: tensor(0.1456, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.1449, device='cuda:0')
reward_raw: tensor(0.4916, device='cuda:0') reward: tensor(1.3525, device='cuda:0') factor: tensor(0.1391, device='cuda:0')


Epoch 1, Step 1547: Loss = -0.0435, Actor Loss = -0.0904, Critic Loss = 0.0940, Entropy = 0.0950, Returns = 1.3253, Value = 1.4201, mask_loss = 0.1378std_loss = 0.2950:  28%|██▊       | 1547/5475 [6:55:47<17:25:01, 15.96s/it]

ratio tensor(0.8556, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5593, device='cuda:0') reward: tensor(1.4326, device='cuda:0') factor: tensor(0.1267, device='cuda:0')
reward_raw: tensor(0.5578, device='cuda:0') reward: tensor(1.4380, device='cuda:0') factor: tensor(0.1198, device='cuda:0')
reward_raw: tensor(0.5574, device='cuda:0') reward: tensor(1.4387, device='cuda:0') factor: tensor(0.1187, device='cuda:0')
reward_raw: tensor(0.5575, device='cuda:0') reward: tensor(1.4308, device='cuda:0') factor: tensor(0.1267, device='cuda:0')
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.4353, device='cuda:0') factor: tensor(0.1261, device='cuda:0')


Epoch 1, Step 1548: Loss = 0.0263, Actor Loss = 0.0093, Critic Loss = 0.0341, Entropy = 0.0643, Returns = 1.4148, Value = 1.4515, mask_loss = 0.1213std_loss = 0.2916:  28%|██▊       | 1548/5475 [6:56:03<17:11:59, 15.77s/it]  

ratio tensor(1.0306, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6258, device='cuda:0') reward: tensor(1.4497, device='cuda:0') factor: tensor(0.1761, device='cuda:0')
reward_raw: tensor(0.5872, device='cuda:0') reward: tensor(1.4115, device='cuda:0') factor: tensor(0.1757, device='cuda:0')
reward_raw: tensor(0.6209, device='cuda:0') reward: tensor(1.4505, device='cuda:0') factor: tensor(0.1704, device='cuda:0')
reward_raw: tensor(0.5858, device='cuda:0') reward: tensor(1.4090, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.6006, device='cuda:0') reward: tensor(1.4239, device='cuda:0') factor: tensor(0.1766, device='cuda:0')


Epoch 1, Step 1549: Loss = -0.1772, Actor Loss = -0.2031, Critic Loss = 0.0520, Entropy = 0.0903, Returns = 1.4203, Value = 1.3375, mask_loss = 0.1473std_loss = 0.2966:  28%|██▊       | 1549/5475 [6:56:18<16:59:23, 15.58s/it]

ratio tensor(2.0374, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5844, device='cuda:0') reward: tensor(1.4561, device='cuda:0') factor: tensor(0.1283, device='cuda:0')
reward_raw: tensor(0.5731, device='cuda:0') reward: tensor(1.4417, device='cuda:0') factor: tensor(0.1314, device='cuda:0')
reward_raw: tensor(0.5959, device='cuda:0') reward: tensor(1.4709, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.5478, device='cuda:0') reward: tensor(1.4149, device='cuda:0') factor: tensor(0.1329, device='cuda:0')
reward_raw: tensor(0.5792, device='cuda:0') reward: tensor(1.4444, device='cuda:0') factor: tensor(0.1347, device='cuda:0')


Epoch 1, Step 1550: Loss = 0.0288, Actor Loss = -0.0012, Critic Loss = 0.0601, Entropy = 0.0627, Returns = 1.4766, Value = 1.5308, mask_loss = 0.1065std_loss = 0.2732:  28%|██▊       | 1550/5475 [6:56:37<18:13:08, 16.71s/it] 

ratio tensor(1.3340, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5307, device='cuda:0') reward: tensor(1.4026, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.5316, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.1267, device='cuda:0')
reward_raw: tensor(0.5315, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1213, device='cuda:0')
reward_raw: tensor(0.5232, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.1256, device='cuda:0')
reward_raw: tensor(0.5201, device='cuda:0') reward: tensor(1.3981, device='cuda:0') factor: tensor(0.1220, device='cuda:0')


Epoch 1, Step 1551: Loss = 0.2884, Actor Loss = 0.2538, Critic Loss = 0.0693, Entropy = 0.0621, Returns = 1.4028, Value = 1.4484, mask_loss = 0.1195std_loss = 0.2904:  28%|██▊       | 1551/5475 [6:56:49<16:42:59, 15.34s/it] 

ratio tensor(0.9307, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4777, device='cuda:0') reward: tensor(1.3062, device='cuda:0') factor: tensor(0.1714, device='cuda:0')
reward_raw: tensor(0.4636, device='cuda:0') reward: tensor(1.2991, device='cuda:0') factor: tensor(0.1646, device='cuda:0')
reward_raw: tensor(0.4896, device='cuda:0') reward: tensor(1.3210, device='cuda:0') factor: tensor(0.1686, device='cuda:0')
reward_raw: tensor(0.4796, device='cuda:0') reward: tensor(1.3135, device='cuda:0') factor: tensor(0.1661, device='cuda:0')
reward_raw: tensor(0.4754, device='cuda:0') reward: tensor(1.3067, device='cuda:0') factor: tensor(0.1687, device='cuda:0')


Epoch 1, Step 1552: Loss = -0.0399, Actor Loss = -0.0675, Critic Loss = 0.0555, Entropy = 0.0903, Returns = 1.3149, Value = 1.3061, mask_loss = 0.1752std_loss = 0.3268:  28%|██▊       | 1552/5475 [6:57:09<18:02:42, 16.56s/it]

ratio tensor(2.4197, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.4123, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.3964, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.1418, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.4008, device='cuda:0') factor: tensor(0.1385, device='cuda:0')
reward_raw: tensor(0.5419, device='cuda:0') reward: tensor(1.4036, device='cuda:0') factor: tensor(0.1383, device='cuda:0')


Epoch 1, Step 1553: Loss = 0.0949, Actor Loss = -0.0211, Critic Loss = 0.2323, Entropy = 0.1330, Returns = 1.3064, Value = 1.2442, mask_loss = 0.1848std_loss = 0.3124:  28%|██▊       | 1553/5475 [6:57:22<17:02:45, 15.65s/it] 

ratio tensor(0.7107, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4576, device='cuda:0') reward: tensor(1.2740, device='cuda:0') factor: tensor(0.1836, device='cuda:0')
reward_raw: tensor(0.4520, device='cuda:0') reward: tensor(1.2802, device='cuda:0') factor: tensor(0.1719, device='cuda:0')
reward_raw: tensor(0.4610, device='cuda:0') reward: tensor(1.2830, device='cuda:0') factor: tensor(0.1780, device='cuda:0')
reward_raw: tensor(0.4820, device='cuda:0') reward: tensor(1.3059, device='cuda:0') factor: tensor(0.1761, device='cuda:0')
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.2869, device='cuda:0') factor: tensor(0.1770, device='cuda:0')


Epoch 1, Step 1554: Loss = 0.1455, Actor Loss = 0.0786, Critic Loss = 0.1339, Entropy = 0.0944, Returns = 1.2427, Value = 1.2102, mask_loss = 0.1860std_loss = 0.3402:  28%|██▊       | 1554/5475 [6:57:36<16:24:31, 15.07s/it] 

ratio tensor(1.4277, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6297, device='cuda:0') reward: tensor(1.4733, device='cuda:0') factor: tensor(0.1565, device='cuda:0')
reward_raw: tensor(0.6572, device='cuda:0') reward: tensor(1.5059, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.6395, device='cuda:0') reward: tensor(1.4874, device='cuda:0') factor: tensor(0.1521, device='cuda:0')
reward_raw: tensor(0.6594, device='cuda:0') reward: tensor(1.5021, device='cuda:0') factor: tensor(0.1573, device='cuda:0')
reward_raw: tensor(0.6496, device='cuda:0') reward: tensor(1.4993, device='cuda:0') factor: tensor(0.1503, device='cuda:0')


Epoch 1, Step 1555: Loss = -0.1979, Actor Loss = -0.2132, Critic Loss = 0.0308, Entropy = 0.1075, Returns = 1.4855, Value = 1.5839, mask_loss = 0.1740std_loss = 0.3198:  28%|██▊       | 1555/5475 [6:57:50<15:57:48, 14.66s/it]

ratio tensor(11.4078, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5276, device='cuda:0') reward: tensor(1.3290, device='cuda:0') factor: tensor(0.1986, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.3066, device='cuda:0') factor: tensor(0.1982, device='cuda:0')
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.3066, device='cuda:0') factor: tensor(0.1995, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(1.3113, device='cuda:0') factor: tensor(0.1956, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3042, device='cuda:0') factor: tensor(0.1987, device='cuda:0')


Epoch 1, Step 1556: Loss = 0.2684, Actor Loss = 0.2481, Critic Loss = 0.0409, Entropy = 0.1326, Returns = 1.2773, Value = 1.3551, mask_loss = 0.2227std_loss = 0.3583:  28%|██▊       | 1556/5475 [6:58:09<17:26:37, 16.02s/it]  

ratio tensor(0.2274, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1790, device='cuda:0')
reward_raw: tensor(0.5509, device='cuda:0') reward: tensor(1.3723, device='cuda:0') factor: tensor(0.1786, device='cuda:0')
reward_raw: tensor(0.5409, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1750, device='cuda:0')
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.3582, device='cuda:0') factor: tensor(0.1780, device='cuda:0')


Epoch 1, Step 1557: Loss = 0.0836, Actor Loss = 0.0509, Critic Loss = 0.0656, Entropy = 0.0936, Returns = 1.3277, Value = 1.3247, mask_loss = 0.1746std_loss = 0.3250:  28%|██▊       | 1557/5475 [6:58:26<17:48:57, 16.37s/it]

ratio tensor(0.6166, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4980, device='cuda:0') reward: tensor(1.3603, device='cuda:0') factor: tensor(0.1377, device='cuda:0')
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.1318, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.1399, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.1363, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1394, device='cuda:0')


Epoch 1, Step 1558: Loss = -0.0021, Actor Loss = -0.0358, Critic Loss = 0.0676, Entropy = 0.0694, Returns = 1.3311, Value = 1.4094, mask_loss = 0.1381std_loss = 0.3119:  28%|██▊       | 1558/5475 [6:58:39<16:42:43, 15.36s/it]

ratio tensor(2.1021, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5802, device='cuda:0') reward: tensor(1.4200, device='cuda:0') factor: tensor(0.1602, device='cuda:0')
reward_raw: tensor(0.5654, device='cuda:0') reward: tensor(1.4087, device='cuda:0') factor: tensor(0.1567, device='cuda:0')
reward_raw: tensor(0.5630, device='cuda:0') reward: tensor(1.4047, device='cuda:0') factor: tensor(0.1583, device='cuda:0')
reward_raw: tensor(0.5768, device='cuda:0') reward: tensor(1.4231, device='cuda:0') factor: tensor(0.1537, device='cuda:0')
reward_raw: tensor(0.5776, device='cuda:0') reward: tensor(1.4181, device='cuda:0') factor: tensor(0.1595, device='cuda:0')


Epoch 1, Step 1559: Loss = 0.0181, Actor Loss = -0.0166, Critic Loss = 0.0695, Entropy = 0.1044, Returns = 1.4154, Value = 1.3352, mask_loss = 0.1910std_loss = 0.3405:  28%|██▊       | 1559/5475 [6:58:58<17:43:20, 16.29s/it] 

ratio tensor(0.7188, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.3283, device='cuda:0') factor: tensor(0.1735, device='cuda:0')
reward_raw: tensor(0.4956, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.1720, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.1744, device='cuda:0')
reward_raw: tensor(0.4740, device='cuda:0') reward: tensor(1.3042, device='cuda:0') factor: tensor(0.1697, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.3088, device='cuda:0') factor: tensor(0.1734, device='cuda:0')


Epoch 1, Step 1560: Loss = 0.0347, Actor Loss = -0.0000, Critic Loss = 0.0697, Entropy = 0.0869, Returns = 1.3130, Value = 1.3159, mask_loss = 0.1716std_loss = 0.3289:  28%|██▊       | 1560/5475 [6:59:13<17:17:33, 15.90s/it]

ratio tensor(1.8506, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5342, device='cuda:0') reward: tensor(1.3950, device='cuda:0') factor: tensor(0.1392, device='cuda:0')
reward_raw: tensor(0.5389, device='cuda:0') reward: tensor(1.4021, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.5222, device='cuda:0') reward: tensor(1.3842, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.5299, device='cuda:0') reward: tensor(1.3908, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.3818, device='cuda:0') factor: tensor(0.1360, device='cuda:0')


Epoch 1, Step 1561: Loss = 0.0063, Actor Loss = -0.0194, Critic Loss = 0.0516, Entropy = 0.0714, Returns = 1.3890, Value = 1.4142, mask_loss = 0.1386std_loss = 0.3077:  28%|██▊       | 1560/5475 [6:59:27<17:17:33, 15.90s/it]

ratio tensor(1.8153, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1561: Loss = 0.0063, Actor Loss = -0.0194, Critic Loss = 0.0516, Entropy = 0.0714, Returns = 1.3890, Value = 1.4142, mask_loss = 0.1386std_loss = 0.3077:  29%|██▊       | 1561/5475 [6:59:27<16:54:35, 15.55s/it]

reward_raw: tensor(0.5716, device='cuda:0') reward: tensor(1.3647, device='cuda:0') factor: tensor(0.2069, device='cuda:0')
reward_raw: tensor(0.5662, device='cuda:0') reward: tensor(1.3645, device='cuda:0') factor: tensor(0.2018, device='cuda:0')
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(1.3670, device='cuda:0') factor: tensor(0.2026, device='cuda:0')
reward_raw: tensor(0.5517, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.2025, device='cuda:0')
reward_raw: tensor(0.5751, device='cuda:0') reward: tensor(1.3723, device='cuda:0') factor: tensor(0.2028, device='cuda:0')


Epoch 1, Step 1562: Loss = -0.0040, Actor Loss = -0.0416, Critic Loss = 0.0754, Entropy = 0.0979, Returns = 1.4238, Value = 1.3785, mask_loss = 0.2242std_loss = 0.3630:  29%|██▊       | 1562/5475 [6:59:41<16:13:19, 14.92s/it]

ratio tensor(2.0441, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5477, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.1877, device='cuda:0')
reward_raw: tensor(0.5478, device='cuda:0') reward: tensor(1.3638, device='cuda:0') factor: tensor(0.1840, device='cuda:0')
reward_raw: tensor(0.5476, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.1860, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.3644, device='cuda:0') factor: tensor(0.1843, device='cuda:0')
reward_raw: tensor(0.5285, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.1819, device='cuda:0')


Epoch 1, Step 1563: Loss = 0.0640, Actor Loss = 0.0472, Critic Loss = 0.0339, Entropy = 0.0829, Returns = 1.3361, Value = 1.4037, mask_loss = 0.1861std_loss = 0.3447:  29%|██▊       | 1563/5475 [6:59:59<17:21:05, 15.97s/it]  

ratio tensor(0.7067, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5869, device='cuda:0') reward: tensor(1.4087, device='cuda:0') factor: tensor(0.1782, device='cuda:0')
reward_raw: tensor(0.5872, device='cuda:0') reward: tensor(1.4122, device='cuda:0') factor: tensor(0.1750, device='cuda:0')
reward_raw: tensor(0.5932, device='cuda:0') reward: tensor(1.4221, device='cuda:0') factor: tensor(0.1711, device='cuda:0')
reward_raw: tensor(0.5976, device='cuda:0') reward: tensor(1.4225, device='cuda:0') factor: tensor(0.1751, device='cuda:0')
reward_raw: tensor(0.5947, device='cuda:0') reward: tensor(1.4198, device='cuda:0') factor: tensor(0.1749, device='cuda:0')


Epoch 1, Step 1564: Loss = 0.0693, Actor Loss = 0.0400, Critic Loss = 0.0587, Entropy = 0.0781, Returns = 1.2917, Value = 1.3991, mask_loss = 0.1794std_loss = 0.3396:  29%|██▊       | 1564/5475 [7:00:19<18:42:51, 17.23s/it]

ratio tensor(1.1433, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.3018, device='cuda:0') factor: tensor(0.1691, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3312, device='cuda:0') factor: tensor(0.1686, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3059, device='cuda:0') factor: tensor(0.1739, device='cuda:0')
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.2936, device='cuda:0') factor: tensor(0.1721, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.3362, device='cuda:0') factor: tensor(0.1627, device='cuda:0')


Epoch 1, Step 1565: Loss = 0.2506, Actor Loss = 0.2272, Critic Loss = 0.0471, Entropy = 0.0689, Returns = 1.2885, Value = 1.1677, mask_loss = 0.1341std_loss = 0.3019:  29%|██▊       | 1565/5475 [7:00:32<17:17:53, 15.93s/it]

ratio tensor(0.3958, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5801, device='cuda:0') reward: tensor(1.4457, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.5904, device='cuda:0') reward: tensor(1.4572, device='cuda:0') factor: tensor(0.1332, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.4335, device='cuda:0') factor: tensor(0.1338, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.4132, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.5770, device='cuda:0') reward: tensor(1.4445, device='cuda:0') factor: tensor(0.1325, device='cuda:0')


Epoch 1, Step 1566: Loss = -0.2635, Actor Loss = -0.3116, Critic Loss = 0.0964, Entropy = 0.0612, Returns = 1.4783, Value = 1.6352, mask_loss = 0.1217std_loss = 0.2967:  29%|██▊       | 1566/5475 [7:00:47<16:59:05, 15.64s/it]

ratio tensor(1.6690, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.3899, device='cuda:0') factor: tensor(0.1325, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1269, device='cuda:0')
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1283, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.1318, device='cuda:0')
reward_raw: tensor(0.5312, device='cuda:0') reward: tensor(1.3984, device='cuda:0') factor: tensor(0.1328, device='cuda:0')


Epoch 1, Step 1567: Loss = 0.0689, Actor Loss = 0.0393, Critic Loss = 0.0592, Entropy = 0.0838, Returns = 1.3601, Value = 1.2872, mask_loss = 0.1421std_loss = 0.3034:  29%|██▊       | 1567/5475 [7:01:04<17:23:04, 16.01s/it]  

ratio tensor(0.6551, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.1427, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.5364, device='cuda:0') reward: tensor(1.3946, device='cuda:0') factor: tensor(0.1418, device='cuda:0')
reward_raw: tensor(0.5399, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.1401, device='cuda:0')
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.3924, device='cuda:0') factor: tensor(0.1417, device='cuda:0')


Epoch 1, Step 1568: Loss = 0.1681, Actor Loss = 0.1385, Critic Loss = 0.0593, Entropy = 0.0727, Returns = 1.3170, Value = 1.3012, mask_loss = 0.1381std_loss = 0.3093:  29%|██▊       | 1568/5475 [7:01:18<16:34:16, 15.27s/it]

ratio tensor(2.7601, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5774, device='cuda:0') reward: tensor(1.4152, device='cuda:0') factor: tensor(0.1622, device='cuda:0')
reward_raw: tensor(0.5930, device='cuda:0') reward: tensor(1.4283, device='cuda:0') factor: tensor(0.1647, device='cuda:0')
reward_raw: tensor(0.5897, device='cuda:0') reward: tensor(1.4238, device='cuda:0') factor: tensor(0.1659, device='cuda:0')
reward_raw: tensor(0.5943, device='cuda:0') reward: tensor(1.4242, device='cuda:0') factor: tensor(0.1701, device='cuda:0')
reward_raw: tensor(0.5732, device='cuda:0') reward: tensor(1.4125, device='cuda:0') factor: tensor(0.1607, device='cuda:0')


Epoch 1, Step 1569: Loss = 0.0072, Actor Loss = -0.0279, Critic Loss = 0.0704, Entropy = 0.1198, Returns = 1.3980, Value = 1.4639, mask_loss = 0.2075std_loss = 0.3409:  29%|██▊       | 1569/5475 [7:01:31<16:00:32, 14.75s/it]

ratio tensor(1.0764, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6150, device='cuda:0') reward: tensor(1.4293, device='cuda:0') factor: tensor(0.1856, device='cuda:0')
reward_raw: tensor(0.5982, device='cuda:0') reward: tensor(1.4147, device='cuda:0') factor: tensor(0.1835, device='cuda:0')
reward_raw: tensor(0.5921, device='cuda:0') reward: tensor(1.4088, device='cuda:0') factor: tensor(0.1833, device='cuda:0')
reward_raw: tensor(0.5926, device='cuda:0') reward: tensor(1.4104, device='cuda:0') factor: tensor(0.1821, device='cuda:0')
reward_raw: tensor(0.6126, device='cuda:0') reward: tensor(1.4284, device='cuda:0') factor: tensor(0.1842, device='cuda:0')


Epoch 1, Step 1570: Loss = -0.0210, Actor Loss = -0.0542, Critic Loss = 0.0666, Entropy = 0.0714, Returns = 1.4382, Value = 1.4394, mask_loss = 0.1589std_loss = 0.3328:  29%|██▊       | 1570/5475 [7:01:43<15:07:55, 13.95s/it]

ratio tensor(0.7163, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5458, device='cuda:0') reward: tensor(1.3941, device='cuda:0') factor: tensor(0.1517, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1480, device='cuda:0')
reward_raw: tensor(0.5679, device='cuda:0') reward: tensor(1.4140, device='cuda:0') factor: tensor(0.1540, device='cuda:0')
reward_raw: tensor(0.5404, device='cuda:0') reward: tensor(1.3979, device='cuda:0') factor: tensor(0.1424, device='cuda:0')


Epoch 1, Step 1571: Loss = 0.0787, Actor Loss = 0.0493, Critic Loss = 0.0589, Entropy = 0.1017, Returns = 1.4151, Value = 1.3372, mask_loss = 0.1635std_loss = 0.3137:  29%|██▊       | 1571/5475 [7:02:06<18:07:13, 16.71s/it]  

ratio tensor(0.5564, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5617, device='cuda:0') reward: tensor(1.4086, device='cuda:0') factor: tensor(0.1531, device='cuda:0')
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.3926, device='cuda:0') factor: tensor(0.1563, device='cuda:0')
reward_raw: tensor(0.5186, device='cuda:0') reward: tensor(1.3640, device='cuda:0') factor: tensor(0.1546, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.3768, device='cuda:0') factor: tensor(0.1516, device='cuda:0')
reward_raw: tensor(0.5182, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.1477, device='cuda:0')


Epoch 1, Step 1572: Loss = -0.0292, Actor Loss = -0.0720, Critic Loss = 0.0859, Entropy = 0.1060, Returns = 1.4152, Value = 1.3975, mask_loss = 0.1827std_loss = 0.3273:  29%|██▊       | 1572/5475 [7:02:24<18:28:27, 17.04s/it]

ratio tensor(1.0973, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5192, device='cuda:0') reward: tensor(1.3462, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.5246, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1759, device='cuda:0')
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.3426, device='cuda:0') factor: tensor(0.1684, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.3516, device='cuda:0') factor: tensor(0.1736, device='cuda:0')
reward_raw: tensor(0.5272, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.1791, device='cuda:0')


Epoch 1, Step 1573: Loss = 0.1314, Actor Loss = 0.0897, Critic Loss = 0.0835, Entropy = 0.0787, Returns = 1.3541, Value = 1.3660, mask_loss = 0.1596std_loss = 0.3240:  29%|██▊       | 1573/5475 [7:02:36<16:46:08, 15.47s/it]  

ratio tensor(0.8633, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6074, device='cuda:0') reward: tensor(1.4299, device='cuda:0') factor: tensor(0.1775, device='cuda:0')
reward_raw: tensor(0.5885, device='cuda:0') reward: tensor(1.4074, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.5964, device='cuda:0') reward: tensor(1.4135, device='cuda:0') factor: tensor(0.1830, device='cuda:0')
reward_raw: tensor(0.5916, device='cuda:0') reward: tensor(1.4067, device='cuda:0') factor: tensor(0.1849, device='cuda:0')
reward_raw: tensor(0.6135, device='cuda:0') reward: tensor(1.4286, device='cuda:0') factor: tensor(0.1849, device='cuda:0')


Epoch 1, Step 1574: Loss = -0.1062, Actor Loss = -0.1186, Critic Loss = 0.0250, Entropy = 0.0735, Returns = 1.4437, Value = 1.4755, mask_loss = 0.1531std_loss = 0.3162:  29%|██▊       | 1574/5475 [7:02:55<17:46:31, 16.40s/it]

ratio tensor(1.4920, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6085, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.2145, device='cuda:0')
reward_raw: tensor(0.6031, device='cuda:0') reward: tensor(1.3919, device='cuda:0') factor: tensor(0.2112, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.2130, device='cuda:0')
reward_raw: tensor(0.5613, device='cuda:0') reward: tensor(1.3477, device='cuda:0') factor: tensor(0.2136, device='cuda:0')
reward_raw: tensor(0.5864, device='cuda:0') reward: tensor(1.3727, device='cuda:0') factor: tensor(0.2137, device='cuda:0')


Epoch 1, Step 1575: Loss = 0.2290, Actor Loss = 0.2016, Critic Loss = 0.0551, Entropy = 0.0997, Returns = 1.3799, Value = 1.4057, mask_loss = 0.1859std_loss = 0.3303:  29%|██▉       | 1575/5475 [7:03:17<19:46:32, 18.25s/it]  

ratio tensor(0.7287, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4767, device='cuda:0') reward: tensor(1.3031, device='cuda:0') factor: tensor(0.1736, device='cuda:0')
reward_raw: tensor(0.4806, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.4716, device='cuda:0') reward: tensor(1.3001, device='cuda:0') factor: tensor(0.1715, device='cuda:0')
reward_raw: tensor(0.4699, device='cuda:0') reward: tensor(1.3027, device='cuda:0') factor: tensor(0.1671, device='cuda:0')
reward_raw: tensor(0.4795, device='cuda:0') reward: tensor(1.3139, device='cuda:0') factor: tensor(0.1656, device='cuda:0')


Epoch 1, Step 1576: Loss = -0.1535, Actor Loss = -0.1921, Critic Loss = 0.0774, Entropy = 0.0909, Returns = 1.2960, Value = 1.1923, mask_loss = 0.1597std_loss = 0.3144:  29%|██▉       | 1576/5475 [7:03:33<19:08:53, 17.68s/it]

ratio tensor(2.5953, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5567, device='cuda:0') reward: tensor(1.4098, device='cuda:0') factor: tensor(0.1469, device='cuda:0')
reward_raw: tensor(0.5731, device='cuda:0') reward: tensor(1.4305, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.5748, device='cuda:0') reward: tensor(1.4268, device='cuda:0') factor: tensor(0.1481, device='cuda:0')
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.4031, device='cuda:0') factor: tensor(0.1510, device='cuda:0')
reward_raw: tensor(0.5435, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.1530, device='cuda:0')


Epoch 1, Step 1577: Loss = 0.0386, Actor Loss = -0.0025, Critic Loss = 0.0824, Entropy = 0.1074, Returns = 1.3654, Value = 1.4979, mask_loss = 0.1648std_loss = 0.3248:  29%|██▉       | 1577/5475 [7:03:51<19:07:20, 17.66s/it] 

ratio tensor(1.3619, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5747, device='cuda:0') reward: tensor(1.3400, device='cuda:0') factor: tensor(0.2348, device='cuda:0')
reward_raw: tensor(0.5643, device='cuda:0') reward: tensor(1.3474, device='cuda:0') factor: tensor(0.2169, device='cuda:0')
reward_raw: tensor(0.5676, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.2271, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.3277, device='cuda:0') factor: tensor(0.2290, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.2293, device='cuda:0')


Epoch 1, Step 1578: Loss = 0.2092, Actor Loss = 0.1733, Critic Loss = 0.0723, Entropy = 0.2020, Returns = 1.3097, Value = 1.2911, mask_loss = 0.2842std_loss = 0.3556:  29%|██▉       | 1578/5475 [7:04:04<17:38:55, 16.30s/it] 

ratio tensor(0.3086, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5958, device='cuda:0') reward: tensor(1.3221, device='cuda:0') factor: tensor(0.2737, device='cuda:0')
reward_raw: tensor(0.6022, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.2741, device='cuda:0')
reward_raw: tensor(0.5927, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.2646, device='cuda:0')
reward_raw: tensor(0.5762, device='cuda:0') reward: tensor(1.3030, device='cuda:0') factor: tensor(0.2732, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(1.3163, device='cuda:0') factor: tensor(0.2754, device='cuda:0')


Epoch 1, Step 1579: Loss = -0.2453, Actor Loss = -0.2712, Critic Loss = 0.0521, Entropy = 0.1878, Returns = 1.3224, Value = 1.2920, mask_loss = 0.2499std_loss = 0.3470:  29%|██▉       | 1579/5475 [7:04:18<16:47:48, 15.52s/it]

ratio tensor(2.0001, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5652, device='cuda:0') reward: tensor(1.2861, device='cuda:0') factor: tensor(0.2791, device='cuda:0')
reward_raw: tensor(0.5518, device='cuda:0') reward: tensor(1.2850, device='cuda:0') factor: tensor(0.2669, device='cuda:0')
reward_raw: tensor(0.5838, device='cuda:0') reward: tensor(1.3091, device='cuda:0') factor: tensor(0.2747, device='cuda:0')
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.2997, device='cuda:0') factor: tensor(0.2800, device='cuda:0')
reward_raw: tensor(0.5621, device='cuda:0') reward: tensor(1.2891, device='cuda:0') factor: tensor(0.2730, device='cuda:0')


Epoch 1, Step 1580: Loss = 0.1442, Actor Loss = 0.1159, Critic Loss = 0.0571, Entropy = 0.1859, Returns = 1.3148, Value = 1.3866, mask_loss = 0.2627std_loss = 0.3506:  29%|██▉       | 1580/5475 [7:04:34<16:54:46, 15.63s/it]  

ratio tensor(1.2527, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5932, device='cuda:0') reward: tensor(1.4160, device='cuda:0') factor: tensor(0.1771, device='cuda:0')
reward_raw: tensor(0.6020, device='cuda:0') reward: tensor(1.4284, device='cuda:0') factor: tensor(0.1736, device='cuda:0')
reward_raw: tensor(0.6105, device='cuda:0') reward: tensor(1.4269, device='cuda:0') factor: tensor(0.1835, device='cuda:0')
reward_raw: tensor(0.5829, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.5769, device='cuda:0') reward: tensor(1.4026, device='cuda:0') factor: tensor(0.1744, device='cuda:0')


Epoch 1, Step 1581: Loss = 0.0337, Actor Loss = 0.0047, Critic Loss = 0.0585, Entropy = 0.1793, Returns = 1.3492, Value = 1.3932, mask_loss = 0.1902std_loss = 0.3117:  29%|██▉       | 1580/5475 [7:04:47<16:54:46, 15.63s/it]

ratio tensor(1.1219, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1581: Loss = 0.0337, Actor Loss = 0.0047, Critic Loss = 0.0585, Entropy = 0.1793, Returns = 1.3492, Value = 1.3932, mask_loss = 0.1902std_loss = 0.3117:  29%|██▉       | 1581/5475 [7:04:48<16:18:05, 15.07s/it]

reward_raw: tensor(0.5186, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.2019, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3043, device='cuda:0') factor: tensor(0.1980, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.3265, device='cuda:0') factor: tensor(0.1957, device='cuda:0')
reward_raw: tensor(0.5171, device='cuda:0') reward: tensor(1.3140, device='cuda:0') factor: tensor(0.2031, device='cuda:0')
reward_raw: tensor(0.4935, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.2030, device='cuda:0')


Epoch 1, Step 1582: Loss = 0.2815, Actor Loss = 0.2503, Critic Loss = 0.0626, Entropy = 0.1350, Returns = 1.2532, Value = 1.3144, mask_loss = 0.1683std_loss = 0.3168:  29%|██▉       | 1582/5475 [7:05:04<16:36:55, 15.36s/it]

ratio tensor(0.2303, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5404, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.1865, device='cuda:0')
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.3386, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5574, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.1896, device='cuda:0')
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.1851, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.1931, device='cuda:0')


Epoch 1, Step 1583: Loss = -0.0861, Actor Loss = -0.1147, Critic Loss = 0.0574, Entropy = 0.1234, Returns = 1.3546, Value = 1.3597, mask_loss = 0.1761std_loss = 0.3141:  29%|██▉       | 1583/5475 [7:05:24<18:07:21, 16.76s/it]

ratio tensor(1.2308, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5834, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.2233, device='cuda:0')
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.2232, device='cuda:0')
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.3500, device='cuda:0') factor: tensor(0.2182, device='cuda:0')
reward_raw: tensor(0.5918, device='cuda:0') reward: tensor(1.3661, device='cuda:0') factor: tensor(0.2257, device='cuda:0')
reward_raw: tensor(0.5867, device='cuda:0') reward: tensor(1.3600, device='cuda:0') factor: tensor(0.2267, device='cuda:0')


Epoch 1, Step 1584: Loss = 0.1423, Actor Loss = 0.1052, Critic Loss = 0.0743, Entropy = 0.1242, Returns = 1.3248, Value = 1.5306, mask_loss = 0.2263std_loss = 0.3625:  29%|██▉       | 1584/5475 [7:05:39<17:42:18, 16.38s/it]  

ratio tensor(0.9884, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6164, device='cuda:0') reward: tensor(1.3846, device='cuda:0') factor: tensor(0.2318, device='cuda:0')
reward_raw: tensor(0.6249, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.2351, device='cuda:0')
reward_raw: tensor(0.5994, device='cuda:0') reward: tensor(1.3637, device='cuda:0') factor: tensor(0.2357, device='cuda:0')
reward_raw: tensor(0.6318, device='cuda:0') reward: tensor(1.3941, device='cuda:0') factor: tensor(0.2377, device='cuda:0')
reward_raw: tensor(0.6181, device='cuda:0') reward: tensor(1.3875, device='cuda:0') factor: tensor(0.2306, device='cuda:0')


Epoch 1, Step 1585: Loss = 0.1292, Actor Loss = 0.0813, Critic Loss = 0.0961, Entropy = 0.1544, Returns = 1.4143, Value = 1.3291, mask_loss = 0.2515std_loss = 0.3467:  29%|██▉       | 1585/5475 [7:05:57<18:13:20, 16.86s/it]

ratio tensor(0.6635, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4982, device='cuda:0') reward: tensor(1.3143, device='cuda:0') factor: tensor(0.1839, device='cuda:0')
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.3430, device='cuda:0') factor: tensor(0.1840, device='cuda:0')
reward_raw: tensor(0.5217, device='cuda:0') reward: tensor(1.3317, device='cuda:0') factor: tensor(0.1899, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.3441, device='cuda:0') factor: tensor(0.1820, device='cuda:0')
reward_raw: tensor(0.5114, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.1813, device='cuda:0')


Epoch 1, Step 1586: Loss = 0.0437, Actor Loss = 0.0008, Critic Loss = 0.0860, Entropy = 0.1068, Returns = 1.3265, Value = 1.3204, mask_loss = 0.1587std_loss = 0.3070:  29%|██▉       | 1586/5475 [7:06:19<19:40:43, 18.22s/it]

ratio tensor(0.6043, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5610, device='cuda:0') reward: tensor(1.3790, device='cuda:0') factor: tensor(0.1820, device='cuda:0')
reward_raw: tensor(0.5835, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.1783, device='cuda:0')
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.3692, device='cuda:0') factor: tensor(0.1810, device='cuda:0')
reward_raw: tensor(0.5723, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.1724, device='cuda:0')
reward_raw: tensor(0.5745, device='cuda:0') reward: tensor(1.3931, device='cuda:0') factor: tensor(0.1814, device='cuda:0')


Epoch 1, Step 1587: Loss = -0.0905, Actor Loss = -0.1246, Critic Loss = 0.0685, Entropy = 0.1419, Returns = 1.4031, Value = 1.4239, mask_loss = 0.1930std_loss = 0.3216:  29%|██▉       | 1587/5475 [7:06:32<18:11:14, 16.84s/it]

ratio tensor(1.5136, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6150, device='cuda:0') reward: tensor(1.4511, device='cuda:0') factor: tensor(0.1639, device='cuda:0')
reward_raw: tensor(0.6129, device='cuda:0') reward: tensor(1.4408, device='cuda:0') factor: tensor(0.1721, device='cuda:0')
reward_raw: tensor(0.6172, device='cuda:0') reward: tensor(1.4504, device='cuda:0') factor: tensor(0.1668, device='cuda:0')
reward_raw: tensor(0.6294, device='cuda:0') reward: tensor(1.4579, device='cuda:0') factor: tensor(0.1715, device='cuda:0')
reward_raw: tensor(0.6229, device='cuda:0') reward: tensor(1.4569, device='cuda:0') factor: tensor(0.1660, device='cuda:0')


Epoch 1, Step 1588: Loss = 0.1099, Actor Loss = 0.0672, Critic Loss = 0.0857, Entropy = 0.1282, Returns = 1.3515, Value = 1.4319, mask_loss = 0.1587std_loss = 0.3018:  29%|██▉       | 1588/5475 [7:06:49<18:16:23, 16.92s/it]  

ratio tensor(1.3782, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5794, device='cuda:0') reward: tensor(1.3685, device='cuda:0') factor: tensor(0.2109, device='cuda:0')
reward_raw: tensor(0.5868, device='cuda:0') reward: tensor(1.3704, device='cuda:0') factor: tensor(0.2165, device='cuda:0')
reward_raw: tensor(0.5782, device='cuda:0') reward: tensor(1.3669, device='cuda:0') factor: tensor(0.2112, device='cuda:0')
reward_raw: tensor(0.5842, device='cuda:0') reward: tensor(1.3721, device='cuda:0') factor: tensor(0.2121, device='cuda:0')
reward_raw: tensor(0.5725, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.2158, device='cuda:0')


Epoch 1, Step 1589: Loss = 0.0895, Actor Loss = 0.0674, Critic Loss = 0.0445, Entropy = 0.1440, Returns = 1.3965, Value = 1.3472, mask_loss = 0.2211std_loss = 0.3557:  29%|██▉       | 1589/5475 [7:07:07<18:40:09, 17.30s/it]

ratio tensor(0.5047, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4915, device='cuda:0') reward: tensor(1.3126, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.4687, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.1757, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3074, device='cuda:0') factor: tensor(0.1782, device='cuda:0')
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.2996, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.4714, device='cuda:0') reward: tensor(1.2986, device='cuda:0') factor: tensor(0.1728, device='cuda:0')


Epoch 1, Step 1590: Loss = 0.0389, Actor Loss = 0.0139, Critic Loss = 0.0502, Entropy = 0.1285, Returns = 1.2575, Value = 1.3112, mask_loss = 0.1820std_loss = 0.3229:  29%|██▉       | 1590/5475 [7:07:30<20:28:51, 18.98s/it]

ratio tensor(0.9950, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5688, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.2035, device='cuda:0')
reward_raw: tensor(0.5847, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1988, device='cuda:0')
reward_raw: tensor(0.5596, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.1941, device='cuda:0')
reward_raw: tensor(0.5900, device='cuda:0') reward: tensor(1.3977, device='cuda:0') factor: tensor(0.1923, device='cuda:0')
reward_raw: tensor(0.5627, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.1897, device='cuda:0')


Epoch 1, Step 1591: Loss = 0.0744, Actor Loss = 0.0422, Critic Loss = 0.0645, Entropy = 0.1342, Returns = 1.3667, Value = 1.4171, mask_loss = 0.1711std_loss = 0.3172:  29%|██▉       | 1591/5475 [7:07:46<19:18:50, 17.90s/it]

ratio tensor(200.8894, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5766, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.1982, device='cuda:0')
reward_raw: tensor(0.5849, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1996, device='cuda:0')
reward_raw: tensor(0.6002, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.2021, device='cuda:0')
reward_raw: tensor(0.5896, device='cuda:0') reward: tensor(1.3831, device='cuda:0') factor: tensor(0.2065, device='cuda:0')
reward_raw: tensor(0.6093, device='cuda:0') reward: tensor(1.4060, device='cuda:0') factor: tensor(0.2032, device='cuda:0')


Epoch 1, Step 1592: Loss = 0.0374, Actor Loss = 0.0016, Critic Loss = 0.0719, Entropy = 0.1201, Returns = 1.3658, Value = 1.3932, mask_loss = 0.2036std_loss = 0.3501:  29%|██▉       | 1592/5475 [7:08:00<18:16:34, 16.94s/it]

ratio tensor(1.0910, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5636, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.1937, device='cuda:0')
reward_raw: tensor(0.5655, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.1971, device='cuda:0')
reward_raw: tensor(0.5839, device='cuda:0') reward: tensor(1.3906, device='cuda:0') factor: tensor(0.1932, device='cuda:0')
reward_raw: tensor(0.5747, device='cuda:0') reward: tensor(1.3796, device='cuda:0') factor: tensor(0.1950, device='cuda:0')
reward_raw: tensor(0.5668, device='cuda:0') reward: tensor(1.3669, device='cuda:0') factor: tensor(0.1998, device='cuda:0')


Epoch 1, Step 1593: Loss = 0.0122, Actor Loss = -0.0267, Critic Loss = 0.0782, Entropy = 0.1243, Returns = 1.4045, Value = 1.3262, mask_loss = 0.1814std_loss = 0.3268:  29%|██▉       | 1593/5475 [7:08:16<17:40:09, 16.39s/it]

ratio tensor(1.3961, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.2457, device='cuda:0') factor: tensor(0.2889, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.2531, device='cuda:0') factor: tensor(0.2879, device='cuda:0')
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.2551, device='cuda:0') factor: tensor(0.2888, device='cuda:0')
reward_raw: tensor(0.5266, device='cuda:0') reward: tensor(1.2384, device='cuda:0') factor: tensor(0.2882, device='cuda:0')
reward_raw: tensor(0.5423, device='cuda:0') reward: tensor(1.2518, device='cuda:0') factor: tensor(0.2905, device='cuda:0')


Epoch 1, Step 1594: Loss = 0.1153, Actor Loss = 0.0809, Critic Loss = 0.0692, Entropy = 0.1719, Returns = 1.2378, Value = 1.2596, mask_loss = 0.2552std_loss = 0.3469:  29%|██▉       | 1594/5475 [7:08:36<19:01:01, 17.64s/it] 

ratio tensor(0.6794, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5695, device='cuda:0') reward: tensor(1.3650, device='cuda:0') factor: tensor(0.2045, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.3653, device='cuda:0') factor: tensor(0.2068, device='cuda:0')
reward_raw: tensor(0.5743, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.2080, device='cuda:0')
reward_raw: tensor(0.5598, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.2078, device='cuda:0')
reward_raw: tensor(0.5711, device='cuda:0') reward: tensor(1.3612, device='cuda:0') factor: tensor(0.2099, device='cuda:0')


Epoch 1, Step 1595: Loss = 0.0221, Actor Loss = -0.0294, Critic Loss = 0.1032, Entropy = 0.1341, Returns = 1.3975, Value = 1.3388, mask_loss = 0.1744std_loss = 0.3037:  29%|██▉       | 1595/5475 [7:08:59<20:39:41, 19.17s/it]

ratio tensor(1.1650, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5821, device='cuda:0') reward: tensor(1.3328, device='cuda:0') factor: tensor(0.2493, device='cuda:0')
reward_raw: tensor(0.5996, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.2457, device='cuda:0')
reward_raw: tensor(0.5904, device='cuda:0') reward: tensor(1.3406, device='cuda:0') factor: tensor(0.2498, device='cuda:0')
reward_raw: tensor(0.5793, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.2439, device='cuda:0')
reward_raw: tensor(0.5902, device='cuda:0') reward: tensor(1.3396, device='cuda:0') factor: tensor(0.2506, device='cuda:0')


Epoch 1, Step 1596: Loss = -0.0280, Actor Loss = -0.0471, Critic Loss = 0.0385, Entropy = 0.1759, Returns = 1.3162, Value = 1.3402, mask_loss = 0.2553std_loss = 0.3424:  29%|██▉       | 1596/5475 [7:09:15<19:36:56, 18.20s/it]

ratio tensor(1.0922, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.4094, device='cuda:0') factor: tensor(0.1760, device='cuda:0')
reward_raw: tensor(0.5992, device='cuda:0') reward: tensor(1.4214, device='cuda:0') factor: tensor(0.1777, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.3886, device='cuda:0') factor: tensor(0.1751, device='cuda:0')
reward_raw: tensor(0.5891, device='cuda:0') reward: tensor(1.4200, device='cuda:0') factor: tensor(0.1692, device='cuda:0')
reward_raw: tensor(0.5860, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1758, device='cuda:0')


Epoch 1, Step 1597: Loss = 0.1020, Actor Loss = 0.0259, Critic Loss = 0.1526, Entropy = 0.1657, Returns = 1.3815, Value = 1.5424, mask_loss = 0.2266std_loss = 0.3376:  29%|██▉       | 1597/5475 [7:09:31<19:05:04, 17.72s/it]  

ratio tensor(1.0958, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6034, device='cuda:0') reward: tensor(1.4333, device='cuda:0') factor: tensor(0.1701, device='cuda:0')
reward_raw: tensor(0.6213, device='cuda:0') reward: tensor(1.4451, device='cuda:0') factor: tensor(0.1762, device='cuda:0')
reward_raw: tensor(0.5722, device='cuda:0') reward: tensor(1.4041, device='cuda:0') factor: tensor(0.1681, device='cuda:0')
reward_raw: tensor(0.5898, device='cuda:0') reward: tensor(1.4175, device='cuda:0') factor: tensor(0.1723, device='cuda:0')
reward_raw: tensor(0.5960, device='cuda:0') reward: tensor(1.4281, device='cuda:0') factor: tensor(0.1679, device='cuda:0')


Epoch 1, Step 1598: Loss = 0.1992, Actor Loss = 0.1832, Critic Loss = 0.0324, Entropy = 0.1365, Returns = 1.4081, Value = 1.3840, mask_loss = 0.1917std_loss = 0.3299:  29%|██▉       | 1598/5475 [7:09:45<17:45:24, 16.49s/it]

ratio tensor(1.1058, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4910, device='cuda:0') reward: tensor(1.3045, device='cuda:0') factor: tensor(0.1864, device='cuda:0')
reward_raw: tensor(0.5016, device='cuda:0') reward: tensor(1.3133, device='cuda:0') factor: tensor(0.1883, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.3169, device='cuda:0') factor: tensor(0.1999, device='cuda:0')
reward_raw: tensor(0.5173, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.1936, device='cuda:0')
reward_raw: tensor(0.5192, device='cuda:0') reward: tensor(1.3261, device='cuda:0') factor: tensor(0.1931, device='cuda:0')


Epoch 1, Step 1599: Loss = 0.1174, Actor Loss = 0.0756, Critic Loss = 0.0840, Entropy = 0.1561, Returns = 1.3617, Value = 1.2079, mask_loss = 0.2088std_loss = 0.3239:  29%|██▉       | 1599/5475 [7:09:58<16:39:36, 15.47s/it]

ratio tensor(0.9638, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5621, device='cuda:0') reward: tensor(1.3420, device='cuda:0') factor: tensor(0.2201, device='cuda:0')
reward_raw: tensor(0.5567, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.2114, device='cuda:0')
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.3465, device='cuda:0') factor: tensor(0.2105, device='cuda:0')
reward_raw: tensor(0.5543, device='cuda:0') reward: tensor(1.3526, device='cuda:0') factor: tensor(0.2017, device='cuda:0')
reward_raw: tensor(0.5879, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.2138, device='cuda:0')


Epoch 1, Step 1600: Loss = -0.0731, Actor Loss = -0.0948, Critic Loss = 0.0437, Entropy = 0.1629, Returns = 1.3565, Value = 1.3794, mask_loss = 0.2001std_loss = 0.3205:  29%|██▉       | 1600/5475 [7:10:17<17:40:20, 16.42s/it]

ratio tensor(1.1457, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5530, device='cuda:0') reward: tensor(1.3878, device='cuda:0') factor: tensor(0.1652, device='cuda:0')
reward_raw: tensor(0.5754, device='cuda:0') reward: tensor(1.4129, device='cuda:0') factor: tensor(0.1625, device='cuda:0')
reward_raw: tensor(0.5752, device='cuda:0') reward: tensor(1.4130, device='cuda:0') factor: tensor(0.1622, device='cuda:0')
reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.5639, device='cuda:0') reward: tensor(1.3996, device='cuda:0') factor: tensor(0.1643, device='cuda:0')


Epoch 1, Step 1601: Loss = -0.0998, Actor Loss = -0.1178, Critic Loss = 0.0362, Entropy = 0.0893, Returns = 1.4337, Value = 1.4644, mask_loss = 0.1311std_loss = 0.2867:  29%|██▉       | 1600/5475 [7:10:34<17:40:20, 16.42s/it]

ratio tensor(1.2305, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1601: Loss = -0.0998, Actor Loss = -0.1178, Critic Loss = 0.0362, Entropy = 0.0893, Returns = 1.4337, Value = 1.4644, mask_loss = 0.1311std_loss = 0.2867:  29%|██▉       | 1601/5475 [7:10:34<18:00:07, 16.73s/it]

reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.3908, device='cuda:0') factor: tensor(0.1819, device='cuda:0')
reward_raw: tensor(0.5634, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1883, device='cuda:0')
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.1910, device='cuda:0')
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.3843, device='cuda:0') factor: tensor(0.1894, device='cuda:0')
reward_raw: tensor(0.5674, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.1826, device='cuda:0')


Epoch 1, Step 1602: Loss = 0.0456, Actor Loss = 0.0162, Critic Loss = 0.0591, Entropy = 0.1311, Returns = 1.4026, Value = 1.3612, mask_loss = 0.1872std_loss = 0.3316:  29%|██▉       | 1602/5475 [7:10:49<17:19:36, 16.11s/it]  

ratio tensor(0.8618, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6121, device='cuda:0') reward: tensor(1.4346, device='cuda:0') factor: tensor(0.1774, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(1.4180, device='cuda:0') factor: tensor(0.1737, device='cuda:0')
reward_raw: tensor(0.6026, device='cuda:0') reward: tensor(1.4240, device='cuda:0') factor: tensor(0.1786, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.4067, device='cuda:0') factor: tensor(0.1841, device='cuda:0')
reward_raw: tensor(0.5841, device='cuda:0') reward: tensor(1.4114, device='cuda:0') factor: tensor(0.1727, device='cuda:0')


Epoch 1, Step 1603: Loss = 0.0528, Actor Loss = 0.0232, Critic Loss = 0.0595, Entropy = 0.1212, Returns = 1.4073, Value = 1.4596, mask_loss = 0.1954std_loss = 0.3447:  29%|██▉       | 1603/5475 [7:11:06<17:38:25, 16.40s/it]

ratio tensor(3.2433, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5494, device='cuda:0') reward: tensor(1.3270, device='cuda:0') factor: tensor(0.2224, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.3312, device='cuda:0') factor: tensor(0.2105, device='cuda:0')
reward_raw: tensor(0.5538, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.2165, device='cuda:0')
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.3097, device='cuda:0') factor: tensor(0.2166, device='cuda:0')
reward_raw: tensor(0.5580, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.2071, device='cuda:0')


Epoch 1, Step 1604: Loss = 0.0526, Actor Loss = 0.0176, Critic Loss = 0.0702, Entropy = 0.1460, Returns = 1.2566, Value = 1.3264, mask_loss = 0.2314std_loss = 0.3313:  29%|██▉       | 1604/5475 [7:11:26<18:44:12, 17.43s/it]

ratio tensor(3.4435, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.3664, device='cuda:0') factor: tensor(0.1776, device='cuda:0')
reward_raw: tensor(0.5360, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1730, device='cuda:0')
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(1.3946, device='cuda:0') factor: tensor(0.1750, device='cuda:0')
reward_raw: tensor(0.5645, device='cuda:0') reward: tensor(1.3876, device='cuda:0') factor: tensor(0.1769, device='cuda:0')
reward_raw: tensor(0.5564, device='cuda:0') reward: tensor(1.3762, device='cuda:0') factor: tensor(0.1802, device='cuda:0')


Epoch 1, Step 1605: Loss = -0.0769, Actor Loss = -0.1287, Critic Loss = 0.1038, Entropy = 0.1276, Returns = 1.3886, Value = 1.2968, mask_loss = 0.1571std_loss = 0.2958:  29%|██▉       | 1605/5475 [7:11:45<19:13:34, 17.88s/it]

ratio tensor(3.5194, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.2909, device='cuda:0') factor: tensor(0.1963, device='cuda:0')
reward_raw: tensor(0.4778, device='cuda:0') reward: tensor(1.2786, device='cuda:0') factor: tensor(0.1992, device='cuda:0')
reward_raw: tensor(0.4913, device='cuda:0') reward: tensor(1.2903, device='cuda:0') factor: tensor(0.2009, device='cuda:0')
reward_raw: tensor(0.4791, device='cuda:0') reward: tensor(1.2836, device='cuda:0') factor: tensor(0.1954, device='cuda:0')
reward_raw: tensor(0.5101, device='cuda:0') reward: tensor(1.3046, device='cuda:0') factor: tensor(0.2056, device='cuda:0')


Epoch 1, Step 1606: Loss = 0.3126, Actor Loss = 0.2685, Critic Loss = 0.0884, Entropy = 0.1744, Returns = 1.1917, Value = 1.2913, mask_loss = 0.2337std_loss = 0.3279:  29%|██▉       | 1606/5475 [7:12:00<18:24:48, 17.13s/it]  

ratio tensor(0.8822, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4766, device='cuda:0') reward: tensor(1.2505, device='cuda:0') factor: tensor(0.2261, device='cuda:0')
reward_raw: tensor(0.4784, device='cuda:0') reward: tensor(1.2511, device='cuda:0') factor: tensor(0.2274, device='cuda:0')
reward_raw: tensor(0.5147, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.2203, device='cuda:0')
reward_raw: tensor(0.4974, device='cuda:0') reward: tensor(1.2732, device='cuda:0') factor: tensor(0.2242, device='cuda:0')
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.2883, device='cuda:0') factor: tensor(0.2225, device='cuda:0')


Epoch 1, Step 1607: Loss = -0.1336, Actor Loss = -0.1720, Critic Loss = 0.0771, Entropy = 0.1386, Returns = 1.3016, Value = 1.2036, mask_loss = 0.2126std_loss = 0.3489:  29%|██▉       | 1607/5475 [7:12:15<17:36:59, 16.40s/it]

ratio tensor(1.1325, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6019, device='cuda:0') reward: tensor(1.4457, device='cuda:0') factor: tensor(0.1562, device='cuda:0')
reward_raw: tensor(0.5852, device='cuda:0') reward: tensor(1.4381, device='cuda:0') factor: tensor(0.1470, device='cuda:0')
reward_raw: tensor(0.5882, device='cuda:0') reward: tensor(1.4323, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.5912, device='cuda:0') reward: tensor(1.4396, device='cuda:0') factor: tensor(0.1516, device='cuda:0')
reward_raw: tensor(0.5780, device='cuda:0') reward: tensor(1.4205, device='cuda:0') factor: tensor(0.1575, device='cuda:0')


Epoch 1, Step 1608: Loss = -0.1723, Actor Loss = -0.2062, Critic Loss = 0.0679, Entropy = 0.1230, Returns = 1.4267, Value = 1.5376, mask_loss = 0.1610std_loss = 0.3000:  29%|██▉       | 1608/5475 [7:12:28<16:33:43, 15.42s/it]

ratio tensor(2.6607, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.3365, device='cuda:0') factor: tensor(0.1725, device='cuda:0')
reward_raw: tensor(0.5169, device='cuda:0') reward: tensor(1.3477, device='cuda:0') factor: tensor(0.1691, device='cuda:0')
reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.3471, device='cuda:0') factor: tensor(0.1661, device='cuda:0')
reward_raw: tensor(0.5098, device='cuda:0') reward: tensor(1.3411, device='cuda:0') factor: tensor(0.1687, device='cuda:0')
reward_raw: tensor(0.5172, device='cuda:0') reward: tensor(1.3520, device='cuda:0') factor: tensor(0.1652, device='cuda:0')


Epoch 1, Step 1609: Loss = 0.2126, Actor Loss = 0.1899, Critic Loss = 0.0457, Entropy = 0.1159, Returns = 1.2896, Value = 1.2823, mask_loss = 0.1708std_loss = 0.3230:  29%|██▉       | 1609/5475 [7:12:45<17:11:28, 16.01s/it]  

ratio tensor(0.5713, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.2873, device='cuda:0') factor: tensor(0.1820, device='cuda:0')
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.3037, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.4795, device='cuda:0') reward: tensor(1.2879, device='cuda:0') factor: tensor(0.1915, device='cuda:0')
reward_raw: tensor(0.4616, device='cuda:0') reward: tensor(1.2826, device='cuda:0') factor: tensor(0.1791, device='cuda:0')
reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.2981, device='cuda:0') factor: tensor(0.1859, device='cuda:0')


Epoch 1, Step 1610: Loss = -0.0746, Actor Loss = -0.1039, Critic Loss = 0.0590, Entropy = 0.1342, Returns = 1.3699, Value = 1.2371, mask_loss = 0.1847std_loss = 0.3201:  29%|██▉       | 1610/5475 [7:13:03<17:50:37, 16.62s/it]

ratio tensor(1.0957, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.3695, device='cuda:0') factor: tensor(0.2102, device='cuda:0')
reward_raw: tensor(0.6040, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.2136, device='cuda:0')
reward_raw: tensor(0.6042, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.2099, device='cuda:0')
reward_raw: tensor(0.5890, device='cuda:0') reward: tensor(1.3807, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.6184, device='cuda:0') reward: tensor(1.4029, device='cuda:0') factor: tensor(0.2155, device='cuda:0')


Epoch 1, Step 1611: Loss = -0.0137, Actor Loss = -0.0380, Critic Loss = 0.0490, Entropy = 0.1691, Returns = 1.3806, Value = 1.4646, mask_loss = 0.2229std_loss = 0.3241:  29%|██▉       | 1611/5475 [7:13:18<17:19:07, 16.14s/it]

ratio tensor(0.5344, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5579, device='cuda:0') reward: tensor(1.3161, device='cuda:0') factor: tensor(0.2418, device='cuda:0')
reward_raw: tensor(0.5446, device='cuda:0') reward: tensor(1.3108, device='cuda:0') factor: tensor(0.2338, device='cuda:0')
reward_raw: tensor(0.5465, device='cuda:0') reward: tensor(1.3046, device='cuda:0') factor: tensor(0.2420, device='cuda:0')
reward_raw: tensor(0.5448, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.2486, device='cuda:0')
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.2488, device='cuda:0')


Epoch 1, Step 1612: Loss = 0.0706, Actor Loss = 0.0411, Critic Loss = 0.0591, Entropy = 0.1417, Returns = 1.3542, Value = 1.2784, mask_loss = 0.2332std_loss = 0.3575:  29%|██▉       | 1612/5475 [7:13:31<16:05:52, 15.00s/it]  

ratio tensor(1.2627, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.3910, device='cuda:0') factor: tensor(0.1653, device='cuda:0')
reward_raw: tensor(0.5685, device='cuda:0') reward: tensor(1.3978, device='cuda:0') factor: tensor(0.1707, device='cuda:0')
reward_raw: tensor(0.5584, device='cuda:0') reward: tensor(1.3955, device='cuda:0') factor: tensor(0.1629, device='cuda:0')
reward_raw: tensor(0.5506, device='cuda:0') reward: tensor(1.3857, device='cuda:0') factor: tensor(0.1649, device='cuda:0')
reward_raw: tensor(0.5463, device='cuda:0') reward: tensor(1.3721, device='cuda:0') factor: tensor(0.1742, device='cuda:0')


Epoch 1, Step 1613: Loss = -0.0976, Actor Loss = -0.1409, Critic Loss = 0.0870, Entropy = 0.1406, Returns = 1.3717, Value = 1.3300, mask_loss = 0.1724std_loss = 0.3142:  29%|██▉       | 1613/5475 [7:13:46<16:03:52, 14.97s/it]

ratio tensor(1.7261, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5859, device='cuda:0') reward: tensor(1.3433, device='cuda:0') factor: tensor(0.2426, device='cuda:0')
reward_raw: tensor(0.5848, device='cuda:0') reward: tensor(1.3517, device='cuda:0') factor: tensor(0.2331, device='cuda:0')
reward_raw: tensor(0.5965, device='cuda:0') reward: tensor(1.3501, device='cuda:0') factor: tensor(0.2464, device='cuda:0')
reward_raw: tensor(0.5825, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.2428, device='cuda:0')
reward_raw: tensor(0.6006, device='cuda:0') reward: tensor(1.3660, device='cuda:0') factor: tensor(0.2346, device='cuda:0')


Epoch 1, Step 1614: Loss = 0.2161, Actor Loss = 0.1843, Critic Loss = 0.0641, Entropy = 0.1864, Returns = 1.2782, Value = 1.3263, mask_loss = 0.2464std_loss = 0.3388:  29%|██▉       | 1614/5475 [7:13:59<15:40:16, 14.61s/it]  

ratio tensor(0.5203, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.2561, device='cuda:0') factor: tensor(0.2248, device='cuda:0')
reward_raw: tensor(0.4720, device='cuda:0') reward: tensor(1.2437, device='cuda:0') factor: tensor(0.2282, device='cuda:0')
reward_raw: tensor(0.4676, device='cuda:0') reward: tensor(1.2451, device='cuda:0') factor: tensor(0.2225, device='cuda:0')
reward_raw: tensor(0.4524, device='cuda:0') reward: tensor(1.2196, device='cuda:0') factor: tensor(0.2328, device='cuda:0')
reward_raw: tensor(0.4634, device='cuda:0') reward: tensor(1.2298, device='cuda:0') factor: tensor(0.2336, device='cuda:0')


Epoch 1, Step 1615: Loss = -0.0520, Actor Loss = -0.0858, Critic Loss = 0.0680, Entropy = 0.1388, Returns = 1.3193, Value = 1.1628, mask_loss = 0.1825std_loss = 0.3139:  29%|██▉       | 1615/5475 [7:14:14<15:41:43, 14.64s/it]

ratio tensor(1.2869, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5707, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.2080, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.3472, device='cuda:0') factor: tensor(0.2105, device='cuda:0')
reward_raw: tensor(0.5607, device='cuda:0') reward: tensor(1.3526, device='cuda:0') factor: tensor(0.2081, device='cuda:0')
reward_raw: tensor(0.5647, device='cuda:0') reward: tensor(1.3549, device='cuda:0') factor: tensor(0.2098, device='cuda:0')
reward_raw: tensor(0.5629, device='cuda:0') reward: tensor(1.3455, device='cuda:0') factor: tensor(0.2174, device='cuda:0')


Epoch 1, Step 1616: Loss = -0.1744, Actor Loss = -0.2049, Critic Loss = 0.0612, Entropy = 0.1769, Returns = 1.3108, Value = 1.4005, mask_loss = 0.2141std_loss = 0.3373:  30%|██▉       | 1616/5475 [7:14:30<16:09:03, 15.07s/it]

ratio tensor(1.5177, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5926, device='cuda:0') reward: tensor(1.4154, device='cuda:0') factor: tensor(0.1772, device='cuda:0')
reward_raw: tensor(0.5905, device='cuda:0') reward: tensor(1.4006, device='cuda:0') factor: tensor(0.1899, device='cuda:0')
reward_raw: tensor(0.6027, device='cuda:0') reward: tensor(1.4197, device='cuda:0') factor: tensor(0.1830, device='cuda:0')
reward_raw: tensor(0.5765, device='cuda:0') reward: tensor(1.3976, device='cuda:0') factor: tensor(0.1789, device='cuda:0')
reward_raw: tensor(0.5809, device='cuda:0') reward: tensor(1.4055, device='cuda:0') factor: tensor(0.1754, device='cuda:0')


Epoch 1, Step 1617: Loss = 0.1739, Actor Loss = 0.1351, Critic Loss = 0.0777, Entropy = 0.1198, Returns = 1.4455, Value = 1.4261, mask_loss = 0.1515std_loss = 0.3033:  30%|██▉       | 1617/5475 [7:14:48<16:57:02, 15.82s/it]  

ratio tensor(1.1426, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.3407, device='cuda:0') factor: tensor(0.2330, device='cuda:0')
reward_raw: tensor(0.5729, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.2360, device='cuda:0')
reward_raw: tensor(0.5726, device='cuda:0') reward: tensor(1.3379, device='cuda:0') factor: tensor(0.2347, device='cuda:0')
reward_raw: tensor(0.5954, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.2336, device='cuda:0')
reward_raw: tensor(0.5895, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.2314, device='cuda:0')


Epoch 1, Step 1618: Loss = 0.2614, Actor Loss = 0.2307, Critic Loss = 0.0617, Entropy = 0.1675, Returns = 1.2885, Value = 1.3250, mask_loss = 0.2297std_loss = 0.3428:  30%|██▉       | 1618/5475 [7:15:03<16:38:07, 15.53s/it]

ratio tensor(0.7096, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5439, device='cuda:0') reward: tensor(1.3405, device='cuda:0') factor: tensor(0.2034, device='cuda:0')
reward_raw: tensor(0.5602, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.2043, device='cuda:0')
reward_raw: tensor(0.5629, device='cuda:0') reward: tensor(1.3645, device='cuda:0') factor: tensor(0.1983, device='cuda:0')
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.3599, device='cuda:0') factor: tensor(0.2052, device='cuda:0')
reward_raw: tensor(0.5318, device='cuda:0') reward: tensor(1.3338, device='cuda:0') factor: tensor(0.1981, device='cuda:0')


Epoch 1, Step 1619: Loss = -0.0702, Actor Loss = -0.1066, Critic Loss = 0.0730, Entropy = 0.1535, Returns = 1.3220, Value = 1.4049, mask_loss = 0.2122std_loss = 0.3359:  30%|██▉       | 1619/5475 [7:15:20<17:06:03, 15.97s/it]

ratio tensor(2.5347, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5500, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.1726, device='cuda:0')
reward_raw: tensor(0.5711, device='cuda:0') reward: tensor(1.3867, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1740, device='cuda:0')
reward_raw: tensor(0.5628, device='cuda:0') reward: tensor(1.3841, device='cuda:0') factor: tensor(0.1787, device='cuda:0')


Epoch 1, Step 1620: Loss = 0.2019, Actor Loss = 0.1702, Critic Loss = 0.0637, Entropy = 0.1291, Returns = 1.3349, Value = 1.3355, mask_loss = 0.2032std_loss = 0.3403:  30%|██▉       | 1620/5475 [7:15:33<16:26:58, 15.36s/it]  

ratio tensor(0.5743, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6071, device='cuda:0') reward: tensor(1.4262, device='cuda:0') factor: tensor(0.1809, device='cuda:0')
reward_raw: tensor(0.6026, device='cuda:0') reward: tensor(1.4281, device='cuda:0') factor: tensor(0.1745, device='cuda:0')
reward_raw: tensor(0.6170, device='cuda:0') reward: tensor(1.4430, device='cuda:0') factor: tensor(0.1740, device='cuda:0')
reward_raw: tensor(0.6066, device='cuda:0') reward: tensor(1.4249, device='cuda:0') factor: tensor(0.1817, device='cuda:0')
reward_raw: tensor(0.6306, device='cuda:0') reward: tensor(1.4493, device='cuda:0') factor: tensor(0.1813, device='cuda:0')


Epoch 1, Step 1621: Loss = -0.3132, Actor Loss = -0.3573, Critic Loss = 0.0885, Entropy = 0.1264, Returns = 1.4136, Value = 1.4624, mask_loss = 0.1570std_loss = 0.3054:  30%|██▉       | 1620/5475 [7:15:49<16:26:58, 15.36s/it]

ratio tensor(2.2225, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1621: Loss = -0.3132, Actor Loss = -0.3573, Critic Loss = 0.0885, Entropy = 0.1264, Returns = 1.4136, Value = 1.4624, mask_loss = 0.1570std_loss = 0.3054:  30%|██▉       | 1621/5475 [7:15:49<16:37:15, 15.53s/it]

reward_raw: tensor(0.6101, device='cuda:0') reward: tensor(1.4227, device='cuda:0') factor: tensor(0.1874, device='cuda:0')
reward_raw: tensor(0.6229, device='cuda:0') reward: tensor(1.4360, device='cuda:0') factor: tensor(0.1869, device='cuda:0')
reward_raw: tensor(0.6095, device='cuda:0') reward: tensor(1.4192, device='cuda:0') factor: tensor(0.1903, device='cuda:0')
reward_raw: tensor(0.6267, device='cuda:0') reward: tensor(1.4399, device='cuda:0') factor: tensor(0.1868, device='cuda:0')
reward_raw: tensor(0.6053, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.1824, device='cuda:0')


Epoch 1, Step 1622: Loss = 0.3100, Actor Loss = 0.2765, Critic Loss = 0.0672, Entropy = 0.1584, Returns = 1.3481, Value = 1.4875, mask_loss = 0.2107std_loss = 0.3444:  30%|██▉       | 1622/5475 [7:16:02<15:43:14, 14.69s/it]  

ratio tensor(0.7205, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5980, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.5986, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.2216, device='cuda:0')
reward_raw: tensor(0.5748, device='cuda:0') reward: tensor(1.3549, device='cuda:0') factor: tensor(0.2199, device='cuda:0')
reward_raw: tensor(0.6209, device='cuda:0') reward: tensor(1.4014, device='cuda:0') factor: tensor(0.2195, device='cuda:0')
reward_raw: tensor(0.5873, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.2287, device='cuda:0')


Epoch 1, Step 1623: Loss = 0.0512, Actor Loss = 0.0011, Critic Loss = 0.1004, Entropy = 0.1207, Returns = 1.4133, Value = 1.2820, mask_loss = 0.1991std_loss = 0.3327:  30%|██▉       | 1623/5475 [7:16:16<15:31:20, 14.51s/it]

ratio tensor(0.9367, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4767, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.1823, device='cuda:0')
reward_raw: tensor(0.4746, device='cuda:0') reward: tensor(1.2944, device='cuda:0') factor: tensor(0.1802, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.1858, device='cuda:0')
reward_raw: tensor(0.4789, device='cuda:0') reward: tensor(1.2988, device='cuda:0') factor: tensor(0.1800, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3220, device='cuda:0') factor: tensor(0.1860, device='cuda:0')


Epoch 1, Step 1624: Loss = -0.0174, Actor Loss = -0.0510, Critic Loss = 0.0676, Entropy = 0.1420, Returns = 1.3995, Value = 1.3957, mask_loss = 0.2063std_loss = 0.3442:  30%|██▉       | 1624/5475 [7:16:36<17:22:43, 16.25s/it]

ratio tensor(2.1530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.3210, device='cuda:0') factor: tensor(0.2279, device='cuda:0')
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.2315, device='cuda:0')
reward_raw: tensor(0.5561, device='cuda:0') reward: tensor(1.3218, device='cuda:0') factor: tensor(0.2343, device='cuda:0')
reward_raw: tensor(0.5386, device='cuda:0') reward: tensor(1.3158, device='cuda:0') factor: tensor(0.2228, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.2882, device='cuda:0') factor: tensor(0.2282, device='cuda:0')


Epoch 1, Step 1625: Loss = -0.0135, Actor Loss = -0.0968, Critic Loss = 0.1668, Entropy = 0.1293, Returns = 1.2899, Value = 1.2776, mask_loss = 0.2156std_loss = 0.3461:  30%|██▉       | 1625/5475 [7:16:52<17:12:13, 16.09s/it]

ratio tensor(1.7244, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5677, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.2299, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.2307, device='cuda:0')
reward_raw: tensor(0.5764, device='cuda:0') reward: tensor(1.3423, device='cuda:0') factor: tensor(0.2340, device='cuda:0')
reward_raw: tensor(0.5825, device='cuda:0') reward: tensor(1.3529, device='cuda:0') factor: tensor(0.2296, device='cuda:0')
reward_raw: tensor(0.5816, device='cuda:0') reward: tensor(1.3531, device='cuda:0') factor: tensor(0.2285, device='cuda:0')


Epoch 1, Step 1626: Loss = 0.2670, Actor Loss = 0.2310, Critic Loss = 0.0722, Entropy = 0.1556, Returns = 1.2705, Value = 1.3976, mask_loss = 0.2886std_loss = 0.3881:  30%|██▉       | 1626/5475 [7:17:05<16:06:36, 15.07s/it]  

ratio tensor(0.5232, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4605, device='cuda:0') reward: tensor(1.3067, device='cuda:0') factor: tensor(0.1538, device='cuda:0')
reward_raw: tensor(0.4639, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.4534, device='cuda:0') reward: tensor(1.2956, device='cuda:0') factor: tensor(0.1578, device='cuda:0')
reward_raw: tensor(0.4435, device='cuda:0') reward: tensor(1.2858, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.4601, device='cuda:0') reward: tensor(1.3033, device='cuda:0') factor: tensor(0.1568, device='cuda:0')


Epoch 1, Step 1627: Loss = -0.1026, Actor Loss = -0.1455, Critic Loss = 0.0860, Entropy = 0.1263, Returns = 1.2820, Value = 1.3305, mask_loss = 0.1731std_loss = 0.3219:  30%|██▉       | 1627/5475 [7:17:19<15:46:12, 14.75s/it]

ratio tensor(2.2233, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5616, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1744, device='cuda:0')
reward_raw: tensor(0.5564, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1711, device='cuda:0')
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.5701, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.1763, device='cuda:0')
reward_raw: tensor(0.5714, device='cuda:0') reward: tensor(1.4008, device='cuda:0') factor: tensor(0.1705, device='cuda:0')


Epoch 1, Step 1628: Loss = 0.0637, Actor Loss = 0.0213, Critic Loss = 0.0850, Entropy = 0.1245, Returns = 1.3626, Value = 1.3774, mask_loss = 0.1968std_loss = 0.3359:  30%|██▉       | 1628/5475 [7:17:38<17:11:21, 16.09s/it]  

ratio tensor(0.6888, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5051, device='cuda:0') reward: tensor(1.2839, device='cuda:0') factor: tensor(0.2212, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.3020, device='cuda:0') factor: tensor(0.2259, device='cuda:0')
reward_raw: tensor(0.5117, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.2191, device='cuda:0')
reward_raw: tensor(0.5154, device='cuda:0') reward: tensor(1.2957, device='cuda:0') factor: tensor(0.2197, device='cuda:0')
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.2856, device='cuda:0') factor: tensor(0.2231, device='cuda:0')


Epoch 1, Step 1629: Loss = 0.0234, Actor Loss = -0.0017, Critic Loss = 0.0505, Entropy = 0.1196, Returns = 1.4200, Value = 1.3419, mask_loss = 0.1849std_loss = 0.3293:  30%|██▉       | 1629/5475 [7:17:58<18:21:12, 17.18s/it]

ratio tensor(0.9365, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6001, device='cuda:0') reward: tensor(1.3925, device='cuda:0') factor: tensor(0.2076, device='cuda:0')
reward_raw: tensor(0.6017, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.1908, device='cuda:0')
reward_raw: tensor(0.6033, device='cuda:0') reward: tensor(1.4003, device='cuda:0') factor: tensor(0.2030, device='cuda:0')
reward_raw: tensor(0.6073, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.2001, device='cuda:0')
reward_raw: tensor(0.6206, device='cuda:0') reward: tensor(1.4166, device='cuda:0') factor: tensor(0.2039, device='cuda:0')


Epoch 1, Step 1630: Loss = -0.1582, Actor Loss = -0.1848, Critic Loss = 0.0536, Entropy = 0.1334, Returns = 1.3951, Value = 1.4457, mask_loss = 0.1983std_loss = 0.3421:  30%|██▉       | 1630/5475 [7:18:11<17:07:55, 16.04s/it]

ratio tensor(1.4785, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5121, device='cuda:0') reward: tensor(1.2954, device='cuda:0') factor: tensor(0.2167, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.3175, device='cuda:0') factor: tensor(0.2131, device='cuda:0')
reward_raw: tensor(0.5207, device='cuda:0') reward: tensor(1.3025, device='cuda:0') factor: tensor(0.2182, device='cuda:0')
reward_raw: tensor(0.5076, device='cuda:0') reward: tensor(1.2907, device='cuda:0') factor: tensor(0.2169, device='cuda:0')
reward_raw: tensor(0.4970, device='cuda:0') reward: tensor(1.2847, device='cuda:0') factor: tensor(0.2124, device='cuda:0')


Epoch 1, Step 1631: Loss = 0.1844, Actor Loss = 0.1484, Critic Loss = 0.0724, Entropy = 0.1898, Returns = 1.3176, Value = 1.2861, mask_loss = 0.2379std_loss = 0.3407:  30%|██▉       | 1631/5475 [7:18:29<17:35:06, 16.47s/it]  

ratio tensor(0.7322, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3233, device='cuda:0') factor: tensor(0.1854, device='cuda:0')
reward_raw: tensor(0.5323, device='cuda:0') reward: tensor(1.3544, device='cuda:0') factor: tensor(0.1780, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3359, device='cuda:0') factor: tensor(0.1854, device='cuda:0')
reward_raw: tensor(0.5343, device='cuda:0') reward: tensor(1.3452, device='cuda:0') factor: tensor(0.1891, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3388, device='cuda:0') factor: tensor(0.1825, device='cuda:0')


Epoch 1, Step 1632: Loss = 0.0338, Actor Loss = -0.0053, Critic Loss = 0.0784, Entropy = 0.1043, Returns = 1.4050, Value = 1.2634, mask_loss = 0.1511std_loss = 0.3075:  30%|██▉       | 1632/5475 [7:18:51<19:30:25, 18.27s/it]

ratio tensor(1.1920, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.3110, device='cuda:0') factor: tensor(0.1773, device='cuda:0')
reward_raw: tensor(0.4751, device='cuda:0') reward: tensor(1.2948, device='cuda:0') factor: tensor(0.1804, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3129, device='cuda:0') factor: tensor(0.1782, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.3413, device='cuda:0') factor: tensor(0.1770, device='cuda:0')
reward_raw: tensor(0.4859, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.1795, device='cuda:0')


Epoch 1, Step 1633: Loss = -0.1385, Actor Loss = -0.1557, Critic Loss = 0.0346, Entropy = 0.1146, Returns = 1.2930, Value = 1.3370, mask_loss = 0.1858std_loss = 0.3351:  30%|██▉       | 1633/5475 [7:19:08<19:09:46, 17.96s/it]

ratio tensor(2.1091, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6107, device='cuda:0') reward: tensor(1.3975, device='cuda:0') factor: tensor(0.2132, device='cuda:0')
reward_raw: tensor(0.6143, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.2223, device='cuda:0')
reward_raw: tensor(0.6224, device='cuda:0') reward: tensor(1.4032, device='cuda:0') factor: tensor(0.2192, device='cuda:0')
reward_raw: tensor(0.6202, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.2210, device='cuda:0')
reward_raw: tensor(0.6052, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.2168, device='cuda:0')


Epoch 1, Step 1634: Loss = 0.1397, Actor Loss = 0.0813, Critic Loss = 0.1170, Entropy = 0.1528, Returns = 1.3874, Value = 1.4876, mask_loss = 0.2124std_loss = 0.3319:  30%|██▉       | 1634/5475 [7:19:28<19:45:45, 18.52s/it]  

ratio tensor(1.9207, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5830, device='cuda:0') reward: tensor(1.4062, device='cuda:0') factor: tensor(0.1768, device='cuda:0')
reward_raw: tensor(0.5777, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1894, device='cuda:0')
reward_raw: tensor(0.5768, device='cuda:0') reward: tensor(1.3940, device='cuda:0') factor: tensor(0.1828, device='cuda:0')
reward_raw: tensor(0.5806, device='cuda:0') reward: tensor(1.3948, device='cuda:0') factor: tensor(0.1858, device='cuda:0')
reward_raw: tensor(0.5681, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.1776, device='cuda:0')


Epoch 1, Step 1635: Loss = -0.0195, Actor Loss = -0.1008, Critic Loss = 0.1628, Entropy = 0.1142, Returns = 1.3735, Value = 1.3246, mask_loss = 0.1941std_loss = 0.3397:  30%|██▉       | 1635/5475 [7:19:46<19:34:18, 18.35s/it]

ratio tensor(1.3575, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5323, device='cuda:0') reward: tensor(1.3050, device='cuda:0') factor: tensor(0.2273, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.3348, device='cuda:0') factor: tensor(0.2204, device='cuda:0')
reward_raw: tensor(0.5344, device='cuda:0') reward: tensor(1.3062, device='cuda:0') factor: tensor(0.2283, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.3123, device='cuda:0') factor: tensor(0.2210, device='cuda:0')
reward_raw: tensor(0.5518, device='cuda:0') reward: tensor(1.3319, device='cuda:0') factor: tensor(0.2199, device='cuda:0')


Epoch 1, Step 1636: Loss = 0.1590, Actor Loss = 0.1177, Critic Loss = 0.0829, Entropy = 0.1349, Returns = 1.3482, Value = 1.1955, mask_loss = 0.1976std_loss = 0.3294:  30%|██▉       | 1636/5475 [7:20:05<19:36:25, 18.39s/it]  

ratio tensor(3.7989, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.2850, device='cuda:0') factor: tensor(0.2207, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.2843, device='cuda:0') factor: tensor(0.2189, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.2171, device='cuda:0')
reward_raw: tensor(0.5160, device='cuda:0') reward: tensor(1.2958, device='cuda:0') factor: tensor(0.2203, device='cuda:0')
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.2870, device='cuda:0') factor: tensor(0.2126, device='cuda:0')


Epoch 1, Step 1637: Loss = -0.0890, Actor Loss = -0.1141, Critic Loss = 0.0506, Entropy = 0.1277, Returns = 1.2507, Value = 1.2891, mask_loss = 0.2212std_loss = 0.3330:  30%|██▉       | 1637/5475 [7:20:20<18:45:21, 17.59s/it]

ratio tensor(1.0406, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.3262, device='cuda:0') factor: tensor(0.2245, device='cuda:0')
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.3201, device='cuda:0') factor: tensor(0.2285, device='cuda:0')
reward_raw: tensor(0.5479, device='cuda:0') reward: tensor(1.3220, device='cuda:0') factor: tensor(0.2259, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.3320, device='cuda:0') factor: tensor(0.2167, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.3180, device='cuda:0') factor: tensor(0.2265, device='cuda:0')


Epoch 1, Step 1638: Loss = 0.0579, Actor Loss = 0.0291, Critic Loss = 0.0578, Entropy = 0.1298, Returns = 1.3597, Value = 1.3290, mask_loss = 0.2348std_loss = 0.3573:  30%|██▉       | 1638/5475 [7:20:38<18:44:57, 17.59s/it]  

ratio tensor(0.6780, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1564, device='cuda:0')
reward_raw: tensor(0.4682, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.1581, device='cuda:0')
reward_raw: tensor(0.4916, device='cuda:0') reward: tensor(1.3280, device='cuda:0') factor: tensor(0.1636, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3286, device='cuda:0') factor: tensor(0.1625, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3128, device='cuda:0') factor: tensor(0.1565, device='cuda:0')


Epoch 1, Step 1639: Loss = 0.0861, Actor Loss = 0.0605, Critic Loss = 0.0514, Entropy = 0.1256, Returns = 1.2525, Value = 1.3011, mask_loss = 0.1688std_loss = 0.3144:  30%|██▉       | 1639/5475 [7:20:59<19:57:43, 18.73s/it]

ratio tensor(2.9960, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6433, device='cuda:0') reward: tensor(1.4478, device='cuda:0') factor: tensor(0.1956, device='cuda:0')
reward_raw: tensor(0.6414, device='cuda:0') reward: tensor(1.4404, device='cuda:0') factor: tensor(0.2010, device='cuda:0')
reward_raw: tensor(0.6496, device='cuda:0') reward: tensor(1.4600, device='cuda:0') factor: tensor(0.1896, device='cuda:0')
reward_raw: tensor(0.6596, device='cuda:0') reward: tensor(1.4578, device='cuda:0') factor: tensor(0.2018, device='cuda:0')
reward_raw: tensor(0.6439, device='cuda:0') reward: tensor(1.4448, device='cuda:0') factor: tensor(0.1991, device='cuda:0')


Epoch 1, Step 1640: Loss = -0.0796, Actor Loss = -0.1062, Critic Loss = 0.0536, Entropy = 0.1423, Returns = 1.4182, Value = 1.5141, mask_loss = 0.2381std_loss = 0.3606:  30%|██▉       | 1640/5475 [7:21:14<18:40:23, 17.53s/it]

ratio tensor(1.6242, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5561, device='cuda:0') reward: tensor(1.3852, device='cuda:0') factor: tensor(0.1709, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.1675, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.3722, device='cuda:0') factor: tensor(0.1710, device='cuda:0')
reward_raw: tensor(0.5431, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1725, device='cuda:0')
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.3788, device='cuda:0') factor: tensor(0.1669, device='cuda:0')


Epoch 1, Step 1641: Loss = 0.1949, Actor Loss = 0.1635, Critic Loss = 0.0629, Entropy = 0.1323, Returns = 1.3465, Value = 1.4038, mask_loss = 0.2116std_loss = 0.3513:  30%|██▉       | 1640/5475 [7:21:36<18:40:23, 17.53s/it]  

ratio tensor(0.4737, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1641: Loss = 0.1949, Actor Loss = 0.1635, Critic Loss = 0.0629, Entropy = 0.1323, Returns = 1.3465, Value = 1.4038, mask_loss = 0.2116std_loss = 0.3513:  30%|██▉       | 1641/5475 [7:21:36<20:10:57, 18.95s/it]

reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.2980, device='cuda:0') factor: tensor(0.2215, device='cuda:0')
reward_raw: tensor(0.5112, device='cuda:0') reward: tensor(1.2939, device='cuda:0') factor: tensor(0.2173, device='cuda:0')
reward_raw: tensor(0.5135, device='cuda:0') reward: tensor(1.2984, device='cuda:0') factor: tensor(0.2151, device='cuda:0')
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.3028, device='cuda:0') factor: tensor(0.2102, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.3023, device='cuda:0') factor: tensor(0.2141, device='cuda:0')


Epoch 1, Step 1642: Loss = -0.0068, Actor Loss = -0.0440, Critic Loss = 0.0746, Entropy = 0.1159, Returns = 1.2928, Value = 1.2613, mask_loss = 0.2043std_loss = 0.3381:  30%|██▉       | 1642/5475 [7:21:58<20:52:20, 19.60s/it]

ratio tensor(1.0048, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.1698, device='cuda:0')
reward_raw: tensor(0.5047, device='cuda:0') reward: tensor(1.3351, device='cuda:0') factor: tensor(0.1696, device='cuda:0')
reward_raw: tensor(0.4952, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1692, device='cuda:0')
reward_raw: tensor(0.5125, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.1748, device='cuda:0')
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.3187, device='cuda:0') factor: tensor(0.1774, device='cuda:0')


Epoch 1, Step 1643: Loss = 0.0323, Actor Loss = -0.0032, Critic Loss = 0.0712, Entropy = 0.1038, Returns = 1.3796, Value = 1.3446, mask_loss = 0.1743std_loss = 0.3297:  30%|███       | 1643/5475 [7:22:11<19:01:43, 17.88s/it] 

ratio tensor(1.0657, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5032, device='cuda:0') reward: tensor(1.3112, device='cuda:0') factor: tensor(0.1920, device='cuda:0')
reward_raw: tensor(0.5022, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.1898, device='cuda:0')
reward_raw: tensor(0.5068, device='cuda:0') reward: tensor(1.3155, device='cuda:0') factor: tensor(0.1913, device='cuda:0')
reward_raw: tensor(0.5126, device='cuda:0') reward: tensor(1.3271, device='cuda:0') factor: tensor(0.1854, device='cuda:0')
reward_raw: tensor(0.5158, device='cuda:0') reward: tensor(1.3212, device='cuda:0') factor: tensor(0.1946, device='cuda:0')


Epoch 1, Step 1644: Loss = -0.0038, Actor Loss = -0.0294, Critic Loss = 0.0516, Entropy = 0.1302, Returns = 1.2918, Value = 1.3786, mask_loss = 0.1995std_loss = 0.3350:  30%|███       | 1644/5475 [7:22:27<18:28:02, 17.35s/it]

ratio tensor(1.3730, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6002, device='cuda:0') reward: tensor(1.4191, device='cuda:0') factor: tensor(0.1811, device='cuda:0')
reward_raw: tensor(0.6188, device='cuda:0') reward: tensor(1.4346, device='cuda:0') factor: tensor(0.1841, device='cuda:0')
reward_raw: tensor(0.6052, device='cuda:0') reward: tensor(1.4236, device='cuda:0') factor: tensor(0.1816, device='cuda:0')
reward_raw: tensor(0.5832, device='cuda:0') reward: tensor(1.4010, device='cuda:0') factor: tensor(0.1822, device='cuda:0')
reward_raw: tensor(0.5900, device='cuda:0') reward: tensor(1.4075, device='cuda:0') factor: tensor(0.1825, device='cuda:0')


Epoch 1, Step 1645: Loss = -0.0059, Actor Loss = -0.0336, Critic Loss = 0.0556, Entropy = 0.0779, Returns = 1.3885, Value = 1.3726, mask_loss = 0.1816std_loss = 0.3538:  30%|███       | 1645/5475 [7:22:41<17:17:42, 16.26s/it]

ratio tensor(2.7329, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5293, device='cuda:0') reward: tensor(1.3391, device='cuda:0') factor: tensor(0.1902, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.1874, device='cuda:0')
reward_raw: tensor(0.5430, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.1935, device='cuda:0')
reward_raw: tensor(0.5291, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1938, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(1.3431, device='cuda:0') factor: tensor(0.1888, device='cuda:0')


Epoch 1, Step 1646: Loss = 0.0674, Actor Loss = 0.0432, Critic Loss = 0.0486, Entropy = 0.1240, Returns = 1.2680, Value = 1.2623, mask_loss = 0.1672std_loss = 0.3166:  30%|███       | 1646/5475 [7:22:56<16:51:12, 15.85s/it]  

ratio tensor(0.9271, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5619, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1876, device='cuda:0')
reward_raw: tensor(0.5702, device='cuda:0') reward: tensor(1.3820, device='cuda:0') factor: tensor(0.1883, device='cuda:0')
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.3749, device='cuda:0') factor: tensor(0.1875, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.1951, device='cuda:0')
reward_raw: tensor(0.5482, device='cuda:0') reward: tensor(1.3621, device='cuda:0') factor: tensor(0.1861, device='cuda:0')


Epoch 1, Step 1647: Loss = -0.0511, Actor Loss = -0.0862, Critic Loss = 0.0703, Entropy = 0.1181, Returns = 1.2710, Value = 1.4133, mask_loss = 0.1918std_loss = 0.3397:  30%|███       | 1647/5475 [7:23:14<17:22:22, 16.34s/it]

ratio tensor(1.4651, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5575, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.2051, device='cuda:0')
reward_raw: tensor(0.5678, device='cuda:0') reward: tensor(1.3545, device='cuda:0') factor: tensor(0.2133, device='cuda:0')
reward_raw: tensor(0.5631, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.2139, device='cuda:0')
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.3581, device='cuda:0') factor: tensor(0.2070, device='cuda:0')
reward_raw: tensor(0.5606, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.2103, device='cuda:0')


Epoch 1, Step 1648: Loss = 0.2482, Actor Loss = 0.2239, Critic Loss = 0.0487, Entropy = 0.0974, Returns = 1.3691, Value = 1.4226, mask_loss = 0.1808std_loss = 0.3310:  30%|███       | 1648/5475 [7:23:28<16:51:23, 15.86s/it]  

ratio tensor(0.6396, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5587, device='cuda:0') reward: tensor(1.3774, device='cuda:0') factor: tensor(0.1812, device='cuda:0')
reward_raw: tensor(0.5621, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.1844, device='cuda:0')
reward_raw: tensor(0.5591, device='cuda:0') reward: tensor(1.3759, device='cuda:0') factor: tensor(0.1832, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.3700, device='cuda:0') factor: tensor(0.1803, device='cuda:0')
reward_raw: tensor(0.5553, device='cuda:0') reward: tensor(1.3735, device='cuda:0') factor: tensor(0.1818, device='cuda:0')


Epoch 1, Step 1649: Loss = -0.0431, Actor Loss = -0.0723, Critic Loss = 0.0585, Entropy = 0.0794, Returns = 1.4058, Value = 1.3203, mask_loss = 0.1682std_loss = 0.3314:  30%|███       | 1649/5475 [7:23:42<16:18:59, 15.35s/it]

ratio tensor(1.1431, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5100, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1441, device='cuda:0')
reward_raw: tensor(0.5168, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.1465, device='cuda:0')
reward_raw: tensor(0.5210, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.1444, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1415, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1440, device='cuda:0')


Epoch 1, Step 1650: Loss = -0.0422, Actor Loss = -0.0674, Critic Loss = 0.0505, Entropy = 0.0734, Returns = 1.3728, Value = 1.3996, mask_loss = 0.1443std_loss = 0.3117:  30%|███       | 1650/5475 [7:23:57<16:07:32, 15.18s/it]

ratio tensor(1.0092, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5210, device='cuda:0') reward: tensor(1.3248, device='cuda:0') factor: tensor(0.1962, device='cuda:0')
reward_raw: tensor(0.5277, device='cuda:0') reward: tensor(1.3374, device='cuda:0') factor: tensor(0.1903, device='cuda:0')
reward_raw: tensor(0.5218, device='cuda:0') reward: tensor(1.3307, device='cuda:0') factor: tensor(0.1911, device='cuda:0')
reward_raw: tensor(0.5325, device='cuda:0') reward: tensor(1.3437, device='cuda:0') factor: tensor(0.1888, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.3309, device='cuda:0') factor: tensor(0.1915, device='cuda:0')


Epoch 1, Step 1651: Loss = 0.1170, Actor Loss = 0.0765, Critic Loss = 0.0812, Entropy = 0.1132, Returns = 1.3139, Value = 1.2247, mask_loss = 0.1859std_loss = 0.3354:  30%|███       | 1651/5475 [7:24:14<16:28:31, 15.51s/it]  

ratio tensor(1.6461, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.2947, device='cuda:0') factor: tensor(0.1736, device='cuda:0')
reward_raw: tensor(0.4811, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.1743, device='cuda:0')
reward_raw: tensor(0.4667, device='cuda:0') reward: tensor(1.2961, device='cuda:0') factor: tensor(0.1706, device='cuda:0')
reward_raw: tensor(0.4742, device='cuda:0') reward: tensor(1.2990, device='cuda:0') factor: tensor(0.1752, device='cuda:0')
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.2948, device='cuda:0') factor: tensor(0.1789, device='cuda:0')


Epoch 1, Step 1652: Loss = 0.0232, Actor Loss = 0.0050, Critic Loss = 0.0366, Entropy = 0.1248, Returns = 1.3068, Value = 1.2884, mask_loss = 0.1886std_loss = 0.3256:  30%|███       | 1652/5475 [7:24:32<17:17:23, 16.28s/it]

ratio tensor(1.7993, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5608, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.1619, device='cuda:0')
reward_raw: tensor(0.5491, device='cuda:0') reward: tensor(1.3893, device='cuda:0') factor: tensor(0.1597, device='cuda:0')
reward_raw: tensor(0.5411, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1583, device='cuda:0')
reward_raw: tensor(0.5595, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.1637, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.4090, device='cuda:0') factor: tensor(0.1585, device='cuda:0')


Epoch 1, Step 1653: Loss = -0.2068, Actor Loss = -0.2299, Critic Loss = 0.0464, Entropy = 0.0791, Returns = 1.3969, Value = 1.3450, mask_loss = 0.1394std_loss = 0.2982:  30%|███       | 1653/5475 [7:24:51<18:20:07, 17.27s/it]

ratio tensor(6.2866, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5748, device='cuda:0') reward: tensor(1.3960, device='cuda:0') factor: tensor(0.1788, device='cuda:0')
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.3977, device='cuda:0') factor: tensor(0.1750, device='cuda:0')
reward_raw: tensor(0.5697, device='cuda:0') reward: tensor(1.3968, device='cuda:0') factor: tensor(0.1730, device='cuda:0')
reward_raw: tensor(0.5789, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.1815, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.3931, device='cuda:0') factor: tensor(0.1811, device='cuda:0')


Epoch 1, Step 1654: Loss = -0.0778, Actor Loss = -0.0996, Critic Loss = 0.0438, Entropy = 0.1097, Returns = 1.3774, Value = 1.3948, mask_loss = 0.1588std_loss = 0.3039:  30%|███       | 1654/5475 [7:25:06<17:26:39, 16.44s/it]

ratio tensor(1.8230, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.1787, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3356, device='cuda:0') factor: tensor(0.1778, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3294, device='cuda:0') factor: tensor(0.1840, device='cuda:0')
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.3346, device='cuda:0') factor: tensor(0.1794, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.1721, device='cuda:0')


Epoch 1, Step 1655: Loss = 0.4432, Actor Loss = 0.4082, Critic Loss = 0.0704, Entropy = 0.1265, Returns = 1.3419, Value = 1.3004, mask_loss = 0.2282std_loss = 0.3401:  30%|███       | 1655/5475 [7:25:22<17:24:02, 16.40s/it]  

ratio tensor(1.2037, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5178, device='cuda:0') reward: tensor(1.3079, device='cuda:0') factor: tensor(0.2099, device='cuda:0')
reward_raw: tensor(0.5136, device='cuda:0') reward: tensor(1.3075, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.3065, device='cuda:0') factor: tensor(0.2103, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3027, device='cuda:0') factor: tensor(0.2067, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.3160, device='cuda:0') factor: tensor(0.2054, device='cuda:0')


Epoch 1, Step 1656: Loss = -0.0540, Actor Loss = -0.0779, Critic Loss = 0.0481, Entropy = 0.1494, Returns = 1.2981, Value = 1.2863, mask_loss = 0.2414std_loss = 0.3663:  30%|███       | 1656/5475 [7:25:37<16:50:48, 15.88s/it]

ratio tensor(0.9904, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4876, device='cuda:0') reward: tensor(1.3099, device='cuda:0') factor: tensor(0.1777, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.3325, device='cuda:0') factor: tensor(0.1777, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1806, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.1732, device='cuda:0')
reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.3162, device='cuda:0') factor: tensor(0.1758, device='cuda:0')


Epoch 1, Step 1657: Loss = -0.1150, Actor Loss = -0.1317, Critic Loss = 0.0336, Entropy = 0.1205, Returns = 1.2844, Value = 1.3119, mask_loss = 0.1680std_loss = 0.3206:  30%|███       | 1657/5475 [7:25:50<16:08:50, 15.23s/it]

ratio tensor(1.8560, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5614, device='cuda:0') reward: tensor(1.3674, device='cuda:0') factor: tensor(0.1941, device='cuda:0')
reward_raw: tensor(0.5797, device='cuda:0') reward: tensor(1.3791, device='cuda:0') factor: tensor(0.2006, device='cuda:0')
reward_raw: tensor(0.5786, device='cuda:0') reward: tensor(1.3704, device='cuda:0') factor: tensor(0.2082, device='cuda:0')
reward_raw: tensor(0.5728, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5751, device='cuda:0') reward: tensor(1.3752, device='cuda:0') factor: tensor(0.1998, device='cuda:0')


Epoch 1, Step 1658: Loss = 0.0149, Actor Loss = -0.0324, Critic Loss = 0.0949, Entropy = 0.1248, Returns = 1.4309, Value = 1.2974, mask_loss = 0.1883std_loss = 0.3315:  30%|███       | 1658/5475 [7:26:07<16:44:15, 15.79s/it] 

ratio tensor(1.0580, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5128, device='cuda:0') reward: tensor(1.2998, device='cuda:0') factor: tensor(0.2130, device='cuda:0')
reward_raw: tensor(0.5087, device='cuda:0') reward: tensor(1.2987, device='cuda:0') factor: tensor(0.2100, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.3113, device='cuda:0') factor: tensor(0.2054, device='cuda:0')
reward_raw: tensor(0.5114, device='cuda:0') reward: tensor(1.3030, device='cuda:0') factor: tensor(0.2084, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3010, device='cuda:0') factor: tensor(0.2072, device='cuda:0')


Epoch 1, Step 1659: Loss = 0.0752, Actor Loss = 0.0312, Critic Loss = 0.0882, Entropy = 0.1141, Returns = 1.2956, Value = 1.3445, mask_loss = 0.1603std_loss = 0.3142:  30%|███       | 1659/5475 [7:26:21<15:57:26, 15.05s/it] 

ratio tensor(2.6513, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.3472, device='cuda:0') factor: tensor(0.2061, device='cuda:0')
reward_raw: tensor(0.5788, device='cuda:0') reward: tensor(1.3711, device='cuda:0') factor: tensor(0.2077, device='cuda:0')
reward_raw: tensor(0.5649, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.2031, device='cuda:0')
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.3719, device='cuda:0') factor: tensor(0.2008, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(1.3604, device='cuda:0') factor: tensor(0.2028, device='cuda:0')


Epoch 1, Step 1660: Loss = -0.0365, Actor Loss = -0.0744, Critic Loss = 0.0761, Entropy = 0.1097, Returns = 1.3492, Value = 1.4596, mask_loss = 0.1988std_loss = 0.3458:  30%|███       | 1660/5475 [7:26:34<15:30:27, 14.63s/it]

ratio tensor(1.3197, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5421, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.1658, device='cuda:0')
reward_raw: tensor(0.5357, device='cuda:0') reward: tensor(1.3754, device='cuda:0') factor: tensor(0.1602, device='cuda:0')
reward_raw: tensor(0.5530, device='cuda:0') reward: tensor(1.3840, device='cuda:0') factor: tensor(0.1690, device='cuda:0')
reward_raw: tensor(0.5515, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1662, device='cuda:0')
reward_raw: tensor(0.5398, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1692, device='cuda:0')


Epoch 1, Step 1661: Loss = 0.0163, Actor Loss = -0.0034, Critic Loss = 0.0397, Entropy = 0.1241, Returns = 1.3804, Value = 1.3797, mask_loss = 0.1808std_loss = 0.3127:  30%|███       | 1660/5475 [7:26:50<15:30:27, 14.63s/it] 

ratio tensor(1.0852, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1661: Loss = 0.0163, Actor Loss = -0.0034, Critic Loss = 0.0397, Entropy = 0.1241, Returns = 1.3804, Value = 1.3797, mask_loss = 0.1808std_loss = 0.3127:  30%|███       | 1661/5475 [7:26:50<15:47:28, 14.91s/it]

reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.4047, device='cuda:0') factor: tensor(0.1641, device='cuda:0')
reward_raw: tensor(0.5618, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.1675, device='cuda:0')
reward_raw: tensor(0.5414, device='cuda:0') reward: tensor(1.3749, device='cuda:0') factor: tensor(0.1665, device='cuda:0')
reward_raw: tensor(0.5720, device='cuda:0') reward: tensor(1.4108, device='cuda:0') factor: tensor(0.1613, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.1660, device='cuda:0')


Epoch 1, Step 1662: Loss = -0.1484, Actor Loss = -0.1819, Critic Loss = 0.0670, Entropy = 0.1021, Returns = 1.4121, Value = 1.2776, mask_loss = 0.1593std_loss = 0.3196:  30%|███       | 1662/5475 [7:27:03<15:17:37, 14.44s/it]

ratio tensor(1.4762, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6250, device='cuda:0') reward: tensor(1.4529, device='cuda:0') factor: tensor(0.1721, device='cuda:0')
reward_raw: tensor(0.6178, device='cuda:0') reward: tensor(1.4488, device='cuda:0') factor: tensor(0.1689, device='cuda:0')
reward_raw: tensor(0.6268, device='cuda:0') reward: tensor(1.4641, device='cuda:0') factor: tensor(0.1628, device='cuda:0')
reward_raw: tensor(0.6257, device='cuda:0') reward: tensor(1.4543, device='cuda:0') factor: tensor(0.1713, device='cuda:0')
reward_raw: tensor(0.6277, device='cuda:0') reward: tensor(1.4609, device='cuda:0') factor: tensor(0.1669, device='cuda:0')


Epoch 1, Step 1663: Loss = 0.0090, Actor Loss = -0.0001, Critic Loss = 0.0184, Entropy = 0.1085, Returns = 1.4311, Value = 1.4904, mask_loss = 0.1590std_loss = 0.3163:  30%|███       | 1663/5475 [7:27:23<16:56:50, 16.00s/it] 

ratio tensor(0.8858, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5347, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1489, device='cuda:0')
reward_raw: tensor(0.5555, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.1498, device='cuda:0')
reward_raw: tensor(0.5518, device='cuda:0') reward: tensor(1.4021, device='cuda:0') factor: tensor(0.1497, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.1441, device='cuda:0')
reward_raw: tensor(0.5433, device='cuda:0') reward: tensor(1.3939, device='cuda:0') factor: tensor(0.1494, device='cuda:0')


Epoch 1, Step 1664: Loss = 0.3719, Actor Loss = 0.3229, Critic Loss = 0.0982, Entropy = 0.1050, Returns = 1.3027, Value = 1.3283, mask_loss = 0.1556std_loss = 0.3032:  30%|███       | 1664/5475 [7:27:38<16:43:14, 15.79s/it] 

ratio tensor(1.6935, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5560, device='cuda:0') reward: tensor(1.3496, device='cuda:0') factor: tensor(0.2064, device='cuda:0')
reward_raw: tensor(0.5524, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.2146, device='cuda:0')
reward_raw: tensor(0.5646, device='cuda:0') reward: tensor(1.3557, device='cuda:0') factor: tensor(0.2089, device='cuda:0')
reward_raw: tensor(0.5739, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.2137, device='cuda:0')
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.3496, device='cuda:0') factor: tensor(0.2046, device='cuda:0')


Epoch 1, Step 1665: Loss = 0.0030, Actor Loss = -0.0190, Critic Loss = 0.0441, Entropy = 0.1107, Returns = 1.3936, Value = 1.3315, mask_loss = 0.2137std_loss = 0.3520:  30%|███       | 1665/5475 [7:27:52<16:04:32, 15.19s/it]

ratio tensor(0.7285, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5123, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.1609, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3476, device='cuda:0') factor: tensor(0.1612, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.1695, device='cuda:0')
reward_raw: tensor(0.4952, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.1594, device='cuda:0')
reward_raw: tensor(0.5097, device='cuda:0') reward: tensor(1.3484, device='cuda:0') factor: tensor(0.1613, device='cuda:0')


Epoch 1, Step 1666: Loss = 0.0295, Actor Loss = 0.0100, Critic Loss = 0.0392, Entropy = 0.0924, Returns = 1.3267, Value = 1.4123, mask_loss = 0.1531std_loss = 0.3176:  30%|███       | 1666/5475 [7:28:06<15:39:04, 14.79s/it] 

ratio tensor(0.9257, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1732, device='cuda:0')
reward_raw: tensor(0.4915, device='cuda:0') reward: tensor(1.3172, device='cuda:0') factor: tensor(0.1742, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.3390, device='cuda:0') factor: tensor(0.1760, device='cuda:0')
reward_raw: tensor(0.5044, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1691, device='cuda:0')
reward_raw: tensor(0.5079, device='cuda:0') reward: tensor(1.3331, device='cuda:0') factor: tensor(0.1748, device='cuda:0')


Epoch 1, Step 1667: Loss = 0.0908, Actor Loss = 0.0594, Critic Loss = 0.0630, Entropy = 0.1114, Returns = 1.2720, Value = 1.2940, mask_loss = 0.1707std_loss = 0.3260:  30%|███       | 1667/5475 [7:28:19<15:11:29, 14.36s/it]

ratio tensor(1.1161, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6986, device='cuda:0') reward: tensor(1.5309, device='cuda:0') factor: tensor(0.1677, device='cuda:0')
reward_raw: tensor(0.6915, device='cuda:0') reward: tensor(1.5248, device='cuda:0') factor: tensor(0.1667, device='cuda:0')
reward_raw: tensor(0.6796, device='cuda:0') reward: tensor(1.5045, device='cuda:0') factor: tensor(0.1751, device='cuda:0')
reward_raw: tensor(0.6787, device='cuda:0') reward: tensor(1.5111, device='cuda:0') factor: tensor(0.1676, device='cuda:0')
reward_raw: tensor(0.6800, device='cuda:0') reward: tensor(1.5122, device='cuda:0') factor: tensor(0.1678, device='cuda:0')


Epoch 1, Step 1668: Loss = -0.1139, Actor Loss = -0.1368, Critic Loss = 0.0459, Entropy = 0.1051, Returns = 1.5082, Value = 1.6400, mask_loss = 0.1690std_loss = 0.3272:  30%|███       | 1668/5475 [7:28:33<14:57:08, 14.14s/it]

ratio tensor(1.4287, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.3868, device='cuda:0') factor: tensor(0.1784, device='cuda:0')
reward_raw: tensor(0.5833, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1761, device='cuda:0')
reward_raw: tensor(0.5907, device='cuda:0') reward: tensor(1.4133, device='cuda:0') factor: tensor(0.1774, device='cuda:0')
reward_raw: tensor(0.5647, device='cuda:0') reward: tensor(1.3895, device='cuda:0') factor: tensor(0.1752, device='cuda:0')
reward_raw: tensor(0.5585, device='cuda:0') reward: tensor(1.3816, device='cuda:0') factor: tensor(0.1769, device='cuda:0')


Epoch 1, Step 1669: Loss = 0.1373, Actor Loss = 0.1177, Critic Loss = 0.0395, Entropy = 0.0936, Returns = 1.3826, Value = 1.4070, mask_loss = 0.1463std_loss = 0.3046:  30%|███       | 1669/5475 [7:28:54<17:05:52, 16.17s/it]  

ratio tensor(1.2162, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5916, device='cuda:0') reward: tensor(1.4265, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.5982, device='cuda:0') reward: tensor(1.4289, device='cuda:0') factor: tensor(0.1693, device='cuda:0')
reward_raw: tensor(0.6005, device='cuda:0') reward: tensor(1.4336, device='cuda:0') factor: tensor(0.1668, device='cuda:0')
reward_raw: tensor(0.5873, device='cuda:0') reward: tensor(1.4243, device='cuda:0') factor: tensor(0.1629, device='cuda:0')
reward_raw: tensor(0.5939, device='cuda:0') reward: tensor(1.4323, device='cuda:0') factor: tensor(0.1616, device='cuda:0')


Epoch 1, Step 1670: Loss = 0.0754, Actor Loss = 0.0524, Critic Loss = 0.0461, Entropy = 0.0739, Returns = 1.3543, Value = 1.2738, mask_loss = 0.1273std_loss = 0.2994:  31%|███       | 1670/5475 [7:29:08<16:26:33, 15.56s/it]

ratio tensor(0.6371, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5387, device='cuda:0') reward: tensor(1.3784, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3541, device='cuda:0') factor: tensor(0.1541, device='cuda:0')
reward_raw: tensor(0.5427, device='cuda:0') reward: tensor(1.3820, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.5327, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1549, device='cuda:0')
reward_raw: tensor(0.5271, device='cuda:0') reward: tensor(1.3702, device='cuda:0') factor: tensor(0.1569, device='cuda:0')


Epoch 1, Step 1671: Loss = -0.1446, Actor Loss = -0.1648, Critic Loss = 0.0406, Entropy = 0.0723, Returns = 1.4570, Value = 1.4485, mask_loss = 0.1284std_loss = 0.2973:  31%|███       | 1671/5475 [7:29:27<17:25:16, 16.49s/it]

ratio tensor(1.3981, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4909, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.1667, device='cuda:0')
reward_raw: tensor(0.5018, device='cuda:0') reward: tensor(1.3343, device='cuda:0') factor: tensor(0.1675, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3364, device='cuda:0') factor: tensor(0.1658, device='cuda:0')
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.3101, device='cuda:0') factor: tensor(0.1684, device='cuda:0')
reward_raw: tensor(0.4863, device='cuda:0') reward: tensor(1.3163, device='cuda:0') factor: tensor(0.1700, device='cuda:0')


Epoch 1, Step 1672: Loss = 0.3397, Actor Loss = 0.3177, Critic Loss = 0.0443, Entropy = 0.1009, Returns = 1.3355, Value = 1.2727, mask_loss = 0.1828std_loss = 0.3369:  31%|███       | 1672/5475 [7:29:44<17:50:14, 16.89s/it]  

ratio tensor(1.2495, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6010, device='cuda:0') reward: tensor(1.4401, device='cuda:0') factor: tensor(0.1609, device='cuda:0')
reward_raw: tensor(0.5977, device='cuda:0') reward: tensor(1.4463, device='cuda:0') factor: tensor(0.1514, device='cuda:0')
reward_raw: tensor(0.5946, device='cuda:0') reward: tensor(1.4395, device='cuda:0') factor: tensor(0.1552, device='cuda:0')
reward_raw: tensor(0.5974, device='cuda:0') reward: tensor(1.4427, device='cuda:0') factor: tensor(0.1547, device='cuda:0')
reward_raw: tensor(0.5983, device='cuda:0') reward: tensor(1.4426, device='cuda:0') factor: tensor(0.1557, device='cuda:0')


Epoch 1, Step 1673: Loss = -0.1345, Actor Loss = -0.1514, Critic Loss = 0.0339, Entropy = 0.0932, Returns = 1.4493, Value = 1.4840, mask_loss = 0.1645std_loss = 0.3197:  31%|███       | 1673/5475 [7:29:59<17:13:56, 16.32s/it]

ratio tensor(1.5493, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.4104, device='cuda:0') factor: tensor(0.1433, device='cuda:0')
reward_raw: tensor(0.5256, device='cuda:0') reward: tensor(1.3876, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1435, device='cuda:0')
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.4104, device='cuda:0') factor: tensor(0.1446, device='cuda:0')
reward_raw: tensor(0.5559, device='cuda:0') reward: tensor(1.4219, device='cuda:0') factor: tensor(0.1340, device='cuda:0')


Epoch 1, Step 1674: Loss = -0.0049, Actor Loss = -0.0389, Critic Loss = 0.0681, Entropy = 0.1156, Returns = 1.4059, Value = 1.5028, mask_loss = 0.1652std_loss = 0.3120:  31%|███       | 1674/5475 [7:30:13<16:14:48, 15.39s/it]

ratio tensor(0.8874, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5302, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.1618, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1551, device='cuda:0')
reward_raw: tensor(0.5259, device='cuda:0') reward: tensor(1.3655, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.5239, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1593, device='cuda:0')
reward_raw: tensor(0.5304, device='cuda:0') reward: tensor(1.3708, device='cuda:0') factor: tensor(0.1596, device='cuda:0')


Epoch 1, Step 1675: Loss = 0.3032, Actor Loss = 0.2818, Critic Loss = 0.0431, Entropy = 0.1226, Returns = 1.3009, Value = 1.3333, mask_loss = 0.1876std_loss = 0.3346:  31%|███       | 1675/5475 [7:30:27<15:48:02, 14.97s/it]  

ratio tensor(0.4853, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.3550, device='cuda:0') factor: tensor(0.1568, device='cuda:0')
reward_raw: tensor(0.5159, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1604, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1558, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.3445, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5293, device='cuda:0') reward: tensor(1.3672, device='cuda:0') factor: tensor(0.1621, device='cuda:0')


Epoch 1, Step 1676: Loss = -0.1029, Actor Loss = -0.1346, Critic Loss = 0.0635, Entropy = 0.0856, Returns = 1.3472, Value = 1.2941, mask_loss = 0.1572std_loss = 0.3196:  31%|███       | 1676/5475 [7:30:40<15:15:52, 14.46s/it]

ratio tensor(1.5799, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4024, device='cuda:0') factor: tensor(0.1539, device='cuda:0')
reward_raw: tensor(0.5560, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.1547, device='cuda:0')
reward_raw: tensor(0.5772, device='cuda:0') reward: tensor(1.4212, device='cuda:0') factor: tensor(0.1559, device='cuda:0')
reward_raw: tensor(0.5426, device='cuda:0') reward: tensor(1.3858, device='cuda:0') factor: tensor(0.1569, device='cuda:0')
reward_raw: tensor(0.5612, device='cuda:0') reward: tensor(1.4013, device='cuda:0') factor: tensor(0.1598, device='cuda:0')


Epoch 1, Step 1677: Loss = -0.1976, Actor Loss = -0.2209, Critic Loss = 0.0468, Entropy = 0.0924, Returns = 1.4498, Value = 1.3983, mask_loss = 0.1608std_loss = 0.3238:  31%|███       | 1677/5475 [7:30:53<14:55:33, 14.15s/it]

ratio tensor(1.7297, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5689, device='cuda:0') reward: tensor(1.4278, device='cuda:0') factor: tensor(0.1411, device='cuda:0')
reward_raw: tensor(0.5512, device='cuda:0') reward: tensor(1.4080, device='cuda:0') factor: tensor(0.1432, device='cuda:0')
reward_raw: tensor(0.5432, device='cuda:0') reward: tensor(1.4078, device='cuda:0') factor: tensor(0.1353, device='cuda:0')
reward_raw: tensor(0.5499, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1519, device='cuda:0')
reward_raw: tensor(0.5586, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.1492, device='cuda:0')


Epoch 1, Step 1678: Loss = 0.1205, Actor Loss = 0.0748, Critic Loss = 0.0916, Entropy = 0.0899, Returns = 1.2910, Value = 1.3671, mask_loss = 0.1450std_loss = 0.3066:  31%|███       | 1678/5475 [7:31:12<16:28:19, 15.62s/it]  

ratio tensor(1.0049, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.3853, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.5186, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.1296, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.5199, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.3912, device='cuda:0') factor: tensor(0.1337, device='cuda:0')


Epoch 1, Step 1679: Loss = 0.0710, Actor Loss = 0.0447, Critic Loss = 0.0527, Entropy = 0.0884, Returns = 1.3768, Value = 1.3593, mask_loss = 0.1460std_loss = 0.3103:  31%|███       | 1679/5475 [7:31:26<15:49:17, 15.00s/it]

ratio tensor(0.8132, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.3776, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.5086, device='cuda:0') reward: tensor(1.3667, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1369, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3859, device='cuda:0') factor: tensor(0.1354, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.1353, device='cuda:0')


Epoch 1, Step 1680: Loss = -0.0045, Actor Loss = -0.0346, Critic Loss = 0.0604, Entropy = 0.0810, Returns = 1.3994, Value = 1.3726, mask_loss = 0.1611std_loss = 0.3198:  31%|███       | 1680/5475 [7:31:44<16:45:49, 15.90s/it]

ratio tensor(0.8012, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6088, device='cuda:0') reward: tensor(1.4421, device='cuda:0') factor: tensor(0.1667, device='cuda:0')
reward_raw: tensor(0.6102, device='cuda:0') reward: tensor(1.4464, device='cuda:0') factor: tensor(0.1638, device='cuda:0')
reward_raw: tensor(0.6081, device='cuda:0') reward: tensor(1.4416, device='cuda:0') factor: tensor(0.1665, device='cuda:0')
reward_raw: tensor(0.5981, device='cuda:0') reward: tensor(1.4323, device='cuda:0') factor: tensor(0.1658, device='cuda:0')
reward_raw: tensor(0.6172, device='cuda:0') reward: tensor(1.4563, device='cuda:0') factor: tensor(0.1609, device='cuda:0')


Epoch 1, Step 1681: Loss = -0.0907, Actor Loss = -0.1287, Critic Loss = 0.0763, Entropy = 0.0839, Returns = 1.4021, Value = 1.4584, mask_loss = 0.1526std_loss = 0.3142:  31%|███       | 1680/5475 [7:31:56<16:45:49, 15.90s/it]

ratio tensor(0.9297, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1681: Loss = -0.0907, Actor Loss = -0.1287, Critic Loss = 0.0763, Entropy = 0.0839, Returns = 1.4021, Value = 1.4584, mask_loss = 0.1526std_loss = 0.3142:  31%|███       | 1681/5475 [7:31:56<15:40:52, 14.88s/it]

reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3497, device='cuda:0') factor: tensor(0.1302, device='cuda:0')
reward_raw: tensor(0.4793, device='cuda:0') reward: tensor(1.3485, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3733, device='cuda:0') factor: tensor(0.1306, device='cuda:0')
reward_raw: tensor(0.4938, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1245, device='cuda:0')
reward_raw: tensor(0.4534, device='cuda:0') reward: tensor(1.3200, device='cuda:0') factor: tensor(0.1334, device='cuda:0')


Epoch 1, Step 1682: Loss = 0.0693, Actor Loss = 0.0192, Critic Loss = 0.1004, Entropy = 0.0738, Returns = 1.3990, Value = 1.3559, mask_loss = 0.1379std_loss = 0.2963:  31%|███       | 1682/5475 [7:32:11<15:40:47, 14.88s/it]  

ratio tensor(0.9458, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5089, device='cuda:0') reward: tensor(1.3538, device='cuda:0') factor: tensor(0.1550, device='cuda:0')
reward_raw: tensor(0.5056, device='cuda:0') reward: tensor(1.3502, device='cuda:0') factor: tensor(0.1554, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3516, device='cuda:0') factor: tensor(0.1522, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1515, device='cuda:0')
reward_raw: tensor(0.5053, device='cuda:0') reward: tensor(1.3555, device='cuda:0') factor: tensor(0.1498, device='cuda:0')


Epoch 1, Step 1683: Loss = 0.0683, Actor Loss = 0.0336, Critic Loss = 0.0696, Entropy = 0.0788, Returns = 1.3184, Value = 1.3131, mask_loss = 0.1601std_loss = 0.3254:  31%|███       | 1683/5475 [7:32:25<15:23:58, 14.62s/it]

ratio tensor(0.8877, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6094, device='cuda:0') reward: tensor(1.4644, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.6105, device='cuda:0') reward: tensor(1.4638, device='cuda:0') factor: tensor(0.1467, device='cuda:0')
reward_raw: tensor(0.6245, device='cuda:0') reward: tensor(1.4826, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.6033, device='cuda:0') reward: tensor(1.4583, device='cuda:0') factor: tensor(0.1450, device='cuda:0')
reward_raw: tensor(0.6028, device='cuda:0') reward: tensor(1.4572, device='cuda:0') factor: tensor(0.1456, device='cuda:0')


Epoch 1, Step 1684: Loss = -0.1636, Actor Loss = -0.1911, Critic Loss = 0.0552, Entropy = 0.0630, Returns = 1.4651, Value = 1.5202, mask_loss = 0.1412std_loss = 0.3190:  31%|███       | 1684/5475 [7:32:41<15:49:25, 15.03s/it]

ratio tensor(1.3784, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4252, device='cuda:0') reward: tensor(1.2648, device='cuda:0') factor: tensor(0.1604, device='cuda:0')
reward_raw: tensor(0.4363, device='cuda:0') reward: tensor(1.2791, device='cuda:0') factor: tensor(0.1571, device='cuda:0')
reward_raw: tensor(0.4469, device='cuda:0') reward: tensor(1.2849, device='cuda:0') factor: tensor(0.1620, device='cuda:0')
reward_raw: tensor(0.4460, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.1605, device='cuda:0')
reward_raw: tensor(0.4358, device='cuda:0') reward: tensor(1.2783, device='cuda:0') factor: tensor(0.1575, device='cuda:0')


Epoch 1, Step 1685: Loss = 0.2686, Actor Loss = 0.2484, Critic Loss = 0.0407, Entropy = 0.1005, Returns = 1.2902, Value = 1.3624, mask_loss = 0.1661std_loss = 0.3257:  31%|███       | 1685/5475 [7:32:57<16:04:39, 15.27s/it]  

ratio tensor(0.7078, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5146, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1498, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.3411, device='cuda:0') factor: tensor(0.1513, device='cuda:0')
reward_raw: tensor(0.5113, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1490, device='cuda:0')
reward_raw: tensor(0.5228, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1484, device='cuda:0')
reward_raw: tensor(0.5413, device='cuda:0') reward: tensor(1.3906, device='cuda:0') factor: tensor(0.1508, device='cuda:0')


Epoch 1, Step 1686: Loss = -0.0316, Actor Loss = -0.0630, Critic Loss = 0.0630, Entropy = 0.0832, Returns = 1.4324, Value = 1.3498, mask_loss = 0.1566std_loss = 0.3208:  31%|███       | 1686/5475 [7:33:13<16:07:20, 15.32s/it]

ratio tensor(1.0387, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4284, device='cuda:0') reward: tensor(1.3035, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.4025, device='cuda:0') reward: tensor(1.2775, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.4131, device='cuda:0') reward: tensor(1.2878, device='cuda:0') factor: tensor(0.1253, device='cuda:0')
reward_raw: tensor(0.3928, device='cuda:0') reward: tensor(1.2674, device='cuda:0') factor: tensor(0.1254, device='cuda:0')
reward_raw: tensor(0.4165, device='cuda:0') reward: tensor(1.2912, device='cuda:0') factor: tensor(0.1253, device='cuda:0')


Epoch 1, Step 1687: Loss = 0.0299, Actor Loss = 0.0072, Critic Loss = 0.0456, Entropy = 0.0828, Returns = 1.2622, Value = 1.3764, mask_loss = 0.1282std_loss = 0.2934:  31%|███       | 1687/5475 [7:33:28<16:16:39, 15.47s/it]  

ratio tensor(0.9425, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5268, device='cuda:0') reward: tensor(1.3785, device='cuda:0') factor: tensor(0.1483, device='cuda:0')
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(1.4021, device='cuda:0') factor: tensor(0.1524, device='cuda:0')
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.3883, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.5143, device='cuda:0') reward: tensor(1.3671, device='cuda:0') factor: tensor(0.1472, device='cuda:0')
reward_raw: tensor(0.5490, device='cuda:0') reward: tensor(1.3989, device='cuda:0') factor: tensor(0.1501, device='cuda:0')


Epoch 1, Step 1688: Loss = 0.0616, Actor Loss = 0.0276, Critic Loss = 0.0681, Entropy = 0.0718, Returns = 1.3324, Value = 1.2735, mask_loss = 0.1240std_loss = 0.2875:  31%|███       | 1688/5475 [7:33:49<17:55:28, 17.04s/it]

ratio tensor(0.8094, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5648, device='cuda:0') reward: tensor(1.4393, device='cuda:0') factor: tensor(0.1255, device='cuda:0')
reward_raw: tensor(0.5854, device='cuda:0') reward: tensor(1.4610, device='cuda:0') factor: tensor(0.1244, device='cuda:0')
reward_raw: tensor(0.5698, device='cuda:0') reward: tensor(1.4496, device='cuda:0') factor: tensor(0.1202, device='cuda:0')
reward_raw: tensor(0.5904, device='cuda:0') reward: tensor(1.4671, device='cuda:0') factor: tensor(0.1233, device='cuda:0')
reward_raw: tensor(0.5583, device='cuda:0') reward: tensor(1.4305, device='cuda:0') factor: tensor(0.1278, device='cuda:0')


Epoch 1, Step 1689: Loss = -0.1104, Actor Loss = -0.1553, Critic Loss = 0.0898, Entropy = 0.0542, Returns = 1.3818, Value = 1.4128, mask_loss = 0.1160std_loss = 0.2924:  31%|███       | 1689/5475 [7:34:05<17:33:08, 16.69s/it]

ratio tensor(1.0362, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(1.3570, device='cuda:0') factor: tensor(0.1354, device='cuda:0')
reward_raw: tensor(0.4899, device='cuda:0') reward: tensor(1.3543, device='cuda:0') factor: tensor(0.1356, device='cuda:0')
reward_raw: tensor(0.4903, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.1316, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.3732, device='cuda:0') factor: tensor(0.1315, device='cuda:0')
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3458, device='cuda:0') factor: tensor(0.1318, device='cuda:0')


Epoch 1, Step 1690: Loss = -0.0192, Actor Loss = -0.0527, Critic Loss = 0.0670, Entropy = 0.0721, Returns = 1.3317, Value = 1.3136, mask_loss = 0.1264std_loss = 0.2936:  31%|███       | 1690/5475 [7:34:27<19:06:59, 18.18s/it]

ratio tensor(0.9664, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4996, device='cuda:0') reward: tensor(1.3412, device='cuda:0') factor: tensor(0.1584, device='cuda:0')
reward_raw: tensor(0.5189, device='cuda:0') reward: tensor(1.3628, device='cuda:0') factor: tensor(0.1561, device='cuda:0')
reward_raw: tensor(0.5192, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.1577, device='cuda:0')
reward_raw: tensor(0.5352, device='cuda:0') reward: tensor(1.3747, device='cuda:0') factor: tensor(0.1605, device='cuda:0')
reward_raw: tensor(0.5459, device='cuda:0') reward: tensor(1.3833, device='cuda:0') factor: tensor(0.1626, device='cuda:0')


Epoch 1, Step 1691: Loss = 0.0948, Actor Loss = 0.0629, Critic Loss = 0.0640, Entropy = 0.0797, Returns = 1.2978, Value = 1.1972, mask_loss = 0.1665std_loss = 0.3235:  31%|███       | 1691/5475 [7:34:40<17:38:55, 16.79s/it]  

ratio tensor(0.7470, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5691, device='cuda:0') reward: tensor(1.4445, device='cuda:0') factor: tensor(0.1245, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.4517, device='cuda:0') factor: tensor(0.1288, device='cuda:0')
reward_raw: tensor(0.5813, device='cuda:0') reward: tensor(1.4538, device='cuda:0') factor: tensor(0.1274, device='cuda:0')
reward_raw: tensor(0.5601, device='cuda:0') reward: tensor(1.4348, device='cuda:0') factor: tensor(0.1253, device='cuda:0')
reward_raw: tensor(0.5785, device='cuda:0') reward: tensor(1.4499, device='cuda:0') factor: tensor(0.1286, device='cuda:0')


Epoch 1, Step 1692: Loss = -0.0684, Actor Loss = -0.1156, Critic Loss = 0.0944, Entropy = 0.0471, Returns = 1.3386, Value = 1.5081, mask_loss = 0.1129std_loss = 0.2915:  31%|███       | 1692/5475 [7:34:55<17:02:34, 16.22s/it]

ratio tensor(1.1377, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6227, device='cuda:0') reward: tensor(1.4697, device='cuda:0') factor: tensor(0.1530, device='cuda:0')
reward_raw: tensor(0.6360, device='cuda:0') reward: tensor(1.4901, device='cuda:0') factor: tensor(0.1459, device='cuda:0')
reward_raw: tensor(0.6460, device='cuda:0') reward: tensor(1.4941, device='cuda:0') factor: tensor(0.1518, device='cuda:0')
reward_raw: tensor(0.6708, device='cuda:0') reward: tensor(1.5168, device='cuda:0') factor: tensor(0.1541, device='cuda:0')
reward_raw: tensor(0.6335, device='cuda:0') reward: tensor(1.4789, device='cuda:0') factor: tensor(0.1547, device='cuda:0')


Epoch 1, Step 1693: Loss = 0.0326, Actor Loss = 0.0173, Critic Loss = 0.0306, Entropy = 0.0497, Returns = 1.5279, Value = 1.4949, mask_loss = 0.1371std_loss = 0.3104:  31%|███       | 1693/5475 [7:35:09<16:19:56, 15.55s/it]  

ratio tensor(1.0661, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4895, device='cuda:0') reward: tensor(1.3260, device='cuda:0') factor: tensor(0.1636, device='cuda:0')
reward_raw: tensor(0.4855, device='cuda:0') reward: tensor(1.3251, device='cuda:0') factor: tensor(0.1603, device='cuda:0')
reward_raw: tensor(0.4648, device='cuda:0') reward: tensor(1.2973, device='cuda:0') factor: tensor(0.1674, device='cuda:0')
reward_raw: tensor(0.4889, device='cuda:0') reward: tensor(1.3241, device='cuda:0') factor: tensor(0.1648, device='cuda:0')
reward_raw: tensor(0.4764, device='cuda:0') reward: tensor(1.3116, device='cuda:0') factor: tensor(0.1648, device='cuda:0')


Epoch 1, Step 1694: Loss = 0.1677, Actor Loss = 0.1387, Critic Loss = 0.0581, Entropy = 0.0891, Returns = 1.3337, Value = 1.2650, mask_loss = 0.1661std_loss = 0.3266:  31%|███       | 1694/5475 [7:35:23<15:42:07, 14.95s/it]

ratio tensor(0.9526, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4239, device='cuda:0') reward: tensor(1.2731, device='cuda:0') factor: tensor(0.1508, device='cuda:0')
reward_raw: tensor(0.4153, device='cuda:0') reward: tensor(1.2620, device='cuda:0') factor: tensor(0.1533, device='cuda:0')
reward_raw: tensor(0.4027, device='cuda:0') reward: tensor(1.2518, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.4090, device='cuda:0') reward: tensor(1.2622, device='cuda:0') factor: tensor(0.1467, device='cuda:0')
reward_raw: tensor(0.4140, device='cuda:0') reward: tensor(1.2620, device='cuda:0') factor: tensor(0.1520, device='cuda:0')


Epoch 1, Step 1695: Loss = -0.0002, Actor Loss = -0.0200, Critic Loss = 0.0397, Entropy = 0.0820, Returns = 1.2402, Value = 1.2486, mask_loss = 0.1465std_loss = 0.3151:  31%|███       | 1695/5475 [7:35:40<16:20:04, 15.56s/it]

ratio tensor(1.0654, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6166, device='cuda:0') reward: tensor(1.4761, device='cuda:0') factor: tensor(0.1405, device='cuda:0')
reward_raw: tensor(0.6066, device='cuda:0') reward: tensor(1.4639, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.6200, device='cuda:0') reward: tensor(1.4748, device='cuda:0') factor: tensor(0.1452, device='cuda:0')
reward_raw: tensor(0.6081, device='cuda:0') reward: tensor(1.4684, device='cuda:0') factor: tensor(0.1397, device='cuda:0')
reward_raw: tensor(0.6056, device='cuda:0') reward: tensor(1.4645, device='cuda:0') factor: tensor(0.1411, device='cuda:0')


Epoch 1, Step 1696: Loss = -0.2001, Actor Loss = -0.2492, Critic Loss = 0.0985, Entropy = 0.0778, Returns = 1.4863, Value = 1.6082, mask_loss = 0.1420std_loss = 0.3051:  31%|███       | 1696/5475 [7:35:58<17:11:08, 16.37s/it]

ratio tensor(2.6920, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.3922, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.5335, device='cuda:0') reward: tensor(1.3933, device='cuda:0') factor: tensor(0.1402, device='cuda:0')
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.3924, device='cuda:0') factor: tensor(0.1448, device='cuda:0')
reward_raw: tensor(0.5039, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.1415, device='cuda:0')
reward_raw: tensor(0.5221, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.1387, device='cuda:0')


Epoch 1, Step 1697: Loss = 0.1800, Actor Loss = 0.1630, Critic Loss = 0.0342, Entropy = 0.0960, Returns = 1.3298, Value = 1.3502, mask_loss = 0.1666std_loss = 0.3207:  31%|███       | 1697/5475 [7:36:14<17:05:21, 16.28s/it]  

ratio tensor(0.7064, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5660, device='cuda:0') reward: tensor(1.4079, device='cuda:0') factor: tensor(0.1581, device='cuda:0')
reward_raw: tensor(0.5747, device='cuda:0') reward: tensor(1.4148, device='cuda:0') factor: tensor(0.1598, device='cuda:0')
reward_raw: tensor(0.5561, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1581, device='cuda:0')
reward_raw: tensor(0.5543, device='cuda:0') reward: tensor(1.3988, device='cuda:0') factor: tensor(0.1555, device='cuda:0')
reward_raw: tensor(0.5952, device='cuda:0') reward: tensor(1.4354, device='cuda:0') factor: tensor(0.1598, device='cuda:0')


Epoch 1, Step 1698: Loss = -0.0478, Actor Loss = -0.0694, Critic Loss = 0.0434, Entropy = 0.0899, Returns = 1.4029, Value = 1.3820, mask_loss = 0.1626std_loss = 0.3255:  31%|███       | 1698/5475 [7:36:25<15:28:02, 14.74s/it]

ratio tensor(1.0382, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5704, device='cuda:0') reward: tensor(1.4113, device='cuda:0') factor: tensor(0.1591, device='cuda:0')
reward_raw: tensor(0.5733, device='cuda:0') reward: tensor(1.4116, device='cuda:0') factor: tensor(0.1617, device='cuda:0')
reward_raw: tensor(0.5712, device='cuda:0') reward: tensor(1.4165, device='cuda:0') factor: tensor(0.1547, device='cuda:0')
reward_raw: tensor(0.5951, device='cuda:0') reward: tensor(1.4393, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.5695, device='cuda:0') reward: tensor(1.4113, device='cuda:0') factor: tensor(0.1582, device='cuda:0')


Epoch 1, Step 1699: Loss = -0.0388, Actor Loss = -0.0570, Critic Loss = 0.0366, Entropy = 0.0971, Returns = 1.4272, Value = 1.4719, mask_loss = 0.2064std_loss = 0.3597:  31%|███       | 1699/5475 [7:36:49<18:19:21, 17.47s/it]

ratio tensor(1.4722, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5671, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.1651, device='cuda:0')
reward_raw: tensor(0.5808, device='cuda:0') reward: tensor(1.4165, device='cuda:0') factor: tensor(0.1643, device='cuda:0')
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.4204, device='cuda:0') factor: tensor(0.1619, device='cuda:0')
reward_raw: tensor(0.5862, device='cuda:0') reward: tensor(1.4217, device='cuda:0') factor: tensor(0.1645, device='cuda:0')
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.1638, device='cuda:0')


Epoch 1, Step 1700: Loss = 0.0284, Actor Loss = -0.0066, Critic Loss = 0.0700, Entropy = 0.0750, Returns = 1.3851, Value = 1.3134, mask_loss = 0.1676std_loss = 0.3399:  31%|███       | 1700/5475 [7:37:13<20:19:49, 19.39s/it] 

ratio tensor(0.9324, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6108, device='cuda:0') reward: tensor(1.4567, device='cuda:0') factor: tensor(0.1541, device='cuda:0')
reward_raw: tensor(0.6216, device='cuda:0') reward: tensor(1.4616, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward_raw: tensor(0.5959, device='cuda:0') reward: tensor(1.4317, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.6138, device='cuda:0') reward: tensor(1.4548, device='cuda:0') factor: tensor(0.1590, device='cuda:0')
reward_raw: tensor(0.6125, device='cuda:0') reward: tensor(1.4478, device='cuda:0') factor: tensor(0.1646, device='cuda:0')


Epoch 1, Step 1701: Loss = 0.0297, Actor Loss = 0.0046, Critic Loss = 0.0502, Entropy = 0.0653, Returns = 1.4727, Value = 1.4960, mask_loss = 0.1690std_loss = 0.3339:  31%|███       | 1700/5475 [7:37:27<20:19:49, 19.39s/it] 

ratio tensor(0.9941, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1701: Loss = 0.0297, Actor Loss = 0.0046, Critic Loss = 0.0502, Entropy = 0.0653, Returns = 1.4727, Value = 1.4960, mask_loss = 0.1690std_loss = 0.3339:  31%|███       | 1701/5475 [7:37:28<18:57:45, 18.09s/it]

reward_raw: tensor(0.5386, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.1611, device='cuda:0')
reward_raw: tensor(0.5492, device='cuda:0') reward: tensor(1.3885, device='cuda:0') factor: tensor(0.1607, device='cuda:0')
reward_raw: tensor(0.5566, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.5639, device='cuda:0') reward: tensor(1.3996, device='cuda:0') factor: tensor(0.1644, device='cuda:0')
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.3894, device='cuda:0') factor: tensor(0.1664, device='cuda:0')


Epoch 1, Step 1702: Loss = 0.1334, Actor Loss = 0.1076, Critic Loss = 0.0516, Entropy = 0.0573, Returns = 1.3288, Value = 1.3930, mask_loss = 0.1723std_loss = 0.3341:  31%|███       | 1702/5475 [7:37:46<18:50:43, 17.98s/it]

ratio tensor(0.9586, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5243, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.3941, device='cuda:0') factor: tensor(0.1469, device='cuda:0')
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.3787, device='cuda:0') factor: tensor(0.1575, device='cuda:0')
reward_raw: tensor(0.5300, device='cuda:0') reward: tensor(1.3799, device='cuda:0') factor: tensor(0.1502, device='cuda:0')
reward_raw: tensor(0.5265, device='cuda:0') reward: tensor(1.3741, device='cuda:0') factor: tensor(0.1525, device='cuda:0')


Epoch 1, Step 1703: Loss = -0.0487, Actor Loss = -0.0815, Critic Loss = 0.0657, Entropy = 0.0640, Returns = 1.3696, Value = 1.3356, mask_loss = 0.1559std_loss = 0.3298:  31%|███       | 1703/5475 [7:38:01<18:07:32, 17.30s/it]

ratio tensor(1.0014, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6414, device='cuda:0') reward: tensor(1.4610, device='cuda:0') factor: tensor(0.1804, device='cuda:0')
reward_raw: tensor(0.6326, device='cuda:0') reward: tensor(1.4552, device='cuda:0') factor: tensor(0.1774, device='cuda:0')
reward_raw: tensor(0.6318, device='cuda:0') reward: tensor(1.4589, device='cuda:0') factor: tensor(0.1729, device='cuda:0')
reward_raw: tensor(0.6319, device='cuda:0') reward: tensor(1.4478, device='cuda:0') factor: tensor(0.1841, device='cuda:0')
reward_raw: tensor(0.6299, device='cuda:0') reward: tensor(1.4525, device='cuda:0') factor: tensor(0.1773, device='cuda:0')


Epoch 1, Step 1704: Loss = 0.1325, Actor Loss = 0.0998, Critic Loss = 0.0655, Entropy = 0.0739, Returns = 1.4987, Value = 1.5241, mask_loss = 0.1747std_loss = 0.3403:  31%|███       | 1704/5475 [7:38:16<17:15:24, 16.47s/it]  

ratio tensor(0.8778, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4592, device='cuda:0') reward: tensor(1.3167, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1426, device='cuda:0')
reward_raw: tensor(0.4546, device='cuda:0') reward: tensor(1.3119, device='cuda:0') factor: tensor(0.1427, device='cuda:0')
reward_raw: tensor(0.4605, device='cuda:0') reward: tensor(1.3248, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.4641, device='cuda:0') reward: tensor(1.3234, device='cuda:0') factor: tensor(0.1407, device='cuda:0')


Epoch 1, Step 1705: Loss = 0.0314, Actor Loss = 0.0073, Critic Loss = 0.0483, Entropy = 0.0673, Returns = 1.2782, Value = 1.2663, mask_loss = 0.1462std_loss = 0.3157:  31%|███       | 1705/5475 [7:38:32<17:05:04, 16.31s/it]

ratio tensor(0.9708, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5824, device='cuda:0') reward: tensor(1.4496, device='cuda:0') factor: tensor(0.1328, device='cuda:0')
reward_raw: tensor(0.5821, device='cuda:0') reward: tensor(1.4475, device='cuda:0') factor: tensor(0.1346, device='cuda:0')
reward_raw: tensor(0.5804, device='cuda:0') reward: tensor(1.4454, device='cuda:0') factor: tensor(0.1349, device='cuda:0')
reward_raw: tensor(0.5727, device='cuda:0') reward: tensor(1.4429, device='cuda:0') factor: tensor(0.1298, device='cuda:0')
reward_raw: tensor(0.5762, device='cuda:0') reward: tensor(1.4432, device='cuda:0') factor: tensor(0.1330, device='cuda:0')


Epoch 1, Step 1706: Loss = -0.0001, Actor Loss = -0.0289, Critic Loss = 0.0576, Entropy = 0.0506, Returns = 1.4519, Value = 1.4679, mask_loss = 0.1341std_loss = 0.3168:  31%|███       | 1706/5475 [7:38:45<16:01:53, 15.31s/it]

ratio tensor(1.1694, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3093, device='cuda:0') factor: tensor(0.1894, device='cuda:0')
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3186, device='cuda:0') factor: tensor(0.1908, device='cuda:0')
reward_raw: tensor(0.5280, device='cuda:0') reward: tensor(1.3378, device='cuda:0') factor: tensor(0.1903, device='cuda:0')
reward_raw: tensor(0.5000, device='cuda:0') reward: tensor(1.3112, device='cuda:0') factor: tensor(0.1888, device='cuda:0')
reward_raw: tensor(0.5222, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1907, device='cuda:0')


Epoch 1, Step 1707: Loss = 0.0517, Actor Loss = 0.0353, Critic Loss = 0.0332, Entropy = 0.0794, Returns = 1.2997, Value = 1.3197, mask_loss = 0.1934std_loss = 0.3535:  31%|███       | 1707/5475 [7:38:59<15:40:32, 14.98s/it]  

ratio tensor(1.2546, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6416, device='cuda:0') reward: tensor(1.4881, device='cuda:0') factor: tensor(0.1535, device='cuda:0')
reward_raw: tensor(0.6399, device='cuda:0') reward: tensor(1.4875, device='cuda:0') factor: tensor(0.1524, device='cuda:0')
reward_raw: tensor(0.6239, device='cuda:0') reward: tensor(1.4723, device='cuda:0') factor: tensor(0.1515, device='cuda:0')
reward_raw: tensor(0.6397, device='cuda:0') reward: tensor(1.4887, device='cuda:0') factor: tensor(0.1510, device='cuda:0')
reward_raw: tensor(0.6276, device='cuda:0') reward: tensor(1.4801, device='cuda:0') factor: tensor(0.1474, device='cuda:0')


Epoch 1, Step 1708: Loss = 0.0464, Actor Loss = 0.0014, Critic Loss = 0.0902, Entropy = 0.0565, Returns = 1.4869, Value = 1.6411, mask_loss = 0.1363std_loss = 0.3120:  31%|███       | 1708/5475 [7:39:15<15:54:47, 15.21s/it]

ratio tensor(1.0165, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.1435, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3625, device='cuda:0') factor: tensor(0.1455, device='cuda:0')
reward_raw: tensor(0.5096, device='cuda:0') reward: tensor(1.3739, device='cuda:0') factor: tensor(0.1357, device='cuda:0')
reward_raw: tensor(0.5074, device='cuda:0') reward: tensor(1.3707, device='cuda:0') factor: tensor(0.1366, device='cuda:0')
reward_raw: tensor(0.5032, device='cuda:0') reward: tensor(1.3624, device='cuda:0') factor: tensor(0.1408, device='cuda:0')


Epoch 1, Step 1709: Loss = 0.2743, Actor Loss = 0.2524, Critic Loss = 0.0439, Entropy = 0.0611, Returns = 1.3434, Value = 1.2939, mask_loss = 0.1504std_loss = 0.3181:  31%|███       | 1709/5475 [7:39:32<16:38:52, 15.91s/it]

ratio tensor(0.6870, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4359, device='cuda:0') reward: tensor(1.3100, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.4477, device='cuda:0') reward: tensor(1.3199, device='cuda:0') factor: tensor(0.1277, device='cuda:0')
reward_raw: tensor(0.4432, device='cuda:0') reward: tensor(1.3103, device='cuda:0') factor: tensor(0.1329, device='cuda:0')
reward_raw: tensor(0.4398, device='cuda:0') reward: tensor(1.3124, device='cuda:0') factor: tensor(0.1274, device='cuda:0')
reward_raw: tensor(0.4296, device='cuda:0') reward: tensor(1.3001, device='cuda:0') factor: tensor(0.1295, device='cuda:0')


Epoch 1, Step 1710: Loss = -0.0717, Actor Loss = -0.1017, Critic Loss = 0.0600, Entropy = 0.0742, Returns = 1.3141, Value = 1.2389, mask_loss = 0.1460std_loss = 0.3114:  31%|███       | 1710/5475 [7:39:50<17:16:16, 16.51s/it]

ratio tensor(1.5925, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.3797, device='cuda:0') factor: tensor(0.1465, device='cuda:0')
reward_raw: tensor(0.5032, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.1456, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.5232, device='cuda:0') reward: tensor(1.3764, device='cuda:0') factor: tensor(0.1469, device='cuda:0')


Epoch 1, Step 1711: Loss = -0.1238, Actor Loss = -0.1504, Critic Loss = 0.0533, Entropy = 0.0862, Returns = 1.4369, Value = 1.4283, mask_loss = 0.1807std_loss = 0.3385:  31%|███▏      | 1711/5475 [7:40:06<16:54:16, 16.17s/it]

ratio tensor(8.1066, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4579, device='cuda:0') reward: tensor(1.3059, device='cuda:0') factor: tensor(0.1520, device='cuda:0')
reward_raw: tensor(0.4463, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.1558, device='cuda:0')
reward_raw: tensor(0.4604, device='cuda:0') reward: tensor(1.3039, device='cuda:0') factor: tensor(0.1565, device='cuda:0')
reward_raw: tensor(0.4638, device='cuda:0') reward: tensor(1.3114, device='cuda:0') factor: tensor(0.1524, device='cuda:0')
reward_raw: tensor(0.4688, device='cuda:0') reward: tensor(1.3087, device='cuda:0') factor: tensor(0.1600, device='cuda:0')


Epoch 1, Step 1712: Loss = 0.2178, Actor Loss = 0.1856, Critic Loss = 0.0647, Entropy = 0.0691, Returns = 1.2362, Value = 1.2011, mask_loss = 0.1417std_loss = 0.3040:  31%|███▏      | 1712/5475 [7:40:20<16:23:05, 15.68s/it]  

ratio tensor(0.6120, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5211, device='cuda:0') reward: tensor(1.3827, device='cuda:0') factor: tensor(0.1384, device='cuda:0')
reward_raw: tensor(0.5088, device='cuda:0') reward: tensor(1.3694, device='cuda:0') factor: tensor(0.1393, device='cuda:0')
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.3765, device='cuda:0') factor: tensor(0.1391, device='cuda:0')
reward_raw: tensor(0.5149, device='cuda:0') reward: tensor(1.3791, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.5233, device='cuda:0') reward: tensor(1.3835, device='cuda:0') factor: tensor(0.1398, device='cuda:0')


Epoch 1, Step 1713: Loss = -0.1227, Actor Loss = -0.1567, Critic Loss = 0.0683, Entropy = 0.0803, Returns = 1.3525, Value = 1.3456, mask_loss = 0.1245std_loss = 0.2852:  31%|███▏      | 1713/5475 [7:40:41<18:01:05, 17.24s/it]

ratio tensor(3.4480, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4860, device='cuda:0') reward: tensor(1.3143, device='cuda:0') factor: tensor(0.1717, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.1767, device='cuda:0')
reward_raw: tensor(0.4826, device='cuda:0') reward: tensor(1.3083, device='cuda:0') factor: tensor(0.1743, device='cuda:0')
reward_raw: tensor(0.4783, device='cuda:0') reward: tensor(1.3061, device='cuda:0') factor: tensor(0.1722, device='cuda:0')
reward_raw: tensor(0.5072, device='cuda:0') reward: tensor(1.3331, device='cuda:0') factor: tensor(0.1741, device='cuda:0')


Epoch 1, Step 1714: Loss = 0.1758, Actor Loss = 0.1432, Critic Loss = 0.0653, Entropy = 0.0868, Returns = 1.3332, Value = 1.3763, mask_loss = 0.1617std_loss = 0.3295:  31%|███▏      | 1714/5475 [7:40:52<16:07:14, 15.43s/it]  

ratio tensor(0.5503, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4914, device='cuda:0') reward: tensor(1.3063, device='cuda:0') factor: tensor(0.1850, device='cuda:0')
reward_raw: tensor(0.4947, device='cuda:0') reward: tensor(1.3084, device='cuda:0') factor: tensor(0.1863, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.3149, device='cuda:0') factor: tensor(0.1835, device='cuda:0')
reward_raw: tensor(0.4972, device='cuda:0') reward: tensor(1.3106, device='cuda:0') factor: tensor(0.1865, device='cuda:0')
reward_raw: tensor(0.5045, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.1856, device='cuda:0')


Epoch 1, Step 1715: Loss = 0.0895, Actor Loss = 0.0591, Critic Loss = 0.0610, Entropy = 0.0954, Returns = 1.2820, Value = 1.1777, mask_loss = 0.1688std_loss = 0.3293:  31%|███▏      | 1715/5475 [7:41:05<15:26:04, 14.78s/it]

ratio tensor(0.5770, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6277, device='cuda:0') reward: tensor(1.4577, device='cuda:0') factor: tensor(0.1700, device='cuda:0')
reward_raw: tensor(0.6126, device='cuda:0') reward: tensor(1.4474, device='cuda:0') factor: tensor(0.1652, device='cuda:0')
reward_raw: tensor(0.6223, device='cuda:0') reward: tensor(1.4567, device='cuda:0') factor: tensor(0.1656, device='cuda:0')
reward_raw: tensor(0.6307, device='cuda:0') reward: tensor(1.4553, device='cuda:0') factor: tensor(0.1754, device='cuda:0')
reward_raw: tensor(0.6199, device='cuda:0') reward: tensor(1.4517, device='cuda:0') factor: tensor(0.1682, device='cuda:0')


Epoch 1, Step 1716: Loss = -0.1980, Actor Loss = -0.2303, Critic Loss = 0.0647, Entropy = 0.0861, Returns = 1.4651, Value = 1.6277, mask_loss = 0.1522std_loss = 0.3187:  31%|███▏      | 1716/5475 [7:41:28<17:44:55, 17.00s/it]

ratio tensor(1.3646, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5174, device='cuda:0') reward: tensor(1.3590, device='cuda:0') factor: tensor(0.1584, device='cuda:0')
reward_raw: tensor(0.4889, device='cuda:0') reward: tensor(1.3323, device='cuda:0') factor: tensor(0.1566, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1519, device='cuda:0')
reward_raw: tensor(0.5327, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1593, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3909, device='cuda:0') factor: tensor(0.1602, device='cuda:0')


Epoch 1, Step 1717: Loss = 0.1161, Actor Loss = 0.0961, Critic Loss = 0.0401, Entropy = 0.0861, Returns = 1.3168, Value = 1.2473, mask_loss = 0.1499std_loss = 0.3104:  31%|███▏      | 1717/5475 [7:41:46<18:08:59, 17.39s/it]  

ratio tensor(0.7347, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4101, device='cuda:0') reward: tensor(1.2524, device='cuda:0') factor: tensor(0.1576, device='cuda:0')
reward_raw: tensor(0.4093, device='cuda:0') reward: tensor(1.2431, device='cuda:0') factor: tensor(0.1662, device='cuda:0')
reward_raw: tensor(0.4226, device='cuda:0') reward: tensor(1.2692, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.3983, device='cuda:0') reward: tensor(1.2369, device='cuda:0') factor: tensor(0.1614, device='cuda:0')
reward_raw: tensor(0.3982, device='cuda:0') reward: tensor(1.2418, device='cuda:0') factor: tensor(0.1563, device='cuda:0')


Epoch 1, Step 1718: Loss = 0.0936, Actor Loss = 0.0658, Critic Loss = 0.0557, Entropy = 0.0844, Returns = 1.3366, Value = 1.2835, mask_loss = 0.1408std_loss = 0.3010:  31%|███▏      | 1718/5475 [7:42:06<18:59:45, 18.20s/it]

ratio tensor(0.7161, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6302, device='cuda:0') reward: tensor(1.4743, device='cuda:0') factor: tensor(0.1560, device='cuda:0')
reward_raw: tensor(0.5995, device='cuda:0') reward: tensor(1.4418, device='cuda:0') factor: tensor(0.1576, device='cuda:0')
reward_raw: tensor(0.5899, device='cuda:0') reward: tensor(1.4332, device='cuda:0') factor: tensor(0.1567, device='cuda:0')
reward_raw: tensor(0.6074, device='cuda:0') reward: tensor(1.4543, device='cuda:0') factor: tensor(0.1532, device='cuda:0')
reward_raw: tensor(0.5863, device='cuda:0') reward: tensor(1.4304, device='cuda:0') factor: tensor(0.1559, device='cuda:0')


Epoch 1, Step 1719: Loss = -0.1532, Actor Loss = -0.1752, Critic Loss = 0.0442, Entropy = 0.0900, Returns = 1.3759, Value = 1.5370, mask_loss = 0.1697std_loss = 0.3242:  31%|███▏      | 1719/5475 [7:42:21<17:53:17, 17.15s/it]

ratio tensor(1.4419, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5202, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.5235, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.1497, device='cuda:0')
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1463, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(1.3633, device='cuda:0') factor: tensor(0.1490, device='cuda:0')
reward_raw: tensor(0.5141, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1490, device='cuda:0')


Epoch 1, Step 1720: Loss = 0.0703, Actor Loss = 0.0447, Critic Loss = 0.0514, Entropy = 0.0838, Returns = 1.3450, Value = 1.2881, mask_loss = 0.1478std_loss = 0.3155:  31%|███▏      | 1720/5475 [7:42:36<17:27:31, 16.74s/it]  

ratio tensor(0.8349, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5456, device='cuda:0') reward: tensor(1.3860, device='cuda:0') factor: tensor(0.1596, device='cuda:0')
reward_raw: tensor(0.5476, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.1657, device='cuda:0')
reward_raw: tensor(0.5275, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1624, device='cuda:0')
reward_raw: tensor(0.5250, device='cuda:0') reward: tensor(1.3730, device='cuda:0') factor: tensor(0.1520, device='cuda:0')
reward_raw: tensor(0.5422, device='cuda:0') reward: tensor(1.3847, device='cuda:0') factor: tensor(0.1575, device='cuda:0')


Epoch 1, Step 1721: Loss = 0.0399, Actor Loss = -0.0226, Critic Loss = 0.1252, Entropy = 0.0931, Returns = 1.3359, Value = 1.3154, mask_loss = 0.1959std_loss = 0.3570:  31%|███▏      | 1720/5475 [7:43:00<17:27:31, 16.74s/it]

ratio tensor(0.8963, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1721: Loss = 0.0399, Actor Loss = -0.0226, Critic Loss = 0.1252, Entropy = 0.0931, Returns = 1.3359, Value = 1.3154, mask_loss = 0.1959std_loss = 0.3570:  31%|███▏      | 1721/5475 [7:43:00<19:37:22, 18.82s/it]

reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.1636, device='cuda:0')
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.3779, device='cuda:0') factor: tensor(0.1623, device='cuda:0')
reward_raw: tensor(0.5611, device='cuda:0') reward: tensor(1.3969, device='cuda:0') factor: tensor(0.1642, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.3975, device='cuda:0') factor: tensor(0.1624, device='cuda:0')
reward_raw: tensor(0.5675, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.1610, device='cuda:0')


Epoch 1, Step 1722: Loss = -0.0592, Actor Loss = -0.0867, Critic Loss = 0.0551, Entropy = 0.0815, Returns = 1.4422, Value = 1.4405, mask_loss = 0.1291std_loss = 0.2931:  31%|███▏      | 1722/5475 [7:43:15<18:31:13, 17.77s/it]

ratio tensor(1.6267, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1475, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1492, device='cuda:0')
reward_raw: tensor(0.5165, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.1537, device='cuda:0')
reward_raw: tensor(0.5162, device='cuda:0') reward: tensor(1.3739, device='cuda:0') factor: tensor(0.1422, device='cuda:0')
reward_raw: tensor(0.5127, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1499, device='cuda:0')


Epoch 1, Step 1723: Loss = -0.0092, Actor Loss = -0.0306, Critic Loss = 0.0428, Entropy = 0.0811, Returns = 1.3967, Value = 1.3959, mask_loss = 0.1464std_loss = 0.3157:  31%|███▏      | 1723/5475 [7:43:31<17:44:00, 17.01s/it]

ratio tensor(0.9786, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.1582, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.3482, device='cuda:0') factor: tensor(0.1669, device='cuda:0')
reward_raw: tensor(0.5194, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.1568, device='cuda:0')
reward_raw: tensor(0.5230, device='cuda:0') reward: tensor(1.3673, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.5145, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.1655, device='cuda:0')


Epoch 1, Step 1724: Loss = 0.1377, Actor Loss = 0.1056, Critic Loss = 0.0642, Entropy = 0.0909, Returns = 1.2955, Value = 1.2128, mask_loss = 0.1570std_loss = 0.3174:  31%|███▏      | 1724/5475 [7:43:44<16:38:18, 15.97s/it]  

ratio tensor(0.7102, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.1197, device='cuda:0')
reward_raw: tensor(0.5425, device='cuda:0') reward: tensor(1.4153, device='cuda:0') factor: tensor(0.1272, device='cuda:0')
reward_raw: tensor(0.5490, device='cuda:0') reward: tensor(1.4242, device='cuda:0') factor: tensor(0.1248, device='cuda:0')
reward_raw: tensor(0.5202, device='cuda:0') reward: tensor(1.4019, device='cuda:0') factor: tensor(0.1183, device='cuda:0')
reward_raw: tensor(0.5525, device='cuda:0') reward: tensor(1.4284, device='cuda:0') factor: tensor(0.1241, device='cuda:0')


Epoch 1, Step 1725: Loss = -0.2367, Actor Loss = -0.2574, Critic Loss = 0.0415, Entropy = 0.0719, Returns = 1.3926, Value = 1.3925, mask_loss = 0.1373std_loss = 0.3102:  32%|███▏      | 1725/5475 [7:44:00<16:34:56, 15.92s/it]

ratio tensor(1.4278, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4986, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.1690, device='cuda:0')
reward_raw: tensor(0.5030, device='cuda:0') reward: tensor(1.3301, device='cuda:0') factor: tensor(0.1728, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3315, device='cuda:0') factor: tensor(0.1683, device='cuda:0')
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.3268, device='cuda:0') factor: tensor(0.1692, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.3322, device='cuda:0') factor: tensor(0.1692, device='cuda:0')


Epoch 1, Step 1726: Loss = 0.1807, Actor Loss = 0.1523, Critic Loss = 0.0568, Entropy = 0.0856, Returns = 1.2944, Value = 1.3607, mask_loss = 0.1584std_loss = 0.3200:  32%|███▏      | 1726/5475 [7:44:16<16:27:38, 15.81s/it]  

ratio tensor(0.5741, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.3687, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.3613, device='cuda:0') factor: tensor(0.1532, device='cuda:0')
reward_raw: tensor(0.5161, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.1509, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.3533, device='cuda:0') factor: tensor(0.1508, device='cuda:0')
reward_raw: tensor(0.5190, device='cuda:0') reward: tensor(1.3722, device='cuda:0') factor: tensor(0.1468, device='cuda:0')


Epoch 1, Step 1727: Loss = -0.0382, Actor Loss = -0.0684, Critic Loss = 0.0607, Entropy = 0.0732, Returns = 1.3563, Value = 1.2938, mask_loss = 0.1699std_loss = 0.3297:  32%|███▏      | 1727/5475 [7:44:28<15:18:31, 14.70s/it]

ratio tensor(1.1413, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.1342, device='cuda:0')
reward_raw: tensor(0.5436, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.1343, device='cuda:0')
reward_raw: tensor(0.5521, device='cuda:0') reward: tensor(1.4189, device='cuda:0') factor: tensor(0.1331, device='cuda:0')
reward_raw: tensor(0.5563, device='cuda:0') reward: tensor(1.4255, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5547, device='cuda:0') reward: tensor(1.4202, device='cuda:0') factor: tensor(0.1345, device='cuda:0')


Epoch 1, Step 1728: Loss = 0.0007, Actor Loss = -0.0141, Critic Loss = 0.0296, Entropy = 0.0628, Returns = 1.4665, Value = 1.5428, mask_loss = 0.1191std_loss = 0.2922:  32%|███▏      | 1728/5475 [7:44:43<15:21:24, 14.75s/it] 

ratio tensor(1.0476, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5356, device='cuda:0') reward: tensor(1.4032, device='cuda:0') factor: tensor(0.1324, device='cuda:0')
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.4111, device='cuda:0') factor: tensor(0.1351, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5260, device='cuda:0') reward: tensor(1.3982, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.5181, device='cuda:0') reward: tensor(1.3848, device='cuda:0') factor: tensor(0.1333, device='cuda:0')


Epoch 1, Step 1729: Loss = 0.1656, Actor Loss = 0.1200, Critic Loss = 0.0913, Entropy = 0.0758, Returns = 1.3770, Value = 1.3761, mask_loss = 0.1276std_loss = 0.2924:  32%|███▏      | 1729/5475 [7:45:01<16:22:18, 15.73s/it] 

ratio tensor(0.9146, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3497, device='cuda:0') factor: tensor(0.1502, device='cuda:0')
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.1417, device='cuda:0')
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.3491, device='cuda:0') factor: tensor(0.1454, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3385, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4806, device='cuda:0') reward: tensor(1.3359, device='cuda:0') factor: tensor(0.1447, device='cuda:0')


Epoch 1, Step 1730: Loss = 0.1079, Actor Loss = 0.0784, Critic Loss = 0.0591, Entropy = 0.0784, Returns = 1.3015, Value = 1.3706, mask_loss = 0.1406std_loss = 0.3058:  32%|███▏      | 1730/5475 [7:45:16<16:19:26, 15.69s/it]

ratio tensor(1.1800, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4340, device='cuda:0') reward: tensor(1.3034, device='cuda:0') factor: tensor(0.1306, device='cuda:0')
reward_raw: tensor(0.4498, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.1261, device='cuda:0')
reward_raw: tensor(0.4427, device='cuda:0') reward: tensor(1.3213, device='cuda:0') factor: tensor(0.1214, device='cuda:0')
reward_raw: tensor(0.4289, device='cuda:0') reward: tensor(1.3024, device='cuda:0') factor: tensor(0.1265, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3233, device='cuda:0') factor: tensor(0.1219, device='cuda:0')


Epoch 1, Step 1731: Loss = 0.0718, Actor Loss = 0.0574, Critic Loss = 0.0290, Entropy = 0.0843, Returns = 1.2724, Value = 1.3321, mask_loss = 0.1538std_loss = 0.3078:  32%|███▏      | 1731/5475 [7:45:33<16:34:54, 15.94s/it]

ratio tensor(0.9409, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4310, device='cuda:0') reward: tensor(1.2972, device='cuda:0') factor: tensor(0.1339, device='cuda:0')
reward_raw: tensor(0.4397, device='cuda:0') reward: tensor(1.3049, device='cuda:0') factor: tensor(0.1347, device='cuda:0')
reward_raw: tensor(0.4290, device='cuda:0') reward: tensor(1.2986, device='cuda:0') factor: tensor(0.1304, device='cuda:0')
reward_raw: tensor(0.4409, device='cuda:0') reward: tensor(1.3068, device='cuda:0') factor: tensor(0.1341, device='cuda:0')
reward_raw: tensor(0.4396, device='cuda:0') reward: tensor(1.3029, device='cuda:0') factor: tensor(0.1367, device='cuda:0')


Epoch 1, Step 1732: Loss = -0.0116, Actor Loss = -0.0538, Critic Loss = 0.0845, Entropy = 0.0659, Returns = 1.3229, Value = 1.2795, mask_loss = 0.1427std_loss = 0.3032:  32%|███▏      | 1732/5475 [7:45:48<16:18:01, 15.68s/it]

ratio tensor(0.9390, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.3523, device='cuda:0') factor: tensor(0.1542, device='cuda:0')
reward_raw: tensor(0.5180, device='cuda:0') reward: tensor(1.3619, device='cuda:0') factor: tensor(0.1561, device='cuda:0')
reward_raw: tensor(0.5130, device='cuda:0') reward: tensor(1.3603, device='cuda:0') factor: tensor(0.1527, device='cuda:0')
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.1579, device='cuda:0')
reward_raw: tensor(0.5119, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.1543, device='cuda:0')


Epoch 1, Step 1733: Loss = -0.0741, Actor Loss = -0.0975, Critic Loss = 0.0470, Entropy = 0.0634, Returns = 1.3467, Value = 1.3484, mask_loss = 0.1446std_loss = 0.3153:  32%|███▏      | 1733/5475 [7:46:03<16:09:04, 15.54s/it]

ratio tensor(1.0907, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5094, device='cuda:0') reward: tensor(1.3632, device='cuda:0') factor: tensor(0.1462, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.3556, device='cuda:0') factor: tensor(0.1433, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3515, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.5038, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1420, device='cuda:0')
reward_raw: tensor(0.5255, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.1383, device='cuda:0')


Epoch 1, Step 1734: Loss = 0.0636, Actor Loss = 0.0238, Critic Loss = 0.0797, Entropy = 0.0824, Returns = 1.3975, Value = 1.4247, mask_loss = 0.1345std_loss = 0.2985:  32%|███▏      | 1734/5475 [7:46:26<18:23:18, 17.70s/it]  

ratio tensor(3.3044, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5397, device='cuda:0') reward: tensor(1.4098, device='cuda:0') factor: tensor(0.1299, device='cuda:0')
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.1437, device='cuda:0')
reward_raw: tensor(0.5552, device='cuda:0') reward: tensor(1.4180, device='cuda:0') factor: tensor(0.1372, device='cuda:0')
reward_raw: tensor(0.5623, device='cuda:0') reward: tensor(1.4210, device='cuda:0') factor: tensor(0.1413, device='cuda:0')
reward_raw: tensor(0.5453, device='cuda:0') reward: tensor(1.4090, device='cuda:0') factor: tensor(0.1362, device='cuda:0')


Epoch 1, Step 1735: Loss = 0.0278, Actor Loss = -0.0258, Critic Loss = 0.1074, Entropy = 0.0796, Returns = 1.4837, Value = 1.4654, mask_loss = 0.1522std_loss = 0.3148:  32%|███▏      | 1735/5475 [7:46:39<16:56:46, 16.31s/it]

ratio tensor(0.9580, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5663, device='cuda:0') reward: tensor(1.4128, device='cuda:0') factor: tensor(0.1535, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.3989, device='cuda:0') factor: tensor(0.1524, device='cuda:0')
reward_raw: tensor(0.5538, device='cuda:0') reward: tensor(1.4023, device='cuda:0') factor: tensor(0.1515, device='cuda:0')
reward_raw: tensor(0.5678, device='cuda:0') reward: tensor(1.4117, device='cuda:0') factor: tensor(0.1561, device='cuda:0')
reward_raw: tensor(0.5607, device='cuda:0') reward: tensor(1.4100, device='cuda:0') factor: tensor(0.1507, device='cuda:0')


Epoch 1, Step 1736: Loss = -0.0956, Actor Loss = -0.1194, Critic Loss = 0.0476, Entropy = 0.0789, Returns = 1.4102, Value = 1.4396, mask_loss = 0.1678std_loss = 0.3296:  32%|███▏      | 1736/5475 [7:46:51<15:38:43, 15.06s/it]

ratio tensor(1.2349, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5431, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1359, device='cuda:0')
reward_raw: tensor(0.5451, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.4082, device='cuda:0') factor: tensor(0.1390, device='cuda:0')
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.1299, device='cuda:0')
reward_raw: tensor(0.5417, device='cuda:0') reward: tensor(1.4015, device='cuda:0') factor: tensor(0.1402, device='cuda:0')


Epoch 1, Step 1737: Loss = 0.1733, Actor Loss = 0.1431, Critic Loss = 0.0605, Entropy = 0.0554, Returns = 1.3479, Value = 1.4220, mask_loss = 0.1427std_loss = 0.3196:  32%|███▏      | 1737/5475 [7:47:05<15:14:39, 14.68s/it]  

ratio tensor(1.0763, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.1716, device='cuda:0')
reward_raw: tensor(0.5517, device='cuda:0') reward: tensor(1.3803, device='cuda:0') factor: tensor(0.1714, device='cuda:0')
reward_raw: tensor(0.5518, device='cuda:0') reward: tensor(1.3821, device='cuda:0') factor: tensor(0.1698, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.3671, device='cuda:0') factor: tensor(0.1735, device='cuda:0')
reward_raw: tensor(0.5511, device='cuda:0') reward: tensor(1.3781, device='cuda:0') factor: tensor(0.1730, device='cuda:0')


Epoch 1, Step 1738: Loss = 0.0323, Actor Loss = 0.0128, Critic Loss = 0.0391, Entropy = 0.0793, Returns = 1.3595, Value = 1.3733, mask_loss = 0.1799std_loss = 0.3378:  32%|███▏      | 1738/5475 [7:47:19<15:13:26, 14.67s/it]

ratio tensor(1.0042, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.3487, device='cuda:0') factor: tensor(0.1482, device='cuda:0')
reward_raw: tensor(0.5233, device='cuda:0') reward: tensor(1.3705, device='cuda:0') factor: tensor(0.1528, device='cuda:0')
reward_raw: tensor(0.5209, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.1450, device='cuda:0')
reward_raw: tensor(0.5301, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.1491, device='cuda:0')
reward_raw: tensor(0.5227, device='cuda:0') reward: tensor(1.3689, device='cuda:0') factor: tensor(0.1538, device='cuda:0')


Epoch 1, Step 1739: Loss = -0.0030, Actor Loss = -0.0299, Critic Loss = 0.0539, Entropy = 0.0671, Returns = 1.3585, Value = 1.3761, mask_loss = 0.1592std_loss = 0.3271:  32%|███▏      | 1739/5475 [7:47:41<17:26:03, 16.80s/it]

ratio tensor(0.7733, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4851, device='cuda:0') reward: tensor(1.3432, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.4966, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.1346, device='cuda:0')
reward_raw: tensor(0.5062, device='cuda:0') reward: tensor(1.3652, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.5155, device='cuda:0') reward: tensor(1.3776, device='cuda:0') factor: tensor(0.1379, device='cuda:0')
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.3646, device='cuda:0') factor: tensor(0.1364, device='cuda:0')


Epoch 1, Step 1740: Loss = 0.1210, Actor Loss = 0.0934, Critic Loss = 0.0553, Entropy = 0.0643, Returns = 1.3300, Value = 1.2974, mask_loss = 0.1349std_loss = 0.3066:  32%|███▏      | 1740/5475 [7:47:55<16:35:46, 16.00s/it]  

ratio tensor(0.8842, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.4200, device='cuda:0') factor: tensor(0.1165, device='cuda:0')
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.4191, device='cuda:0') factor: tensor(0.1175, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.4210, device='cuda:0') factor: tensor(0.1183, device='cuda:0')
reward_raw: tensor(0.5473, device='cuda:0') reward: tensor(1.4267, device='cuda:0') factor: tensor(0.1205, device='cuda:0')
reward_raw: tensor(0.5558, device='cuda:0') reward: tensor(1.4398, device='cuda:0') factor: tensor(0.1160, device='cuda:0')


Epoch 1, Step 1741: Loss = -0.1567, Actor Loss = -0.1875, Critic Loss = 0.0617, Entropy = 0.0419, Returns = 1.4383, Value = 1.4755, mask_loss = 0.1246std_loss = 0.3072:  32%|███▏      | 1740/5475 [7:48:13<16:35:46, 16.00s/it]

ratio tensor(1.3781, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1741: Loss = -0.1567, Actor Loss = -0.1875, Critic Loss = 0.0617, Entropy = 0.0419, Returns = 1.4383, Value = 1.4755, mask_loss = 0.1246std_loss = 0.3072:  32%|███▏      | 1741/5475 [7:48:14<17:18:19, 16.68s/it]

reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3247, device='cuda:0') factor: tensor(0.1428, device='cuda:0')
reward_raw: tensor(0.4580, device='cuda:0') reward: tensor(1.3177, device='cuda:0') factor: tensor(0.1403, device='cuda:0')
reward_raw: tensor(0.4797, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4743, device='cuda:0') reward: tensor(1.3296, device='cuda:0') factor: tensor(0.1447, device='cuda:0')
reward_raw: tensor(0.4659, device='cuda:0') reward: tensor(1.3247, device='cuda:0') factor: tensor(0.1412, device='cuda:0')


Epoch 1, Step 1742: Loss = 0.1800, Actor Loss = 0.1552, Critic Loss = 0.0498, Entropy = 0.0588, Returns = 1.3352, Value = 1.3050, mask_loss = 0.1270std_loss = 0.2998:  32%|███▏      | 1742/5475 [7:48:33<18:03:51, 17.42s/it]  

ratio tensor(0.7900, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.5572, device='cuda:0') reward: tensor(1.4272, device='cuda:0') factor: tensor(0.1300, device='cuda:0')
reward_raw: tensor(0.5323, device='cuda:0') reward: tensor(1.4039, device='cuda:0') factor: tensor(0.1283, device='cuda:0')
reward_raw: tensor(0.5486, device='cuda:0') reward: tensor(1.4201, device='cuda:0') factor: tensor(0.1285, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.4235, device='cuda:0') factor: tensor(0.1269, device='cuda:0')


Epoch 1, Step 1743: Loss = 0.0219, Actor Loss = -0.0112, Critic Loss = 0.0662, Entropy = 0.0656, Returns = 1.3414, Value = 1.4060, mask_loss = 0.1283std_loss = 0.2951:  32%|███▏      | 1743/5475 [7:48:46<16:54:58, 16.32s/it]

ratio tensor(1.5452, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6192, device='cuda:0') reward: tensor(1.5047, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.6110, device='cuda:0') reward: tensor(1.4940, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.6064, device='cuda:0') reward: tensor(1.4925, device='cuda:0') factor: tensor(0.1138, device='cuda:0')
reward_raw: tensor(0.6261, device='cuda:0') reward: tensor(1.5109, device='cuda:0') factor: tensor(0.1152, device='cuda:0')
reward_raw: tensor(0.6393, device='cuda:0') reward: tensor(1.5239, device='cuda:0') factor: tensor(0.1154, device='cuda:0')


Epoch 1, Step 1744: Loss = -0.1801, Actor Loss = -0.2361, Critic Loss = 0.1121, Entropy = 0.0433, Returns = 1.5237, Value = 1.5925, mask_loss = 0.1060std_loss = 0.2825:  32%|███▏      | 1744/5475 [7:49:10<19:02:46, 18.38s/it]

ratio tensor(2.6415, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5337, device='cuda:0') reward: tensor(1.3972, device='cuda:0') factor: tensor(0.1365, device='cuda:0')
reward_raw: tensor(0.5420, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.1368, device='cuda:0')
reward_raw: tensor(0.5356, device='cuda:0') reward: tensor(1.3974, device='cuda:0') factor: tensor(0.1382, device='cuda:0')
reward_raw: tensor(0.5261, device='cuda:0') reward: tensor(1.3897, device='cuda:0') factor: tensor(0.1364, device='cuda:0')
reward_raw: tensor(0.5445, device='cuda:0') reward: tensor(1.4085, device='cuda:0') factor: tensor(0.1360, device='cuda:0')


Epoch 1, Step 1745: Loss = 0.1463, Actor Loss = 0.1111, Critic Loss = 0.0706, Entropy = 0.0600, Returns = 1.4174, Value = 1.3470, mask_loss = 0.1230std_loss = 0.2944:  32%|███▏      | 1745/5475 [7:49:25<18:03:42, 17.43s/it]  

ratio tensor(0.8721, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4803, device='cuda:0') reward: tensor(1.3270, device='cuda:0') factor: tensor(0.1534, device='cuda:0')
reward_raw: tensor(0.4763, device='cuda:0') reward: tensor(1.3206, device='cuda:0') factor: tensor(0.1557, device='cuda:0')
reward_raw: tensor(0.4891, device='cuda:0') reward: tensor(1.3343, device='cuda:0') factor: tensor(0.1548, device='cuda:0')
reward_raw: tensor(0.4906, device='cuda:0') reward: tensor(1.3368, device='cuda:0') factor: tensor(0.1538, device='cuda:0')
reward_raw: tensor(0.4844, device='cuda:0') reward: tensor(1.3311, device='cuda:0') factor: tensor(0.1533, device='cuda:0')


Epoch 1, Step 1746: Loss = 0.1310, Actor Loss = 0.1075, Critic Loss = 0.0471, Entropy = 0.0880, Returns = 1.3822, Value = 1.3082, mask_loss = 0.1465std_loss = 0.3073:  32%|███▏      | 1746/5475 [7:49:39<16:59:17, 16.40s/it]

ratio tensor(0.5560, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5370, device='cuda:0') reward: tensor(1.3919, device='cuda:0') factor: tensor(0.1451, device='cuda:0')
reward_raw: tensor(0.5345, device='cuda:0') reward: tensor(1.3951, device='cuda:0') factor: tensor(0.1394, device='cuda:0')
reward_raw: tensor(0.5467, device='cuda:0') reward: tensor(1.4053, device='cuda:0') factor: tensor(0.1414, device='cuda:0')
reward_raw: tensor(0.5343, device='cuda:0') reward: tensor(1.3963, device='cuda:0') factor: tensor(0.1380, device='cuda:0')
reward_raw: tensor(0.5379, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.1420, device='cuda:0')


Epoch 1, Step 1747: Loss = -0.2022, Actor Loss = -0.2586, Critic Loss = 0.1130, Entropy = 0.0858, Returns = 1.4266, Value = 1.4471, mask_loss = 0.1504std_loss = 0.3093:  32%|███▏      | 1747/5475 [7:49:58<17:43:53, 17.12s/it]

ratio tensor(2.3946, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6037, device='cuda:0') reward: tensor(1.4804, device='cuda:0') factor: tensor(0.1234, device='cuda:0')
reward_raw: tensor(0.6095, device='cuda:0') reward: tensor(1.4897, device='cuda:0') factor: tensor(0.1198, device='cuda:0')
reward_raw: tensor(0.6285, device='cuda:0') reward: tensor(1.5026, device='cuda:0') factor: tensor(0.1259, device='cuda:0')
reward_raw: tensor(0.6028, device='cuda:0') reward: tensor(1.4804, device='cuda:0') factor: tensor(0.1225, device='cuda:0')
reward_raw: tensor(0.6118, device='cuda:0') reward: tensor(1.4881, device='cuda:0') factor: tensor(0.1237, device='cuda:0')


Epoch 1, Step 1748: Loss = 0.0146, Actor Loss = -0.0109, Critic Loss = 0.0510, Entropy = 0.0492, Returns = 1.5448, Value = 1.5688, mask_loss = 0.1175std_loss = 0.2961:  32%|███▏      | 1748/5475 [7:50:11<16:28:15, 15.91s/it] 

ratio tensor(1.5055, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5656, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.1311, device='cuda:0')
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.4360, device='cuda:0') factor: tensor(0.1264, device='cuda:0')
reward_raw: tensor(0.5607, device='cuda:0') reward: tensor(1.4299, device='cuda:0') factor: tensor(0.1308, device='cuda:0')
reward_raw: tensor(0.5617, device='cuda:0') reward: tensor(1.4292, device='cuda:0') factor: tensor(0.1324, device='cuda:0')
reward_raw: tensor(0.5554, device='cuda:0') reward: tensor(1.4259, device='cuda:0') factor: tensor(0.1295, device='cuda:0')


Epoch 1, Step 1749: Loss = 0.0072, Actor Loss = -0.0143, Critic Loss = 0.0431, Entropy = 0.0554, Returns = 1.4884, Value = 1.4718, mask_loss = 0.1249std_loss = 0.3018:  32%|███▏      | 1749/5475 [7:50:29<17:05:05, 16.51s/it]

ratio tensor(7.6784, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5549, device='cuda:0') reward: tensor(1.3933, device='cuda:0') factor: tensor(0.1616, device='cuda:0')
reward_raw: tensor(0.5314, device='cuda:0') reward: tensor(1.3647, device='cuda:0') factor: tensor(0.1667, device='cuda:0')
reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1570, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.3701, device='cuda:0') factor: tensor(0.1583, device='cuda:0')
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1618, device='cuda:0')


Epoch 1, Step 1750: Loss = 0.1055, Actor Loss = 0.0945, Critic Loss = 0.0222, Entropy = 0.0803, Returns = 1.3662, Value = 1.4199, mask_loss = 0.1882std_loss = 0.3512:  32%|███▏      | 1750/5475 [7:50:42<15:59:35, 15.46s/it] 

ratio tensor(0.9462, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.3822, device='cuda:0') factor: tensor(0.1375, device='cuda:0')
reward_raw: tensor(0.5128, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1360, device='cuda:0')
reward_raw: tensor(0.5175, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.1303, device='cuda:0')
reward_raw: tensor(0.5200, device='cuda:0') reward: tensor(1.3779, device='cuda:0') factor: tensor(0.1420, device='cuda:0')
reward_raw: tensor(0.5184, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1421, device='cuda:0')


Epoch 1, Step 1751: Loss = 0.1382, Actor Loss = 0.0954, Critic Loss = 0.0857, Entropy = 0.0535, Returns = 1.3637, Value = 1.2810, mask_loss = 0.1264std_loss = 0.3069:  32%|███▏      | 1751/5475 [7:50:54<15:10:14, 14.67s/it]

ratio tensor(0.6107, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.6074, device='cuda:0') reward: tensor(1.4814, device='cuda:0') factor: tensor(0.1260, device='cuda:0')
reward_raw: tensor(0.5977, device='cuda:0') reward: tensor(1.4737, device='cuda:0') factor: tensor(0.1240, device='cuda:0')
reward_raw: tensor(0.6054, device='cuda:0') reward: tensor(1.4749, device='cuda:0') factor: tensor(0.1304, device='cuda:0')
reward_raw: tensor(0.6050, device='cuda:0') reward: tensor(1.4808, device='cuda:0') factor: tensor(0.1243, device='cuda:0')
reward_raw: tensor(0.6051, device='cuda:0') reward: tensor(1.4710, device='cuda:0') factor: tensor(0.1341, device='cuda:0')


Epoch 1, Step 1752: Loss = -0.3547, Actor Loss = -0.3698, Critic Loss = 0.0302, Entropy = 0.0660, Returns = 1.4857, Value = 1.4745, mask_loss = 0.1470std_loss = 0.3174:  32%|███▏      | 1752/5475 [7:51:15<16:50:38, 16.29s/it]

ratio tensor(6.7818, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5462, device='cuda:0') reward: tensor(1.4118, device='cuda:0') factor: tensor(0.1344, device='cuda:0')
reward_raw: tensor(0.5771, device='cuda:0') reward: tensor(1.4474, device='cuda:0') factor: tensor(0.1297, device='cuda:0')
reward_raw: tensor(0.5572, device='cuda:0') reward: tensor(1.4251, device='cuda:0') factor: tensor(0.1322, device='cuda:0')
reward_raw: tensor(0.5730, device='cuda:0') reward: tensor(1.4391, device='cuda:0') factor: tensor(0.1340, device='cuda:0')
reward_raw: tensor(0.5795, device='cuda:0') reward: tensor(1.4435, device='cuda:0') factor: tensor(0.1359, device='cuda:0')


Epoch 1, Step 1753: Loss = 0.2661, Actor Loss = 0.2239, Critic Loss = 0.0845, Entropy = 0.0664, Returns = 1.3764, Value = 1.4384, mask_loss = 0.1328std_loss = 0.3046:  32%|███▏      | 1753/5475 [7:51:28<15:59:35, 15.47s/it]  

ratio tensor(1.3904, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5428, device='cuda:0') reward: tensor(1.4120, device='cuda:0') factor: tensor(0.1307, device='cuda:0')
reward_raw: tensor(0.5371, device='cuda:0') reward: tensor(1.4101, device='cuda:0') factor: tensor(0.1270, device='cuda:0')
reward_raw: tensor(0.5367, device='cuda:0') reward: tensor(1.4092, device='cuda:0') factor: tensor(0.1275, device='cuda:0')
reward_raw: tensor(0.5390, device='cuda:0') reward: tensor(1.4107, device='cuda:0') factor: tensor(0.1282, device='cuda:0')
reward_raw: tensor(0.5372, device='cuda:0') reward: tensor(1.4090, device='cuda:0') factor: tensor(0.1282, device='cuda:0')


Epoch 1, Step 1754: Loss = 0.1704, Actor Loss = 0.1326, Critic Loss = 0.0756, Entropy = 0.0433, Returns = 1.3653, Value = 1.3497, mask_loss = 0.1133std_loss = 0.2898:  32%|███▏      | 1754/5475 [7:51:42<15:38:19, 15.13s/it]

ratio tensor(0.1623, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4895, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1162, device='cuda:0')
reward_raw: tensor(0.4977, device='cuda:0') reward: tensor(1.3828, device='cuda:0') factor: tensor(0.1149, device='cuda:0')
reward_raw: tensor(0.4982, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.1170, device='cuda:0')
reward_raw: tensor(0.4966, device='cuda:0') reward: tensor(1.3803, device='cuda:0') factor: tensor(0.1163, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.1189, device='cuda:0')


Epoch 1, Step 1755: Loss = -0.0800, Actor Loss = -0.1071, Critic Loss = 0.0542, Entropy = 0.0437, Returns = 1.3389, Value = 1.4253, mask_loss = 0.1259std_loss = 0.3053:  32%|███▏      | 1755/5475 [7:51:54<14:37:48, 14.16s/it]

ratio tensor(1.0336, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4340, device='cuda:0') reward: tensor(1.3383, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4580, device='cuda:0') reward: tensor(1.3588, device='cuda:0') factor: tensor(0.0992, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.3485, device='cuda:0') factor: tensor(0.0967, device='cuda:0')
reward_raw: tensor(0.4439, device='cuda:0') reward: tensor(1.3495, device='cuda:0') factor: tensor(0.0945, device='cuda:0')
reward_raw: tensor(0.4295, device='cuda:0') reward: tensor(1.3352, device='cuda:0') factor: tensor(0.0942, device='cuda:0')


Epoch 1, Step 1756: Loss = 0.0224, Actor Loss = -0.0132, Critic Loss = 0.0713, Entropy = 0.0385, Returns = 1.3892, Value = 1.2937, mask_loss = 0.0872std_loss = 0.2590:  32%|███▏      | 1756/5475 [7:52:07<14:17:39, 13.84s/it] 

ratio tensor(0.8593, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4386, device='cuda:0') reward: tensor(1.3339, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.4586, device='cuda:0') reward: tensor(1.3509, device='cuda:0') factor: tensor(0.1078, device='cuda:0')
reward_raw: tensor(0.4530, device='cuda:0') reward: tensor(1.3453, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.4555, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.1099, device='cuda:0')
reward_raw: tensor(0.4540, device='cuda:0') reward: tensor(1.3451, device='cuda:0') factor: tensor(0.1089, device='cuda:0')


Epoch 1, Step 1757: Loss = -0.0336, Actor Loss = -0.0556, Critic Loss = 0.0442, Entropy = 0.0443, Returns = 1.3371, Value = 1.4638, mask_loss = 0.0957std_loss = 0.2602:  32%|███▏      | 1757/5475 [7:52:22<14:26:34, 13.98s/it]

ratio tensor(0.9187, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4747, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.1176, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.5151, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.1152, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.4007, device='cuda:0') factor: tensor(0.1176, device='cuda:0')
reward_raw: tensor(0.5461, device='cuda:0') reward: tensor(1.4260, device='cuda:0') factor: tensor(0.1201, device='cuda:0')


Epoch 1, Step 1758: Loss = 0.1173, Actor Loss = 0.0956, Critic Loss = 0.0436, Entropy = 0.0573, Returns = 1.3884, Value = 1.4391, mask_loss = 0.1553std_loss = 0.3114:  32%|███▏      | 1758/5475 [7:52:41<16:03:41, 15.56s/it]  

ratio tensor(0.7880, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.1000, device='cuda:0')
reward_raw: tensor(0.5027, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.0983, device='cuda:0')
reward_raw: tensor(0.5054, device='cuda:0') reward: tensor(1.4042, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.4994, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.5222, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.0993, device='cuda:0')


Epoch 1, Step 1759: Loss = -0.0010, Actor Loss = -0.0252, Critic Loss = 0.0484, Entropy = 0.0213, Returns = 1.4259, Value = 1.3084, mask_loss = 0.0890std_loss = 0.2679:  32%|███▏      | 1759/5475 [7:52:55<15:35:03, 15.10s/it]

ratio tensor(1.0694, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4939, device='cuda:0') reward: tensor(1.3830, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.1124, device='cuda:0')
reward_raw: tensor(0.4813, device='cuda:0') reward: tensor(1.3699, device='cuda:0') factor: tensor(0.1114, device='cuda:0')
reward_raw: tensor(0.5028, device='cuda:0') reward: tensor(1.3874, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.3869, device='cuda:0') factor: tensor(0.1100, device='cuda:0')


Epoch 1, Step 1760: Loss = -0.1587, Actor Loss = -0.1841, Critic Loss = 0.0509, Entropy = 0.0542, Returns = 1.4619, Value = 1.4216, mask_loss = 0.1344std_loss = 0.3090:  32%|███▏      | 1760/5475 [7:53:07<14:29:52, 14.05s/it]

ratio tensor(1.1963, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4875, device='cuda:0') reward: tensor(1.3625, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3593, device='cuda:0') factor: tensor(0.1249, device='cuda:0')
reward_raw: tensor(0.5029, device='cuda:0') reward: tensor(1.3758, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.5036, device='cuda:0') reward: tensor(1.3736, device='cuda:0') factor: tensor(0.1299, device='cuda:0')
reward_raw: tensor(0.4883, device='cuda:0') reward: tensor(1.3610, device='cuda:0') factor: tensor(0.1273, device='cuda:0')


Epoch 1, Step 1761: Loss = 0.0430, Actor Loss = 0.0153, Critic Loss = 0.0553, Entropy = 0.0499, Returns = 1.3697, Value = 1.3858, mask_loss = 0.1580std_loss = 0.3339:  32%|███▏      | 1760/5475 [7:53:19<14:29:52, 14.05s/it]  

ratio tensor(0.9541, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1761: Loss = 0.0430, Actor Loss = 0.0153, Critic Loss = 0.0553, Entropy = 0.0499, Returns = 1.3697, Value = 1.3858, mask_loss = 0.1580std_loss = 0.3339:  32%|███▏      | 1761/5475 [7:53:20<14:10:15, 13.74s/it]

reward_raw: tensor(0.5071, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.1173, device='cuda:0')
reward_raw: tensor(0.5048, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1127, device='cuda:0')
reward_raw: tensor(0.4961, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.5023, device='cuda:0') reward: tensor(1.3885, device='cuda:0') factor: tensor(0.1138, device='cuda:0')
reward_raw: tensor(0.4955, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.1121, device='cuda:0')


Epoch 1, Step 1762: Loss = 0.0838, Actor Loss = 0.0274, Critic Loss = 0.1129, Entropy = 0.0470, Returns = 1.4319, Value = 1.3040, mask_loss = 0.1074std_loss = 0.2796:  32%|███▏      | 1762/5475 [7:53:33<13:59:28, 13.57s/it]

ratio tensor(0.8335, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5300, device='cuda:0') reward: tensor(1.4337, device='cuda:0') factor: tensor(0.0963, device='cuda:0')
reward_raw: tensor(0.5381, device='cuda:0') reward: tensor(1.4390, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.5490, device='cuda:0') reward: tensor(1.4458, device='cuda:0') factor: tensor(0.1033, device='cuda:0')
reward_raw: tensor(0.5520, device='cuda:0') reward: tensor(1.4502, device='cuda:0') factor: tensor(0.1018, device='cuda:0')
reward_raw: tensor(0.5577, device='cuda:0') reward: tensor(1.4535, device='cuda:0') factor: tensor(0.1042, device='cuda:0')


Epoch 1, Step 1763: Loss = -0.2245, Actor Loss = -0.2434, Critic Loss = 0.0379, Entropy = 0.0364, Returns = 1.4746, Value = 1.6044, mask_loss = 0.0943std_loss = 0.2694:  32%|███▏      | 1763/5475 [7:53:46<13:52:12, 13.45s/it]

ratio tensor(1.4558, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3961, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.3989, device='cuda:0') reward: tensor(1.2909, device='cuda:0') factor: tensor(0.1080, device='cuda:0')
reward_raw: tensor(0.3814, device='cuda:0') reward: tensor(1.2778, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.3930, device='cuda:0') reward: tensor(1.2872, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.3943, device='cuda:0') reward: tensor(1.2868, device='cuda:0') factor: tensor(0.1074, device='cuda:0')


Epoch 1, Step 1764: Loss = 0.2829, Actor Loss = 0.2693, Critic Loss = 0.0272, Entropy = 0.0546, Returns = 1.2817, Value = 1.2224, mask_loss = 0.1384std_loss = 0.3102:  32%|███▏      | 1764/5475 [7:54:05<15:32:28, 15.08s/it]  

ratio tensor(0.6084, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4756, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.0894, device='cuda:0')
reward_raw: tensor(0.4625, device='cuda:0') reward: tensor(1.3702, device='cuda:0') factor: tensor(0.0923, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(1.3728, device='cuda:0') factor: tensor(0.0892, device='cuda:0')
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.3773, device='cuda:0') factor: tensor(0.0898, device='cuda:0')
reward_raw: tensor(0.4748, device='cuda:0') reward: tensor(1.3809, device='cuda:0') factor: tensor(0.0939, device='cuda:0')


Epoch 1, Step 1765: Loss = -0.0685, Actor Loss = -0.1156, Critic Loss = 0.0942, Entropy = 0.0437, Returns = 1.3615, Value = 1.4742, mask_loss = 0.1245std_loss = 0.2910:  32%|███▏      | 1765/5475 [7:54:19<15:18:42, 14.86s/it]

ratio tensor(1.0591, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.3959, device='cuda:0') factor: tensor(0.1190, device='cuda:0')
reward_raw: tensor(0.5280, device='cuda:0') reward: tensor(1.4084, device='cuda:0') factor: tensor(0.1196, device='cuda:0')
reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.1180, device='cuda:0')
reward_raw: tensor(0.5052, device='cuda:0') reward: tensor(1.3867, device='cuda:0') factor: tensor(0.1186, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.1203, device='cuda:0')


Epoch 1, Step 1766: Loss = 0.1567, Actor Loss = 0.1256, Critic Loss = 0.0623, Entropy = 0.0396, Returns = 1.3704, Value = 1.4301, mask_loss = 0.1199std_loss = 0.2958:  32%|███▏      | 1766/5475 [7:54:34<15:21:40, 14.91s/it]  

ratio tensor(0.9607, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4657, device='cuda:0') reward: tensor(1.3479, device='cuda:0') factor: tensor(0.1179, device='cuda:0')
reward_raw: tensor(0.4465, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1111, device='cuda:0')
reward_raw: tensor(0.4576, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.1160, device='cuda:0')
reward_raw: tensor(0.4636, device='cuda:0') reward: tensor(1.3511, device='cuda:0') factor: tensor(0.1125, device='cuda:0')
reward_raw: tensor(0.4642, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.1110, device='cuda:0')


Epoch 1, Step 1767: Loss = 0.0294, Actor Loss = -0.0032, Critic Loss = 0.0652, Entropy = 0.0222, Returns = 1.3903, Value = 1.3227, mask_loss = 0.0893std_loss = 0.2669:  32%|███▏      | 1767/5475 [7:54:54<16:52:14, 16.38s/it]

ratio tensor(1.1040, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5650, device='cuda:0') reward: tensor(1.4550, device='cuda:0') factor: tensor(0.1100, device='cuda:0')
reward_raw: tensor(0.5584, device='cuda:0') reward: tensor(1.4467, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.5603, device='cuda:0') reward: tensor(1.4498, device='cuda:0') factor: tensor(0.1105, device='cuda:0')
reward_raw: tensor(0.5831, device='cuda:0') reward: tensor(1.4705, device='cuda:0') factor: tensor(0.1126, device='cuda:0')
reward_raw: tensor(0.5599, device='cuda:0') reward: tensor(1.4495, device='cuda:0') factor: tensor(0.1105, device='cuda:0')


Epoch 1, Step 1768: Loss = -0.1123, Actor Loss = -0.1439, Critic Loss = 0.0631, Entropy = 0.0356, Returns = 1.4343, Value = 1.4351, mask_loss = 0.1036std_loss = 0.2830:  32%|███▏      | 1768/5475 [7:55:05<15:08:53, 14.71s/it]

ratio tensor(1.6592, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5300, device='cuda:0') reward: tensor(1.4416, device='cuda:0') factor: tensor(0.0884, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.4366, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.5576, device='cuda:0') reward: tensor(1.4666, device='cuda:0') factor: tensor(0.0910, device='cuda:0')
reward_raw: tensor(0.5365, device='cuda:0') reward: tensor(1.4486, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.5227, device='cuda:0') reward: tensor(1.4363, device='cuda:0') factor: tensor(0.0864, device='cuda:0')


Epoch 1, Step 1769: Loss = -0.0367, Actor Loss = -0.0591, Critic Loss = 0.0448, Entropy = 0.0207, Returns = 1.5004, Value = 1.4082, mask_loss = 0.0931std_loss = 0.2737:  32%|███▏      | 1769/5475 [7:55:20<15:24:23, 14.97s/it]

ratio tensor(0.8653, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5262, device='cuda:0') reward: tensor(1.4183, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.5252, device='cuda:0') reward: tensor(1.4159, device='cuda:0') factor: tensor(0.1093, device='cuda:0')
reward_raw: tensor(0.5133, device='cuda:0') reward: tensor(1.4056, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.4195, device='cuda:0') factor: tensor(0.1084, device='cuda:0')
reward_raw: tensor(0.5251, device='cuda:0') reward: tensor(1.4208, device='cuda:0') factor: tensor(0.1043, device='cuda:0')


Epoch 1, Step 1770: Loss = 0.1755, Actor Loss = 0.1366, Critic Loss = 0.0781, Entropy = 0.0443, Returns = 1.3735, Value = 1.4514, mask_loss = 0.1252std_loss = 0.2971:  32%|███▏      | 1770/5475 [7:55:43<17:46:53, 17.28s/it]  

ratio tensor(0.8560, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.0872, device='cuda:0')
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.4305, device='cuda:0') factor: tensor(0.0892, device='cuda:0')
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(1.4091, device='cuda:0') factor: tensor(0.0898, device='cuda:0')
reward_raw: tensor(0.5264, device='cuda:0') reward: tensor(1.4378, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.4761, device='cuda:0') reward: tensor(1.3883, device='cuda:0') factor: tensor(0.0878, device='cuda:0')


Epoch 1, Step 1771: Loss = -0.0047, Actor Loss = -0.0402, Critic Loss = 0.0711, Entropy = 0.0397, Returns = 1.3518, Value = 1.4261, mask_loss = 0.0951std_loss = 0.2680:  32%|███▏      | 1771/5475 [7:55:59<17:30:51, 17.02s/it]

ratio tensor(1.1456, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4376, device='cuda:0') reward: tensor(1.3363, device='cuda:0') factor: tensor(0.1013, device='cuda:0')
reward_raw: tensor(0.4370, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.1023, device='cuda:0')
reward_raw: tensor(0.4523, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4542, device='cuda:0') reward: tensor(1.3504, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.4449, device='cuda:0') reward: tensor(1.3493, device='cuda:0') factor: tensor(0.0956, device='cuda:0')


Epoch 1, Step 1772: Loss = 0.0659, Actor Loss = 0.0409, Critic Loss = 0.0500, Entropy = 0.0506, Returns = 1.3493, Value = 1.3119, mask_loss = 0.0999std_loss = 0.2698:  32%|███▏      | 1772/5475 [7:56:13<16:20:44, 15.89s/it]  

ratio tensor(1.1838, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.3992, device='cuda:0') factor: tensor(0.0880, device='cuda:0')
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.4087, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.5222, device='cuda:0') reward: tensor(1.4373, device='cuda:0') factor: tensor(0.0849, device='cuda:0')
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.4109, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.5126, device='cuda:0') reward: tensor(1.4229, device='cuda:0') factor: tensor(0.0897, device='cuda:0')


Epoch 1, Step 1773: Loss = 0.0866, Actor Loss = 0.0493, Critic Loss = 0.0747, Entropy = 0.0354, Returns = 1.4014, Value = 1.3852, mask_loss = 0.0889std_loss = 0.2638:  32%|███▏      | 1773/5475 [7:56:28<16:14:27, 15.79s/it]

ratio tensor(0.9865, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5524, device='cuda:0') reward: tensor(1.4676, device='cuda:0') factor: tensor(0.0849, device='cuda:0')
reward_raw: tensor(0.5569, device='cuda:0') reward: tensor(1.4672, device='cuda:0') factor: tensor(0.0896, device='cuda:0')
reward_raw: tensor(0.5601, device='cuda:0') reward: tensor(1.4724, device='cuda:0') factor: tensor(0.0877, device='cuda:0')
reward_raw: tensor(0.5521, device='cuda:0') reward: tensor(1.4618, device='cuda:0') factor: tensor(0.0903, device='cuda:0')
reward_raw: tensor(0.5469, device='cuda:0') reward: tensor(1.4598, device='cuda:0') factor: tensor(0.0870, device='cuda:0')


Epoch 1, Step 1774: Loss = -0.0392, Actor Loss = -0.0873, Critic Loss = 0.0963, Entropy = 0.0310, Returns = 1.3875, Value = 1.5546, mask_loss = 0.0770std_loss = 0.2462:  32%|███▏      | 1774/5475 [7:56:43<15:50:49, 15.41s/it]

ratio tensor(1.2406, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3724, device='cuda:0') reward: tensor(1.2602, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.3949, device='cuda:0') reward: tensor(1.2854, device='cuda:0') factor: tensor(0.1095, device='cuda:0')
reward_raw: tensor(0.3903, device='cuda:0') reward: tensor(1.2769, device='cuda:0') factor: tensor(0.1134, device='cuda:0')
reward_raw: tensor(0.3776, device='cuda:0') reward: tensor(1.2656, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.3981, device='cuda:0') reward: tensor(1.2860, device='cuda:0') factor: tensor(0.1121, device='cuda:0')


Epoch 1, Step 1775: Loss = 0.3312, Actor Loss = 0.2987, Critic Loss = 0.0650, Entropy = 0.0438, Returns = 1.2252, Value = 1.1398, mask_loss = 0.1265std_loss = 0.2903:  32%|███▏      | 1775/5475 [7:57:01<16:38:16, 16.19s/it]  

ratio tensor(0.7987, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4970, device='cuda:0') reward: tensor(1.3884, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.3896, device='cuda:0') factor: tensor(0.1097, device='cuda:0')
reward_raw: tensor(0.5035, device='cuda:0') reward: tensor(1.3915, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.5017, device='cuda:0') reward: tensor(1.3862, device='cuda:0') factor: tensor(0.1155, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3842, device='cuda:0') factor: tensor(0.1090, device='cuda:0')


Epoch 1, Step 1776: Loss = -0.2205, Actor Loss = -0.2522, Critic Loss = 0.0634, Entropy = 0.0369, Returns = 1.4039, Value = 1.3713, mask_loss = 0.1129std_loss = 0.2942:  32%|███▏      | 1776/5475 [7:57:17<16:39:44, 16.22s/it]

ratio tensor(1.1091, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.4225, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.4251, device='cuda:0') factor: tensor(0.0963, device='cuda:0')
reward_raw: tensor(0.5213, device='cuda:0') reward: tensor(1.4261, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.4225, device='cuda:0') factor: tensor(0.0945, device='cuda:0')
reward_raw: tensor(0.5291, device='cuda:0') reward: tensor(1.4337, device='cuda:0') factor: tensor(0.0954, device='cuda:0')


Epoch 1, Step 1777: Loss = 0.1182, Actor Loss = 0.0971, Critic Loss = 0.0423, Entropy = 0.0231, Returns = 1.4226, Value = 1.4956, mask_loss = 0.0936std_loss = 0.2764:  32%|███▏      | 1777/5475 [7:57:32<16:24:11, 15.97s/it]  

ratio tensor(0.8987, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.0842, device='cuda:0')
reward_raw: tensor(0.4638, device='cuda:0') reward: tensor(1.3780, device='cuda:0') factor: tensor(0.0859, device='cuda:0')
reward_raw: tensor(0.4801, device='cuda:0') reward: tensor(1.3964, device='cuda:0') factor: tensor(0.0837, device='cuda:0')
reward_raw: tensor(0.4694, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.0824, device='cuda:0')
reward_raw: tensor(0.4682, device='cuda:0') reward: tensor(1.3825, device='cuda:0') factor: tensor(0.0857, device='cuda:0')


Epoch 1, Step 1778: Loss = -0.0825, Actor Loss = -0.1146, Critic Loss = 0.0643, Entropy = 0.0392, Returns = 1.4124, Value = 1.3480, mask_loss = 0.0964std_loss = 0.2712:  32%|███▏      | 1778/5475 [7:57:47<16:00:01, 15.58s/it]

ratio tensor(1.3019, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4775, device='cuda:0') reward: tensor(1.3757, device='cuda:0') factor: tensor(0.1018, device='cuda:0')
reward_raw: tensor(0.4543, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1008, device='cuda:0')
reward_raw: tensor(0.4783, device='cuda:0') reward: tensor(1.3749, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4650, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.4560, device='cuda:0') reward: tensor(1.3571, device='cuda:0') factor: tensor(0.0989, device='cuda:0')


Epoch 1, Step 1779: Loss = 0.0184, Actor Loss = -0.0110, Critic Loss = 0.0588, Entropy = 0.0357, Returns = 1.3212, Value = 1.3595, mask_loss = 0.1065std_loss = 0.2836:  32%|███▏      | 1779/5475 [7:58:00<15:18:27, 14.91s/it] 

ratio tensor(0.8660, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4814, device='cuda:0') reward: tensor(1.3629, device='cuda:0') factor: tensor(0.1185, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.1200, device='cuda:0')
reward_raw: tensor(0.4859, device='cuda:0') reward: tensor(1.3654, device='cuda:0') factor: tensor(0.1205, device='cuda:0')
reward_raw: tensor(0.4735, device='cuda:0') reward: tensor(1.3524, device='cuda:0') factor: tensor(0.1211, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.3520, device='cuda:0') factor: tensor(0.1202, device='cuda:0')


Epoch 1, Step 1780: Loss = 0.1310, Actor Loss = 0.1112, Critic Loss = 0.0395, Entropy = 0.0451, Returns = 1.3520, Value = 1.3717, mask_loss = 0.1524std_loss = 0.3316:  33%|███▎      | 1780/5475 [7:58:17<15:40:22, 15.27s/it] 

ratio tensor(0.7618, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4240, device='cuda:0') reward: tensor(1.3396, device='cuda:0') factor: tensor(0.0844, device='cuda:0')
reward_raw: tensor(0.4107, device='cuda:0') reward: tensor(1.3267, device='cuda:0') factor: tensor(0.0840, device='cuda:0')
reward_raw: tensor(0.4184, device='cuda:0') reward: tensor(1.3381, device='cuda:0') factor: tensor(0.0803, device='cuda:0')
reward_raw: tensor(0.4212, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.0835, device='cuda:0')
reward_raw: tensor(0.4131, device='cuda:0') reward: tensor(1.3275, device='cuda:0') factor: tensor(0.0856, device='cuda:0')


Epoch 1, Step 1781: Loss = -0.1524, Actor Loss = -0.1816, Critic Loss = 0.0586, Entropy = 0.0342, Returns = 1.3473, Value = 1.3717, mask_loss = 0.0887std_loss = 0.2644:  33%|███▎      | 1780/5475 [7:58:31<15:40:22, 15.27s/it]

ratio tensor(1.2620, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1781: Loss = -0.1524, Actor Loss = -0.1816, Critic Loss = 0.0586, Entropy = 0.0342, Returns = 1.3473, Value = 1.3717, mask_loss = 0.0887std_loss = 0.2644:  33%|███▎      | 1781/5475 [7:58:31<15:28:49, 15.09s/it]

reward_raw: tensor(0.4613, device='cuda:0') reward: tensor(1.3630, device='cuda:0') factor: tensor(0.0983, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.3843, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.4684, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.1006, device='cuda:0')
reward_raw: tensor(0.4558, device='cuda:0') reward: tensor(1.3557, device='cuda:0') factor: tensor(0.1001, device='cuda:0')
reward_raw: tensor(0.4664, device='cuda:0') reward: tensor(1.3701, device='cuda:0') factor: tensor(0.0963, device='cuda:0')


Epoch 1, Step 1782: Loss = 0.0338, Actor Loss = 0.0071, Critic Loss = 0.0534, Entropy = 0.0360, Returns = 1.4039, Value = 1.3674, mask_loss = 0.1153std_loss = 0.3003:  33%|███▎      | 1782/5475 [7:58:49<16:18:19, 15.89s/it]  

ratio tensor(0.8745, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3919, device='cuda:0') reward: tensor(1.2923, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4021, device='cuda:0') reward: tensor(1.3035, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.3891, device='cuda:0') reward: tensor(1.2905, device='cuda:0') factor: tensor(0.0987, device='cuda:0')
reward_raw: tensor(0.3852, device='cuda:0') reward: tensor(1.2868, device='cuda:0') factor: tensor(0.0983, device='cuda:0')
reward_raw: tensor(0.4217, device='cuda:0') reward: tensor(1.3240, device='cuda:0') factor: tensor(0.0978, device='cuda:0')


Epoch 1, Step 1783: Loss = 0.0677, Actor Loss = 0.0462, Critic Loss = 0.0431, Entropy = 0.0266, Returns = 1.3137, Value = 1.2410, mask_loss = 0.0833std_loss = 0.2600:  33%|███▎      | 1783/5475 [7:59:04<15:58:29, 15.58s/it]

ratio tensor(0.9813, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4902, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.1138, device='cuda:0')
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3649, device='cuda:0') factor: tensor(0.1127, device='cuda:0')
reward_raw: tensor(0.4859, device='cuda:0') reward: tensor(1.3769, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3417, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1084, device='cuda:0')


Epoch 1, Step 1784: Loss = 0.0096, Actor Loss = -0.0271, Critic Loss = 0.0735, Entropy = 0.0491, Returns = 1.3125, Value = 1.3945, mask_loss = 0.1224std_loss = 0.2959:  33%|███▎      | 1784/5475 [7:59:18<15:28:45, 15.10s/it]

ratio tensor(1.4466, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4734, device='cuda:0') reward: tensor(1.3676, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1099, device='cuda:0')
reward_raw: tensor(0.5019, device='cuda:0') reward: tensor(1.3955, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.3889, device='cuda:0') factor: tensor(0.1106, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3915, device='cuda:0') factor: tensor(0.1083, device='cuda:0')


Epoch 1, Step 1785: Loss = 0.0149, Actor Loss = -0.0214, Critic Loss = 0.0726, Entropy = 0.0445, Returns = 1.4209, Value = 1.3889, mask_loss = 0.1038std_loss = 0.2778:  33%|███▎      | 1785/5475 [7:59:37<16:34:22, 16.17s/it]

ratio tensor(1.2537, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5953, device='cuda:0') reward: tensor(1.4734, device='cuda:0') factor: tensor(0.1219, device='cuda:0')
reward_raw: tensor(0.5764, device='cuda:0') reward: tensor(1.4576, device='cuda:0') factor: tensor(0.1188, device='cuda:0')
reward_raw: tensor(0.5914, device='cuda:0') reward: tensor(1.4711, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.5737, device='cuda:0') reward: tensor(1.4511, device='cuda:0') factor: tensor(0.1226, device='cuda:0')
reward_raw: tensor(0.5818, device='cuda:0') reward: tensor(1.4579, device='cuda:0') factor: tensor(0.1239, device='cuda:0')


Epoch 1, Step 1786: Loss = -0.2025, Actor Loss = -0.2250, Critic Loss = 0.0452, Entropy = 0.0483, Returns = 1.4759, Value = 1.5091, mask_loss = 0.1145std_loss = 0.2922:  33%|███▎      | 1786/5475 [7:59:51<16:05:49, 15.71s/it]

ratio tensor(1.3501, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4278, device='cuda:0') reward: tensor(1.3284, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.4334, device='cuda:0') reward: tensor(1.3332, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.4294, device='cuda:0') reward: tensor(1.3321, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4221, device='cuda:0') reward: tensor(1.3237, device='cuda:0') factor: tensor(0.0984, device='cuda:0')
reward_raw: tensor(0.4395, device='cuda:0') reward: tensor(1.3401, device='cuda:0') factor: tensor(0.0994, device='cuda:0')


Epoch 1, Step 1787: Loss = 0.2453, Actor Loss = 0.2212, Critic Loss = 0.0483, Entropy = 0.0360, Returns = 1.3366, Value = 1.3338, mask_loss = 0.1030std_loss = 0.2829:  33%|███▎      | 1787/5475 [8:00:05<15:27:31, 15.09s/it]  

ratio tensor(0.7969, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4827, device='cuda:0') reward: tensor(1.3421, device='cuda:0') factor: tensor(0.1406, device='cuda:0')
reward_raw: tensor(0.4782, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3467, device='cuda:0') factor: tensor(0.1389, device='cuda:0')
reward_raw: tensor(0.4835, device='cuda:0') reward: tensor(1.3416, device='cuda:0') factor: tensor(0.1419, device='cuda:0')
reward_raw: tensor(0.4889, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1398, device='cuda:0')


Epoch 1, Step 1788: Loss = -0.1049, Actor Loss = -0.1470, Critic Loss = 0.0843, Entropy = 0.0427, Returns = 1.3062, Value = 1.1662, mask_loss = 0.1234std_loss = 0.3015:  33%|███▎      | 1788/5475 [8:00:18<14:54:33, 14.56s/it]

ratio tensor(0.9884, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4945, device='cuda:0') reward: tensor(1.3666, device='cuda:0') factor: tensor(0.1279, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3584, device='cuda:0') factor: tensor(0.1214, device='cuda:0')
reward_raw: tensor(0.4792, device='cuda:0') reward: tensor(1.3594, device='cuda:0') factor: tensor(0.1198, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3558, device='cuda:0') factor: tensor(0.1192, device='cuda:0')
reward_raw: tensor(0.4798, device='cuda:0') reward: tensor(1.3633, device='cuda:0') factor: tensor(0.1165, device='cuda:0')


Epoch 1, Step 1789: Loss = 0.0213, Actor Loss = -0.0345, Critic Loss = 0.1116, Entropy = 0.0329, Returns = 1.3249, Value = 1.4543, mask_loss = 0.1085std_loss = 0.2839:  33%|███▎      | 1789/5475 [8:00:39<16:58:02, 16.57s/it] 

ratio tensor(0.9264, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4578, device='cuda:0') factor: tensor(0.0918, device='cuda:0')
reward_raw: tensor(0.5496, device='cuda:0') reward: tensor(1.4537, device='cuda:0') factor: tensor(0.0960, device='cuda:0')
reward_raw: tensor(0.5430, device='cuda:0') reward: tensor(1.4478, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.5571, device='cuda:0') reward: tensor(1.4600, device='cuda:0') factor: tensor(0.0971, device='cuda:0')
reward_raw: tensor(0.5463, device='cuda:0') reward: tensor(1.4526, device='cuda:0') factor: tensor(0.0936, device='cuda:0')


Epoch 1, Step 1790: Loss = 0.0186, Actor Loss = -0.0025, Critic Loss = 0.0422, Entropy = 0.0278, Returns = 1.4188, Value = 1.4415, mask_loss = 0.0935std_loss = 0.2684:  33%|███▎      | 1790/5475 [8:00:55<16:36:15, 16.22s/it]

ratio tensor(0.9277, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4842, device='cuda:0') reward: tensor(1.3791, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4504, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4585, device='cuda:0') reward: tensor(1.3519, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3662, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.4744, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.1051, device='cuda:0')


Epoch 1, Step 1791: Loss = 0.0558, Actor Loss = -0.0035, Critic Loss = 0.1186, Entropy = 0.0327, Returns = 1.3981, Value = 1.3100, mask_loss = 0.0923std_loss = 0.2694:  33%|███▎      | 1791/5475 [8:01:08<15:45:52, 15.41s/it]

ratio tensor(0.9695, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4953, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.5077, device='cuda:0') reward: tensor(1.4152, device='cuda:0') factor: tensor(0.0925, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.4068, device='cuda:0') factor: tensor(0.0923, device='cuda:0')
reward_raw: tensor(0.5074, device='cuda:0') reward: tensor(1.4143, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.4937, device='cuda:0') reward: tensor(1.4028, device='cuda:0') factor: tensor(0.0909, device='cuda:0')


Epoch 1, Step 1792: Loss = -0.0535, Actor Loss = -0.0758, Critic Loss = 0.0448, Entropy = 0.0266, Returns = 1.3290, Value = 1.3571, mask_loss = 0.0966std_loss = 0.2777:  33%|███▎      | 1792/5475 [8:01:20<14:34:51, 14.25s/it]

ratio tensor(1.1930, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4242, device='cuda:0') reward: tensor(1.3202, device='cuda:0') factor: tensor(0.1040, device='cuda:0')
reward_raw: tensor(0.4293, device='cuda:0') reward: tensor(1.3262, device='cuda:0') factor: tensor(0.1031, device='cuda:0')
reward_raw: tensor(0.4298, device='cuda:0') reward: tensor(1.3236, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.4203, device='cuda:0') reward: tensor(1.3147, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4151, device='cuda:0') reward: tensor(1.3077, device='cuda:0') factor: tensor(0.1074, device='cuda:0')


Epoch 1, Step 1793: Loss = 0.1969, Actor Loss = 0.1586, Critic Loss = 0.0768, Entropy = 0.0753, Returns = 1.2549, Value = 1.2874, mask_loss = 0.1659std_loss = 0.3029:  33%|███▎      | 1793/5475 [8:01:42<16:54:08, 16.53s/it]  

ratio tensor(1.0498, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.1033, device='cuda:0')
reward_raw: tensor(0.4854, device='cuda:0') reward: tensor(1.3851, device='cuda:0') factor: tensor(0.1003, device='cuda:0')
reward_raw: tensor(0.4549, device='cuda:0') reward: tensor(1.3506, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.3609, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.4568, device='cuda:0') reward: tensor(1.3497, device='cuda:0') factor: tensor(0.1071, device='cuda:0')


Epoch 1, Step 1794: Loss = -0.0760, Actor Loss = -0.0977, Critic Loss = 0.0434, Entropy = 0.0406, Returns = 1.3767, Value = 1.3626, mask_loss = 0.1087std_loss = 0.2866:  33%|███▎      | 1794/5475 [8:01:59<17:15:43, 16.88s/it]

ratio tensor(1.2516, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4501, device='cuda:0') reward: tensor(1.3512, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.4696, device='cuda:0') reward: tensor(1.3747, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.4651, device='cuda:0') reward: tensor(1.3692, device='cuda:0') factor: tensor(0.0959, device='cuda:0')
reward_raw: tensor(0.4479, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4445, device='cuda:0') reward: tensor(1.3493, device='cuda:0') factor: tensor(0.0952, device='cuda:0')


Epoch 1, Step 1795: Loss = 0.1509, Actor Loss = 0.1113, Critic Loss = 0.0792, Entropy = 0.0306, Returns = 1.2908, Value = 1.4052, mask_loss = 0.0903std_loss = 0.2686:  33%|███▎      | 1795/5475 [8:02:13<16:05:54, 15.75s/it]  

ratio tensor(0.8052, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.4453, device='cuda:0') factor: tensor(0.1139, device='cuda:0')
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.4461, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.5351, device='cuda:0') reward: tensor(1.4259, device='cuda:0') factor: tensor(0.1092, device='cuda:0')
reward_raw: tensor(0.5434, device='cuda:0') reward: tensor(1.4314, device='cuda:0') factor: tensor(0.1120, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.4420, device='cuda:0') factor: tensor(0.1093, device='cuda:0')


Epoch 1, Step 1796: Loss = -0.1277, Actor Loss = -0.1555, Critic Loss = 0.0556, Entropy = 0.0288, Returns = 1.4976, Value = 1.4196, mask_loss = 0.0962std_loss = 0.2771:  33%|███▎      | 1796/5475 [8:02:26<15:18:39, 14.98s/it]

ratio tensor(0.9637, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4334, device='cuda:0') reward: tensor(1.3323, device='cuda:0') factor: tensor(0.1011, device='cuda:0')
reward_raw: tensor(0.4559, device='cuda:0') reward: tensor(1.3527, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.4486, device='cuda:0') reward: tensor(1.3489, device='cuda:0') factor: tensor(0.0997, device='cuda:0')
reward_raw: tensor(0.4527, device='cuda:0') reward: tensor(1.3501, device='cuda:0') factor: tensor(0.1026, device='cuda:0')
reward_raw: tensor(0.4145, device='cuda:0') reward: tensor(1.3125, device='cuda:0') factor: tensor(0.1020, device='cuda:0')


Epoch 1, Step 1797: Loss = 0.1045, Actor Loss = 0.0750, Critic Loss = 0.0591, Entropy = 0.0569, Returns = 1.2915, Value = 1.2768, mask_loss = 0.1257std_loss = 0.2856:  33%|███▎      | 1797/5475 [8:02:43<15:57:40, 15.62s/it]  

ratio tensor(0.7800, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3966, device='cuda:0') reward: tensor(1.2930, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.3864, device='cuda:0') reward: tensor(1.2844, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.3895, device='cuda:0') reward: tensor(1.2826, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.3961, device='cuda:0') reward: tensor(1.2926, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.3760, device='cuda:0') reward: tensor(1.2770, device='cuda:0') factor: tensor(0.0990, device='cuda:0')


Epoch 1, Step 1798: Loss = 0.1901, Actor Loss = 0.1602, Critic Loss = 0.0600, Entropy = 0.0336, Returns = 1.2034, Value = 1.3220, mask_loss = 0.1047std_loss = 0.2849:  33%|███▎      | 1798/5475 [8:02:59<16:03:43, 15.73s/it]

ratio tensor(0.7933, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5835, device='cuda:0') reward: tensor(1.4746, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.4685, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.5654, device='cuda:0') reward: tensor(1.4527, device='cuda:0') factor: tensor(0.1127, device='cuda:0')
reward_raw: tensor(0.5746, device='cuda:0') reward: tensor(1.4650, device='cuda:0') factor: tensor(0.1095, device='cuda:0')
reward_raw: tensor(0.5513, device='cuda:0') reward: tensor(1.4442, device='cuda:0') factor: tensor(0.1072, device='cuda:0')


Epoch 1, Step 1799: Loss = -0.1545, Actor Loss = -0.1789, Critic Loss = 0.0489, Entropy = 0.0467, Returns = 1.4473, Value = 1.5180, mask_loss = 0.1070std_loss = 0.2808:  33%|███▎      | 1799/5475 [8:03:14<15:54:07, 15.57s/it]

ratio tensor(5.9471, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5060, device='cuda:0') reward: tensor(1.3974, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.5170, device='cuda:0') reward: tensor(1.4111, device='cuda:0') factor: tensor(0.1059, device='cuda:0')
reward_raw: tensor(0.5103, device='cuda:0') reward: tensor(1.4018, device='cuda:0') factor: tensor(0.1084, device='cuda:0')
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.3982, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.4909, device='cuda:0') reward: tensor(1.3868, device='cuda:0') factor: tensor(0.1042, device='cuda:0')


Epoch 1, Step 1800: Loss = 0.0485, Actor Loss = 0.0055, Critic Loss = 0.0862, Entropy = 0.0479, Returns = 1.3209, Value = 1.2911, mask_loss = 0.1155std_loss = 0.2945:  33%|███▎      | 1800/5475 [8:03:31<16:12:33, 15.88s/it]  

ratio tensor(0.9123, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4342, device='cuda:0') reward: tensor(1.3188, device='cuda:0') factor: tensor(0.1154, device='cuda:0')
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.3734, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.4766, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4604, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3569, device='cuda:0') factor: tensor(0.1112, device='cuda:0')


Epoch 1, Step 1801: Loss = -0.0570, Actor Loss = -0.0813, Critic Loss = 0.0487, Entropy = 0.0404, Returns = 1.3139, Value = 1.3127, mask_loss = 0.1233std_loss = 0.3023:  33%|███▎      | 1800/5475 [8:03:45<16:12:33, 15.88s/it]

ratio tensor(0.8805, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1801: Loss = -0.0570, Actor Loss = -0.0813, Critic Loss = 0.0487, Entropy = 0.0404, Returns = 1.3139, Value = 1.3127, mask_loss = 0.1233std_loss = 0.3023:  33%|███▎      | 1801/5475 [8:03:45<15:46:28, 15.46s/it]

reward_raw: tensor(0.4880, device='cuda:0') reward: tensor(1.3748, device='cuda:0') factor: tensor(0.1132, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1113, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.3757, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.4793, device='cuda:0') reward: tensor(1.3626, device='cuda:0') factor: tensor(0.1167, device='cuda:0')
reward_raw: tensor(0.4651, device='cuda:0') reward: tensor(1.3538, device='cuda:0') factor: tensor(0.1113, device='cuda:0')


Epoch 1, Step 1802: Loss = 0.0224, Actor Loss = -0.0168, Critic Loss = 0.0785, Entropy = 0.0397, Returns = 1.4083, Value = 1.3376, mask_loss = 0.1252std_loss = 0.2987:  33%|███▎      | 1802/5475 [8:04:01<16:01:11, 15.70s/it] 

ratio tensor(1.4987, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4701, device='cuda:0') reward: tensor(1.3703, device='cuda:0') factor: tensor(0.0998, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3885, device='cuda:0') factor: tensor(0.1021, device='cuda:0')
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.3954, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3618, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.4825, device='cuda:0') reward: tensor(1.3776, device='cuda:0') factor: tensor(0.1048, device='cuda:0')


Epoch 1, Step 1803: Loss = 0.0451, Actor Loss = 0.0043, Critic Loss = 0.0817, Entropy = 0.0341, Returns = 1.3541, Value = 1.4418, mask_loss = 0.0936std_loss = 0.2694:  33%|███▎      | 1803/5475 [8:04:17<16:05:55, 15.78s/it] 

ratio tensor(1.4227, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.4757, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.5908, device='cuda:0') reward: tensor(1.4953, device='cuda:0') factor: tensor(0.0955, device='cuda:0')
reward_raw: tensor(0.5662, device='cuda:0') reward: tensor(1.4735, device='cuda:0') factor: tensor(0.0927, device='cuda:0')
reward_raw: tensor(0.5749, device='cuda:0') reward: tensor(1.4815, device='cuda:0') factor: tensor(0.0934, device='cuda:0')
reward_raw: tensor(0.5823, device='cuda:0') reward: tensor(1.4857, device='cuda:0') factor: tensor(0.0967, device='cuda:0')


Epoch 1, Step 1804: Loss = -0.0671, Actor Loss = -0.0854, Critic Loss = 0.0366, Entropy = 0.0299, Returns = 1.4639, Value = 1.4714, mask_loss = 0.1002std_loss = 0.2733:  33%|███▎      | 1804/5475 [8:04:40<18:19:45, 17.97s/it]

ratio tensor(1.1325, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5485, device='cuda:0') reward: tensor(1.4369, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.5670, device='cuda:0') reward: tensor(1.4523, device='cuda:0') factor: tensor(0.1146, device='cuda:0')
reward_raw: tensor(0.5767, device='cuda:0') reward: tensor(1.4622, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(1.4574, device='cuda:0') factor: tensor(0.1148, device='cuda:0')
reward_raw: tensor(0.5533, device='cuda:0') reward: tensor(1.4419, device='cuda:0') factor: tensor(0.1114, device='cuda:0')


Epoch 1, Step 1805: Loss = -0.0276, Actor Loss = -0.0621, Critic Loss = 0.0691, Entropy = 0.0441, Returns = 1.5451, Value = 1.4469, mask_loss = 0.1321std_loss = 0.3082:  33%|███▎      | 1805/5475 [8:04:58<18:03:19, 17.71s/it]

ratio tensor(1.1741, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.3551, device='cuda:0') factor: tensor(0.1258, device='cuda:0')
reward_raw: tensor(0.4870, device='cuda:0') reward: tensor(1.3641, device='cuda:0') factor: tensor(0.1229, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.3686, device='cuda:0') factor: tensor(0.1250, device='cuda:0')
reward_raw: tensor(0.4940, device='cuda:0') reward: tensor(1.3683, device='cuda:0') factor: tensor(0.1257, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1226, device='cuda:0')


Epoch 1, Step 1806: Loss = 0.0285, Actor Loss = -0.0037, Critic Loss = 0.0646, Entropy = 0.0274, Returns = 1.3686, Value = 1.3570, mask_loss = 0.0962std_loss = 0.2766:  33%|███▎      | 1806/5475 [8:05:21<19:45:05, 19.38s/it] 

ratio tensor(1.1715, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4848, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.1000, device='cuda:0')
reward_raw: tensor(0.4730, device='cuda:0') reward: tensor(1.3740, device='cuda:0') factor: tensor(0.0990, device='cuda:0')
reward_raw: tensor(0.4697, device='cuda:0') reward: tensor(1.3709, device='cuda:0') factor: tensor(0.0988, device='cuda:0')
reward_raw: tensor(0.4736, device='cuda:0') reward: tensor(1.3760, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.4708, device='cuda:0') reward: tensor(1.3713, device='cuda:0') factor: tensor(0.0995, device='cuda:0')


Epoch 1, Step 1807: Loss = -0.0114, Actor Loss = -0.0210, Critic Loss = 0.0191, Entropy = 0.0539, Returns = 1.3573, Value = 1.3412, mask_loss = 0.1127std_loss = 0.2875:  33%|███▎      | 1807/5475 [8:05:38<19:08:59, 18.79s/it]

ratio tensor(0.9149, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4756, device='cuda:0') reward: tensor(1.3648, device='cuda:0') factor: tensor(0.1108, device='cuda:0')
reward_raw: tensor(0.4802, device='cuda:0') reward: tensor(1.3679, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4687, device='cuda:0') reward: tensor(1.3564, device='cuda:0') factor: tensor(0.1123, device='cuda:0')
reward_raw: tensor(0.4721, device='cuda:0') reward: tensor(1.3606, device='cuda:0') factor: tensor(0.1115, device='cuda:0')
reward_raw: tensor(0.4896, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.1130, device='cuda:0')


Epoch 1, Step 1808: Loss = -0.0410, Actor Loss = -0.0716, Critic Loss = 0.0614, Entropy = 0.0434, Returns = 1.4113, Value = 1.3537, mask_loss = 0.0991std_loss = 0.2730:  33%|███▎      | 1808/5475 [8:05:54<18:17:15, 17.95s/it]

ratio tensor(1.0657, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5685, device='cuda:0') reward: tensor(1.4640, device='cuda:0') factor: tensor(0.1044, device='cuda:0')
reward_raw: tensor(0.5546, device='cuda:0') reward: tensor(1.4509, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.4557, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.5588, device='cuda:0') reward: tensor(1.4530, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.5624, device='cuda:0') reward: tensor(1.4594, device='cuda:0') factor: tensor(0.1030, device='cuda:0')


Epoch 1, Step 1809: Loss = 0.0417, Actor Loss = 0.0157, Critic Loss = 0.0522, Entropy = 0.0312, Returns = 1.4715, Value = 1.4668, mask_loss = 0.1077std_loss = 0.2884:  33%|███▎      | 1809/5475 [8:06:08<17:03:06, 16.74s/it]  

ratio tensor(0.9925, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6092, device='cuda:0') reward: tensor(1.4871, device='cuda:0') factor: tensor(0.1220, device='cuda:0')
reward_raw: tensor(0.5814, device='cuda:0') reward: tensor(1.4598, device='cuda:0') factor: tensor(0.1216, device='cuda:0')
reward_raw: tensor(0.5968, device='cuda:0') reward: tensor(1.4744, device='cuda:0') factor: tensor(0.1224, device='cuda:0')
reward_raw: tensor(0.5977, device='cuda:0') reward: tensor(1.4750, device='cuda:0') factor: tensor(0.1227, device='cuda:0')
reward_raw: tensor(0.5922, device='cuda:0') reward: tensor(1.4722, device='cuda:0') factor: tensor(0.1200, device='cuda:0')


Epoch 1, Step 1810: Loss = 0.0500, Actor Loss = 0.0200, Critic Loss = 0.0601, Entropy = 0.0364, Returns = 1.4631, Value = 1.5477, mask_loss = 0.1352std_loss = 0.3201:  33%|███▎      | 1810/5475 [8:06:22<16:18:08, 16.01s/it]

ratio tensor(0.9578, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5311, device='cuda:0') reward: tensor(1.4193, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.1193, device='cuda:0')
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.4015, device='cuda:0') factor: tensor(0.1141, device='cuda:0')
reward_raw: tensor(0.5235, device='cuda:0') reward: tensor(1.4118, device='cuda:0') factor: tensor(0.1117, device='cuda:0')
reward_raw: tensor(0.4936, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.1124, device='cuda:0')


Epoch 1, Step 1811: Loss = -0.0092, Actor Loss = -0.0559, Critic Loss = 0.0934, Entropy = 0.0448, Returns = 1.4265, Value = 1.3145, mask_loss = 0.1104std_loss = 0.2900:  33%|███▎      | 1811/5475 [8:06:45<18:17:52, 17.98s/it]

ratio tensor(1.5416, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4363, device='cuda:0') reward: tensor(1.3243, device='cuda:0') factor: tensor(0.1121, device='cuda:0')
reward_raw: tensor(0.4518, device='cuda:0') reward: tensor(1.3373, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.4371, device='cuda:0') reward: tensor(1.3246, device='cuda:0') factor: tensor(0.1125, device='cuda:0')
reward_raw: tensor(0.4336, device='cuda:0') reward: tensor(1.3205, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.4569, device='cuda:0') reward: tensor(1.3468, device='cuda:0') factor: tensor(0.1101, device='cuda:0')


Epoch 1, Step 1812: Loss = 0.2364, Actor Loss = 0.1967, Critic Loss = 0.0795, Entropy = 0.0421, Returns = 1.2660, Value = 1.2536, mask_loss = 0.1131std_loss = 0.2868:  33%|███▎      | 1812/5475 [8:06:58<16:41:54, 16.41s/it]  

ratio tensor(0.7516, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4871, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.0806, device='cuda:0')
reward_raw: tensor(0.4937, device='cuda:0') reward: tensor(1.4098, device='cuda:0') factor: tensor(0.0839, device='cuda:0')
reward_raw: tensor(0.4921, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.0826, device='cuda:0')
reward_raw: tensor(0.4745, device='cuda:0') reward: tensor(1.3896, device='cuda:0') factor: tensor(0.0849, device='cuda:0')
reward_raw: tensor(0.4823, device='cuda:0') reward: tensor(1.3977, device='cuda:0') factor: tensor(0.0847, device='cuda:0')


Epoch 1, Step 1813: Loss = -0.3005, Actor Loss = -0.3237, Critic Loss = 0.0464, Entropy = 0.0265, Returns = 1.4085, Value = 1.4267, mask_loss = 0.0889std_loss = 0.2640:  33%|███▎      | 1813/5475 [8:07:11<15:49:26, 15.56s/it]

ratio tensor(1.2272, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4782, device='cuda:0') reward: tensor(1.3697, device='cuda:0') factor: tensor(0.1085, device='cuda:0')
reward_raw: tensor(0.4880, device='cuda:0') reward: tensor(1.3804, device='cuda:0') factor: tensor(0.1076, device='cuda:0')
reward_raw: tensor(0.4612, device='cuda:0') reward: tensor(1.3535, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3377, device='cuda:0') factor: tensor(0.1046, device='cuda:0')
reward_raw: tensor(0.4565, device='cuda:0') reward: tensor(1.3467, device='cuda:0') factor: tensor(0.1099, device='cuda:0')


Epoch 1, Step 1814: Loss = 0.1653, Actor Loss = 0.1096, Critic Loss = 0.1114, Entropy = 0.0357, Returns = 1.3993, Value = 1.3666, mask_loss = 0.1086std_loss = 0.2893:  33%|███▎      | 1814/5475 [8:07:26<15:32:04, 15.28s/it]  

ratio tensor(1.2091, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.3919, device='cuda:0') factor: tensor(0.1278, device='cuda:0')
reward_raw: tensor(0.5046, device='cuda:0') reward: tensor(1.3770, device='cuda:0') factor: tensor(0.1275, device='cuda:0')
reward_raw: tensor(0.5020, device='cuda:0') reward: tensor(1.3749, device='cuda:0') factor: tensor(0.1271, device='cuda:0')
reward_raw: tensor(0.5225, device='cuda:0') reward: tensor(1.3977, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.4911, device='cuda:0') reward: tensor(1.3611, device='cuda:0') factor: tensor(0.1300, device='cuda:0')


Epoch 1, Step 1815: Loss = 0.0027, Actor Loss = -0.0154, Critic Loss = 0.0363, Entropy = 0.0421, Returns = 1.3945, Value = 1.3479, mask_loss = 0.1261std_loss = 0.3076:  33%|███▎      | 1815/5475 [8:07:38<14:29:41, 14.26s/it]

ratio tensor(1.2595, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3464, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4495, device='cuda:0') reward: tensor(1.3443, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.4424, device='cuda:0') reward: tensor(1.3397, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.4610, device='cuda:0') reward: tensor(1.3562, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.4530, device='cuda:0') reward: tensor(1.3498, device='cuda:0') factor: tensor(0.1032, device='cuda:0')


Epoch 1, Step 1816: Loss = -0.0045, Actor Loss = -0.0364, Critic Loss = 0.0639, Entropy = 0.0291, Returns = 1.2922, Value = 1.4306, mask_loss = 0.1073std_loss = 0.2921:  33%|███▎      | 1816/5475 [8:07:51<14:16:26, 14.04s/it]

ratio tensor(0.9732, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.3851, device='cuda:0') factor: tensor(0.0934, device='cuda:0')
reward_raw: tensor(0.4607, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.0929, device='cuda:0')
reward_raw: tensor(0.4521, device='cuda:0') reward: tensor(1.3602, device='cuda:0') factor: tensor(0.0919, device='cuda:0')
reward_raw: tensor(0.4509, device='cuda:0') reward: tensor(1.3592, device='cuda:0') factor: tensor(0.0917, device='cuda:0')
reward_raw: tensor(0.4750, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.0937, device='cuda:0')


Epoch 1, Step 1817: Loss = 0.0635, Actor Loss = 0.0151, Critic Loss = 0.0969, Entropy = 0.0184, Returns = 1.4163, Value = 1.3556, mask_loss = 0.0878std_loss = 0.2599:  33%|███▎      | 1817/5475 [8:08:09<15:15:40, 15.02s/it]  

ratio tensor(0.9279, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5067, device='cuda:0') reward: tensor(1.4015, device='cuda:0') factor: tensor(0.1052, device='cuda:0')
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.3773, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.4955, device='cuda:0') reward: tensor(1.3902, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.4907, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3880, device='cuda:0') factor: tensor(0.1038, device='cuda:0')


Epoch 1, Step 1818: Loss = -0.0313, Actor Loss = -0.0606, Critic Loss = 0.0586, Entropy = 0.0348, Returns = 1.3988, Value = 1.4079, mask_loss = 0.0991std_loss = 0.2763:  33%|███▎      | 1818/5475 [8:08:24<15:24:49, 15.17s/it]

ratio tensor(1.1009, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4761, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.4867, device='cuda:0') reward: tensor(1.3916, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.4776, device='cuda:0') reward: tensor(1.3810, device='cuda:0') factor: tensor(0.0966, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.4740, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.1003, device='cuda:0')


Epoch 1, Step 1819: Loss = 0.0259, Actor Loss = 0.0052, Critic Loss = 0.0414, Entropy = 0.0346, Returns = 1.3470, Value = 1.3719, mask_loss = 0.0963std_loss = 0.2743:  33%|███▎      | 1819/5475 [8:08:38<15:03:40, 14.83s/it]  

ratio tensor(1.0110, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5542, device='cuda:0') reward: tensor(1.4479, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.5728, device='cuda:0') reward: tensor(1.4676, device='cuda:0') factor: tensor(0.1052, device='cuda:0')
reward_raw: tensor(0.5805, device='cuda:0') reward: tensor(1.4738, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.5607, device='cuda:0') reward: tensor(1.4545, device='cuda:0') factor: tensor(0.1062, device='cuda:0')
reward_raw: tensor(0.5676, device='cuda:0') reward: tensor(1.4596, device='cuda:0') factor: tensor(0.1080, device='cuda:0')


Epoch 1, Step 1820: Loss = 0.0456, Actor Loss = 0.0077, Critic Loss = 0.0757, Entropy = 0.0268, Returns = 1.4735, Value = 1.5360, mask_loss = 0.0912std_loss = 0.2680:  33%|███▎      | 1820/5475 [8:08:53<15:09:38, 14.93s/it]

ratio tensor(1.0818, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5073, device='cuda:0') reward: tensor(1.4143, device='cuda:0') factor: tensor(0.0930, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.4252, device='cuda:0') factor: tensor(0.0900, device='cuda:0')
reward_raw: tensor(0.4952, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.0932, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.0919, device='cuda:0')
reward_raw: tensor(0.5124, device='cuda:0') reward: tensor(1.4172, device='cuda:0') factor: tensor(0.0952, device='cuda:0')


Epoch 1, Step 1821: Loss = -0.0032, Actor Loss = -0.0298, Critic Loss = 0.0532, Entropy = 0.0364, Returns = 1.4110, Value = 1.3841, mask_loss = 0.0795std_loss = 0.2462:  33%|███▎      | 1820/5475 [8:09:08<15:09:38, 14.93s/it]

ratio tensor(1.5245, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1821: Loss = -0.0032, Actor Loss = -0.0298, Critic Loss = 0.0532, Entropy = 0.0364, Returns = 1.4110, Value = 1.3841, mask_loss = 0.0795std_loss = 0.2462:  33%|███▎      | 1821/5475 [8:09:08<15:10:28, 14.95s/it]

reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3852, device='cuda:0') factor: tensor(0.1067, device='cuda:0')
reward_raw: tensor(0.5058, device='cuda:0') reward: tensor(1.4017, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.4674, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.1054, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.3947, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.1074, device='cuda:0')


Epoch 1, Step 1822: Loss = 0.1307, Actor Loss = 0.1042, Critic Loss = 0.0531, Entropy = 0.0266, Returns = 1.3428, Value = 1.4014, mask_loss = 0.0980std_loss = 0.2773:  33%|███▎      | 1822/5475 [8:09:23<14:58:46, 14.76s/it]  

ratio tensor(0.9876, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5489, device='cuda:0') reward: tensor(1.4569, device='cuda:0') factor: tensor(0.0920, device='cuda:0')
reward_raw: tensor(0.5286, device='cuda:0') reward: tensor(1.4381, device='cuda:0') factor: tensor(0.0905, device='cuda:0')
reward_raw: tensor(0.5554, device='cuda:0') reward: tensor(1.4681, device='cuda:0') factor: tensor(0.0873, device='cuda:0')
reward_raw: tensor(0.5472, device='cuda:0') reward: tensor(1.4592, device='cuda:0') factor: tensor(0.0880, device='cuda:0')
reward_raw: tensor(0.5388, device='cuda:0') reward: tensor(1.4491, device='cuda:0') factor: tensor(0.0897, device='cuda:0')


Epoch 1, Step 1823: Loss = -0.0806, Actor Loss = -0.0973, Critic Loss = 0.0336, Entropy = 0.0323, Returns = 1.4839, Value = 1.4740, mask_loss = 0.1059std_loss = 0.2877:  33%|███▎      | 1823/5475 [8:09:40<15:42:25, 15.48s/it]

ratio tensor(1.1490, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.4102, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.5110, device='cuda:0') reward: tensor(1.4023, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.5253, device='cuda:0') reward: tensor(1.4159, device='cuda:0') factor: tensor(0.1094, device='cuda:0')
reward_raw: tensor(0.5337, device='cuda:0') reward: tensor(1.4271, device='cuda:0') factor: tensor(0.1065, device='cuda:0')
reward_raw: tensor(0.5286, device='cuda:0') reward: tensor(1.4269, device='cuda:0') factor: tensor(0.1018, device='cuda:0')


Epoch 1, Step 1824: Loss = 0.1067, Actor Loss = 0.0761, Critic Loss = 0.0612, Entropy = 0.0410, Returns = 1.3688, Value = 1.3395, mask_loss = 0.1094std_loss = 0.2748:  33%|███▎      | 1824/5475 [8:09:53<15:01:42, 14.82s/it]  

ratio tensor(0.8530, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5286, device='cuda:0') reward: tensor(1.4197, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.5093, device='cuda:0') reward: tensor(1.4025, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.4077, device='cuda:0') factor: tensor(0.1086, device='cuda:0')
reward_raw: tensor(0.4925, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.1088, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.3980, device='cuda:0') factor: tensor(0.1100, device='cuda:0')


Epoch 1, Step 1825: Loss = -0.1110, Actor Loss = -0.1472, Critic Loss = 0.0726, Entropy = 0.0301, Returns = 1.3984, Value = 1.4375, mask_loss = 0.1040std_loss = 0.2795:  33%|███▎      | 1825/5475 [8:10:07<14:49:43, 14.63s/it]

ratio tensor(0.8928, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4975, device='cuda:0') reward: tensor(1.3981, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.3725, device='cuda:0') factor: tensor(0.1002, device='cuda:0')
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.3671, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3872, device='cuda:0') factor: tensor(0.0986, device='cuda:0')
reward_raw: tensor(0.4841, device='cuda:0') reward: tensor(1.3855, device='cuda:0') factor: tensor(0.0986, device='cuda:0')


Epoch 1, Step 1826: Loss = 0.2246, Actor Loss = 0.2087, Critic Loss = 0.0318, Entropy = 0.0307, Returns = 1.3662, Value = 1.3331, mask_loss = 0.1036std_loss = 0.2829:  33%|███▎      | 1826/5475 [8:10:20<14:21:19, 14.16s/it]  

ratio tensor(0.7953, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5671, device='cuda:0') reward: tensor(1.4424, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.5661, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.1248, device='cuda:0')
reward_raw: tensor(0.5709, device='cuda:0') reward: tensor(1.4447, device='cuda:0') factor: tensor(0.1262, device='cuda:0')
reward_raw: tensor(0.5532, device='cuda:0') reward: tensor(1.4285, device='cuda:0') factor: tensor(0.1247, device='cuda:0')
reward_raw: tensor(0.5519, device='cuda:0') reward: tensor(1.4289, device='cuda:0') factor: tensor(0.1230, device='cuda:0')


Epoch 1, Step 1827: Loss = -0.0193, Actor Loss = -0.0549, Critic Loss = 0.0711, Entropy = 0.0421, Returns = 1.5161, Value = 1.5208, mask_loss = 0.1327std_loss = 0.3166:  33%|███▎      | 1827/5475 [8:10:33<13:56:58, 13.77s/it]

ratio tensor(0.8803, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4507, device='cuda:0') reward: tensor(1.3335, device='cuda:0') factor: tensor(0.1171, device='cuda:0')
reward_raw: tensor(0.4476, device='cuda:0') reward: tensor(1.3273, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.4612, device='cuda:0') reward: tensor(1.3429, device='cuda:0') factor: tensor(0.1183, device='cuda:0')
reward_raw: tensor(0.4452, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1203, device='cuda:0')
reward_raw: tensor(0.4435, device='cuda:0') reward: tensor(1.3278, device='cuda:0') factor: tensor(0.1157, device='cuda:0')


Epoch 1, Step 1828: Loss = 0.0902, Actor Loss = 0.0356, Critic Loss = 0.1093, Entropy = 0.0308, Returns = 1.4112, Value = 1.4515, mask_loss = 0.1059std_loss = 0.2906:  33%|███▎      | 1828/5475 [8:10:46<13:40:20, 13.50s/it]  

ratio tensor(0.8449, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3892, device='cuda:0') factor: tensor(0.1071, device='cuda:0')
reward_raw: tensor(0.5090, device='cuda:0') reward: tensor(1.4049, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3854, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.5212, device='cuda:0') reward: tensor(1.4148, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3747, device='cuda:0') factor: tensor(0.1077, device='cuda:0')


Epoch 1, Step 1829: Loss = 0.0824, Actor Loss = 0.0422, Critic Loss = 0.0804, Entropy = 0.0210, Returns = 1.4143, Value = 1.4016, mask_loss = 0.0824std_loss = 0.2593:  33%|███▎      | 1829/5475 [8:11:10<16:43:04, 16.51s/it]

ratio tensor(0.7081, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5017, device='cuda:0') reward: tensor(1.4058, device='cuda:0') factor: tensor(0.0959, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.5021, device='cuda:0') reward: tensor(1.4053, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.5067, device='cuda:0') reward: tensor(1.4073, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.4193, device='cuda:0') factor: tensor(0.0963, device='cuda:0')


Epoch 1, Step 1830: Loss = 0.0084, Actor Loss = -0.0203, Critic Loss = 0.0574, Entropy = 0.0262, Returns = 1.4746, Value = 1.4579, mask_loss = 0.1001std_loss = 0.2826:  33%|███▎      | 1830/5475 [8:11:26<16:31:16, 16.32s/it]

ratio tensor(0.8456, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3702, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.4759, device='cuda:0') reward: tensor(1.3793, device='cuda:0') factor: tensor(0.0967, device='cuda:0')
reward_raw: tensor(0.4704, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.0959, device='cuda:0')
reward_raw: tensor(0.4757, device='cuda:0') reward: tensor(1.3783, device='cuda:0') factor: tensor(0.0974, device='cuda:0')
reward_raw: tensor(0.4690, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.0953, device='cuda:0')


Epoch 1, Step 1831: Loss = 0.1446, Actor Loss = 0.1176, Critic Loss = 0.0540, Entropy = 0.0257, Returns = 1.2713, Value = 1.4308, mask_loss = 0.1018std_loss = 0.2815:  33%|███▎      | 1831/5475 [8:11:41<16:24:33, 16.21s/it] 

ratio tensor(0.7651, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5264, device='cuda:0') reward: tensor(1.4325, device='cuda:0') factor: tensor(0.0939, device='cuda:0')
reward_raw: tensor(0.5239, device='cuda:0') reward: tensor(1.4326, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.5284, device='cuda:0') reward: tensor(1.4326, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.5167, device='cuda:0') reward: tensor(1.4236, device='cuda:0') factor: tensor(0.0932, device='cuda:0')
reward_raw: tensor(0.4831, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.0914, device='cuda:0')


Epoch 1, Step 1832: Loss = 0.0097, Actor Loss = -0.0120, Critic Loss = 0.0435, Entropy = 0.0411, Returns = 1.4701, Value = 1.4005, mask_loss = 0.1205std_loss = 0.2915:  33%|███▎      | 1832/5475 [8:11:59<16:54:44, 16.71s/it]

ratio tensor(0.7700, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4312, device='cuda:0') reward: tensor(1.3414, device='cuda:0') factor: tensor(0.0898, device='cuda:0')
reward_raw: tensor(0.4627, device='cuda:0') reward: tensor(1.3678, device='cuda:0') factor: tensor(0.0948, device='cuda:0')
reward_raw: tensor(0.4484, device='cuda:0') reward: tensor(1.3559, device='cuda:0') factor: tensor(0.0925, device='cuda:0')
reward_raw: tensor(0.4519, device='cuda:0') reward: tensor(1.3566, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.4486, device='cuda:0') reward: tensor(1.3563, device='cuda:0') factor: tensor(0.0923, device='cuda:0')


Epoch 1, Step 1833: Loss = -0.0486, Actor Loss = -0.0853, Critic Loss = 0.0734, Entropy = 0.0215, Returns = 1.3788, Value = 1.2960, mask_loss = 0.0808std_loss = 0.2569:  33%|███▎      | 1833/5475 [8:12:15<16:36:16, 16.41s/it]

ratio tensor(1.2027, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4865, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.4680, device='cuda:0') reward: tensor(1.3858, device='cuda:0') factor: tensor(0.0822, device='cuda:0')
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.3845, device='cuda:0') factor: tensor(0.0832, device='cuda:0')
reward_raw: tensor(0.4711, device='cuda:0') reward: tensor(1.3849, device='cuda:0') factor: tensor(0.0862, device='cuda:0')
reward_raw: tensor(0.5025, device='cuda:0') reward: tensor(1.4160, device='cuda:0') factor: tensor(0.0865, device='cuda:0')


Epoch 1, Step 1834: Loss = 0.1578, Actor Loss = 0.1351, Critic Loss = 0.0456, Entropy = 0.0229, Returns = 1.3578, Value = 1.4991, mask_loss = 0.0939std_loss = 0.2692:  33%|███▎      | 1834/5475 [8:12:33<17:05:31, 16.90s/it]  

ratio tensor(0.7840, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4938, device='cuda:0') reward: tensor(1.3850, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.3952, device='cuda:0') factor: tensor(0.1050, device='cuda:0')
reward_raw: tensor(0.5156, device='cuda:0') reward: tensor(1.4052, device='cuda:0') factor: tensor(0.1104, device='cuda:0')
reward_raw: tensor(0.4956, device='cuda:0') reward: tensor(1.3885, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.4862, device='cuda:0') reward: tensor(1.3737, device='cuda:0') factor: tensor(0.1126, device='cuda:0')


Epoch 1, Step 1835: Loss = -0.0183, Actor Loss = -0.0585, Critic Loss = 0.0805, Entropy = 0.0379, Returns = 1.3577, Value = 1.3142, mask_loss = 0.1068std_loss = 0.2797:  34%|███▎      | 1835/5475 [8:12:54<18:12:41, 18.01s/it]

ratio tensor(0.9215, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5014, device='cuda:0') reward: tensor(1.3921, device='cuda:0') factor: tensor(0.1093, device='cuda:0')
reward_raw: tensor(0.4806, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1063, device='cuda:0')
reward_raw: tensor(0.5063, device='cuda:0') reward: tensor(1.3997, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.4959, device='cuda:0') reward: tensor(1.3913, device='cuda:0') factor: tensor(0.1045, device='cuda:0')
reward_raw: tensor(0.4895, device='cuda:0') reward: tensor(1.3781, device='cuda:0') factor: tensor(0.1114, device='cuda:0')


Epoch 1, Step 1836: Loss = 0.0143, Actor Loss = -0.0303, Critic Loss = 0.0891, Entropy = 0.0324, Returns = 1.4100, Value = 1.3537, mask_loss = 0.1017std_loss = 0.2806:  34%|███▎      | 1836/5475 [8:13:10<17:45:24, 17.57s/it] 

ratio tensor(0.9171, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.4586, device='cuda:0') factor: tensor(0.0889, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.4219, device='cuda:0') factor: tensor(0.0921, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.4424, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.5585, device='cuda:0') reward: tensor(1.4648, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.5548, device='cuda:0') reward: tensor(1.4602, device='cuda:0') factor: tensor(0.0946, device='cuda:0')


Epoch 1, Step 1837: Loss = 0.0322, Actor Loss = 0.0053, Critic Loss = 0.0538, Entropy = 0.0231, Returns = 1.4400, Value = 1.5748, mask_loss = 0.0999std_loss = 0.2763:  34%|███▎      | 1837/5475 [8:13:29<18:13:37, 18.04s/it] 

ratio tensor(1.0467, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4866, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.4824, device='cuda:0') reward: tensor(1.3858, device='cuda:0') factor: tensor(0.0965, device='cuda:0')
reward_raw: tensor(0.4864, device='cuda:0') reward: tensor(1.3901, device='cuda:0') factor: tensor(0.0962, device='cuda:0')
reward_raw: tensor(0.5043, device='cuda:0') reward: tensor(1.4045, device='cuda:0') factor: tensor(0.0997, device='cuda:0')
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.4012, device='cuda:0') factor: tensor(0.0990, device='cuda:0')


Epoch 1, Step 1838: Loss = 0.0947, Actor Loss = 0.0485, Critic Loss = 0.0925, Entropy = 0.0245, Returns = 1.4108, Value = 1.2731, mask_loss = 0.0817std_loss = 0.2565:  34%|███▎      | 1838/5475 [8:13:50<19:02:19, 18.85s/it]

ratio tensor(0.9467, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5280, device='cuda:0') reward: tensor(1.4353, device='cuda:0') factor: tensor(0.0927, device='cuda:0')
reward_raw: tensor(0.5247, device='cuda:0') reward: tensor(1.4288, device='cuda:0') factor: tensor(0.0959, device='cuda:0')
reward_raw: tensor(0.5233, device='cuda:0') reward: tensor(1.4288, device='cuda:0') factor: tensor(0.0945, device='cuda:0')
reward_raw: tensor(0.5256, device='cuda:0') reward: tensor(1.4317, device='cuda:0') factor: tensor(0.0939, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.4170, device='cuda:0') factor: tensor(0.0932, device='cuda:0')


Epoch 1, Step 1839: Loss = -0.0415, Actor Loss = -0.0933, Critic Loss = 0.1037, Entropy = 0.0248, Returns = 1.3244, Value = 1.5171, mask_loss = 0.1042std_loss = 0.2874:  34%|███▎      | 1839/5475 [8:14:14<20:33:16, 20.35s/it]

ratio tensor(1.0812, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4596, device='cuda:0') reward: tensor(1.3657, device='cuda:0') factor: tensor(0.0939, device='cuda:0')
reward_raw: tensor(0.4772, device='cuda:0') reward: tensor(1.3825, device='cuda:0') factor: tensor(0.0947, device='cuda:0')
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3677, device='cuda:0') factor: tensor(0.0964, device='cuda:0')
reward_raw: tensor(0.4754, device='cuda:0') reward: tensor(1.3805, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.4641, device='cuda:0') reward: tensor(1.3714, device='cuda:0') factor: tensor(0.0926, device='cuda:0')


Epoch 1, Step 1840: Loss = 0.2163, Actor Loss = 0.1920, Critic Loss = 0.0487, Entropy = 0.0369, Returns = 1.3732, Value = 1.3755, mask_loss = 0.0909std_loss = 0.2661:  34%|███▎      | 1840/5475 [8:14:27<18:27:15, 18.28s/it]  

ratio tensor(0.5700, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4944, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.4757, device='cuda:0') reward: tensor(1.3707, device='cuda:0') factor: tensor(0.1050, device='cuda:0')
reward_raw: tensor(0.4839, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.1094, device='cuda:0')
reward_raw: tensor(0.4785, device='cuda:0') reward: tensor(1.3696, device='cuda:0') factor: tensor(0.1090, device='cuda:0')
reward_raw: tensor(0.4843, device='cuda:0') reward: tensor(1.3767, device='cuda:0') factor: tensor(0.1076, device='cuda:0')


Epoch 1, Step 1841: Loss = -0.0061, Actor Loss = -0.0520, Critic Loss = 0.0918, Entropy = 0.0223, Returns = 1.4142, Value = 1.3440, mask_loss = 0.1068std_loss = 0.2959:  34%|███▎      | 1840/5475 [8:14:43<18:27:15, 18.28s/it]

ratio tensor(0.9033, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1841: Loss = -0.0061, Actor Loss = -0.0520, Critic Loss = 0.0918, Entropy = 0.0223, Returns = 1.4142, Value = 1.3440, mask_loss = 0.1068std_loss = 0.2959:  34%|███▎      | 1841/5475 [8:14:43<17:47:14, 17.62s/it]

reward_raw: tensor(0.4061, device='cuda:0') reward: tensor(1.3072, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.4042, device='cuda:0') reward: tensor(1.3026, device='cuda:0') factor: tensor(0.1017, device='cuda:0')
reward_raw: tensor(0.3875, device='cuda:0') reward: tensor(1.2893, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4055, device='cuda:0') reward: tensor(1.3050, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.3977, device='cuda:0') reward: tensor(1.2955, device='cuda:0') factor: tensor(0.1022, device='cuda:0')


Epoch 1, Step 1842: Loss = -0.0113, Actor Loss = -0.0615, Critic Loss = 0.1006, Entropy = 0.0303, Returns = 1.3451, Value = 1.3478, mask_loss = 0.1116std_loss = 0.2899:  34%|███▎      | 1842/5475 [8:15:01<17:51:27, 17.70s/it]

ratio tensor(0.8031, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4958, device='cuda:0') reward: tensor(1.4043, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.5058, device='cuda:0') reward: tensor(1.4139, device='cuda:0') factor: tensor(0.0918, device='cuda:0')
reward_raw: tensor(0.5033, device='cuda:0') reward: tensor(1.4099, device='cuda:0') factor: tensor(0.0933, device='cuda:0')
reward_raw: tensor(0.5007, device='cuda:0') reward: tensor(1.4065, device='cuda:0') factor: tensor(0.0942, device='cuda:0')
reward_raw: tensor(0.4860, device='cuda:0') reward: tensor(1.3898, device='cuda:0') factor: tensor(0.0961, device='cuda:0')


Epoch 1, Step 1843: Loss = -0.0817, Actor Loss = -0.1021, Critic Loss = 0.0409, Entropy = 0.0240, Returns = 1.4216, Value = 1.3815, mask_loss = 0.0868std_loss = 0.2627:  34%|███▎      | 1843/5475 [8:15:18<17:34:27, 17.42s/it]

ratio tensor(1.0681, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5279, device='cuda:0') reward: tensor(1.4288, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4638, device='cuda:0') reward: tensor(1.3640, device='cuda:0') factor: tensor(0.0998, device='cuda:0')
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(1.3888, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4898, device='cuda:0') reward: tensor(1.3857, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.4964, device='cuda:0') reward: tensor(1.3933, device='cuda:0') factor: tensor(0.1030, device='cuda:0')


Epoch 1, Step 1844: Loss = 0.0141, Actor Loss = -0.0153, Critic Loss = 0.0589, Entropy = 0.0286, Returns = 1.3984, Value = 1.4214, mask_loss = 0.0876std_loss = 0.2620:  34%|███▎      | 1844/5475 [8:15:33<16:54:53, 16.77s/it] 

ratio tensor(0.8515, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5332, device='cuda:0') reward: tensor(1.4220, device='cuda:0') factor: tensor(0.1111, device='cuda:0')
reward_raw: tensor(0.5333, device='cuda:0') reward: tensor(1.4175, device='cuda:0') factor: tensor(0.1157, device='cuda:0')
reward_raw: tensor(0.5393, device='cuda:0') reward: tensor(1.4283, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.5424, device='cuda:0') reward: tensor(1.4337, device='cuda:0') factor: tensor(0.1087, device='cuda:0')
reward_raw: tensor(0.5322, device='cuda:0') reward: tensor(1.4219, device='cuda:0') factor: tensor(0.1103, device='cuda:0')


Epoch 1, Step 1845: Loss = 0.0261, Actor Loss = 0.0091, Critic Loss = 0.0340, Entropy = 0.0329, Returns = 1.4323, Value = 1.4186, mask_loss = 0.0958std_loss = 0.2722:  34%|███▎      | 1845/5475 [8:15:47<15:54:32, 15.78s/it] 

ratio tensor(0.8563, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.0888, device='cuda:0')
reward_raw: tensor(0.4433, device='cuda:0') reward: tensor(1.3547, device='cuda:0') factor: tensor(0.0885, device='cuda:0')
reward_raw: tensor(0.4425, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.0891, device='cuda:0')
reward_raw: tensor(0.4433, device='cuda:0') reward: tensor(1.3551, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.4272, device='cuda:0') reward: tensor(1.3362, device='cuda:0') factor: tensor(0.0909, device='cuda:0')


Epoch 1, Step 1846: Loss = 0.0235, Actor Loss = 0.0079, Critic Loss = 0.0314, Entropy = 0.0239, Returns = 1.3560, Value = 1.3757, mask_loss = 0.0763std_loss = 0.2492:  34%|███▎      | 1846/5475 [8:16:04<16:25:26, 16.29s/it]

ratio tensor(0.9577, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5083, device='cuda:0') reward: tensor(1.4136, device='cuda:0') factor: tensor(0.0947, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4348, device='cuda:0') factor: tensor(0.0935, device='cuda:0')
reward_raw: tensor(0.5223, device='cuda:0') reward: tensor(1.4251, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.5299, device='cuda:0') reward: tensor(1.4353, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.5140, device='cuda:0') reward: tensor(1.4232, device='cuda:0') factor: tensor(0.0908, device='cuda:0')


Epoch 1, Step 1847: Loss = -0.1116, Actor Loss = -0.1418, Critic Loss = 0.0605, Entropy = 0.0289, Returns = 1.4438, Value = 1.3975, mask_loss = 0.0922std_loss = 0.2704:  34%|███▎      | 1847/5475 [8:16:17<15:11:27, 15.07s/it]

ratio tensor(1.0464, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4670, device='cuda:0') reward: tensor(1.3800, device='cuda:0') factor: tensor(0.0870, device='cuda:0')
reward_raw: tensor(0.4775, device='cuda:0') reward: tensor(1.3915, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.4598, device='cuda:0') reward: tensor(1.3738, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.4674, device='cuda:0') reward: tensor(1.3814, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.4453, device='cuda:0') reward: tensor(1.3615, device='cuda:0') factor: tensor(0.0838, device='cuda:0')


Epoch 1, Step 1848: Loss = 0.1946, Actor Loss = 0.1644, Critic Loss = 0.0606, Entropy = 0.0323, Returns = 1.3267, Value = 1.4080, mask_loss = 0.0794std_loss = 0.2489:  34%|███▍      | 1848/5475 [8:16:32<15:12:23, 15.09s/it]  

ratio tensor(0.5339, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5550, device='cuda:0') reward: tensor(1.4692, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.5374, device='cuda:0') reward: tensor(1.4551, device='cuda:0') factor: tensor(0.0823, device='cuda:0')
reward_raw: tensor(0.5653, device='cuda:0') reward: tensor(1.4749, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.5507, device='cuda:0') reward: tensor(1.4660, device='cuda:0') factor: tensor(0.0848, device='cuda:0')
reward_raw: tensor(0.5589, device='cuda:0') reward: tensor(1.4737, device='cuda:0') factor: tensor(0.0853, device='cuda:0')


Epoch 1, Step 1849: Loss = -0.1068, Actor Loss = -0.1462, Critic Loss = 0.0790, Entropy = 0.0206, Returns = 1.4548, Value = 1.5662, mask_loss = 0.0909std_loss = 0.2724:  34%|███▍      | 1849/5475 [8:16:45<14:44:30, 14.64s/it]

ratio tensor(1.4498, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4969, device='cuda:0') reward: tensor(1.4100, device='cuda:0') factor: tensor(0.0869, device='cuda:0')
reward_raw: tensor(0.5050, device='cuda:0') reward: tensor(1.4190, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.4221, device='cuda:0') factor: tensor(0.0843, device='cuda:0')
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.4200, device='cuda:0') factor: tensor(0.0840, device='cuda:0')
reward_raw: tensor(0.5007, device='cuda:0') reward: tensor(1.4138, device='cuda:0') factor: tensor(0.0869, device='cuda:0')


Epoch 1, Step 1850: Loss = 0.1165, Actor Loss = 0.0868, Critic Loss = 0.0594, Entropy = 0.0288, Returns = 1.4044, Value = 1.4103, mask_loss = 0.0821std_loss = 0.2523:  34%|███▍      | 1850/5475 [8:17:05<16:14:41, 16.13s/it]  

ratio tensor(2.6454, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5300, device='cuda:0') reward: tensor(1.4258, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.4320, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.5343, device='cuda:0') reward: tensor(1.4334, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.5454, device='cuda:0') reward: tensor(1.4401, device='cuda:0') factor: tensor(0.1053, device='cuda:0')
reward_raw: tensor(0.5342, device='cuda:0') reward: tensor(1.4320, device='cuda:0') factor: tensor(0.1022, device='cuda:0')


Epoch 1, Step 1851: Loss = -0.0684, Actor Loss = -0.1233, Critic Loss = 0.1099, Entropy = 0.0277, Returns = 1.4369, Value = 1.4212, mask_loss = 0.0947std_loss = 0.2683:  34%|███▍      | 1851/5475 [8:17:23<16:49:52, 16.72s/it]

ratio tensor(1.0323, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.3857, device='cuda:0') reward: tensor(1.2881, device='cuda:0') factor: tensor(0.0976, device='cuda:0')
reward_raw: tensor(0.3911, device='cuda:0') reward: tensor(1.2923, device='cuda:0') factor: tensor(0.0988, device='cuda:0')
reward_raw: tensor(0.3816, device='cuda:0') reward: tensor(1.2847, device='cuda:0') factor: tensor(0.0969, device='cuda:0')
reward_raw: tensor(0.3859, device='cuda:0') reward: tensor(1.2880, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.3813, device='cuda:0') reward: tensor(1.2862, device='cuda:0') factor: tensor(0.0951, device='cuda:0')


Epoch 1, Step 1852: Loss = 0.1199, Actor Loss = 0.0874, Critic Loss = 0.0651, Entropy = 0.0301, Returns = 1.3763, Value = 1.2447, mask_loss = 0.1088std_loss = 0.2933:  34%|███▍      | 1852/5475 [8:17:39<16:32:42, 16.44s/it]  

ratio tensor(1.0374, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4880, device='cuda:0') reward: tensor(1.3815, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3845, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.5065, device='cuda:0') reward: tensor(1.4050, device='cuda:0') factor: tensor(0.1015, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.4059, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.3769, device='cuda:0') factor: tensor(0.1065, device='cuda:0')


Epoch 1, Step 1853: Loss = -0.1500, Actor Loss = -0.1833, Critic Loss = 0.0667, Entropy = 0.0293, Returns = 1.4130, Value = 1.5162, mask_loss = 0.0973std_loss = 0.2750:  34%|███▍      | 1853/5475 [8:17:52<15:36:53, 15.52s/it]

ratio tensor(1.4129, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4289, device='cuda:0') reward: tensor(1.3258, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4300, device='cuda:0') reward: tensor(1.3284, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4216, device='cuda:0') reward: tensor(1.3235, device='cuda:0') factor: tensor(0.0981, device='cuda:0')
reward_raw: tensor(0.4475, device='cuda:0') reward: tensor(1.3463, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.4369, device='cuda:0') reward: tensor(1.3350, device='cuda:0') factor: tensor(0.1019, device='cuda:0')


Epoch 1, Step 1854: Loss = 0.1524, Actor Loss = 0.1147, Critic Loss = 0.0755, Entropy = 0.0418, Returns = 1.3782, Value = 1.3781, mask_loss = 0.1079std_loss = 0.2855:  34%|███▍      | 1854/5475 [8:18:08<15:36:31, 15.52s/it]  

ratio tensor(0.8132, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5700, device='cuda:0') reward: tensor(1.4610, device='cuda:0') factor: tensor(0.1089, device='cuda:0')
reward_raw: tensor(0.5579, device='cuda:0') reward: tensor(1.4569, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.5639, device='cuda:0') reward: tensor(1.4619, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.5672, device='cuda:0') reward: tensor(1.4652, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.5522, device='cuda:0') reward: tensor(1.4491, device='cuda:0') factor: tensor(0.1032, device='cuda:0')


Epoch 1, Step 1855: Loss = -0.0441, Actor Loss = -0.0719, Critic Loss = 0.0558, Entropy = 0.0423, Returns = 1.4636, Value = 1.4290, mask_loss = 0.0968std_loss = 0.2713:  34%|███▍      | 1855/5475 [8:18:21<14:55:28, 14.84s/it]

ratio tensor(1.5278, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4276, device='cuda:0') reward: tensor(1.3232, device='cuda:0') factor: tensor(0.1043, device='cuda:0')
reward_raw: tensor(0.4304, device='cuda:0') reward: tensor(1.3292, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.4321, device='cuda:0') reward: tensor(1.3274, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.4331, device='cuda:0') reward: tensor(1.3284, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.4349, device='cuda:0') reward: tensor(1.3335, device='cuda:0') factor: tensor(0.1013, device='cuda:0')


Epoch 1, Step 1856: Loss = 0.1793, Actor Loss = 0.1595, Critic Loss = 0.0397, Entropy = 0.0337, Returns = 1.2483, Value = 1.2668, mask_loss = 0.1104std_loss = 0.2936:  34%|███▍      | 1856/5475 [8:18:36<15:01:55, 14.95s/it]  

ratio tensor(0.7815, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5592, device='cuda:0') reward: tensor(1.4542, device='cuda:0') factor: tensor(0.1050, device='cuda:0')
reward_raw: tensor(0.5449, device='cuda:0') reward: tensor(1.4420, device='cuda:0') factor: tensor(0.1029, device='cuda:0')
reward_raw: tensor(0.5450, device='cuda:0') reward: tensor(1.4402, device='cuda:0') factor: tensor(0.1048, device='cuda:0')
reward_raw: tensor(0.5544, device='cuda:0') reward: tensor(1.4508, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.5406, device='cuda:0') reward: tensor(1.4344, device='cuda:0') factor: tensor(0.1062, device='cuda:0')


Epoch 1, Step 1857: Loss = -0.1949, Actor Loss = -0.2439, Critic Loss = 0.0980, Entropy = 0.0388, Returns = 1.4424, Value = 1.5414, mask_loss = 0.0962std_loss = 0.2693:  34%|███▍      | 1857/5475 [8:18:51<14:50:54, 14.77s/it]

ratio tensor(1.5967, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5041, device='cuda:0') reward: tensor(1.4029, device='cuda:0') factor: tensor(0.1012, device='cuda:0')
reward_raw: tensor(0.4992, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.1008, device='cuda:0')
reward_raw: tensor(0.4846, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.5095, device='cuda:0') reward: tensor(1.4063, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.5108, device='cuda:0') reward: tensor(1.4106, device='cuda:0') factor: tensor(0.1002, device='cuda:0')


Epoch 1, Step 1858: Loss = 0.1810, Actor Loss = 0.1587, Critic Loss = 0.0447, Entropy = 0.0324, Returns = 1.4287, Value = 1.4332, mask_loss = 0.1129std_loss = 0.2964:  34%|███▍      | 1858/5475 [8:19:09<15:52:08, 15.79s/it]  

ratio tensor(0.8377, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5186, device='cuda:0') reward: tensor(1.4117, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.5249, device='cuda:0') reward: tensor(1.4211, device='cuda:0') factor: tensor(0.1039, device='cuda:0')
reward_raw: tensor(0.5153, device='cuda:0') reward: tensor(1.4082, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.5298, device='cuda:0') reward: tensor(1.4242, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.5310, device='cuda:0') reward: tensor(1.4246, device='cuda:0') factor: tensor(0.1064, device='cuda:0')


Epoch 1, Step 1859: Loss = -0.1949, Actor Loss = -0.2434, Critic Loss = 0.0971, Entropy = 0.0309, Returns = 1.5815, Value = 1.4248, mask_loss = 0.1200std_loss = 0.3024:  34%|███▍      | 1859/5475 [8:19:23<15:32:31, 15.47s/it]

ratio tensor(1.3159, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4752, device='cuda:0') reward: tensor(1.3829, device='cuda:0') factor: tensor(0.0923, device='cuda:0')
reward_raw: tensor(0.4956, device='cuda:0') reward: tensor(1.4032, device='cuda:0') factor: tensor(0.0923, device='cuda:0')
reward_raw: tensor(0.4830, device='cuda:0') reward: tensor(1.3899, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.4822, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.4901, device='cuda:0') reward: tensor(1.3958, device='cuda:0') factor: tensor(0.0944, device='cuda:0')


Epoch 1, Step 1860: Loss = 0.1223, Actor Loss = 0.1012, Critic Loss = 0.0423, Entropy = 0.0263, Returns = 1.3656, Value = 1.3407, mask_loss = 0.0992std_loss = 0.2788:  34%|███▍      | 1860/5475 [8:19:38<15:23:18, 15.32s/it]  

ratio tensor(0.6860, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4454, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.4264, device='cuda:0') reward: tensor(1.3342, device='cuda:0') factor: tensor(0.0922, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.3547, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.4304, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.0936, device='cuda:0')
reward_raw: tensor(0.4273, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.0918, device='cuda:0')


Epoch 1, Step 1861: Loss = 0.0147, Actor Loss = -0.0145, Critic Loss = 0.0584, Entropy = 0.0282, Returns = 1.3292, Value = 1.3393, mask_loss = 0.0876std_loss = 0.2622:  34%|███▍      | 1860/5475 [8:19:53<15:23:18, 15.32s/it]

ratio tensor(0.8950, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1861: Loss = 0.0147, Actor Loss = -0.0145, Critic Loss = 0.0584, Entropy = 0.0282, Returns = 1.3292, Value = 1.3393, mask_loss = 0.0876std_loss = 0.2622:  34%|███▍      | 1861/5475 [8:19:54<15:19:15, 15.26s/it]

reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.4220, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.5403, device='cuda:0') reward: tensor(1.4439, device='cuda:0') factor: tensor(0.0964, device='cuda:0')
reward_raw: tensor(0.5286, device='cuda:0') reward: tensor(1.4329, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.5349, device='cuda:0') reward: tensor(1.4395, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.5324, device='cuda:0') reward: tensor(1.4369, device='cuda:0') factor: tensor(0.0955, device='cuda:0')


Epoch 1, Step 1862: Loss = 0.0063, Actor Loss = -0.0312, Critic Loss = 0.0751, Entropy = 0.0228, Returns = 1.4804, Value = 1.4339, mask_loss = 0.0916std_loss = 0.2705:  34%|███▍      | 1862/5475 [8:20:07<14:49:50, 14.78s/it]

ratio tensor(0.9488, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4845, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.5092, device='cuda:0') reward: tensor(1.4027, device='cuda:0') factor: tensor(0.1064, device='cuda:0')
reward_raw: tensor(0.5078, device='cuda:0') reward: tensor(1.4009, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.1083, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.1063, device='cuda:0')


Epoch 1, Step 1863: Loss = 0.0425, Actor Loss = -0.0186, Critic Loss = 0.1222, Entropy = 0.0325, Returns = 1.3048, Value = 1.4856, mask_loss = 0.1022std_loss = 0.2840:  34%|███▍      | 1863/5475 [8:20:19<13:52:30, 13.83s/it]

ratio tensor(0.9722, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5401, device='cuda:0') reward: tensor(1.4522, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.5335, device='cuda:0') reward: tensor(1.4436, device='cuda:0') factor: tensor(0.0900, device='cuda:0')
reward_raw: tensor(0.5306, device='cuda:0') reward: tensor(1.4390, device='cuda:0') factor: tensor(0.0916, device='cuda:0')
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.4365, device='cuda:0') factor: tensor(0.0864, device='cuda:0')
reward_raw: tensor(0.5410, device='cuda:0') reward: tensor(1.4528, device='cuda:0') factor: tensor(0.0882, device='cuda:0')


Epoch 1, Step 1864: Loss = 0.0726, Actor Loss = 0.0332, Critic Loss = 0.0790, Entropy = 0.0252, Returns = 1.4658, Value = 1.4119, mask_loss = 0.0746std_loss = 0.2427:  34%|███▍      | 1864/5475 [8:20:34<14:19:50, 14.29s/it] 

ratio tensor(0.9249, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4874, device='cuda:0') reward: tensor(1.3744, device='cuda:0') factor: tensor(0.1130, device='cuda:0')
reward_raw: tensor(0.4854, device='cuda:0') reward: tensor(1.3702, device='cuda:0') factor: tensor(0.1152, device='cuda:0')
reward_raw: tensor(0.4881, device='cuda:0') reward: tensor(1.3750, device='cuda:0') factor: tensor(0.1131, device='cuda:0')
reward_raw: tensor(0.4623, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.1084, device='cuda:0')
reward_raw: tensor(0.4809, device='cuda:0') reward: tensor(1.3706, device='cuda:0') factor: tensor(0.1103, device='cuda:0')


Epoch 1, Step 1865: Loss = 0.0844, Actor Loss = 0.0503, Critic Loss = 0.0681, Entropy = 0.0352, Returns = 1.3915, Value = 1.3054, mask_loss = 0.1201std_loss = 0.3014:  34%|███▍      | 1865/5475 [8:20:48<14:14:11, 14.20s/it]

ratio tensor(1.1453, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5163, device='cuda:0') reward: tensor(1.4135, device='cuda:0') factor: tensor(0.1028, device='cuda:0')
reward_raw: tensor(0.5000, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.1010, device='cuda:0')
reward_raw: tensor(0.4933, device='cuda:0') reward: tensor(1.3943, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.4980, device='cuda:0') reward: tensor(1.3991, device='cuda:0') factor: tensor(0.0989, device='cuda:0')
reward_raw: tensor(0.4955, device='cuda:0') reward: tensor(1.3973, device='cuda:0') factor: tensor(0.0983, device='cuda:0')


Epoch 1, Step 1866: Loss = -0.1594, Actor Loss = -0.1735, Critic Loss = 0.0282, Entropy = 0.0263, Returns = 1.4482, Value = 1.4339, mask_loss = 0.0900std_loss = 0.2699:  34%|███▍      | 1866/5475 [8:21:01<13:57:48, 13.93s/it]

ratio tensor(1.2580, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5447, device='cuda:0') reward: tensor(1.4489, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.5442, device='cuda:0') reward: tensor(1.4521, device='cuda:0') factor: tensor(0.0921, device='cuda:0')
reward_raw: tensor(0.5292, device='cuda:0') reward: tensor(1.4383, device='cuda:0') factor: tensor(0.0909, device='cuda:0')
reward_raw: tensor(0.5362, device='cuda:0') reward: tensor(1.4437, device='cuda:0') factor: tensor(0.0926, device='cuda:0')
reward_raw: tensor(0.5613, device='cuda:0') reward: tensor(1.4648, device='cuda:0') factor: tensor(0.0965, device='cuda:0')


Epoch 1, Step 1867: Loss = -0.0591, Actor Loss = -0.0980, Critic Loss = 0.0778, Entropy = 0.0338, Returns = 1.4690, Value = 1.4822, mask_loss = 0.0990std_loss = 0.2791:  34%|███▍      | 1867/5475 [8:21:17<14:19:56, 14.30s/it]

ratio tensor(1.4888, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5374, device='cuda:0') reward: tensor(1.4358, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.5373, device='cuda:0') reward: tensor(1.4340, device='cuda:0') factor: tensor(0.1032, device='cuda:0')
reward_raw: tensor(0.5376, device='cuda:0') reward: tensor(1.4343, device='cuda:0') factor: tensor(0.1033, device='cuda:0')
reward_raw: tensor(0.5204, device='cuda:0') reward: tensor(1.4174, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.4287, device='cuda:0') factor: tensor(0.1055, device='cuda:0')


Epoch 1, Step 1868: Loss = 0.1595, Actor Loss = 0.1277, Critic Loss = 0.0636, Entropy = 0.0240, Returns = 1.4234, Value = 1.3704, mask_loss = 0.0954std_loss = 0.2755:  34%|███▍      | 1868/5475 [8:21:33<14:50:43, 14.82s/it]  

ratio tensor(1.0671, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.0958, device='cuda:0')
reward_raw: tensor(0.4873, device='cuda:0') reward: tensor(1.3891, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4808, device='cuda:0') reward: tensor(1.3871, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.4973, device='cuda:0') reward: tensor(1.4024, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.5001, device='cuda:0') reward: tensor(1.4017, device='cuda:0') factor: tensor(0.0985, device='cuda:0')


Epoch 1, Step 1869: Loss = -0.0982, Actor Loss = -0.1198, Critic Loss = 0.0434, Entropy = 0.0375, Returns = 1.4577, Value = 1.4295, mask_loss = 0.0998std_loss = 0.2754:  34%|███▍      | 1869/5475 [8:21:48<15:07:33, 15.10s/it]

ratio tensor(1.2087, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4536, device='cuda:0') reward: tensor(1.3586, device='cuda:0') factor: tensor(0.0950, device='cuda:0')
reward_raw: tensor(0.4727, device='cuda:0') reward: tensor(1.3761, device='cuda:0') factor: tensor(0.0965, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3647, device='cuda:0') factor: tensor(0.0968, device='cuda:0')
reward_raw: tensor(0.4494, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.4894, device='cuda:0') reward: tensor(1.3917, device='cuda:0') factor: tensor(0.0977, device='cuda:0')


Epoch 1, Step 1870: Loss = 0.2784, Actor Loss = 0.2492, Critic Loss = 0.0584, Entropy = 0.0338, Returns = 1.4500, Value = 1.4508, mask_loss = 0.1068std_loss = 0.2817:  34%|███▍      | 1870/5475 [8:22:04<15:09:03, 15.13s/it]  

ratio tensor(1.5867, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4828, device='cuda:0') reward: tensor(1.3794, device='cuda:0') factor: tensor(0.1034, device='cuda:0')
reward_raw: tensor(0.4836, device='cuda:0') reward: tensor(1.3801, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.4994, device='cuda:0') reward: tensor(1.3925, device='cuda:0') factor: tensor(0.1069, device='cuda:0')
reward_raw: tensor(0.4834, device='cuda:0') reward: tensor(1.3778, device='cuda:0') factor: tensor(0.1056, device='cuda:0')
reward_raw: tensor(0.4918, device='cuda:0') reward: tensor(1.3845, device='cuda:0') factor: tensor(0.1073, device='cuda:0')


Epoch 1, Step 1871: Loss = 0.0830, Actor Loss = 0.0496, Critic Loss = 0.0668, Entropy = 0.0339, Returns = 1.3254, Value = 1.3546, mask_loss = 0.1052std_loss = 0.2828:  34%|███▍      | 1871/5475 [8:22:18<14:59:23, 14.97s/it]

ratio tensor(1.2308, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4602, device='cuda:0') reward: tensor(1.3620, device='cuda:0') factor: tensor(0.0982, device='cuda:0')
reward_raw: tensor(0.4557, device='cuda:0') reward: tensor(1.3548, device='cuda:0') factor: tensor(0.1009, device='cuda:0')
reward_raw: tensor(0.4684, device='cuda:0') reward: tensor(1.3642, device='cuda:0') factor: tensor(0.1042, device='cuda:0')
reward_raw: tensor(0.4747, device='cuda:0') reward: tensor(1.3731, device='cuda:0') factor: tensor(0.1016, device='cuda:0')
reward_raw: tensor(0.4615, device='cuda:0') reward: tensor(1.3634, device='cuda:0') factor: tensor(0.0981, device='cuda:0')


Epoch 1, Step 1872: Loss = 0.2308, Actor Loss = 0.1934, Critic Loss = 0.0748, Entropy = 0.0264, Returns = 1.2933, Value = 1.3885, mask_loss = 0.0913std_loss = 0.2695:  34%|███▍      | 1872/5475 [8:22:34<15:19:26, 15.31s/it]

ratio tensor(0.9934, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.3979, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.5049, device='cuda:0') reward: tensor(1.4016, device='cuda:0') factor: tensor(0.1033, device='cuda:0')
reward_raw: tensor(0.5105, device='cuda:0') reward: tensor(1.4112, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.4105, device='cuda:0') factor: tensor(0.0975, device='cuda:0')
reward_raw: tensor(0.5131, device='cuda:0') reward: tensor(1.4107, device='cuda:0') factor: tensor(0.1025, device='cuda:0')


Epoch 1, Step 1873: Loss = -0.1095, Actor Loss = -0.1636, Critic Loss = 0.1083, Entropy = 0.0298, Returns = 1.4800, Value = 1.2974, mask_loss = 0.1087std_loss = 0.2843:  34%|███▍      | 1873/5475 [8:22:50<15:18:01, 15.29s/it]

ratio tensor(1.0249, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4515, device='cuda:0') reward: tensor(1.3675, device='cuda:0') factor: tensor(0.0840, device='cuda:0')
reward_raw: tensor(0.4962, device='cuda:0') reward: tensor(1.4130, device='cuda:0') factor: tensor(0.0832, device='cuda:0')
reward_raw: tensor(0.4490, device='cuda:0') reward: tensor(1.3650, device='cuda:0') factor: tensor(0.0840, device='cuda:0')
reward_raw: tensor(0.4840, device='cuda:0') reward: tensor(1.3997, device='cuda:0') factor: tensor(0.0842, device='cuda:0')
reward_raw: tensor(0.4659, device='cuda:0') reward: tensor(1.3832, device='cuda:0') factor: tensor(0.0828, device='cuda:0')


Epoch 1, Step 1874: Loss = 0.0272, Actor Loss = 0.0012, Critic Loss = 0.0520, Entropy = 0.0311, Returns = 1.3164, Value = 1.4303, mask_loss = 0.0739std_loss = 0.2392:  34%|███▍      | 1874/5475 [8:23:05<15:23:46, 15.39s/it]  

ratio tensor(0.8414, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4487, device='cuda:0') reward: tensor(1.3534, device='cuda:0') factor: tensor(0.0953, device='cuda:0')
reward_raw: tensor(0.4233, device='cuda:0') reward: tensor(1.3294, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.4737, device='cuda:0') reward: tensor(1.3821, device='cuda:0') factor: tensor(0.0916, device='cuda:0')
reward_raw: tensor(0.4528, device='cuda:0') reward: tensor(1.3574, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.4539, device='cuda:0') reward: tensor(1.3591, device='cuda:0') factor: tensor(0.0948, device='cuda:0')


Epoch 1, Step 1875: Loss = 0.0761, Actor Loss = 0.0391, Critic Loss = 0.0741, Entropy = 0.0253, Returns = 1.3729, Value = 1.3079, mask_loss = 0.0855std_loss = 0.2614:  34%|███▍      | 1875/5475 [8:23:24<16:22:15, 16.37s/it]

ratio tensor(0.7661, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4799, device='cuda:0') reward: tensor(1.4125, device='cuda:0') factor: tensor(0.0674, device='cuda:0')
reward_raw: tensor(0.4484, device='cuda:0') reward: tensor(1.3818, device='cuda:0') factor: tensor(0.0665, device='cuda:0')
reward_raw: tensor(0.4722, device='cuda:0') reward: tensor(1.4025, device='cuda:0') factor: tensor(0.0697, device='cuda:0')
reward_raw: tensor(0.4611, device='cuda:0') reward: tensor(1.3899, device='cuda:0') factor: tensor(0.0712, device='cuda:0')
reward_raw: tensor(0.4451, device='cuda:0') reward: tensor(1.3758, device='cuda:0') factor: tensor(0.0693, device='cuda:0')


Epoch 1, Step 1876: Loss = -0.0146, Actor Loss = -0.0559, Critic Loss = 0.0827, Entropy = 0.0261, Returns = 1.4252, Value = 1.5023, mask_loss = 0.0762std_loss = 0.2463:  34%|███▍      | 1876/5475 [8:23:43<17:04:44, 17.08s/it]

ratio tensor(1.5229, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5010, device='cuda:0') reward: tensor(1.4130, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.5099, device='cuda:0') reward: tensor(1.4241, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.4076, device='cuda:0') factor: tensor(0.0848, device='cuda:0')
reward_raw: tensor(0.4868, device='cuda:0') reward: tensor(1.4021, device='cuda:0') factor: tensor(0.0846, device='cuda:0')
reward_raw: tensor(0.5111, device='cuda:0') reward: tensor(1.4262, device='cuda:0') factor: tensor(0.0849, device='cuda:0')


Epoch 1, Step 1877: Loss = 0.1492, Actor Loss = 0.1282, Critic Loss = 0.0420, Entropy = 0.0222, Returns = 1.3609, Value = 1.3972, mask_loss = 0.0835std_loss = 0.2590:  34%|███▍      | 1877/5475 [8:23:59<16:55:22, 16.93s/it]  

ratio tensor(0.9710, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4829, device='cuda:0') reward: tensor(1.3878, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.4990, device='cuda:0') reward: tensor(1.4071, device='cuda:0') factor: tensor(0.0919, device='cuda:0')
reward_raw: tensor(0.5059, device='cuda:0') reward: tensor(1.4119, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.5120, device='cuda:0') reward: tensor(1.4179, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.4179, device='cuda:0') factor: tensor(0.0950, device='cuda:0')


Epoch 1, Step 1878: Loss = -0.1266, Actor Loss = -0.1455, Critic Loss = 0.0379, Entropy = 0.0234, Returns = 1.4189, Value = 1.3793, mask_loss = 0.0834std_loss = 0.2598:  34%|███▍      | 1878/5475 [8:24:14<16:25:52, 16.44s/it]

ratio tensor(1.1872, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5641, device='cuda:0') reward: tensor(1.4689, device='cuda:0') factor: tensor(0.0952, device='cuda:0')
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(1.4766, device='cuda:0') factor: tensor(0.0930, device='cuda:0')
reward_raw: tensor(0.5934, device='cuda:0') reward: tensor(1.5014, device='cuda:0') factor: tensor(0.0920, device='cuda:0')
reward_raw: tensor(0.5498, device='cuda:0') reward: tensor(1.4581, device='cuda:0') factor: tensor(0.0918, device='cuda:0')
reward_raw: tensor(0.5742, device='cuda:0') reward: tensor(1.4824, device='cuda:0') factor: tensor(0.0918, device='cuda:0')


Epoch 1, Step 1879: Loss = -0.0356, Actor Loss = -0.0648, Critic Loss = 0.0584, Entropy = 0.0305, Returns = 1.4287, Value = 1.4732, mask_loss = 0.0800std_loss = 0.2514:  34%|███▍      | 1879/5475 [8:24:34<17:19:53, 17.35s/it]

ratio tensor(0.9253, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5502, device='cuda:0') reward: tensor(1.4600, device='cuda:0') factor: tensor(0.0902, device='cuda:0')
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.4471, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.5332, device='cuda:0') reward: tensor(1.4448, device='cuda:0') factor: tensor(0.0884, device='cuda:0')
reward_raw: tensor(0.5294, device='cuda:0') reward: tensor(1.4429, device='cuda:0') factor: tensor(0.0865, device='cuda:0')
reward_raw: tensor(0.5330, device='cuda:0') reward: tensor(1.4439, device='cuda:0') factor: tensor(0.0892, device='cuda:0')


Epoch 1, Step 1880: Loss = 0.0997, Actor Loss = 0.0578, Critic Loss = 0.0839, Entropy = 0.0310, Returns = 1.5333, Value = 1.4726, mask_loss = 0.0895std_loss = 0.2653:  34%|███▍      | 1880/5475 [8:24:48<16:13:22, 16.25s/it]  

ratio tensor(0.8221, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5457, device='cuda:0') reward: tensor(1.4442, device='cuda:0') factor: tensor(0.1014, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.1005, device='cuda:0')
reward_raw: tensor(0.5482, device='cuda:0') reward: tensor(1.4405, device='cuda:0') factor: tensor(0.1077, device='cuda:0')
reward_raw: tensor(0.5483, device='cuda:0') reward: tensor(1.4463, device='cuda:0') factor: tensor(0.1020, device='cuda:0')
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.4238, device='cuda:0') factor: tensor(0.1051, device='cuda:0')


Epoch 1, Step 1881: Loss = -0.0503, Actor Loss = -0.0952, Critic Loss = 0.0899, Entropy = 0.0358, Returns = 1.4249, Value = 1.3846, mask_loss = 0.1235std_loss = 0.3071:  34%|███▍      | 1880/5475 [8:24:59<16:13:22, 16.25s/it]

ratio tensor(1.5895, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1881: Loss = -0.0503, Actor Loss = -0.0952, Critic Loss = 0.0899, Entropy = 0.0358, Returns = 1.4249, Value = 1.3846, mask_loss = 0.1235std_loss = 0.3071:  34%|███▍      | 1881/5475 [8:24:59<14:55:14, 14.95s/it]

reward_raw: tensor(0.5384, device='cuda:0') reward: tensor(1.4475, device='cuda:0') factor: tensor(0.0909, device='cuda:0')
reward_raw: tensor(0.5687, device='cuda:0') reward: tensor(1.4773, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.5303, device='cuda:0') reward: tensor(1.4404, device='cuda:0') factor: tensor(0.0899, device='cuda:0')
reward_raw: tensor(0.5295, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.5301, device='cuda:0') reward: tensor(1.4403, device='cuda:0') factor: tensor(0.0899, device='cuda:0')


Epoch 1, Step 1882: Loss = 0.0319, Actor Loss = 0.0014, Critic Loss = 0.0610, Entropy = 0.0312, Returns = 1.4857, Value = 1.5468, mask_loss = 0.1020std_loss = 0.2838:  34%|███▍      | 1882/5475 [8:25:11<14:00:28, 14.04s/it]  

ratio tensor(0.8545, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4906, device='cuda:0') reward: tensor(1.4038, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.4967, device='cuda:0') reward: tensor(1.4093, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4920, device='cuda:0') reward: tensor(1.4082, device='cuda:0') factor: tensor(0.0839, device='cuda:0')
reward_raw: tensor(0.4984, device='cuda:0') reward: tensor(1.4110, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4993, device='cuda:0') reward: tensor(1.4126, device='cuda:0') factor: tensor(0.0867, device='cuda:0')


Epoch 1, Step 1883: Loss = -0.0038, Actor Loss = -0.0429, Critic Loss = 0.0782, Entropy = 0.0303, Returns = 1.4228, Value = 1.2497, mask_loss = 0.0900std_loss = 0.2669:  34%|███▍      | 1883/5475 [8:25:29<15:05:31, 15.13s/it]

ratio tensor(1.0858, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4592, device='cuda:0') reward: tensor(1.3777, device='cuda:0') factor: tensor(0.0815, device='cuda:0')
reward_raw: tensor(0.4594, device='cuda:0') reward: tensor(1.3779, device='cuda:0') factor: tensor(0.0815, device='cuda:0')
reward_raw: tensor(0.4782, device='cuda:0') reward: tensor(1.3983, device='cuda:0') factor: tensor(0.0799, device='cuda:0')
reward_raw: tensor(0.4642, device='cuda:0') reward: tensor(1.3860, device='cuda:0') factor: tensor(0.0782, device='cuda:0')
reward_raw: tensor(0.4647, device='cuda:0') reward: tensor(1.3864, device='cuda:0') factor: tensor(0.0783, device='cuda:0')


Epoch 1, Step 1884: Loss = 0.0246, Actor Loss = -0.0216, Critic Loss = 0.0925, Entropy = 0.0222, Returns = 1.4023, Value = 1.5348, mask_loss = 0.0718std_loss = 0.2433:  34%|███▍      | 1884/5475 [8:25:43<14:39:02, 14.69s/it] 

ratio tensor(1.4708, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.4217, device='cuda:0') factor: tensor(0.0991, device='cuda:0')
reward_raw: tensor(0.4934, device='cuda:0') reward: tensor(1.3979, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.5104, device='cuda:0') reward: tensor(1.4112, device='cuda:0') factor: tensor(0.0992, device='cuda:0')
reward_raw: tensor(0.4999, device='cuda:0') reward: tensor(1.4002, device='cuda:0') factor: tensor(0.0998, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.3994, device='cuda:0') factor: tensor(0.0974, device='cuda:0')


Epoch 1, Step 1885: Loss = 0.2739, Actor Loss = 0.2364, Critic Loss = 0.0751, Entropy = 0.0273, Returns = 1.3484, Value = 1.3695, mask_loss = 0.0943std_loss = 0.2733:  34%|███▍      | 1885/5475 [8:26:00<15:19:09, 15.36s/it] 

ratio tensor(0.8121, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4241, device='cuda:0') reward: tensor(1.3257, device='cuda:0') factor: tensor(0.0983, device='cuda:0')
reward_raw: tensor(0.3983, device='cuda:0') reward: tensor(1.3040, device='cuda:0') factor: tensor(0.0942, device='cuda:0')
reward_raw: tensor(0.4193, device='cuda:0') reward: tensor(1.3242, device='cuda:0') factor: tensor(0.0951, device='cuda:0')
reward_raw: tensor(0.4165, device='cuda:0') reward: tensor(1.3250, device='cuda:0') factor: tensor(0.0915, device='cuda:0')
reward_raw: tensor(0.4310, device='cuda:0') reward: tensor(1.3376, device='cuda:0') factor: tensor(0.0934, device='cuda:0')


Epoch 1, Step 1886: Loss = 0.0776, Actor Loss = 0.0559, Critic Loss = 0.0435, Entropy = 0.0257, Returns = 1.2437, Value = 1.3245, mask_loss = 0.0933std_loss = 0.2687:  34%|███▍      | 1886/5475 [8:26:15<15:12:33, 15.26s/it]

ratio tensor(0.6701, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4325, device='cuda:0') reward: tensor(1.3539, device='cuda:0') factor: tensor(0.0786, device='cuda:0')
reward_raw: tensor(0.4387, device='cuda:0') reward: tensor(1.3610, device='cuda:0') factor: tensor(0.0777, device='cuda:0')
reward_raw: tensor(0.4278, device='cuda:0') reward: tensor(1.3481, device='cuda:0') factor: tensor(0.0798, device='cuda:0')
reward_raw: tensor(0.4341, device='cuda:0') reward: tensor(1.3532, device='cuda:0') factor: tensor(0.0809, device='cuda:0')
reward_raw: tensor(0.4401, device='cuda:0') reward: tensor(1.3597, device='cuda:0') factor: tensor(0.0804, device='cuda:0')


Epoch 1, Step 1887: Loss = -0.0467, Actor Loss = -0.0851, Critic Loss = 0.0768, Entropy = 0.0225, Returns = 1.3704, Value = 1.4657, mask_loss = 0.0745std_loss = 0.2467:  34%|███▍      | 1887/5475 [8:26:31<15:26:02, 15.49s/it]

ratio tensor(1.6485, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4768, device='cuda:0') reward: tensor(1.3837, device='cuda:0') factor: tensor(0.0931, device='cuda:0')
reward_raw: tensor(0.4773, device='cuda:0') reward: tensor(1.3819, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.4726, device='cuda:0') reward: tensor(1.3786, device='cuda:0') factor: tensor(0.0940, device='cuda:0')
reward_raw: tensor(0.4729, device='cuda:0') reward: tensor(1.3817, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4646, device='cuda:0') reward: tensor(1.3721, device='cuda:0') factor: tensor(0.0925, device='cuda:0')


Epoch 1, Step 1888: Loss = 0.1412, Actor Loss = 0.1214, Critic Loss = 0.0397, Entropy = 0.0215, Returns = 1.4567, Value = 1.3975, mask_loss = 0.1032std_loss = 0.2932:  34%|███▍      | 1888/5475 [8:26:51<16:55:10, 16.98s/it]  

ratio tensor(0.7680, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4426, device='cuda:0') reward: tensor(1.3281, device='cuda:0') factor: tensor(0.1145, device='cuda:0')
reward_raw: tensor(0.4401, device='cuda:0') reward: tensor(1.3239, device='cuda:0') factor: tensor(0.1162, device='cuda:0')
reward_raw: tensor(0.4562, device='cuda:0') reward: tensor(1.3382, device='cuda:0') factor: tensor(0.1180, device='cuda:0')
reward_raw: tensor(0.4430, device='cuda:0') reward: tensor(1.3249, device='cuda:0') factor: tensor(0.1181, device='cuda:0')
reward_raw: tensor(0.4683, device='cuda:0') reward: tensor(1.3516, device='cuda:0') factor: tensor(0.1168, device='cuda:0')


Epoch 1, Step 1889: Loss = -0.0334, Actor Loss = -0.0545, Critic Loss = 0.0421, Entropy = 0.0265, Returns = 1.3333, Value = 1.3229, mask_loss = 0.1211std_loss = 0.3084:  35%|███▍      | 1889/5475 [8:27:05<15:50:53, 15.91s/it]

ratio tensor(1.0056, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5082, device='cuda:0') reward: tensor(1.4125, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.5046, device='cuda:0') reward: tensor(1.4068, device='cuda:0') factor: tensor(0.0979, device='cuda:0')
reward_raw: tensor(0.5138, device='cuda:0') reward: tensor(1.4165, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.5271, device='cuda:0') reward: tensor(1.4308, device='cuda:0') factor: tensor(0.0963, device='cuda:0')
reward_raw: tensor(0.5269, device='cuda:0') reward: tensor(1.4284, device='cuda:0') factor: tensor(0.0985, device='cuda:0')


Epoch 1, Step 1890: Loss = 0.0422, Actor Loss = 0.0227, Critic Loss = 0.0391, Entropy = 0.0271, Returns = 1.3613, Value = 1.4487, mask_loss = 0.1210std_loss = 0.3086:  35%|███▍      | 1890/5475 [8:27:20<15:50:01, 15.90s/it]  

ratio tensor(0.8570, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3771, device='cuda:0') reward: tensor(1.2851, device='cuda:0') factor: tensor(0.0920, device='cuda:0')
reward_raw: tensor(0.3700, device='cuda:0') reward: tensor(1.2743, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.3958, device='cuda:0') reward: tensor(1.2985, device='cuda:0') factor: tensor(0.0974, device='cuda:0')
reward_raw: tensor(0.3970, device='cuda:0') reward: tensor(1.3031, device='cuda:0') factor: tensor(0.0939, device='cuda:0')
reward_raw: tensor(0.3889, device='cuda:0') reward: tensor(1.2967, device='cuda:0') factor: tensor(0.0922, device='cuda:0')


Epoch 1, Step 1891: Loss = 0.2713, Actor Loss = 0.2270, Critic Loss = 0.0885, Entropy = 0.0277, Returns = 1.1873, Value = 1.2566, mask_loss = 0.0912std_loss = 0.2618:  35%|███▍      | 1891/5475 [8:27:36<15:37:17, 15.69s/it]

ratio tensor(0.6137, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4740, device='cuda:0') reward: tensor(1.3811, device='cuda:0') factor: tensor(0.0929, device='cuda:0')
reward_raw: tensor(0.4730, device='cuda:0') reward: tensor(1.3813, device='cuda:0') factor: tensor(0.0917, device='cuda:0')
reward_raw: tensor(0.4797, device='cuda:0') reward: tensor(1.3885, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4891, device='cuda:0') reward: tensor(1.3979, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4870, device='cuda:0') reward: tensor(1.3939, device='cuda:0') factor: tensor(0.0931, device='cuda:0')


Epoch 1, Step 1892: Loss = -0.1828, Actor Loss = -0.2525, Critic Loss = 0.1395, Entropy = 0.0172, Returns = 1.2850, Value = 1.2703, mask_loss = 0.0905std_loss = 0.2693:  35%|███▍      | 1892/5475 [8:27:55<16:39:16, 16.73s/it]

ratio tensor(1.1022, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4816, device='cuda:0') reward: tensor(1.3904, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4693, device='cuda:0') reward: tensor(1.3812, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.4847, device='cuda:0') reward: tensor(1.3968, device='cuda:0') factor: tensor(0.0879, device='cuda:0')
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(1.3781, device='cuda:0') factor: tensor(0.0896, device='cuda:0')
reward_raw: tensor(0.4770, device='cuda:0') reward: tensor(1.3839, device='cuda:0') factor: tensor(0.0931, device='cuda:0')


Epoch 1, Step 1893: Loss = 0.1702, Actor Loss = 0.1361, Critic Loss = 0.0681, Entropy = 0.0204, Returns = 1.4281, Value = 1.5062, mask_loss = 0.0955std_loss = 0.2780:  35%|███▍      | 1893/5475 [8:28:07<15:08:42, 15.22s/it]  

ratio tensor(0.9842, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4371, device='cuda:0') reward: tensor(1.3457, device='cuda:0') factor: tensor(0.0914, device='cuda:0')
reward_raw: tensor(0.4413, device='cuda:0') reward: tensor(1.3503, device='cuda:0') factor: tensor(0.0910, device='cuda:0')
reward_raw: tensor(0.4437, device='cuda:0') reward: tensor(1.3550, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.4435, device='cuda:0') reward: tensor(1.3508, device='cuda:0') factor: tensor(0.0927, device='cuda:0')
reward_raw: tensor(0.4424, device='cuda:0') reward: tensor(1.3499, device='cuda:0') factor: tensor(0.0925, device='cuda:0')


Epoch 1, Step 1894: Loss = 0.2316, Actor Loss = 0.1954, Critic Loss = 0.0724, Entropy = 0.0358, Returns = 1.3966, Value = 1.2489, mask_loss = 0.1041std_loss = 0.2805:  35%|███▍      | 1894/5475 [8:28:27<16:37:30, 16.71s/it]

ratio tensor(1.3161, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.0766, device='cuda:0')
reward_raw: tensor(0.4786, device='cuda:0') reward: tensor(1.3985, device='cuda:0') factor: tensor(0.0801, device='cuda:0')
reward_raw: tensor(0.4741, device='cuda:0') reward: tensor(1.3990, device='cuda:0') factor: tensor(0.0751, device='cuda:0')
reward_raw: tensor(0.4850, device='cuda:0') reward: tensor(1.4072, device='cuda:0') factor: tensor(0.0778, device='cuda:0')
reward_raw: tensor(0.4882, device='cuda:0') reward: tensor(1.4106, device='cuda:0') factor: tensor(0.0776, device='cuda:0')


Epoch 1, Step 1895: Loss = -0.1613, Actor Loss = -0.2022, Critic Loss = 0.0819, Entropy = 0.0206, Returns = 1.4278, Value = 1.5542, mask_loss = 0.0782std_loss = 0.2535:  35%|███▍      | 1895/5475 [8:28:41<15:45:53, 15.85s/it]

ratio tensor(1.0232, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5433, device='cuda:0') reward: tensor(1.4628, device='cuda:0') factor: tensor(0.0805, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4498, device='cuda:0') factor: tensor(0.0784, device='cuda:0')
reward_raw: tensor(0.5416, device='cuda:0') reward: tensor(1.4589, device='cuda:0') factor: tensor(0.0826, device='cuda:0')
reward_raw: tensor(0.5412, device='cuda:0') reward: tensor(1.4612, device='cuda:0') factor: tensor(0.0799, device='cuda:0')
reward_raw: tensor(0.5382, device='cuda:0') reward: tensor(1.4552, device='cuda:0') factor: tensor(0.0831, device='cuda:0')


Epoch 1, Step 1896: Loss = 0.1557, Actor Loss = 0.1233, Critic Loss = 0.0647, Entropy = 0.0130, Returns = 1.4604, Value = 1.4679, mask_loss = 0.0732std_loss = 0.2515:  35%|███▍      | 1896/5475 [8:28:55<15:20:11, 15.43s/it]  

ratio tensor(0.9779, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5289, device='cuda:0') reward: tensor(1.4444, device='cuda:0') factor: tensor(0.0845, device='cuda:0')
reward_raw: tensor(0.5270, device='cuda:0') reward: tensor(1.4413, device='cuda:0') factor: tensor(0.0857, device='cuda:0')
reward_raw: tensor(0.5539, device='cuda:0') reward: tensor(1.4708, device='cuda:0') factor: tensor(0.0831, device='cuda:0')
reward_raw: tensor(0.5402, device='cuda:0') reward: tensor(1.4563, device='cuda:0') factor: tensor(0.0839, device='cuda:0')
reward_raw: tensor(0.5290, device='cuda:0') reward: tensor(1.4430, device='cuda:0') factor: tensor(0.0859, device='cuda:0')


Epoch 1, Step 1897: Loss = -0.0257, Actor Loss = -0.0528, Critic Loss = 0.0541, Entropy = 0.0232, Returns = 1.4696, Value = 1.5005, mask_loss = 0.0881std_loss = 0.2661:  35%|███▍      | 1897/5475 [8:29:08<14:45:04, 14.84s/it]

ratio tensor(1.1163, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4487, device='cuda:0') reward: tensor(1.3464, device='cuda:0') factor: tensor(0.1023, device='cuda:0')
reward_raw: tensor(0.4294, device='cuda:0') reward: tensor(1.3256, device='cuda:0') factor: tensor(0.1038, device='cuda:0')
reward_raw: tensor(0.4197, device='cuda:0') reward: tensor(1.3201, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4312, device='cuda:0') reward: tensor(1.3299, device='cuda:0') factor: tensor(0.1013, device='cuda:0')
reward_raw: tensor(0.4336, device='cuda:0') reward: tensor(1.3347, device='cuda:0') factor: tensor(0.0989, device='cuda:0')


Epoch 1, Step 1898: Loss = 0.1222, Actor Loss = 0.0956, Critic Loss = 0.0532, Entropy = 0.0302, Returns = 1.3450, Value = 1.2847, mask_loss = 0.1046std_loss = 0.2790:  35%|███▍      | 1898/5475 [8:29:26<15:41:18, 15.79s/it]  

ratio tensor(0.6911, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4777, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.0951, device='cuda:0')
reward_raw: tensor(0.4755, device='cuda:0') reward: tensor(1.3804, device='cuda:0') factor: tensor(0.0951, device='cuda:0')
reward_raw: tensor(0.4649, device='cuda:0') reward: tensor(1.3693, device='cuda:0') factor: tensor(0.0957, device='cuda:0')
reward_raw: tensor(0.4707, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.4723, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.0931, device='cuda:0')


Epoch 1, Step 1899: Loss = -0.0578, Actor Loss = -0.0912, Critic Loss = 0.0668, Entropy = 0.0224, Returns = 1.4171, Value = 1.4270, mask_loss = 0.1077std_loss = 0.2939:  35%|███▍      | 1899/5475 [8:29:42<15:44:50, 15.85s/it]

ratio tensor(1.3086, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4225, device='cuda:0') reward: tensor(1.3317, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4207, device='cuda:0') reward: tensor(1.3326, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.4168, device='cuda:0') reward: tensor(1.3287, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.4311, device='cuda:0') reward: tensor(1.3409, device='cuda:0') factor: tensor(0.0902, device='cuda:0')
reward_raw: tensor(0.4281, device='cuda:0') reward: tensor(1.3358, device='cuda:0') factor: tensor(0.0923, device='cuda:0')


Epoch 1, Step 1900: Loss = 0.1068, Actor Loss = 0.0663, Critic Loss = 0.0809, Entropy = 0.0231, Returns = 1.3304, Value = 1.2376, mask_loss = 0.1028std_loss = 0.2863:  35%|███▍      | 1900/5475 [8:29:58<15:41:25, 15.80s/it]  

ratio tensor(1.0129, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5353, device='cuda:0') reward: tensor(1.4316, device='cuda:0') factor: tensor(0.1037, device='cuda:0')
reward_raw: tensor(0.5271, device='cuda:0') reward: tensor(1.4244, device='cuda:0') factor: tensor(0.1027, device='cuda:0')
reward_raw: tensor(0.5215, device='cuda:0') reward: tensor(1.4168, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.5254, device='cuda:0') reward: tensor(1.4218, device='cuda:0') factor: tensor(0.1036, device='cuda:0')
reward_raw: tensor(0.5164, device='cuda:0') reward: tensor(1.4156, device='cuda:0') factor: tensor(0.1008, device='cuda:0')


Epoch 1, Step 1901: Loss = -0.2315, Actor Loss = -0.2908, Critic Loss = 0.1187, Entropy = 0.0272, Returns = 1.4556, Value = 1.5352, mask_loss = 0.0900std_loss = 0.2640:  35%|███▍      | 1900/5475 [8:30:19<15:41:25, 15.80s/it]

ratio tensor(1.6964, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1901: Loss = -0.2315, Actor Loss = -0.2908, Critic Loss = 0.1187, Entropy = 0.0272, Returns = 1.4556, Value = 1.5352, mask_loss = 0.0900std_loss = 0.2640:  35%|███▍      | 1901/5475 [8:30:19<17:12:42, 17.34s/it]

reward_raw: tensor(0.5144, device='cuda:0') reward: tensor(1.4140, device='cuda:0') factor: tensor(0.1004, device='cuda:0')
reward_raw: tensor(0.5204, device='cuda:0') reward: tensor(1.4208, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.4905, device='cuda:0') reward: tensor(1.3932, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.5226, device='cuda:0') reward: tensor(1.4230, device='cuda:0') factor: tensor(0.0996, device='cuda:0')
reward_raw: tensor(0.5057, device='cuda:0') reward: tensor(1.4035, device='cuda:0') factor: tensor(0.1022, device='cuda:0')


Epoch 1, Step 1902: Loss = 0.1716, Actor Loss = 0.1448, Critic Loss = 0.0536, Entropy = 0.0254, Returns = 1.4502, Value = 1.3904, mask_loss = 0.0967std_loss = 0.2716:  35%|███▍      | 1902/5475 [8:30:41<18:25:57, 18.57s/it]  

ratio tensor(0.8949, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5031, device='cuda:0') reward: tensor(1.3965, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.5017, device='cuda:0') reward: tensor(1.3969, device='cuda:0') factor: tensor(0.1047, device='cuda:0')
reward_raw: tensor(0.4940, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.1035, device='cuda:0')
reward_raw: tensor(0.4859, device='cuda:0') reward: tensor(1.3794, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.5013, device='cuda:0') reward: tensor(1.3910, device='cuda:0') factor: tensor(0.1103, device='cuda:0')


Epoch 1, Step 1903: Loss = 0.0481, Actor Loss = 0.0205, Critic Loss = 0.0551, Entropy = 0.0318, Returns = 1.3795, Value = 1.3527, mask_loss = 0.1208std_loss = 0.3046:  35%|███▍      | 1903/5475 [8:31:02<19:22:58, 19.53s/it]

ratio tensor(0.9078, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5541, device='cuda:0') reward: tensor(1.4621, device='cuda:0') factor: tensor(0.0920, device='cuda:0')
reward_raw: tensor(0.5197, device='cuda:0') reward: tensor(1.4268, device='cuda:0') factor: tensor(0.0929, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4403, device='cuda:0') factor: tensor(0.0880, device='cuda:0')
reward_raw: tensor(0.5475, device='cuda:0') reward: tensor(1.4502, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.5224, device='cuda:0') reward: tensor(1.4299, device='cuda:0') factor: tensor(0.0924, device='cuda:0')


Epoch 1, Step 1904: Loss = -0.1899, Actor Loss = -0.2283, Critic Loss = 0.0768, Entropy = 0.0201, Returns = 1.5046, Value = 1.4330, mask_loss = 0.0933std_loss = 0.2779:  35%|███▍      | 1904/5475 [8:31:18<18:18:45, 18.46s/it]

ratio tensor(1.3397, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5255, device='cuda:0') reward: tensor(1.4184, device='cuda:0') factor: tensor(0.1071, device='cuda:0')
reward_raw: tensor(0.5229, device='cuda:0') reward: tensor(1.4151, device='cuda:0') factor: tensor(0.1079, device='cuda:0')
reward_raw: tensor(0.5102, device='cuda:0') reward: tensor(1.4007, device='cuda:0') factor: tensor(0.1095, device='cuda:0')
reward_raw: tensor(0.5338, device='cuda:0') reward: tensor(1.4216, device='cuda:0') factor: tensor(0.1122, device='cuda:0')
reward_raw: tensor(0.5152, device='cuda:0') reward: tensor(1.4076, device='cuda:0') factor: tensor(0.1076, device='cuda:0')


Epoch 1, Step 1905: Loss = 0.0359, Actor Loss = 0.0100, Critic Loss = 0.0518, Entropy = 0.0308, Returns = 1.4058, Value = 1.4791, mask_loss = 0.1173std_loss = 0.2990:  35%|███▍      | 1905/5475 [8:31:35<17:51:39, 18.01s/it]  

ratio tensor(1.0118, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4428, device='cuda:0') reward: tensor(1.3531, device='cuda:0') factor: tensor(0.0897, device='cuda:0')
reward_raw: tensor(0.4347, device='cuda:0') reward: tensor(1.3470, device='cuda:0') factor: tensor(0.0878, device='cuda:0')
reward_raw: tensor(0.4493, device='cuda:0') reward: tensor(1.3576, device='cuda:0') factor: tensor(0.0917, device='cuda:0')
reward_raw: tensor(0.4471, device='cuda:0') reward: tensor(1.3585, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.4426, device='cuda:0') reward: tensor(1.3556, device='cuda:0') factor: tensor(0.0870, device='cuda:0')


Epoch 1, Step 1906: Loss = 0.1171, Actor Loss = 0.0933, Critic Loss = 0.0476, Entropy = 0.0349, Returns = 1.3646, Value = 1.2828, mask_loss = 0.1062std_loss = 0.2792:  35%|███▍      | 1906/5475 [8:31:49<16:43:25, 16.87s/it]

ratio tensor(1.1607, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4333, device='cuda:0') reward: tensor(1.3466, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.4347, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.0867, device='cuda:0')
reward_raw: tensor(0.4443, device='cuda:0') reward: tensor(1.3561, device='cuda:0') factor: tensor(0.0882, device='cuda:0')
reward_raw: tensor(0.4301, device='cuda:0') reward: tensor(1.3435, device='cuda:0') factor: tensor(0.0866, device='cuda:0')
reward_raw: tensor(0.4468, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.0892, device='cuda:0')


Epoch 1, Step 1907: Loss = 0.0250, Actor Loss = 0.0038, Critic Loss = 0.0425, Entropy = 0.0197, Returns = 1.3906, Value = 1.3892, mask_loss = 0.0841std_loss = 0.2628:  35%|███▍      | 1907/5475 [8:32:03<15:50:52, 15.99s/it]

ratio tensor(1.0404, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4983, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.0888, device='cuda:0')
reward_raw: tensor(0.4998, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.0903, device='cuda:0')
reward_raw: tensor(0.5208, device='cuda:0') reward: tensor(1.4296, device='cuda:0') factor: tensor(0.0912, device='cuda:0')
reward_raw: tensor(0.4976, device='cuda:0') reward: tensor(1.4039, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.5109, device='cuda:0') reward: tensor(1.4198, device='cuda:0') factor: tensor(0.0911, device='cuda:0')


Epoch 1, Step 1908: Loss = -0.0962, Actor Loss = -0.1270, Critic Loss = 0.0616, Entropy = 0.0204, Returns = 1.4143, Value = 1.5804, mask_loss = 0.0826std_loss = 0.2611:  35%|███▍      | 1908/5475 [8:32:19<15:39:11, 15.80s/it]

ratio tensor(0.9691, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4772, device='cuda:0') reward: tensor(1.3714, device='cuda:0') factor: tensor(0.1058, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.1030, device='cuda:0')
reward_raw: tensor(0.4932, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.1026, device='cuda:0')
reward_raw: tensor(0.4678, device='cuda:0') reward: tensor(1.3659, device='cuda:0') factor: tensor(0.1019, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.3875, device='cuda:0') factor: tensor(0.1073, device='cuda:0')


Epoch 1, Step 1909: Loss = 0.2352, Actor Loss = 0.1988, Critic Loss = 0.0729, Entropy = 0.0249, Returns = 1.3976, Value = 1.2738, mask_loss = 0.1129std_loss = 0.2895:  35%|███▍      | 1909/5475 [8:32:34<15:24:04, 15.55s/it]  

ratio tensor(0.8799, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4441, device='cuda:0') factor: tensor(0.0842, device='cuda:0')
reward_raw: tensor(0.5064, device='cuda:0') reward: tensor(1.4244, device='cuda:0') factor: tensor(0.0820, device='cuda:0')
reward_raw: tensor(0.5339, device='cuda:0') reward: tensor(1.4509, device='cuda:0') factor: tensor(0.0830, device='cuda:0')
reward_raw: tensor(0.5396, device='cuda:0') reward: tensor(1.4573, device='cuda:0') factor: tensor(0.0823, device='cuda:0')
reward_raw: tensor(0.5334, device='cuda:0') reward: tensor(1.4512, device='cuda:0') factor: tensor(0.0822, device='cuda:0')


Epoch 1, Step 1910: Loss = -0.0608, Actor Loss = -0.0995, Critic Loss = 0.0774, Entropy = 0.0203, Returns = 1.3963, Value = 1.5723, mask_loss = 0.0904std_loss = 0.2713:  35%|███▍      | 1910/5475 [8:32:47<14:41:31, 14.84s/it]

ratio tensor(1.0876, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5219, device='cuda:0') reward: tensor(1.4356, device='cuda:0') factor: tensor(0.0863, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.4263, device='cuda:0') factor: tensor(0.0887, device='cuda:0')
reward_raw: tensor(0.5322, device='cuda:0') reward: tensor(1.4417, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward_raw: tensor(0.4995, device='cuda:0') reward: tensor(1.4138, device='cuda:0') factor: tensor(0.0857, device='cuda:0')
reward_raw: tensor(0.5329, device='cuda:0') reward: tensor(1.4432, device='cuda:0') factor: tensor(0.0897, device='cuda:0')


Epoch 1, Step 1911: Loss = 0.1516, Actor Loss = 0.1279, Critic Loss = 0.0474, Entropy = 0.0240, Returns = 1.3786, Value = 1.4229, mask_loss = 0.1015std_loss = 0.2809:  35%|███▍      | 1911/5475 [8:33:06<15:49:27, 15.98s/it]  

ratio tensor(0.8424, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4563, device='cuda:0') reward: tensor(1.3492, device='cuda:0') factor: tensor(0.1072, device='cuda:0')
reward_raw: tensor(0.4580, device='cuda:0') reward: tensor(1.3505, device='cuda:0') factor: tensor(0.1075, device='cuda:0')
reward_raw: tensor(0.4405, device='cuda:0') reward: tensor(1.3354, device='cuda:0') factor: tensor(0.1051, device='cuda:0')
reward_raw: tensor(0.4515, device='cuda:0') reward: tensor(1.3514, device='cuda:0') factor: tensor(0.1001, device='cuda:0')
reward_raw: tensor(0.4563, device='cuda:0') reward: tensor(1.3505, device='cuda:0') factor: tensor(0.1058, device='cuda:0')


Epoch 1, Step 1912: Loss = 0.0248, Actor Loss = 0.0044, Critic Loss = 0.0408, Entropy = 0.0311, Returns = 1.3467, Value = 1.2856, mask_loss = 0.1503std_loss = 0.3377:  35%|███▍      | 1912/5475 [8:33:19<15:03:58, 15.22s/it]

ratio tensor(0.9989, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4389, device='cuda:0') reward: tensor(1.3446, device='cuda:0') factor: tensor(0.0942, device='cuda:0')
reward_raw: tensor(0.4245, device='cuda:0') reward: tensor(1.3291, device='cuda:0') factor: tensor(0.0954, device='cuda:0')
reward_raw: tensor(0.4348, device='cuda:0') reward: tensor(1.3388, device='cuda:0') factor: tensor(0.0961, device='cuda:0')
reward_raw: tensor(0.4311, device='cuda:0') reward: tensor(1.3366, device='cuda:0') factor: tensor(0.0945, device='cuda:0')
reward_raw: tensor(0.4181, device='cuda:0') reward: tensor(1.3223, device='cuda:0') factor: tensor(0.0958, device='cuda:0')


Epoch 1, Step 1913: Loss = 0.0433, Actor Loss = 0.0083, Critic Loss = 0.0702, Entropy = 0.0239, Returns = 1.4162, Value = 1.3619, mask_loss = 0.0935std_loss = 0.2722:  35%|███▍      | 1913/5475 [8:33:33<14:37:48, 14.79s/it]

ratio tensor(0.9634, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5002, device='cuda:0') reward: tensor(1.4210, device='cuda:0') factor: tensor(0.0792, device='cuda:0')
reward_raw: tensor(0.5066, device='cuda:0') reward: tensor(1.4240, device='cuda:0') factor: tensor(0.0826, device='cuda:0')
reward_raw: tensor(0.5240, device='cuda:0') reward: tensor(1.4436, device='cuda:0') factor: tensor(0.0805, device='cuda:0')
reward_raw: tensor(0.5072, device='cuda:0') reward: tensor(1.4291, device='cuda:0') factor: tensor(0.0781, device='cuda:0')
reward_raw: tensor(0.5056, device='cuda:0') reward: tensor(1.4268, device='cuda:0') factor: tensor(0.0788, device='cuda:0')


Epoch 1, Step 1914: Loss = -0.1176, Actor Loss = -0.1427, Critic Loss = 0.0503, Entropy = 0.0305, Returns = 1.3821, Value = 1.4088, mask_loss = 0.0780std_loss = 0.2424:  35%|███▍      | 1914/5475 [8:33:52<16:01:30, 16.20s/it]

ratio tensor(0.9810, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4569, device='cuda:0') reward: tensor(1.3623, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.4607, device='cuda:0') reward: tensor(1.3670, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.4589, device='cuda:0') reward: tensor(1.3640, device='cuda:0') factor: tensor(0.0949, device='cuda:0')
reward_raw: tensor(0.4599, device='cuda:0') reward: tensor(1.3634, device='cuda:0') factor: tensor(0.0966, device='cuda:0')
reward_raw: tensor(0.4553, device='cuda:0') reward: tensor(1.3614, device='cuda:0') factor: tensor(0.0939, device='cuda:0')


Epoch 1, Step 1915: Loss = 0.0411, Actor Loss = 0.0142, Critic Loss = 0.0538, Entropy = 0.0284, Returns = 1.3737, Value = 1.3180, mask_loss = 0.1019std_loss = 0.2839:  35%|███▍      | 1915/5475 [8:34:06<15:10:02, 15.34s/it]  

ratio tensor(1.1059, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4519, device='cuda:0') reward: tensor(1.3589, device='cuda:0') factor: tensor(0.0930, device='cuda:0')
reward_raw: tensor(0.4637, device='cuda:0') reward: tensor(1.3747, device='cuda:0') factor: tensor(0.0890, device='cuda:0')
reward_raw: tensor(0.4749, device='cuda:0') reward: tensor(1.3826, device='cuda:0') factor: tensor(0.0924, device='cuda:0')
reward_raw: tensor(0.4709, device='cuda:0') reward: tensor(1.3824, device='cuda:0') factor: tensor(0.0885, device='cuda:0')
reward_raw: tensor(0.4661, device='cuda:0') reward: tensor(1.3779, device='cuda:0') factor: tensor(0.0882, device='cuda:0')


Epoch 1, Step 1916: Loss = -0.0256, Actor Loss = -0.0474, Critic Loss = 0.0436, Entropy = 0.0364, Returns = 1.3761, Value = 1.3076, mask_loss = 0.0973std_loss = 0.2725:  35%|███▍      | 1916/5475 [8:34:21<15:08:58, 15.32s/it]

ratio tensor(0.8557, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5236, device='cuda:0') reward: tensor(1.4385, device='cuda:0') factor: tensor(0.0851, device='cuda:0')
reward_raw: tensor(0.5371, device='cuda:0') reward: tensor(1.4494, device='cuda:0') factor: tensor(0.0878, device='cuda:0')
reward_raw: tensor(0.5242, device='cuda:0') reward: tensor(1.4385, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.5283, device='cuda:0') reward: tensor(1.4431, device='cuda:0') factor: tensor(0.0852, device='cuda:0')
reward_raw: tensor(0.5183, device='cuda:0') reward: tensor(1.4322, device='cuda:0') factor: tensor(0.0861, device='cuda:0')


Epoch 1, Step 1917: Loss = -0.0545, Actor Loss = -0.0730, Critic Loss = 0.0371, Entropy = 0.0228, Returns = 1.5058, Value = 1.5739, mask_loss = 0.0839std_loss = 0.2624:  35%|███▌      | 1917/5475 [8:34:33<14:05:57, 14.27s/it]

ratio tensor(0.9874, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.4001, device='cuda:0') factor: tensor(0.0962, device='cuda:0')
reward_raw: tensor(0.4992, device='cuda:0') reward: tensor(1.4020, device='cuda:0') factor: tensor(0.0972, device='cuda:0')
reward_raw: tensor(0.4858, device='cuda:0') reward: tensor(1.3904, device='cuda:0') factor: tensor(0.0955, device='cuda:0')
reward_raw: tensor(0.4909, device='cuda:0') reward: tensor(1.3944, device='cuda:0') factor: tensor(0.0965, device='cuda:0')
reward_raw: tensor(0.4963, device='cuda:0') reward: tensor(1.3975, device='cuda:0') factor: tensor(0.0988, device='cuda:0')


Epoch 1, Step 1918: Loss = -0.0610, Actor Loss = -0.1023, Critic Loss = 0.0827, Entropy = 0.0236, Returns = 1.3568, Value = 1.3441, mask_loss = 0.0912std_loss = 0.2710:  35%|███▌      | 1918/5475 [8:34:47<13:58:52, 14.15s/it]

ratio tensor(1.2013, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4722, device='cuda:0') reward: tensor(1.3792, device='cuda:0') factor: tensor(0.0930, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3913, device='cuda:0') factor: tensor(0.0918, device='cuda:0')
reward_raw: tensor(0.4608, device='cuda:0') reward: tensor(1.3684, device='cuda:0') factor: tensor(0.0924, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3658, device='cuda:0') factor: tensor(0.0956, device='cuda:0')
reward_raw: tensor(0.4735, device='cuda:0') reward: tensor(1.3789, device='cuda:0') factor: tensor(0.0946, device='cuda:0')


Epoch 1, Step 1919: Loss = 0.2689, Actor Loss = 0.2350, Critic Loss = 0.0680, Entropy = 0.0242, Returns = 1.3516, Value = 1.2631, mask_loss = 0.1044std_loss = 0.2879:  35%|███▌      | 1919/5475 [8:35:01<14:00:59, 14.19s/it]  

ratio tensor(0.7412, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4805, device='cuda:0') reward: tensor(1.3490, device='cuda:0') factor: tensor(0.1315, device='cuda:0')
reward_raw: tensor(0.4800, device='cuda:0') reward: tensor(1.3522, device='cuda:0') factor: tensor(0.1278, device='cuda:0')
reward_raw: tensor(0.4658, device='cuda:0') reward: tensor(1.3355, device='cuda:0') factor: tensor(0.1303, device='cuda:0')
reward_raw: tensor(0.4731, device='cuda:0') reward: tensor(1.3455, device='cuda:0') factor: tensor(0.1276, device='cuda:0')
reward_raw: tensor(0.4628, device='cuda:0') reward: tensor(1.3369, device='cuda:0') factor: tensor(0.1259, device='cuda:0')


Epoch 1, Step 1920: Loss = -0.1687, Actor Loss = -0.2155, Critic Loss = 0.0937, Entropy = 0.0260, Returns = 1.3303, Value = 1.4450, mask_loss = 0.1281std_loss = 0.3059:  35%|███▌      | 1920/5475 [8:35:14<13:41:45, 13.87s/it]

ratio tensor(1.0208, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6205, device='cuda:0') reward: tensor(1.5344, device='cuda:0') factor: tensor(0.0860, device='cuda:0')
reward_raw: tensor(0.6075, device='cuda:0') reward: tensor(1.5200, device='cuda:0') factor: tensor(0.0875, device='cuda:0')
reward_raw: tensor(0.6070, device='cuda:0') reward: tensor(1.5212, device='cuda:0') factor: tensor(0.0858, device='cuda:0')
reward_raw: tensor(0.6106, device='cuda:0') reward: tensor(1.5224, device='cuda:0') factor: tensor(0.0881, device='cuda:0')
reward_raw: tensor(0.6079, device='cuda:0') reward: tensor(1.5207, device='cuda:0') factor: tensor(0.0872, device='cuda:0')


Epoch 1, Step 1921: Loss = 0.0765, Actor Loss = 0.0330, Critic Loss = 0.0870, Entropy = 0.0313, Returns = 1.5330, Value = 1.6665, mask_loss = 0.1037std_loss = 0.2818:  35%|███▌      | 1920/5475 [8:35:31<13:41:45, 13.87s/it]  

ratio tensor(1.1472, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 1921: Loss = 0.0765, Actor Loss = 0.0330, Critic Loss = 0.0870, Entropy = 0.0313, Returns = 1.5330, Value = 1.6665, mask_loss = 0.1037std_loss = 0.2818:  35%|███▌      | 1921/5475 [8:35:31<14:47:18, 14.98s/it]

reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.4145, device='cuda:0') factor: tensor(0.0846, device='cuda:0')
reward_raw: tensor(0.4872, device='cuda:0') reward: tensor(1.4028, device='cuda:0') factor: tensor(0.0844, device='cuda:0')
reward_raw: tensor(0.4888, device='cuda:0') reward: tensor(1.4040, device='cuda:0') factor: tensor(0.0847, device='cuda:0')
reward_raw: tensor(0.5008, device='cuda:0') reward: tensor(1.4172, device='cuda:0') factor: tensor(0.0836, device='cuda:0')
reward_raw: tensor(0.4767, device='cuda:0') reward: tensor(1.3937, device='cuda:0') factor: tensor(0.0831, device='cuda:0')


Epoch 1, Step 1922: Loss = 0.2496, Actor Loss = 0.2137, Critic Loss = 0.0719, Entropy = 0.0233, Returns = 1.3954, Value = 1.2907, mask_loss = 0.0823std_loss = 0.2579:  35%|███▌      | 1922/5475 [8:35:45<14:18:21, 14.50s/it]

ratio tensor(0.7772, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5713, device='cuda:0') reward: tensor(1.4778, device='cuda:0') factor: tensor(0.0935, device='cuda:0')
reward_raw: tensor(0.6059, device='cuda:0') reward: tensor(1.5076, device='cuda:0') factor: tensor(0.0983, device='cuda:0')
reward_raw: tensor(0.5836, device='cuda:0') reward: tensor(1.4881, device='cuda:0') factor: tensor(0.0955, device='cuda:0')
reward_raw: tensor(0.5626, device='cuda:0') reward: tensor(1.4689, device='cuda:0') factor: tensor(0.0937, device='cuda:0')
reward_raw: tensor(0.5802, device='cuda:0') reward: tensor(1.4881, device='cuda:0') factor: tensor(0.0921, device='cuda:0')


Epoch 1, Step 1923: Loss = -0.2277, Actor Loss = -0.2410, Critic Loss = 0.0267, Entropy = 0.0346, Returns = 1.4917, Value = 1.5821, mask_loss = 0.1098std_loss = 0.2859:  35%|███▌      | 1923/5475 [8:35:56<13:21:47, 13.54s/it]

ratio tensor(1.9563, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.4022, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.4832, device='cuda:0') reward: tensor(1.3911, device='cuda:0') factor: tensor(0.0921, device='cuda:0')
reward_raw: tensor(0.4924, device='cuda:0') reward: tensor(1.4000, device='cuda:0') factor: tensor(0.0924, device='cuda:0')
reward_raw: tensor(0.4799, device='cuda:0') reward: tensor(1.3863, device='cuda:0') factor: tensor(0.0936, device='cuda:0')
reward_raw: tensor(0.4830, device='cuda:0') reward: tensor(1.3893, device='cuda:0') factor: tensor(0.0936, device='cuda:0')


Epoch 1, Step 1924: Loss = 0.1717, Actor Loss = 0.1443, Critic Loss = 0.0549, Entropy = 0.0313, Returns = 1.3895, Value = 1.3451, mask_loss = 0.1060std_loss = 0.2876:  35%|███▌      | 1924/5475 [8:36:10<13:19:40, 13.51s/it]  

ratio tensor(0.7783, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4951, device='cuda:0') reward: tensor(1.3870, device='cuda:0') factor: tensor(0.1081, device='cuda:0')
reward_raw: tensor(0.4991, device='cuda:0') reward: tensor(1.3882, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.5118, device='cuda:0') reward: tensor(1.3999, device='cuda:0') factor: tensor(0.1119, device='cuda:0')
reward_raw: tensor(0.4950, device='cuda:0') reward: tensor(1.3840, device='cuda:0') factor: tensor(0.1110, device='cuda:0')
reward_raw: tensor(0.4847, device='cuda:0') reward: tensor(1.3751, device='cuda:0') factor: tensor(0.1096, device='cuda:0')


Epoch 1, Step 1925: Loss = -0.0994, Actor Loss = -0.1547, Critic Loss = 0.1106, Entropy = 0.0345, Returns = 1.4860, Value = 1.3703, mask_loss = 0.1075std_loss = 0.2824:  35%|███▌      | 1925/5475 [8:36:27<14:19:29, 14.53s/it]

ratio tensor(1.0006, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6087, device='cuda:0') reward: tensor(1.5180, device='cuda:0') factor: tensor(0.0907, device='cuda:0')
reward_raw: tensor(0.5833, device='cuda:0') reward: tensor(1.4947, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.5996, device='cuda:0') reward: tensor(1.5124, device='cuda:0') factor: tensor(0.0872, device='cuda:0')
reward_raw: tensor(0.5875, device='cuda:0') reward: tensor(1.5026, device='cuda:0') factor: tensor(0.0850, device='cuda:0')
reward_raw: tensor(0.5860, device='cuda:0') reward: tensor(1.4981, device='cuda:0') factor: tensor(0.0879, device='cuda:0')


Epoch 1, Step 1926: Loss = -0.1028, Actor Loss = -0.1294, Critic Loss = 0.0532, Entropy = 0.0243, Returns = 1.5487, Value = 1.6171, mask_loss = 0.0767std_loss = 0.2455:  35%|███▌      | 1926/5475 [8:36:41<14:25:36, 14.63s/it]

ratio tensor(1.3310, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4857, device='cuda:0') reward: tensor(1.3905, device='cuda:0') factor: tensor(0.0951, device='cuda:0')
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(1.3938, device='cuda:0') factor: tensor(0.0946, device='cuda:0')
reward_raw: tensor(0.4988, device='cuda:0') reward: tensor(1.3997, device='cuda:0') factor: tensor(0.0990, device='cuda:0')
reward_raw: tensor(0.4930, device='cuda:0') reward: tensor(1.3967, device='cuda:0') factor: tensor(0.0964, device='cuda:0')
reward_raw: tensor(0.4899, device='cuda:0') reward: tensor(1.3949, device='cuda:0') factor: tensor(0.0950, device='cuda:0')


Epoch 1, Step 1927: Loss = 0.0818, Actor Loss = 0.0518, Critic Loss = 0.0599, Entropy = 0.0337, Returns = 1.4447, Value = 1.2867, mask_loss = 0.0934std_loss = 0.2615:  35%|███▌      | 1927/5475 [8:36:57<14:33:47, 14.78s/it]  

ratio tensor(0.7902, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4917, device='cuda:0') reward: tensor(1.4044, device='cuda:0') factor: tensor(0.0873, device='cuda:0')
reward_raw: tensor(0.4784, device='cuda:0') reward: tensor(1.3913, device='cuda:0') factor: tensor(0.0871, device='cuda:0')
reward_raw: tensor(0.5191, device='cuda:0') reward: tensor(1.4325, device='cuda:0') factor: tensor(0.0865, device='cuda:0')
reward_raw: tensor(0.4948, device='cuda:0') reward: tensor(1.4063, device='cuda:0') factor: tensor(0.0886, device='cuda:0')
reward_raw: tensor(0.5072, device='cuda:0') reward: tensor(1.4179, device='cuda:0') factor: tensor(0.0892, device='cuda:0')


Epoch 1, Step 1928: Loss = -0.1993, Actor Loss = -0.2204, Critic Loss = 0.0423, Entropy = 0.0347, Returns = 1.3987, Value = 1.4061, mask_loss = 0.0920std_loss = 0.2692:  35%|███▌      | 1928/5475 [8:37:13<14:59:38, 15.22s/it]

ratio tensor(1.5953, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5126, device='cuda:0') reward: tensor(1.4157, device='cuda:0') factor: tensor(0.0970, device='cuda:0')
reward_raw: tensor(0.5205, device='cuda:0') reward: tensor(1.4198, device='cuda:0') factor: tensor(0.1007, device='cuda:0')
reward_raw: tensor(0.5129, device='cuda:0') reward: tensor(1.4145, device='cuda:0') factor: tensor(0.0984, device='cuda:0')
reward_raw: tensor(0.5080, device='cuda:0') reward: tensor(1.4087, device='cuda:0') factor: tensor(0.0993, device='cuda:0')
reward_raw: tensor(0.5321, device='cuda:0') reward: tensor(1.4311, device='cuda:0') factor: tensor(0.1009, device='cuda:0')


Epoch 1, Step 1929: Loss = 0.0886, Actor Loss = 0.0590, Critic Loss = 0.0592, Entropy = 0.0290, Returns = 1.4468, Value = 1.4361, mask_loss = 0.1098std_loss = 0.2955:  35%|███▌      | 1929/5475 [8:37:28<14:51:12, 15.08s/it]  

ratio tensor(0.8258, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4672, device='cuda:0') reward: tensor(1.3649, device='cuda:0') factor: tensor(0.1024, device='cuda:0')
reward_raw: tensor(0.4626, device='cuda:0') reward: tensor(1.3643, device='cuda:0') factor: tensor(0.0983, device='cuda:0')
reward_raw: tensor(0.4666, device='cuda:0') reward: tensor(1.3688, device='cuda:0') factor: tensor(0.0978, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(1.3680, device='cuda:0') factor: tensor(0.0995, device='cuda:0')
reward_raw: tensor(0.4770, device='cuda:0') reward: tensor(1.3775, device='cuda:0') factor: tensor(0.0995, device='cuda:0')


Epoch 1, Step 1930: Loss = 0.2186, Actor Loss = 0.1880, Critic Loss = 0.0613, Entropy = 0.0409, Returns = 1.2510, Value = 1.3350, mask_loss = 0.0989std_loss = 0.2705:  35%|███▌      | 1930/5475 [8:37:47<16:01:38, 16.28s/it]

ratio tensor(0.7764, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5796, device='cuda:0') reward: tensor(1.4802, device='cuda:0') factor: tensor(0.0994, device='cuda:0')
reward_raw: tensor(0.5673, device='cuda:0') reward: tensor(1.4701, device='cuda:0') factor: tensor(0.0973, device='cuda:0')
reward_raw: tensor(0.5843, device='cuda:0') reward: tensor(1.4802, device='cuda:0') factor: tensor(0.1041, device='cuda:0')
reward_raw: tensor(0.5683, device='cuda:0') reward: tensor(1.4700, device='cuda:0') factor: tensor(0.0984, device='cuda:0')
reward_raw: tensor(0.5651, device='cuda:0') reward: tensor(1.4650, device='cuda:0') factor: tensor(0.1001, device='cuda:0')


Epoch 1, Step 1931: Loss = -0.1563, Actor Loss = -0.1878, Critic Loss = 0.0632, Entropy = 0.0369, Returns = 1.4394, Value = 1.4516, mask_loss = 0.1010std_loss = 0.2790:  35%|███▌      | 1931/5475 [8:38:00<15:10:08, 15.41s/it]

ratio tensor(1.3228, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5026, device='cuda:0') reward: tensor(1.4137, device='cuda:0') factor: tensor(0.0889, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.4095, device='cuda:0') factor: tensor(0.0873, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(1.4285, device='cuda:0') factor: tensor(0.0899, device='cuda:0')
reward_raw: tensor(0.4607, device='cuda:0') reward: tensor(1.3755, device='cuda:0') factor: tensor(0.0852, device='cuda:0')
reward_raw: tensor(0.5150, device='cuda:0') reward: tensor(1.4218, device='cuda:0') factor: tensor(0.0932, device='cuda:0')


Epoch 1, Step 1932: Loss = 0.1720, Actor Loss = 0.1533, Critic Loss = 0.0373, Entropy = 0.0265, Returns = 1.4057, Value = 1.3833, mask_loss = 0.0840std_loss = 0.2590:  35%|███▌      | 1932/5475 [8:38:17<15:35:19, 15.84s/it]  

ratio tensor(0.7098, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5061, device='cuda:0') reward: tensor(1.4214, device='cuda:0') factor: tensor(0.0847, device='cuda:0')
reward_raw: tensor(0.5025, device='cuda:0') reward: tensor(1.4203, device='cuda:0') factor: tensor(0.0823, device='cuda:0')
reward_raw: tensor(0.5244, device='cuda:0') reward: tensor(1.4404, device='cuda:0') factor: tensor(0.0840, device='cuda:0')
reward_raw: tensor(0.5097, device='cuda:0') reward: tensor(1.4271, device='cuda:0') factor: tensor(0.0826, device='cuda:0')
reward_raw: tensor(0.4968, device='cuda:0') reward: tensor(1.4138, device='cuda:0') factor: tensor(0.0830, device='cuda:0')


Epoch 1, Step 1933: Loss = -0.0224, Actor Loss = -0.0547, Critic Loss = 0.0648, Entropy = 0.0267, Returns = 1.4162, Value = 1.4376, mask_loss = 0.0831std_loss = 0.2556:  35%|███▌      | 1933/5475 [8:38:37<16:58:15, 17.25s/it]

ratio tensor(0.9126, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4546, device='cuda:0') reward: tensor(1.3479, device='cuda:0') factor: tensor(0.1068, device='cuda:0')
reward_raw: tensor(0.4640, device='cuda:0') reward: tensor(1.3567, device='cuda:0') factor: tensor(0.1073, device='cuda:0')
reward_raw: tensor(0.4510, device='cuda:0') reward: tensor(1.3449, device='cuda:0') factor: tensor(0.1061, device='cuda:0')
reward_raw: tensor(0.4484, device='cuda:0') reward: tensor(1.3427, device='cuda:0') factor: tensor(0.1057, device='cuda:0')
reward_raw: tensor(0.4654, device='cuda:0') reward: tensor(1.3577, device='cuda:0') factor: tensor(0.1077, device='cuda:0')


Epoch 1, Step 1934: Loss = 0.1679, Actor Loss = 0.1346, Critic Loss = 0.0667, Entropy = 0.0345, Returns = 1.2343, Value = 1.4069, mask_loss = 0.1237std_loss = 0.2851:  35%|███▌      | 1934/5475 [8:38:52<16:16:47, 16.55s/it]  

ratio tensor(0.8292, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4437, device='cuda:0') reward: tensor(1.3370, device='cuda:0') factor: tensor(0.1066, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3486, device='cuda:0') factor: tensor(0.1128, device='cuda:0')
reward_raw: tensor(0.4483, device='cuda:0') reward: tensor(1.3375, device='cuda:0') factor: tensor(0.1109, device='cuda:0')
reward_raw: tensor(0.4503, device='cuda:0') reward: tensor(1.3362, device='cuda:0') factor: tensor(0.1141, device='cuda:0')
reward_raw: tensor(0.4561, device='cuda:0') reward: tensor(1.3480, device='cuda:0') factor: tensor(0.1082, device='cuda:0')


Epoch 1, Step 1935: Loss = -0.0216, Actor Loss = -0.0488, Critic Loss = 0.0544, Entropy = 0.0255, Returns = 1.2954, Value = 1.2574, mask_loss = 0.0886std_loss = 0.2669:  35%|███▌      | 1935/5475 [8:39:08<15:59:38, 16.26s/it]

ratio tensor(1.1216, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4502, device='cuda:0') reward: tensor(1.3627, device='cuda:0') factor: tensor(0.0874, device='cuda:0')
reward_raw: tensor(0.4653, device='cuda:0') reward: tensor(1.3766, device='cuda:0') factor: tensor(0.0888, device='cuda:0')
reward_raw: tensor(0.4856, device='cuda:0') reward: tensor(1.3963, device='cuda:0') factor: tensor(0.0893, device='cuda:0')
reward_raw: tensor(0.4636, device='cuda:0') reward: tensor(1.3727, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4614, device='cuda:0') reward: tensor(1.3708, device='cuda:0') factor: tensor(0.0907, device='cuda:0')


Epoch 1, Step 1936: Loss = 0.1126, Actor Loss = 0.0970, Critic Loss = 0.0314, Entropy = 0.0283, Returns = 1.3207, Value = 1.3525, mask_loss = 0.0788std_loss = 0.2494:  35%|███▌      | 1936/5475 [8:39:20<14:47:37, 15.05s/it]  

ratio tensor(0.7608, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4671, device='cuda:0') reward: tensor(1.3763, device='cuda:0') factor: tensor(0.0908, device='cuda:0')
reward_raw: tensor(0.4546, device='cuda:0') reward: tensor(1.3651, device='cuda:0') factor: tensor(0.0895, device='cuda:0')
reward_raw: tensor(0.4581, device='cuda:0') reward: tensor(1.3698, device='cuda:0') factor: tensor(0.0883, device='cuda:0')
reward_raw: tensor(0.4735, device='cuda:0') reward: tensor(1.3834, device='cuda:0') factor: tensor(0.0901, device='cuda:0')
reward_raw: tensor(0.4642, device='cuda:0') reward: tensor(1.3745, device='cuda:0') factor: tensor(0.0898, device='cuda:0')


Epoch 1, Step 1937: Loss = 0.0162, Actor Loss = -0.0199, Critic Loss = 0.0723, Entropy = 0.0461, Returns = 1.4168, Value = 1.3970, mask_loss = 0.0969std_loss = 0.2687:  35%|███▌      | 1937/5475 [8:39:39<15:55:29, 16.20s/it]

ratio tensor(2.7099, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(1.4823, device='cuda:0') factor: tensor(0.0859, device='cuda:0')
reward_raw: tensor(0.5865, device='cuda:0') reward: tensor(1.4986, device='cuda:0') factor: tensor(0.0879, device='cuda:0')


Epoch 1, Step 1937: Loss = 0.0162, Actor Loss = -0.0199, Critic Loss = 0.0723, Entropy = 0.0461, Returns = 1.4168, Value = 1.3970, mask_loss = 0.0969std_loss = 0.2687:  35%|███▌      | 1937/5475 [8:39:44<15:49:20, 16.10s/it]


KeyboardInterrupt: 

In [71]:
import numpy as np
import torch.nn.functional as F
from datasets import Dataset

mask_gen_model.load_state_dict(torch.load('saved_model/squad_kldv_0_1900.pth',map_location=device))

mask_gen_model.eval()

test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# test_inputs = next(iter(test_dataloader)).to(device)
test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])



# data_dict = {
#     'sentence': ["I absolutely love this product! It exceeded all my expectations.", 
#              "The movie was fantastic, and the acting was top-notch.",
#              "This restaurant offers great service and delicious food. Highly recommend!",
#              "The product works as advertised, nothing more, nothing less.",
#              "The event was well-organized, but it didn’t leave a lasting impression.",
#              "t’s an average phone, nothing special but it does the job.",
#              "I’m really disappointed with this purchase. It broke within a week.",
#              "The movie was too long and boring, I wouldn’t recommend it.",
#              "Terrible customer service, I won’t be coming back to this place."],
#     'label': [1, 1, 1, 0, 0, 0, -1, -1, -1]
# }
# manual_test_data = Dataset.from_dict(data_dict)

# manual_test_dataloader = DataLoader(manual_test_data, batch_size=9, collate_fn=collate_fn, shuffle=False)
# #
# test_inputs = next(iter(manual_test_dataloader)).to(device)


# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask, response_mask
    dist, value = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

  mask_gen_model.load_state_dict(torch.load('saved_model/squad_kldv_0_1900.pth',map_location=device))


In [72]:
import random
idx = random.randint(0, 8)
# idx = 0
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask

def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = (array > 0)
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)

    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
scores = normalize_except_zeros(scores.detach().cpu().numpy())

# # remove special tokens
# filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) ]

In [73]:
for token, score in filtered_token_scores:
    print(f"Token: {token}, Score: {score}")

Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|e

In [74]:
import re

def clean_token(token):
    # 移除所有普通的特殊字符，比如 'Ġ' 和 'Ċ'
    return token.replace("Ġ", "").replace("Ċ", "")

merged_tokens_scores = []
current_token = ""
current_score = 0
count = 0

def is_special_token(token):
    # 判断是否是特殊的独立 token，例如 '<|start_header_id|>' 这样的 token
    return token.startswith("<|") and token.endswith("|>")

# 用于合并 token 和 score，取平均值
for token, score in filtered_token_scores:
    # 检查是否是特殊 token
    if is_special_token(token):
        # 如果当前有累积的 token，先把它们加入结果
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
            current_token = ""
            current_score = 0
            count = 0

        # 特殊 token 直接加入，不合并
        merged_tokens_scores.append((token, score))
        continue

    # 清理 token 中的特殊字符
    cleaned_token = clean_token(token)

    # 忽略清理后的空 token
    if not cleaned_token:
        continue

    # 判断是否是新单词的开始（以 'Ġ' 或 'Ċ' 开头的通常是新词）
    if token.startswith("Ġ") or token.startswith("Ċ"):
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
        
        # 初始化新的 token 和 score
        current_token = cleaned_token
        current_score = score
        count = 1
    else:
        # 如果是子词，则继续合并
        current_token += cleaned_token
        current_score += score
        count += 1

# 处理最后一个 token
if current_token:
    # 确保分数归一化在 [0, 1] 之间
    average_score = min(current_score / count, 1.0)
    merged_tokens_scores.append((current_token, average_score))

# # 输出结果
# for token, score in merged_tokens_scores:
#     print(f"Token: {token}, Score: {score}")





# 根据分数高亮文本（示例中使用HTML标签）
highlighted_text = ""
for token, score in merged_tokens_scores:
    # 动态设置背景颜色：score为0时为白色，score为1时为绿色
    red = int((1 - score) * 255)
    green = 255
    blue = int((1 - score) * 255)
    color = f'rgb({red}, {green}, {blue})'
    highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# 打印高亮后的文本
from IPython.display import display, HTML
display(HTML(highlighted_text.strip()))

In [67]:
(test_mask_prob * test_context_mask).max()

tensor(1.0000, device='cuda:0')

In [None]:
filtered_token_scores

In [None]:
test_inputs.keys()

collate_fn()

tokenizer("this is a sentence")

In [22]:
new_ds = ds['train'].select(range(3))

In [26]:
from datasets import Dataset

data_dict = {
    'text': ["This is the first example.", "This is the second example."],
    'label': [0, 1]
}
dataset = Dataset.from_dict(data_dict)

In [None]:
new_ds[0]