In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.imdb_model import MaskGeneratingModelForIMDB
from tqdm import tqdm

In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

ds = load_dataset("imdb")
# ds = load_dataset("rajpurkar/squad")
# ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
test_ds = ds['test']

llm_exp_helper = DataHelper(tokenizer)
collate_fn = llm_exp_helper.get_collate_fun('imdb')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    device_map=device,
    token=access_token,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Configure mask model and  Training parameters
mask_gen_model = MaskGeneratingModelForIMDB()
mask_gen_model.to(device)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

optimizer = torch.optim.Adam(mask_gen_model.parameters(), lr=1e-5)

In [6]:
mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
        gen_tokens = gen_outputs.sequences
        pad_length = gen_tokens.size(1) - input_ids.size(1)
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
        # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
        unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
        unpaded_token_mask[:, :-pad_length] = 1
        gen_attention_mask = gen_attention_mask * unpaded_token_mask

        # get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
        response_mask = gen_attention_mask.clone()
        response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, response_mask, optimizer,
                                                   num_steps=3, mini_batch_size=16, ppo_epochs=1)


        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
               f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
               f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               f"Entropy = {loss_dict['entropy']:.4f}, " \
               f"Returns = {loss_dict['returns']:.4f}, " \
               f"Value = {loss_dict['value']:.4f}, " \
                f"mask_loss = {loss_dict['mask_loss']:.4f}" \
                f"std_loss = {loss_dict['std_loss']:.4f}" \
            #    f"Cont_loss = {loss_dict['contrast_loss']:.4f}, "  \
               
        pbar.set_description(log)

        # if idx % 1 == 0:
        #     print()
        if idx % 10 == 0 and idx != 0:
            torch.save(mask_gen_model.state_dict(), f'saved_model/imdb_mask_gen_model_{epoch}_{idx}.pth') 

  0%|          | 0/1563 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


reward: tensor(1.5380, device='cuda:0') factor: tensor(0.5083, device='cuda:0')
reward: tensor(1.6864, device='cuda:0') factor: tensor(0.5304, device='cuda:0')
reward: tensor(1.6671, device='cuda:0') factor: tensor(0.5149, device='cuda:0')


Epoch 1, Step 1: Loss = -0.1157, Actor Loss = -0.5801, Critic Loss = 0.9300, Entropy = 0.6913, Returns = 1.5633, Value = 1.1257, mask_loss = 0.5146std_loss = 0.0192:   0%|          | 1/1563 [00:15<6:42:38, 15.47s/it]

reward: tensor(1.6364, device='cuda:0') factor: tensor(0.4993, device='cuda:0')
reward: tensor(1.6157, device='cuda:0') factor: tensor(0.5032, device='cuda:0')
reward: tensor(1.4340, device='cuda:0') factor: tensor(0.5140, device='cuda:0')


Epoch 1, Step 2: Loss = -0.2253, Actor Loss = -0.7037, Critic Loss = 0.9582, Entropy = 0.6899, Returns = 1.7024, Value = 1.0706, mask_loss = 0.5339std_loss = 0.0203:   0%|          | 2/1563 [00:33<7:17:58, 16.83s/it]

reward: tensor(1.5145, device='cuda:0') factor: tensor(0.5094, device='cuda:0')
reward: tensor(1.6045, device='cuda:0') factor: tensor(0.5238, device='cuda:0')
reward: tensor(1.4763, device='cuda:0') factor: tensor(0.5228, device='cuda:0')


Epoch 1, Step 3: Loss = -0.0612, Actor Loss = -0.3916, Critic Loss = 0.6621, Entropy = 0.6912, Returns = 1.5550, Value = 1.2415, mask_loss = 0.5225std_loss = 0.0202:   0%|          | 3/1563 [00:48<6:58:59, 16.12s/it]

reward: tensor(1.5857, device='cuda:0') factor: tensor(0.5094, device='cuda:0')
reward: tensor(1.6098, device='cuda:0') factor: tensor(0.5165, device='cuda:0')
reward: tensor(1.6883, device='cuda:0') factor: tensor(0.5293, device='cuda:0')


Epoch 1, Step 4: Loss = -0.1289, Actor Loss = -0.4894, Critic Loss = 0.7224, Entropy = 0.6919, Returns = 1.5549, Value = 1.1510, mask_loss = 0.5022std_loss = 0.0224:   0%|          | 4/1563 [01:03<6:46:09, 15.63s/it]

reward: tensor(1.3154, device='cuda:0') factor: tensor(0.5291, device='cuda:0')
reward: tensor(1.4978, device='cuda:0') factor: tensor(0.5246, device='cuda:0')
reward: tensor(1.4795, device='cuda:0') factor: tensor(0.5259, device='cuda:0')


Epoch 1, Step 5: Loss = 0.0469, Actor Loss = -0.3893, Critic Loss = 0.8739, Entropy = 0.6917, Returns = 1.7015, Value = 1.4005, mask_loss = 0.5112std_loss = 0.0226:   0%|          | 5/1563 [01:21<7:12:06, 16.64s/it] 

reward: tensor(1.5792, device='cuda:0') factor: tensor(0.4956, device='cuda:0')
reward: tensor(1.5999, device='cuda:0') factor: tensor(0.4977, device='cuda:0')
reward: tensor(1.7681, device='cuda:0') factor: tensor(0.4978, device='cuda:0')


Epoch 1, Step 6: Loss = -0.1030, Actor Loss = -0.3285, Critic Loss = 0.4525, Entropy = 0.6912, Returns = 1.6030, Value = 1.3736, mask_loss = 0.4901std_loss = 0.0241:   0%|          | 6/1563 [01:36<6:55:34, 16.01s/it]

reward: tensor(1.7921, device='cuda:0') factor: tensor(0.4949, device='cuda:0')
reward: tensor(1.6425, device='cuda:0') factor: tensor(0.4944, device='cuda:0')
reward: tensor(1.7861, device='cuda:0') factor: tensor(0.5000, device='cuda:0')


Epoch 1, Step 7: Loss = -0.0090, Actor Loss = -0.1334, Critic Loss = 0.2501, Entropy = 0.6910, Returns = 1.6532, Value = 1.5837, mask_loss = 0.4812std_loss = 0.0244:   0%|          | 7/1563 [01:54<7:08:23, 16.52s/it]

reward: tensor(1.7313, device='cuda:0') factor: tensor(0.4776, device='cuda:0')
reward: tensor(1.7636, device='cuda:0') factor: tensor(0.4693, device='cuda:0')
reward: tensor(1.5467, device='cuda:0') factor: tensor(0.4882, device='cuda:0')


Epoch 1, Step 8: Loss = 0.0338, Actor Loss = -0.1914, Critic Loss = 0.4518, Entropy = 0.6862, Returns = 1.7410, Value = 1.6049, mask_loss = 0.4473std_loss = 0.0244:   1%|          | 8/1563 [02:12<7:23:50, 17.13s/it] 

reward: tensor(1.7126, device='cuda:0') factor: tensor(0.4299, device='cuda:0')
reward: tensor(1.8064, device='cuda:0') factor: tensor(0.4275, device='cuda:0')
reward: tensor(1.5403, device='cuda:0') factor: tensor(0.4369, device='cuda:0')


Epoch 1, Step 9: Loss = 0.7621, Actor Loss = 0.3680, Critic Loss = 0.7895, Entropy = 0.6736, Returns = 1.4727, Value = 1.8810, mask_loss = 0.4047std_loss = 0.0223:   1%|          | 9/1563 [02:27<7:07:55, 16.52s/it] 

reward: tensor(2.2912, device='cuda:0') factor: tensor(0.3819, device='cuda:0')
reward: tensor(1.9767, device='cuda:0') factor: tensor(0.3990, device='cuda:0')
reward: tensor(1.6576, device='cuda:0') factor: tensor(0.4056, device='cuda:0')


Epoch 1, Step 10: Loss = 0.8033, Actor Loss = 0.0286, Critic Loss = 1.5508, Entropy = 0.6596, Returns = 1.8767, Value = 1.9132, mask_loss = 0.3741std_loss = 0.0238:   1%|          | 10/1563 [02:51<8:08:46, 18.88s/it]

reward: tensor(2.1109, device='cuda:0') factor: tensor(0.3415, device='cuda:0')
reward: tensor(2.1135, device='cuda:0') factor: tensor(0.3497, device='cuda:0')
reward: tensor(1.7634, device='cuda:0') factor: tensor(0.3484, device='cuda:0')


Epoch 1, Step 11: Loss = -0.0423, Actor Loss = -0.4709, Critic Loss = 0.8586, Entropy = 0.6343, Returns = 2.2441, Value = 1.7886, mask_loss = 0.3323std_loss = 0.0219:   1%|          | 11/1563 [03:06<7:34:51, 17.58s/it]

reward: tensor(1.6964, device='cuda:0') factor: tensor(0.3171, device='cuda:0')
reward: tensor(1.9449, device='cuda:0') factor: tensor(0.3175, device='cuda:0')
reward: tensor(1.8616, device='cuda:0') factor: tensor(0.3210, device='cuda:0')


Epoch 1, Step 12: Loss = 0.6242, Actor Loss = -0.0401, Critic Loss = 1.3298, Entropy = 0.6147, Returns = 1.6235, Value = 1.6379, mask_loss = 0.3084std_loss = 0.0210:   1%|          | 12/1563 [03:20<7:09:12, 16.60s/it] 

reward: tensor(1.9623, device='cuda:0') factor: tensor(0.2954, device='cuda:0')
reward: tensor(1.9369, device='cuda:0') factor: tensor(0.3046, device='cuda:0')
reward: tensor(2.0724, device='cuda:0') factor: tensor(0.2890, device='cuda:0')


Epoch 1, Step 13: Loss = 0.6656, Actor Loss = 0.1010, Critic Loss = 1.1303, Entropy = 0.5976, Returns = 2.0015, Value = 2.1573, mask_loss = 0.2887std_loss = 0.0226:   1%|          | 13/1563 [03:36<6:59:54, 16.25s/it] 

reward: tensor(2.5880, device='cuda:0') factor: tensor(0.2622, device='cuda:0')
reward: tensor(2.3970, device='cuda:0') factor: tensor(0.2825, device='cuda:0')
reward: tensor(1.7616, device='cuda:0') factor: tensor(0.2711, device='cuda:0')


Epoch 1, Step 14: Loss = 0.3270, Actor Loss = -0.3459, Critic Loss = 1.3470, Entropy = 0.5587, Returns = 2.2232, Value = 1.9476, mask_loss = 0.2491std_loss = 0.0184:   1%|          | 14/1563 [03:51<6:46:41, 15.75s/it]

reward: tensor(2.7010, device='cuda:0') factor: tensor(0.2313, device='cuda:0')
reward: tensor(1.6990, device='cuda:0') factor: tensor(0.2298, device='cuda:0')
reward: tensor(2.5210, device='cuda:0') factor: tensor(0.2350, device='cuda:0')


Epoch 1, Step 15: Loss = 0.6764, Actor Loss = -0.1502, Critic Loss = 1.6541, Entropy = 0.5176, Returns = 2.2066, Value = 2.1313, mask_loss = 0.2141std_loss = 0.0203:   1%|          | 15/1563 [04:11<7:25:52, 17.28s/it]

reward: tensor(2.9459, device='cuda:0') factor: tensor(0.2081, device='cuda:0')
reward: tensor(1.9598, device='cuda:0') factor: tensor(0.2193, device='cuda:0')
reward: tensor(2.5353, device='cuda:0') factor: tensor(0.2212, device='cuda:0')


Epoch 1, Step 16: Loss = 1.1799, Actor Loss = 0.2909, Critic Loss = 1.7790, Entropy = 0.4993, Returns = 1.8464, Value = 2.2158, mask_loss = 0.2013std_loss = 0.0209:   1%|          | 16/1563 [04:26<7:05:28, 16.50s/it] 

reward: tensor(2.8087, device='cuda:0') factor: tensor(0.1782, device='cuda:0')
reward: tensor(2.9438, device='cuda:0') factor: tensor(0.1938, device='cuda:0')
reward: tensor(3.0162, device='cuda:0') factor: tensor(0.1833, device='cuda:0')


Epoch 1, Step 17: Loss = 0.3467, Actor Loss = -1.1406, Critic Loss = 2.9755, Entropy = 0.4622, Returns = 3.4835, Value = 2.4236, mask_loss = 0.1755std_loss = 0.0199:   1%|          | 17/1563 [04:41<6:53:38, 16.05s/it]

reward: tensor(3.6214, device='cuda:0') factor: tensor(0.1654, device='cuda:0')
reward: tensor(2.7439, device='cuda:0') factor: tensor(0.1822, device='cuda:0')
reward: tensor(2.8902, device='cuda:0') factor: tensor(0.1759, device='cuda:0')


Epoch 1, Step 18: Loss = 0.9968, Actor Loss = -0.1150, Critic Loss = 2.2244, Entropy = 0.4586, Returns = 2.7235, Value = 2.7284, mask_loss = 0.1743std_loss = 0.0183:   1%|          | 18/1563 [04:55<6:39:23, 15.51s/it]

reward: tensor(3.5824, device='cuda:0') factor: tensor(0.1650, device='cuda:0')
reward: tensor(3.0215, device='cuda:0') factor: tensor(0.1653, device='cuda:0')
reward: tensor(2.8565, device='cuda:0') factor: tensor(0.1615, device='cuda:0')


Epoch 1, Step 19: Loss = 1.3652, Actor Loss = -0.0749, Critic Loss = 2.8812, Entropy = 0.4388, Returns = 2.6896, Value = 2.7086, mask_loss = 0.1610std_loss = 0.0201:   1%|          | 19/1563 [05:11<6:36:58, 15.43s/it]

reward: tensor(3.2223, device='cuda:0') factor: tensor(0.1586, device='cuda:0')
reward: tensor(3.2268, device='cuda:0') factor: tensor(0.1594, device='cuda:0')
reward: tensor(2.6327, device='cuda:0') factor: tensor(0.1678, device='cuda:0')


Epoch 1, Step 20: Loss = 2.4959, Actor Loss = 0.2791, Critic Loss = 4.4344, Entropy = 0.4539, Returns = 2.7188, Value = 3.0661, mask_loss = 0.1711std_loss = 0.0202:   1%|▏         | 20/1563 [05:30<7:11:19, 16.77s/it] 

reward: tensor(2.4983, device='cuda:0') factor: tensor(0.1599, device='cuda:0')
reward: tensor(3.3434, device='cuda:0') factor: tensor(0.1558, device='cuda:0')
reward: tensor(2.8065, device='cuda:0') factor: tensor(0.1575, device='cuda:0')


Epoch 1, Step 21: Loss = 4.3327, Actor Loss = 1.4087, Critic Loss = 5.8488, Entropy = 0.4152, Returns = 1.7490, Value = 3.1947, mask_loss = 0.1466std_loss = 0.0184:   1%|▏         | 21/1563 [05:49<7:22:42, 17.23s/it]

reward: tensor(3.7849, device='cuda:0') factor: tensor(0.1456, device='cuda:0')
reward: tensor(2.5127, device='cuda:0') factor: tensor(0.1410, device='cuda:0')
reward: tensor(2.7120, device='cuda:0') factor: tensor(0.1446, device='cuda:0')


Epoch 1, Step 22: Loss = 3.6007, Actor Loss = 0.3117, Critic Loss = 6.5787, Entropy = 0.3757, Returns = 2.5942, Value = 2.9023, mask_loss = 0.1254std_loss = 0.0184:   1%|▏         | 22/1563 [06:09<7:49:26, 18.28s/it]

reward: tensor(4.2633, device='cuda:0') factor: tensor(0.1335, device='cuda:0')
reward: tensor(3.7281, device='cuda:0') factor: tensor(0.1333, device='cuda:0')
reward: tensor(3.1398, device='cuda:0') factor: tensor(0.1321, device='cuda:0')


Epoch 1, Step 23: Loss = 1.7329, Actor Loss = -0.2899, Critic Loss = 4.0463, Entropy = 0.3589, Returns = 3.2282, Value = 2.9871, mask_loss = 0.1183std_loss = 0.0187:   1%|▏         | 23/1563 [06:24<7:23:03, 17.26s/it]

reward: tensor(3.6026, device='cuda:0') factor: tensor(0.0888, device='cuda:0')
reward: tensor(3.5352, device='cuda:0') factor: tensor(0.1073, device='cuda:0')
reward: tensor(4.7292, device='cuda:0') factor: tensor(0.1007, device='cuda:0')


Epoch 1, Step 24: Loss = 1.6596, Actor Loss = -0.6291, Critic Loss = 4.5779, Entropy = 0.2865, Returns = 3.7218, Value = 3.1934, mask_loss = 0.0839std_loss = 0.0153:   2%|▏         | 24/1563 [06:37<6:46:12, 15.84s/it]

reward: tensor(6.8022, device='cuda:0') factor: tensor(0.0904, device='cuda:0')
reward: tensor(5.5775, device='cuda:0') factor: tensor(0.0868, device='cuda:0')
reward: tensor(5.0773, device='cuda:0') factor: tensor(0.0845, device='cuda:0')


Epoch 1, Step 25: Loss = 4.0888, Actor Loss = -0.9630, Critic Loss = 10.1040, Entropy = 0.2581, Returns = 4.1265, Value = 3.3234, mask_loss = 0.0731std_loss = 0.0165:   2%|▏         | 25/1563 [06:52<6:40:37, 15.63s/it]

reward: tensor(6.7615, device='cuda:0') factor: tensor(0.0624, device='cuda:0')
reward: tensor(6.0452, device='cuda:0') factor: tensor(0.0688, device='cuda:0')
reward: tensor(7.6667, device='cuda:0') factor: tensor(0.0678, device='cuda:0')


Epoch 1, Step 26: Loss = 7.8120, Actor Loss = -3.4756, Critic Loss = 22.5757, Entropy = 0.2221, Returns = 6.8846, Value = 3.6261, mask_loss = 0.0589std_loss = 0.0141:   2%|▏         | 26/1563 [07:04<6:15:37, 14.66s/it]

reward: tensor(7.1833, device='cuda:0') factor: tensor(0.0562, device='cuda:0')
reward: tensor(6.8493, device='cuda:0') factor: tensor(0.0643, device='cuda:0')
reward: tensor(5.8219, device='cuda:0') factor: tensor(0.0606, device='cuda:0')


Epoch 1, Step 27: Loss = 25.4035, Actor Loss = -3.5823, Critic Loss = 57.9720, Entropy = 0.2080, Returns = 7.2137, Value = 3.8518, mask_loss = 0.0539std_loss = 0.0136:   2%|▏         | 27/1563 [07:18<6:08:25, 14.39s/it]

reward: tensor(8.4034, device='cuda:0') factor: tensor(0.0604, device='cuda:0')
reward: tensor(6.9603, device='cuda:0') factor: tensor(0.0563, device='cuda:0')
reward: tensor(8.0636, device='cuda:0') factor: tensor(0.0495, device='cuda:0')


Epoch 1, Step 28: Loss = 18.5572, Actor Loss = -0.8573, Critic Loss = 38.8294, Entropy = 0.1939, Returns = 4.9626, Value = 4.3425, mask_loss = 0.0490std_loss = 0.0114:   2%|▏         | 28/1563 [07:39<7:01:03, 16.46s/it]

reward: tensor(7.2954, device='cuda:0') factor: tensor(0.0455, device='cuda:0')
reward: tensor(6.1986, device='cuda:0') factor: tensor(0.0495, device='cuda:0')
reward: tensor(6.2877, device='cuda:0') factor: tensor(0.0505, device='cuda:0')


Epoch 1, Step 29: Loss = 15.2378, Actor Loss = -1.9850, Critic Loss = 34.4459, Entropy = 0.1750, Returns = 6.3531, Value = 4.5932, mask_loss = 0.0428std_loss = 0.0110:   2%|▏         | 29/1563 [08:04<8:00:48, 18.81s/it]

reward: tensor(11.6862, device='cuda:0') factor: tensor(0.0426, device='cuda:0')
reward: tensor(7.6192, device='cuda:0') factor: tensor(0.0455, device='cuda:0')
reward: tensor(8.8855, device='cuda:0') factor: tensor(0.0384, device='cuda:0')


Epoch 1, Step 30: Loss = 30.9360, Actor Loss = -2.6863, Critic Loss = 67.2449, Entropy = 0.1682, Returns = 7.1318, Value = 4.6787, mask_loss = 0.0411std_loss = 0.0119:   2%|▏         | 30/1563 [08:23<8:02:41, 18.89s/it]

reward: tensor(11.8130, device='cuda:0') factor: tensor(0.0375, device='cuda:0')
reward: tensor(11.3437, device='cuda:0') factor: tensor(0.0380, device='cuda:0')
reward: tensor(6.6605, device='cuda:0') factor: tensor(0.0442, device='cuda:0')


Epoch 1, Step 31: Loss = 118.4556, Actor Loss = -9.9366, Critic Loss = 256.7848, Entropy = 0.1491, Returns = 15.0159, Value = 5.3480, mask_loss = 0.0352std_loss = 0.0103:   2%|▏         | 31/1563 [08:38<7:35:49, 17.85s/it]

reward: tensor(21.0085, device='cuda:0') factor: tensor(0.0231, device='cuda:0')
reward: tensor(18.3638, device='cuda:0') factor: tensor(0.0174, device='cuda:0')
reward: tensor(15.0131, device='cuda:0') factor: tensor(0.0241, device='cuda:0')


Epoch 1, Step 32: Loss = 139.4557, Actor Loss = -8.7569, Critic Loss = 296.4255, Entropy = 0.0833, Returns = 14.3902, Value = 5.9264, mask_loss = 0.0166std_loss = 0.0061:   2%|▏         | 32/1563 [08:58<7:52:13, 18.51s/it]

reward: tensor(16.0892, device='cuda:0') factor: tensor(0.0154, device='cuda:0')
reward: tensor(14.6518, device='cuda:0') factor: tensor(0.0181, device='cuda:0')
reward: tensor(17.1467, device='cuda:0') factor: tensor(0.0169, device='cuda:0')


Epoch 1, Step 33: Loss = 140.8087, Actor Loss = -11.6963, Critic Loss = 305.0101, Entropy = 0.0835, Returns = 17.5682, Value = 6.1680, mask_loss = 0.0171std_loss = 0.0075:   2%|▏         | 33/1563 [09:13<7:23:06, 17.38s/it]

reward: tensor(31.2553, device='cuda:0') factor: tensor(0.0137, device='cuda:0')
reward: tensor(22.0869, device='cuda:0') factor: tensor(0.0138, device='cuda:0')
reward: tensor(26.6620, device='cuda:0') factor: tensor(0.0176, device='cuda:0')


Epoch 1, Step 34: Loss = 810.0765, Actor Loss = -26.9804, Critic Loss = 1674.1140, Entropy = 0.0623, Returns = 33.2925, Value = 6.6073, mask_loss = 0.0117std_loss = 0.0048:   2%|▏         | 34/1563 [09:28<7:01:09, 16.53s/it]

reward: tensor(13.6050, device='cuda:0') factor: tensor(0.0134, device='cuda:0')
reward: tensor(22.0008, device='cuda:0') factor: tensor(0.0127, device='cuda:0')
reward: tensor(22.2717, device='cuda:0') factor: tensor(0.0146, device='cuda:0')


Epoch 1, Step 35: Loss = 350.8876, Actor Loss = -11.8120, Critic Loss = 725.3993, Entropy = 0.0597, Returns = 18.6145, Value = 7.0876, mask_loss = 0.0112std_loss = 0.0045:   2%|▏         | 35/1563 [09:42<6:47:31, 16.00s/it] 

reward: tensor(25.2471, device='cuda:0') factor: tensor(0.0158, device='cuda:0')
reward: tensor(31.4333, device='cuda:0') factor: tensor(0.0118, device='cuda:0')
reward: tensor(21.8732, device='cuda:0') factor: tensor(0.0149, device='cuda:0')


Epoch 1, Step 36: Loss = 904.9410, Actor Loss = -31.4309, Critic Loss = 1872.7438, Entropy = 0.0789, Returns = 38.5110, Value = 7.3716, mask_loss = 0.0159std_loss = 0.0071:   2%|▏         | 36/1563 [09:58<6:41:05, 15.76s/it]

reward: tensor(34.7755, device='cuda:0') factor: tensor(0.0129, device='cuda:0')
reward: tensor(28.3973, device='cuda:0') factor: tensor(0.0126, device='cuda:0')
reward: tensor(14.6143, device='cuda:0') factor: tensor(0.0127, device='cuda:0')


Epoch 1, Step 37: Loss = 376.8659, Actor Loss = -12.6599, Critic Loss = 779.0519, Entropy = 0.0703, Returns = 20.3795, Value = 8.0008, mask_loss = 0.0135std_loss = 0.0052:   2%|▏         | 37/1563 [10:17<7:08:26, 16.85s/it] 

reward: tensor(20.3575, device='cuda:0') factor: tensor(0.0146, device='cuda:0')
reward: tensor(12.4424, device='cuda:0') factor: tensor(0.0131, device='cuda:0')
reward: tensor(16.5063, device='cuda:0') factor: tensor(0.0159, device='cuda:0')


Epoch 1, Step 38: Loss = 428.5129, Actor Loss = -19.1565, Critic Loss = 895.3392, Entropy = 0.0879, Returns = 27.1642, Value = 8.2568, mask_loss = 0.0180std_loss = 0.0069:   2%|▏         | 38/1563 [10:35<7:18:32, 17.25s/it]

reward: tensor(18.7630, device='cuda:0') factor: tensor(0.0195, device='cuda:0')
reward: tensor(13.4945, device='cuda:0') factor: tensor(0.0234, device='cuda:0')
reward: tensor(19.8217, device='cuda:0') factor: tensor(0.0219, device='cuda:0')


Epoch 1, Step 39: Loss = 50.7469, Actor Loss = -3.6513, Critic Loss = 108.7967, Entropy = 0.1009, Returns = 12.3323, Value = 8.9320, mask_loss = 0.0217std_loss = 0.0072:   2%|▏         | 39/1563 [10:50<6:57:36, 16.44s/it]  

reward: tensor(11.6956, device='cuda:0') factor: tensor(0.0285, device='cuda:0')
reward: tensor(13.7665, device='cuda:0') factor: tensor(0.0303, device='cuda:0')
reward: tensor(13.6269, device='cuda:0') factor: tensor(0.0328, device='cuda:0')


Epoch 1, Step 40: Loss = 15.5094, Actor Loss = 0.8899, Critic Loss = 29.2393, Entropy = 0.1340, Returns = 7.8244, Value = 8.9070, mask_loss = 0.0310std_loss = 0.0106:   3%|▎         | 40/1563 [11:05<6:46:16, 16.01s/it]   

reward: tensor(15.8239, device='cuda:0') factor: tensor(0.0269, device='cuda:0')
reward: tensor(22.7586, device='cuda:0') factor: tensor(0.0253, device='cuda:0')
reward: tensor(19.4650, device='cuda:0') factor: tensor(0.0230, device='cuda:0')


Epoch 1, Step 41: Loss = 524.8269, Actor Loss = -9.8188, Critic Loss = 1069.2917, Entropy = 0.1034, Returns = 19.0120, Value = 9.3647, mask_loss = 0.0221std_loss = 0.0077:   3%|▎         | 41/1563 [11:20<6:42:45, 15.88s/it]

reward: tensor(16.0687, device='cuda:0') factor: tensor(0.0247, device='cuda:0')
reward: tensor(10.8355, device='cuda:0') factor: tensor(0.0193, device='cuda:0')
reward: tensor(17.5117, device='cuda:0') factor: tensor(0.0209, device='cuda:0')


Epoch 1, Step 42: Loss = 39.1091, Actor Loss = 1.2425, Critic Loss = 75.7333, Entropy = 0.0881, Returns = 7.5681, Value = 8.9538, mask_loss = 0.0181std_loss = 0.0079:   3%|▎         | 42/1563 [11:40<7:12:20, 17.05s/it]     

reward: tensor(12.8807, device='cuda:0') factor: tensor(0.0133, device='cuda:0')
reward: tensor(13.9140, device='cuda:0') factor: tensor(0.0135, device='cuda:0')
reward: tensor(15.6350, device='cuda:0') factor: tensor(0.0144, device='cuda:0')


Epoch 1, Step 43: Loss = 63.5888, Actor Loss = -2.6270, Critic Loss = 132.4319, Entropy = 0.0662, Returns = 11.9466, Value = 9.4311, mask_loss = 0.0126std_loss = 0.0054:   3%|▎         | 43/1563 [12:00<7:35:06, 17.96s/it]

reward: tensor(13.9008, device='cuda:0') factor: tensor(0.0071, device='cuda:0')
reward: tensor(14.8151, device='cuda:0') factor: tensor(0.0125, device='cuda:0')
reward: tensor(26.0754, device='cuda:0') factor: tensor(0.0108, device='cuda:0')


Epoch 1, Step 44: Loss = 1323.4825, Actor Loss = -13.5790, Critic Loss = 2674.1230, Entropy = 0.0547, Returns = 23.0581, Value = 9.6195, mask_loss = 0.0099std_loss = 0.0038:   3%|▎         | 44/1563 [12:17<7:24:47, 17.57s/it]

reward: tensor(36.8034, device='cuda:0') factor: tensor(0.0130, device='cuda:0')
reward: tensor(33.0683, device='cuda:0') factor: tensor(0.0109, device='cuda:0')
reward: tensor(31.7192, device='cuda:0') factor: tensor(0.0080, device='cuda:0')


Epoch 1, Step 45: Loss = 2053.0107, Actor Loss = -36.5602, Critic Loss = 4179.1421, Entropy = 0.0535, Returns = 46.4313, Value = 10.0778, mask_loss = 0.0096std_loss = 0.0041:   3%|▎         | 45/1563 [12:35<7:29:14, 17.76s/it]

reward: tensor(18.7468, device='cuda:0') factor: tensor(0.0123, device='cuda:0')
reward: tensor(25.4275, device='cuda:0') factor: tensor(0.0103, device='cuda:0')
reward: tensor(20.9681, device='cuda:0') factor: tensor(0.0120, device='cuda:0')


Epoch 1, Step 46: Loss = 375.2245, Actor Loss = -14.4640, Critic Loss = 779.3772, Entropy = 0.0561, Returns = 24.7702, Value = 10.5382, mask_loss = 0.0107std_loss = 0.0053:   3%|▎         | 46/1563 [12:49<7:03:24, 16.75s/it]  

reward: tensor(13.3740, device='cuda:0') factor: tensor(0.0083, device='cuda:0')
reward: tensor(43.1214, device='cuda:0') factor: tensor(0.0081, device='cuda:0')
reward: tensor(25.3280, device='cuda:0') factor: tensor(0.0087, device='cuda:0')


Epoch 1, Step 47: Loss = 787.3333, Actor Loss = -7.6546, Critic Loss = 1589.9760, Entropy = 0.0469, Returns = 18.2970, Value = 10.8679, mask_loss = 0.0084std_loss = 0.0044:   3%|▎         | 47/1563 [13:04<6:49:12, 16.20s/it]

reward: tensor(33.9242, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(11.8496, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(24.2233, device='cuda:0') factor: tensor(0.0070, device='cuda:0')


Epoch 1, Step 48: Loss = 1386.5775, Actor Loss = -12.8134, Critic Loss = 2798.7820, Entropy = 0.0417, Returns = 23.4386, Value = 10.8463, mask_loss = 0.0072std_loss = 0.0042:   3%|▎         | 48/1563 [13:22<7:03:01, 16.75s/it]

reward: tensor(14.6811, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(20.3663, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(36.8935, device='cuda:0') factor: tensor(0.0068, device='cuda:0')


Epoch 1, Step 49: Loss = 2474.4712, Actor Loss = -25.7969, Critic Loss = 5000.5361, Entropy = 0.0406, Returns = 36.8992, Value = 11.2755, mask_loss = 0.0071std_loss = 0.0051:   3%|▎         | 49/1563 [13:39<7:04:34, 16.83s/it]

reward: tensor(23.5709, device='cuda:0') factor: tensor(0.0074, device='cuda:0')
reward: tensor(15.1381, device='cuda:0') factor: tensor(0.0106, device='cuda:0')
reward: tensor(16.3590, device='cuda:0') factor: tensor(0.0083, device='cuda:0')


Epoch 1, Step 50: Loss = 353.3450, Actor Loss = -6.3964, Critic Loss = 719.4830, Entropy = 0.0429, Returns = 17.9745, Value = 11.7528, mask_loss = 0.0077std_loss = 0.0066:   3%|▎         | 50/1563 [13:54<6:51:00, 16.30s/it]   

reward: tensor(42.3714, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(34.8987, device='cuda:0') factor: tensor(0.0110, device='cuda:0')
reward: tensor(28.1950, device='cuda:0') factor: tensor(0.0104, device='cuda:0')


Epoch 1, Step 51: Loss = 1303.2195, Actor Loss = -30.4884, Critic Loss = 2667.4160, Entropy = 0.0617, Returns = 42.3175, Value = 12.0003, mask_loss = 0.0119std_loss = 0.0080:   3%|▎         | 51/1563 [14:09<6:41:04, 15.92s/it]

reward: tensor(22.3405, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(31.6883, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(20.4848, device='cuda:0') factor: tensor(0.0073, device='cuda:0')


Epoch 1, Step 52: Loss = 796.2226, Actor Loss = -15.8481, Critic Loss = 1624.1415, Entropy = 0.0442, Returns = 27.9914, Value = 12.3264, mask_loss = 0.0079std_loss = 0.0051:   3%|▎         | 52/1563 [14:24<6:29:49, 15.48s/it] 

reward: tensor(24.5096, device='cuda:0') factor: tensor(0.0025, device='cuda:0')
reward: tensor(14.0290, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(11.2535, device='cuda:0') factor: tensor(0.0075, device='cuda:0')


Epoch 1, Step 53: Loss = 2505.1814, Actor Loss = -9.0380, Critic Loss = 5028.4390, Entropy = 0.0404, Returns = 21.4474, Value = 12.5701, mask_loss = 0.0071std_loss = 0.0057:   3%|▎         | 53/1563 [14:38<6:22:03, 15.18s/it]

reward: tensor(35.8368, device='cuda:0') factor: tensor(0.0085, device='cuda:0')
reward: tensor(18.1592, device='cuda:0') factor: tensor(0.0069, device='cuda:0')
reward: tensor(22.1242, device='cuda:0') factor: tensor(0.0095, device='cuda:0')


Epoch 1, Step 54: Loss = 1873.8152, Actor Loss = -16.5608, Critic Loss = 3780.7520, Entropy = 0.0416, Returns = 29.1690, Value = 12.7296, mask_loss = 0.0075std_loss = 0.0088:   3%|▎         | 54/1563 [14:52<6:08:21, 14.65s/it]

reward: tensor(69.2348, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(31.2221, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(58.5322, device='cuda:0') factor: tensor(0.0064, device='cuda:0')


Epoch 1, Step 55: Loss = 1098.7103, Actor Loss = -19.3988, Critic Loss = 2236.2183, Entropy = 0.0387, Returns = 32.1973, Value = 13.0081, mask_loss = 0.0067std_loss = 0.0052:   4%|▎         | 55/1563 [15:07<6:10:35, 14.74s/it]

reward: tensor(12.6996, device='cuda:0') factor: tensor(0.0062, device='cuda:0')
reward: tensor(31.4306, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(13.8398, device='cuda:0') factor: tensor(0.0071, device='cuda:0')


Epoch 1, Step 56: Loss = 2333.9714, Actor Loss = -14.6905, Critic Loss = 4697.3237, Entropy = 0.0408, Returns = 27.5395, Value = 13.0213, mask_loss = 0.0072std_loss = 0.0058:   4%|▎         | 56/1563 [15:22<6:12:05, 14.81s/it]

reward: tensor(20.9032, device='cuda:0') factor: tensor(0.0066, device='cuda:0')
reward: tensor(61.3005, device='cuda:0') factor: tensor(0.0051, device='cuda:0')
reward: tensor(26.5847, device='cuda:0') factor: tensor(0.0074, device='cuda:0')


Epoch 1, Step 57: Loss = 351.0789, Actor Loss = -8.6989, Critic Loss = 719.5555, Entropy = 0.0381, Returns = 21.8772, Value = 13.3321, mask_loss = 0.0066std_loss = 0.0056:   4%|▎         | 57/1563 [15:37<6:14:47, 14.93s/it]   

reward: tensor(29.1549, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(45.0369, device='cuda:0') factor: tensor(0.0071, device='cuda:0')
reward: tensor(40.8324, device='cuda:0') factor: tensor(0.0079, device='cuda:0')


Epoch 1, Step 58: Loss = 1595.9780, Actor Loss = -33.1265, Critic Loss = 3258.2090, Entropy = 0.0397, Returns = 46.5404, Value = 13.5861, mask_loss = 0.0069std_loss = 0.0057:   4%|▎         | 58/1563 [15:51<6:10:06, 14.76s/it]

reward: tensor(28.8100, device='cuda:0') factor: tensor(0.0068, device='cuda:0')
reward: tensor(11.3124, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(33.2644, device='cuda:0') factor: tensor(0.0053, device='cuda:0')


Epoch 1, Step 59: Loss = 2160.3667, Actor Loss = -19.2002, Critic Loss = 4359.1338, Entropy = 0.0376, Returns = 32.5103, Value = 13.5210, mask_loss = 0.0064std_loss = 0.0042:   4%|▍         | 59/1563 [16:07<6:14:03, 14.92s/it]

reward: tensor(39.6698, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(22.3227, device='cuda:0') factor: tensor(0.0068, device='cuda:0')
reward: tensor(23.2151, device='cuda:0') factor: tensor(0.0052, device='cuda:0')


Epoch 1, Step 60: Loss = 117.5118, Actor Loss = 0.7818, Critic Loss = 233.4601, Entropy = 0.0366, Returns = 12.9822, Value = 13.9480, mask_loss = 0.0063std_loss = 0.0054:   4%|▍         | 60/1563 [16:22<6:14:03, 14.93s/it]    

reward: tensor(17.9991, device='cuda:0') factor: tensor(0.0060, device='cuda:0')
reward: tensor(32.8209, device='cuda:0') factor: tensor(0.0061, device='cuda:0')
reward: tensor(18.9253, device='cuda:0') factor: tensor(0.0045, device='cuda:0')


Epoch 1, Step 61: Loss = 259.1884, Actor Loss = -1.9307, Critic Loss = 522.2383, Entropy = 0.0391, Returns = 16.1562, Value = 14.3754, mask_loss = 0.0068std_loss = 0.0069:   4%|▍         | 61/1563 [17:22<11:58:48, 28.71s/it]

reward: tensor(25.4361, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(31.4304, device='cuda:0') factor: tensor(0.0085, device='cuda:0')
reward: tensor(54.2380, device='cuda:0') factor: tensor(0.0088, device='cuda:0')


Epoch 1, Step 62: Loss = 1339.1646, Actor Loss = -23.7469, Critic Loss = 2725.8230, Entropy = 0.0505, Returns = 37.5349, Value = 13.9363, mask_loss = 0.0094std_loss = 0.0086:   4%|▍         | 62/1563 [17:42<10:52:27, 26.08s/it]

reward: tensor(14.2696, device='cuda:0') factor: tensor(0.0114, device='cuda:0')
reward: tensor(21.3090, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(53.9841, device='cuda:0') factor: tensor(0.0083, device='cuda:0')


Epoch 1, Step 63: Loss = 546.9957, Actor Loss = -9.1555, Critic Loss = 1112.3025, Entropy = 0.0489, Returns = 23.9088, Value = 14.9308, mask_loss = 0.0091std_loss = 0.0089:   4%|▍         | 63/1563 [17:57<9:23:45, 22.55s/it]   

reward: tensor(16.1620, device='cuda:0') factor: tensor(0.0081, device='cuda:0')
reward: tensor(24.4159, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(37.5630, device='cuda:0') factor: tensor(0.0067, device='cuda:0')


Epoch 1, Step 64: Loss = 498.2679, Actor Loss = -8.3241, Critic Loss = 1013.1841, Entropy = 0.0518, Returns = 23.1275, Value = 14.9701, mask_loss = 0.0095std_loss = 0.0063:   4%|▍         | 64/1563 [18:18<9:16:31, 22.28s/it]

reward: tensor(21.8180, device='cuda:0') factor: tensor(0.0124, device='cuda:0')
reward: tensor(21.6753, device='cuda:0') factor: tensor(0.0114, device='cuda:0')
reward: tensor(23.1416, device='cuda:0') factor: tensor(0.0128, device='cuda:0')


Epoch 1, Step 65: Loss = 176.7197, Actor Loss = -4.6532, Critic Loss = 362.7459, Entropy = 0.0665, Returns = 20.1930, Value = 15.7019, mask_loss = 0.0129std_loss = 0.0088:   4%|▍         | 65/1563 [18:31<8:08:07, 19.55s/it] 

reward: tensor(20.5947, device='cuda:0') factor: tensor(0.0182, device='cuda:0')
reward: tensor(30.0265, device='cuda:0') factor: tensor(0.0131, device='cuda:0')
reward: tensor(33.5271, device='cuda:0') factor: tensor(0.0108, device='cuda:0')


Epoch 1, Step 66: Loss = 768.5159, Actor Loss = -16.1059, Critic Loss = 1569.2437, Entropy = 0.0640, Returns = 31.6383, Value = 15.6877, mask_loss = 0.0125std_loss = 0.0097:   4%|▍         | 66/1563 [18:46<7:33:28, 18.18s/it]

reward: tensor(31.1790, device='cuda:0') factor: tensor(0.0128, device='cuda:0')
reward: tensor(39.8355, device='cuda:0') factor: tensor(0.0099, device='cuda:0')
reward: tensor(45.1689, device='cuda:0') factor: tensor(0.0135, device='cuda:0')


Epoch 1, Step 67: Loss = 884.7184, Actor Loss = -23.4269, Critic Loss = 1816.2906, Entropy = 0.0524, Returns = 39.1218, Value = 15.8498, mask_loss = 0.0098std_loss = 0.0078:   4%|▍         | 67/1563 [19:01<7:08:13, 17.17s/it]

reward: tensor(51.9641, device='cuda:0') factor: tensor(0.0081, device='cuda:0')
reward: tensor(38.2256, device='cuda:0') factor: tensor(0.0083, device='cuda:0')
reward: tensor(16.9127, device='cuda:0') factor: tensor(0.0086, device='cuda:0')


Epoch 1, Step 68: Loss = 1979.2991, Actor Loss = -31.9204, Critic Loss = 4022.4390, Entropy = 0.0438, Returns = 47.5855, Value = 15.8308, mask_loss = 0.0077std_loss = 0.0051:   4%|▍         | 68/1563 [19:16<6:52:38, 16.56s/it]

reward: tensor(16.0912, device='cuda:0') factor: tensor(0.0083, device='cuda:0')
reward: tensor(20.1560, device='cuda:0') factor: tensor(0.0080, device='cuda:0')
reward: tensor(19.6638, device='cuda:0') factor: tensor(0.0063, device='cuda:0')


Epoch 1, Step 69: Loss = 148.4414, Actor Loss = 4.4669, Critic Loss = 287.9491, Entropy = 0.0368, Returns = 11.7265, Value = 16.3459, mask_loss = 0.0064std_loss = 0.0057:   4%|▍         | 69/1563 [19:36<7:13:39, 17.42s/it]    

reward: tensor(17.6169, device='cuda:0') factor: tensor(0.0069, device='cuda:0')
reward: tensor(10.7743, device='cuda:0') factor: tensor(0.0066, device='cuda:0')
reward: tensor(27.8438, device='cuda:0') factor: tensor(0.0071, device='cuda:0')


Epoch 1, Step 70: Loss = 432.1025, Actor Loss = -1.7712, Critic Loss = 867.7474, Entropy = 0.0405, Returns = 18.2520, Value = 16.5947, mask_loss = 0.0072std_loss = 0.0069:   4%|▍         | 70/1563 [19:49<6:44:09, 16.24s/it]

reward: tensor(68.3303, device='cuda:0') factor: tensor(0.0100, device='cuda:0')
reward: tensor(38.4099, device='cuda:0') factor: tensor(0.0055, device='cuda:0')
reward: tensor(53.3728, device='cuda:0') factor: tensor(0.0084, device='cuda:0')


Epoch 1, Step 71: Loss = 3403.3196, Actor Loss = -33.3674, Critic Loss = 6873.3740, Entropy = 0.0387, Returns = 49.6988, Value = 16.5064, mask_loss = 0.0067std_loss = 0.0054:   5%|▍         | 71/1563 [20:04<6:35:05, 15.89s/it]

reward: tensor(16.4672, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(19.9631, device='cuda:0') factor: tensor(0.0091, device='cuda:0')
reward: tensor(19.3372, device='cuda:0') factor: tensor(0.0042, device='cuda:0')


Epoch 1, Step 72: Loss = 422.7591, Actor Loss = -3.0431, Critic Loss = 851.6044, Entropy = 0.0521, Returns = 19.2497, Value = 16.3607, mask_loss = 0.0099std_loss = 0.0094:   5%|▍         | 72/1563 [20:21<6:42:00, 16.18s/it]   

reward: tensor(29.2277, device='cuda:0') factor: tensor(0.0118, device='cuda:0')
reward: tensor(28.1436, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(23.1071, device='cuda:0') factor: tensor(0.0103, device='cuda:0')


Epoch 1, Step 73: Loss = 915.4856, Actor Loss = -11.2637, Critic Loss = 1853.4987, Entropy = 0.0582, Returns = 27.3731, Value = 16.2075, mask_loss = 0.0113std_loss = 0.0112:   5%|▍         | 73/1563 [20:45<7:39:43, 18.51s/it]

reward: tensor(33.8181, device='cuda:0') factor: tensor(0.0107, device='cuda:0')
reward: tensor(10.9882, device='cuda:0') factor: tensor(0.0119, device='cuda:0')
reward: tensor(20.6511, device='cuda:0') factor: tensor(0.0138, device='cuda:0')


Epoch 1, Step 74: Loss = 589.6132, Actor Loss = -5.6918, Critic Loss = 1190.6101, Entropy = 0.0694, Returns = 23.0352, Value = 17.4422, mask_loss = 0.0144std_loss = 0.0158:   5%|▍         | 74/1563 [21:02<7:30:12, 18.14s/it] 

reward: tensor(24.4105, device='cuda:0') factor: tensor(0.0122, device='cuda:0')
reward: tensor(28.7507, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(29.0640, device='cuda:0') factor: tensor(0.0101, device='cuda:0')


Epoch 1, Step 75: Loss = 2008.9312, Actor Loss = -36.7602, Critic Loss = 4091.3826, Entropy = 0.0607, Returns = 53.8752, Value = 17.2035, mask_loss = 0.0117std_loss = 0.0108:   5%|▍         | 75/1563 [21:21<7:32:11, 18.23s/it]

reward: tensor(22.5868, device='cuda:0') factor: tensor(0.0084, device='cuda:0')
reward: tensor(17.5740, device='cuda:0') factor: tensor(0.0134, device='cuda:0')
reward: tensor(19.1501, device='cuda:0') factor: tensor(0.0222, device='cuda:0')


Epoch 1, Step 76: Loss = 183.6037, Actor Loss = -3.8712, Critic Loss = 374.9498, Entropy = 0.0624, Returns = 20.9768, Value = 17.2054, mask_loss = 0.0127std_loss = 0.0169:   5%|▍         | 76/1563 [21:39<7:31:09, 18.20s/it]   

reward: tensor(42.5655, device='cuda:0') factor: tensor(0.0112, device='cuda:0')
reward: tensor(62.1673, device='cuda:0') factor: tensor(0.0141, device='cuda:0')
reward: tensor(40.9146, device='cuda:0') factor: tensor(0.0112, device='cuda:0')


Epoch 1, Step 77: Loss = 2934.1169, Actor Loss = -37.5560, Critic Loss = 5943.3457, Entropy = 0.0505, Returns = 54.8052, Value = 17.3951, mask_loss = 0.0106std_loss = 0.0202:   5%|▍         | 77/1563 [21:54<7:07:41, 17.27s/it]

reward: tensor(17.3500, device='cuda:0') factor: tensor(0.0114, device='cuda:0')
reward: tensor(19.8660, device='cuda:0') factor: tensor(0.0116, device='cuda:0')
reward: tensor(17.5442, device='cuda:0') factor: tensor(0.0086, device='cuda:0')


Epoch 1, Step 78: Loss = 220.2575, Actor Loss = -3.5645, Critic Loss = 447.6441, Entropy = 0.0354, Returns = 20.1237, Value = 16.6877, mask_loss = 0.0066std_loss = 0.0128:   5%|▍         | 78/1563 [22:09<6:51:01, 16.61s/it]   

reward: tensor(29.6757, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(30.3135, device='cuda:0') factor: tensor(0.0121, device='cuda:0')
reward: tensor(30.2303, device='cuda:0') factor: tensor(0.0071, device='cuda:0')


Epoch 1, Step 79: Loss = 835.9184, Actor Loss = -16.5398, Critic Loss = 1704.9165, Entropy = 0.0309, Returns = 34.5074, Value = 18.0799, mask_loss = 0.0062std_loss = 0.0173:   5%|▌         | 79/1563 [22:29<7:15:39, 17.61s/it]

reward: tensor(40.2707, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(15.3405, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(50.4307, device='cuda:0') factor: tensor(0.0044, device='cuda:0')


Epoch 1, Step 80: Loss = 9384.5938, Actor Loss = -49.9792, Critic Loss = 18869.1465, Entropy = 0.0399, Returns = 67.9194, Value = 18.0488, mask_loss = 0.0081std_loss = 0.0150:   5%|▌         | 80/1563 [22:47<7:18:09, 17.73s/it]

reward: tensor(25.6401, device='cuda:0') factor: tensor(0.0054, device='cuda:0')
reward: tensor(50.0242, device='cuda:0') factor: tensor(0.0107, device='cuda:0')
reward: tensor(46.9559, device='cuda:0') factor: tensor(0.0058, device='cuda:0')


Epoch 1, Step 81: Loss = 3234.5078, Actor Loss = -33.2851, Critic Loss = 6535.5859, Entropy = 0.0375, Returns = 51.3510, Value = 18.2283, mask_loss = 0.0087std_loss = 0.0233:   5%|▌         | 81/1563 [25:10<22:42:50, 55.18s/it]

reward: tensor(18.1715, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(18.0733, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(33.9001, device='cuda:0') factor: tensor(0.0066, device='cuda:0')


Epoch 1, Step 82: Loss = 577.1131, Actor Loss = -8.0377, Critic Loss = 1170.3018, Entropy = 0.0306, Returns = 26.6357, Value = 18.7612, mask_loss = 0.0067std_loss = 0.0249:   5%|▌         | 82/1563 [25:25<17:43:47, 43.10s/it]  

reward: tensor(61.9707, device='cuda:0') factor: tensor(0.0093, device='cuda:0')
reward: tensor(40.6007, device='cuda:0') factor: tensor(0.0090, device='cuda:0')
reward: tensor(61.3403, device='cuda:0') factor: tensor(0.0121, device='cuda:0')


Epoch 1, Step 83: Loss = 139.8927, Actor Loss = -0.3463, Critic Loss = 280.4783, Entropy = 0.0686, Returns = 18.8492, Value = 18.6718, mask_loss = 0.0210std_loss = 0.0557:   5%|▌         | 83/1563 [25:45<14:53:00, 36.20s/it] 

reward: tensor(63.5776, device='cuda:0') factor: tensor(0.0104, device='cuda:0')
reward: tensor(35.0036, device='cuda:0') factor: tensor(0.0133, device='cuda:0')
reward: tensor(52.1643, device='cuda:0') factor: tensor(0.0083, device='cuda:0')


Epoch 1, Step 84: Loss = 17381.3848, Actor Loss = -92.7946, Critic Loss = 34948.3594, Entropy = 0.0450, Returns = 111.5271, Value = 18.9129, mask_loss = 0.0132std_loss = 0.0428:   5%|▌         | 84/1563 [26:00<12:18:12, 29.95s/it]

reward: tensor(29.5616, device='cuda:0') factor: tensor(0.0131, device='cuda:0')
reward: tensor(64.5591, device='cuda:0') factor: tensor(0.0098, device='cuda:0')
reward: tensor(81.6914, device='cuda:0') factor: tensor(0.0085, device='cuda:0')


Epoch 1, Step 85: Loss = 6671.3037, Actor Loss = -57.8191, Critic Loss = 13458.2451, Entropy = 0.0229, Returns = 76.7148, Value = 19.1150, mask_loss = 0.0079std_loss = 0.0412:   5%|▌         | 85/1563 [26:15<10:27:57, 25.49s/it]  

reward: tensor(22.8177, device='cuda:0') factor: tensor(0.0069, device='cuda:0')
reward: tensor(67.2609, device='cuda:0') factor: tensor(0.0077, device='cuda:0')
reward: tensor(84.5422, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 86: Loss = 6074.3647, Actor Loss = -58.2922, Critic Loss = 12265.3135, Entropy = 0.0195, Returns = 77.8431, Value = 19.8508, mask_loss = 0.0070std_loss = 0.0333:   6%|▌         | 86/1563 [26:30<9:09:43, 22.33s/it] 

reward: tensor(13.9403, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(14.1061, device='cuda:0') factor: tensor(0.0092, device='cuda:0')
reward: tensor(5.2111, device='cuda:0') factor: tensor(0.0052, device='cuda:0')


Epoch 1, Step 87: Loss = 188.0453, Actor Loss = 7.0156, Critic Loss = 362.0596, Entropy = 0.0294, Returns = 12.7729, Value = 20.0533, mask_loss = 0.0129std_loss = 0.0546:   6%|▌         | 87/1563 [26:53<9:12:18, 22.45s/it]     

reward: tensor(43.5701, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(34.5870, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(49.1651, device='cuda:0') factor: tensor(0.0064, device='cuda:0')


Epoch 1, Step 88: Loss = 1102.9241, Actor Loss = -25.9007, Critic Loss = 2257.6494, Entropy = 0.0177, Returns = 46.3404, Value = 20.6478, mask_loss = 0.0068std_loss = 0.0434:   6%|▌         | 88/1563 [27:06<8:05:04, 19.73s/it]

reward: tensor(66.0543, device='cuda:0') factor: tensor(0.0069, device='cuda:0')
reward: tensor(36.3260, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(57.2174, device='cuda:0') factor: tensor(0.0058, device='cuda:0')


Epoch 1, Step 89: Loss = 7864.2739, Actor Loss = -48.2168, Critic Loss = 15824.9814, Entropy = 0.0261, Returns = 69.1554, Value = 21.1053, mask_loss = 0.0099std_loss = 0.0502:   6%|▌         | 89/1563 [27:21<7:30:47, 18.35s/it]

reward: tensor(24.4964, device='cuda:0') factor: tensor(0.0202, device='cuda:0')
reward: tensor(12.3243, device='cuda:0') factor: tensor(0.0256, device='cuda:0')
reward: tensor(19.8347, device='cuda:0') factor: tensor(0.0258, device='cuda:0')


Epoch 1, Step 90: Loss = 188.9717, Actor Loss = 6.0350, Critic Loss = 365.8734, Entropy = 0.0512, Returns = 13.8493, Value = 20.0674, mask_loss = 0.0221std_loss = 0.0792:   6%|▌         | 90/1563 [27:40<7:33:19, 18.47s/it]     

reward: tensor(24.7587, device='cuda:0') factor: tensor(0.0195, device='cuda:0')
reward: tensor(37.8441, device='cuda:0') factor: tensor(0.0184, device='cuda:0')
reward: tensor(33.6532, device='cuda:0') factor: tensor(0.0152, device='cuda:0')


Epoch 1, Step 91: Loss = 1492.7871, Actor Loss = -11.8767, Critic Loss = 3009.3276, Entropy = 0.0240, Returns = 32.6408, Value = 20.9194, mask_loss = 0.0073std_loss = 0.0384:   6%|▌         | 91/1563 [28:03<8:08:59, 19.93s/it]

reward: tensor(57.0727, device='cuda:0') factor: tensor(0.0113, device='cuda:0')
reward: tensor(59.5194, device='cuda:0') factor: tensor(0.0108, device='cuda:0')
reward: tensor(44.6814, device='cuda:0') factor: tensor(0.0088, device='cuda:0')


Epoch 1, Step 92: Loss = 617.9155, Actor Loss = -0.3859, Critic Loss = 1236.6030, Entropy = 0.0329, Returns = 21.4804, Value = 21.2540, mask_loss = 0.0139std_loss = 0.0555:   6%|▌         | 92/1563 [28:24<8:12:27, 20.09s/it]  

reward: tensor(43.4803, device='cuda:0') factor: tensor(0.0130, device='cuda:0')
reward: tensor(23.2325, device='cuda:0') factor: tensor(0.0118, device='cuda:0')
reward: tensor(34.0035, device='cuda:0') factor: tensor(0.0118, device='cuda:0')


Epoch 1, Step 93: Loss = 258.5108, Actor Loss = -2.9929, Critic Loss = 523.0074, Entropy = 0.0188, Returns = 23.7144, Value = 20.8598, mask_loss = 0.0064std_loss = 0.0291:   6%|▌         | 93/1563 [28:43<8:06:21, 19.85s/it] 

reward: tensor(23.4505, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(59.8237, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(35.4107, device='cuda:0') factor: tensor(0.0085, device='cuda:0')


Epoch 1, Step 94: Loss = 1336.4260, Actor Loss = -16.7890, Critic Loss = 2706.4299, Entropy = 0.0231, Returns = 38.0270, Value = 21.3612, mask_loss = 0.0091std_loss = 0.0461:   6%|▌         | 94/1563 [28:59<7:34:31, 18.56s/it]

reward: tensor(37.6165, device='cuda:0') factor: tensor(0.0102, device='cuda:0')
reward: tensor(34.9101, device='cuda:0') factor: tensor(0.0108, device='cuda:0')
reward: tensor(18.3368, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 95: Loss = 1557.1490, Actor Loss = -5.6669, Critic Loss = 3125.6321, Entropy = 0.0219, Returns = 27.6574, Value = 22.0978, mask_loss = 0.0095std_loss = 0.0526:   6%|▌         | 95/1563 [29:13<6:59:41, 17.15s/it] 

reward: tensor(79.7504, device='cuda:0') factor: tensor(0.0136, device='cuda:0')
reward: tensor(47.9233, device='cuda:0') factor: tensor(0.0116, device='cuda:0')
reward: tensor(27.5044, device='cuda:0') factor: tensor(0.0109, device='cuda:0')


Epoch 1, Step 96: Loss = 1474.0801, Actor Loss = -26.8033, Critic Loss = 3001.7666, Entropy = 0.0301, Returns = 48.7287, Value = 22.0972, mask_loss = 0.0122std_loss = 0.0548:   6%|▌         | 96/1563 [29:31<7:10:51, 17.62s/it]

reward: tensor(34.4713, device='cuda:0') factor: tensor(0.0035, device='cuda:0')
reward: tensor(48.4386, device='cuda:0') factor: tensor(0.0019, device='cuda:0')
reward: tensor(33.9370, device='cuda:0') factor: tensor(0.0029, device='cuda:0')


Epoch 1, Step 97: Loss = 792.1876, Actor Loss = 5.0488, Critic Loss = 1574.2776, Entropy = 0.0085, Returns = 17.1206, Value = 22.2863, mask_loss = 0.0021std_loss = 0.0157:   6%|▌         | 97/1563 [29:51<7:28:30, 18.36s/it]   

reward: tensor(24.1296, device='cuda:0') factor: tensor(0.0046, device='cuda:0')
reward: tensor(18.2926, device='cuda:0') factor: tensor(0.0036, device='cuda:0')
reward: tensor(56.7817, device='cuda:0') factor: tensor(0.0040, device='cuda:0')


Epoch 1, Step 98: Loss = 1020.2612, Actor Loss = 3.2167, Critic Loss = 2034.0891, Entropy = 0.0133, Returns = 19.1141, Value = 22.4349, mask_loss = 0.0052std_loss = 0.0387:   6%|▋         | 98/1563 [30:13<7:54:29, 19.43s/it]

reward: tensor(86.4863, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(49.7806, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(58.8889, device='cuda:0') factor: tensor(0.0020, device='cuda:0')


Epoch 1, Step 99: Loss = 5495.7407, Actor Loss = -48.6743, Critic Loss = 11088.8301, Entropy = 0.0136, Returns = 70.5827, Value = 22.1072, mask_loss = 0.0047std_loss = 0.0264:   6%|▋         | 99/1563 [30:28<7:22:17, 18.13s/it]

reward: tensor(34.0951, device='cuda:0') factor: tensor(0.0112, device='cuda:0')
reward: tensor(19.8923, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(25.2464, device='cuda:0') factor: tensor(0.0047, device='cuda:0')


Epoch 1, Step 100: Loss = 273.9043, Actor Loss = 5.7197, Critic Loss = 536.3693, Entropy = 0.0195, Returns = 16.3022, Value = 22.1950, mask_loss = 0.0116std_loss = 0.0663:   6%|▋         | 100/1563 [30:43<6:55:57, 17.06s/it]   

reward: tensor(90.3333, device='cuda:0') factor: tensor(0.0096, device='cuda:0')
reward: tensor(57.5949, device='cuda:0') factor: tensor(0.0125, device='cuda:0')
reward: tensor(71.9366, device='cuda:0') factor: tensor(0.0132, device='cuda:0')


Epoch 1, Step 101: Loss = 4182.1147, Actor Loss = -44.3996, Critic Loss = 8453.0283, Entropy = 0.0191, Returns = 67.1158, Value = 22.9042, mask_loss = 0.0110std_loss = 0.0546:   6%|▋         | 101/1563 [30:58<6:42:44, 16.53s/it]

reward: tensor(62.9981, device='cuda:0') factor: tensor(0.0099, device='cuda:0')
reward: tensor(28.1654, device='cuda:0') factor: tensor(0.0107, device='cuda:0')
reward: tensor(54.6112, device='cuda:0') factor: tensor(0.0136, device='cuda:0')


Epoch 1, Step 102: Loss = 3901.4541, Actor Loss = -12.7499, Critic Loss = 7828.4082, Entropy = 0.0301, Returns = 35.6374, Value = 23.0892, mask_loss = 0.0177std_loss = 0.0872:   7%|▋         | 102/1563 [31:17<6:57:50, 17.16s/it]

reward: tensor(38.0877, device='cuda:0') factor: tensor(0.0098, device='cuda:0')
reward: tensor(24.8374, device='cuda:0') factor: tensor(0.0108, device='cuda:0')
reward: tensor(19.6152, device='cuda:0') factor: tensor(0.0113, device='cuda:0')


Epoch 1, Step 103: Loss = 362.2468, Actor Loss = -1.4051, Critic Loss = 727.3037, Entropy = 0.0177, Returns = 24.7915, Value = 23.5280, mask_loss = 0.0112std_loss = 0.0649:   7%|▋         | 103/1563 [31:35<7:04:07, 17.43s/it]   

reward: tensor(51.2344, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(77.0808, device='cuda:0') factor: tensor(0.0074, device='cuda:0')
reward: tensor(40.3682, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 104: Loss = 7996.7446, Actor Loss = -53.1442, Critic Loss = 16099.7773, Entropy = 0.0194, Returns = 77.0057, Value = 23.9916, mask_loss = 0.0100std_loss = 0.0649:   7%|▋         | 104/1563 [31:51<6:53:59, 17.02s/it]

reward: tensor(41.0887, device='cuda:0') factor: tensor(0.0094, device='cuda:0')
reward: tensor(39.7733, device='cuda:0') factor: tensor(0.0098, device='cuda:0')
reward: tensor(37.3684, device='cuda:0') factor: tensor(0.0085, device='cuda:0')


Epoch 1, Step 105: Loss = 4834.4844, Actor Loss = -46.4009, Critic Loss = 9761.7705, Entropy = 0.0178, Returns = 70.6344, Value = 24.3392, mask_loss = 0.0102std_loss = 0.0673:   7%|▋         | 105/1563 [32:06<6:38:00, 16.38s/it] 

reward: tensor(71.4880, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(58.0369, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(64.8883, device='cuda:0') factor: tensor(0.0080, device='cuda:0')


Epoch 1, Step 106: Loss = 1927.6289, Actor Loss = -30.7832, Critic Loss = 3916.8242, Entropy = 0.0151, Returns = 54.8046, Value = 24.1943, mask_loss = 0.0075std_loss = 0.0508:   7%|▋         | 106/1563 [32:24<6:49:20, 16.86s/it]

reward: tensor(36.6268, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(36.7383, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(52.7114, device='cuda:0') factor: tensor(0.0068, device='cuda:0')


Epoch 1, Step 107: Loss = 2709.3669, Actor Loss = -49.0569, Critic Loss = 5516.8477, Entropy = 0.0144, Returns = 73.1197, Value = 24.2604, mask_loss = 0.0069std_loss = 0.0548:   7%|▋         | 107/1563 [32:44<7:11:11, 17.77s/it]

reward: tensor(44.4459, device='cuda:0') factor: tensor(0.0089, device='cuda:0')
reward: tensor(56.7299, device='cuda:0') factor: tensor(0.0093, device='cuda:0')
reward: tensor(28.0766, device='cuda:0') factor: tensor(0.0113, device='cuda:0')


Epoch 1, Step 108: Loss = 1954.3843, Actor Loss = -27.4675, Critic Loss = 3963.7034, Entropy = 0.0221, Returns = 51.9376, Value = 24.6440, mask_loss = 0.0115std_loss = 0.0692:   7%|▋         | 108/1563 [32:59<6:50:09, 16.91s/it]

reward: tensor(22.9768, device='cuda:0') factor: tensor(0.0115, device='cuda:0')
reward: tensor(34.4324, device='cuda:0') factor: tensor(0.0088, device='cuda:0')
reward: tensor(35.2659, device='cuda:0') factor: tensor(0.0128, device='cuda:0')


Epoch 1, Step 109: Loss = 3012.6533, Actor Loss = -7.7323, Critic Loss = 6040.7710, Entropy = 0.0240, Returns = 32.1749, Value = 24.6198, mask_loss = 0.0133std_loss = 0.0716:   7%|▋         | 109/1563 [33:14<6:37:09, 16.39s/it] 

reward: tensor(28.7138, device='cuda:0') factor: tensor(0.0154, device='cuda:0')
reward: tensor(83.9229, device='cuda:0') factor: tensor(0.0129, device='cuda:0')
reward: tensor(73.2257, device='cuda:0') factor: tensor(0.0114, device='cuda:0')


Epoch 1, Step 110: Loss = 1105.5049, Actor Loss = -21.3774, Critic Loss = 2253.7644, Entropy = 0.0278, Returns = 46.1341, Value = 24.8866, mask_loss = 0.0132std_loss = 0.0744:   7%|▋         | 110/1563 [33:36<7:21:02, 18.21s/it]

reward: tensor(34.7696, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(47.6582, device='cuda:0') factor: tensor(0.0135, device='cuda:0')
reward: tensor(37.9277, device='cuda:0') factor: tensor(0.0132, device='cuda:0')


Epoch 1, Step 111: Loss = 175.4034, Actor Loss = 1.4087, Critic Loss = 347.9894, Entropy = 0.0248, Returns = 23.5618, Value = 25.1092, mask_loss = 0.0126std_loss = 0.0746:   7%|▋         | 111/1563 [33:52<7:00:06, 17.36s/it]    

reward: tensor(51.0790, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(40.0735, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(87.8073, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 112: Loss = 5360.7764, Actor Loss = -40.2391, Critic Loss = 10802.0312, Entropy = 0.0162, Returns = 65.4391, Value = 25.4212, mask_loss = 0.0058std_loss = 0.0411:   7%|▋         | 112/1563 [34:14<7:33:09, 18.74s/it]

reward: tensor(35.5065, device='cuda:0') factor: tensor(0.0076, device='cuda:0')
reward: tensor(36.1528, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(37.1284, device='cuda:0') factor: tensor(0.0067, device='cuda:0')


Epoch 1, Step 113: Loss = 2101.7971, Actor Loss = -22.3928, Critic Loss = 4248.3799, Entropy = 0.0168, Returns = 47.8640, Value = 25.6029, mask_loss = 0.0083std_loss = 0.0563:   7%|▋         | 113/1563 [34:34<7:45:53, 19.28s/it] 

reward: tensor(38.9464, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(31.7061, device='cuda:0') factor: tensor(0.0095, device='cuda:0')
reward: tensor(54.1325, device='cuda:0') factor: tensor(0.0070, device='cuda:0')


Epoch 1, Step 114: Loss = 708.5967, Actor Loss = -11.3556, Critic Loss = 1439.9047, Entropy = 0.0151, Returns = 37.7849, Value = 26.5836, mask_loss = 0.0085std_loss = 0.0591:   7%|▋         | 114/1563 [34:49<7:10:50, 17.84s/it] 

reward: tensor(48.6780, device='cuda:0') factor: tensor(0.0140, device='cuda:0')
reward: tensor(34.1007, device='cuda:0') factor: tensor(0.0115, device='cuda:0')
reward: tensor(44.0873, device='cuda:0') factor: tensor(0.0109, device='cuda:0')


Epoch 1, Step 115: Loss = 875.2250, Actor Loss = 2.0276, Critic Loss = 1746.3949, Entropy = 0.0148, Returns = 24.7078, Value = 26.8920, mask_loss = 0.0134std_loss = 0.0837:   7%|▋         | 115/1563 [35:03<6:46:56, 16.86s/it]  

reward: tensor(36.6760, device='cuda:0') factor: tensor(0.0091, device='cuda:0')
reward: tensor(49.0588, device='cuda:0') factor: tensor(0.0085, device='cuda:0')
reward: tensor(33.0686, device='cuda:0') factor: tensor(0.0105, device='cuda:0')


Epoch 1, Step 116: Loss = 1876.9094, Actor Loss = -16.4299, Critic Loss = 3786.6787, Entropy = 0.0149, Returns = 41.7172, Value = 25.4052, mask_loss = 0.0095std_loss = 0.0543:   7%|▋         | 116/1563 [35:20<6:46:22, 16.85s/it]

reward: tensor(33.3948, device='cuda:0') factor: tensor(0.0097, device='cuda:0')
reward: tensor(45.7616, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(41.4218, device='cuda:0') factor: tensor(0.0079, device='cuda:0')


Epoch 1, Step 117: Loss = 1103.8119, Actor Loss = -16.1515, Critic Loss = 2239.9268, Entropy = 0.0108, Returns = 43.0894, Value = 27.0506, mask_loss = 0.0076std_loss = 0.0655:   7%|▋         | 117/1563 [35:34<6:24:42, 15.96s/it]

reward: tensor(39.7104, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(35.9898, device='cuda:0') factor: tensor(0.0073, device='cuda:0')
reward: tensor(43.1974, device='cuda:0') factor: tensor(0.0096, device='cuda:0')


Epoch 1, Step 118: Loss = 1746.7212, Actor Loss = -25.3565, Critic Loss = 3544.1555, Entropy = 0.0117, Returns = 51.8502, Value = 26.6241, mask_loss = 0.0065std_loss = 0.0538:   8%|▊         | 118/1563 [35:48<6:10:04, 15.37s/it]

reward: tensor(68.6161, device='cuda:0') factor: tensor(0.0095, device='cuda:0')
reward: tensor(75.7555, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(62.9402, device='cuda:0') factor: tensor(0.0088, device='cuda:0')


Epoch 1, Step 119: Loss = 1364.2487, Actor Loss = -29.4524, Critic Loss = 2787.4021, Entropy = 0.0089, Returns = 55.9591, Value = 26.6691, mask_loss = 0.0084std_loss = 0.0696:   8%|▊         | 119/1563 [36:03<6:08:34, 15.31s/it]

reward: tensor(28.1221, device='cuda:0') factor: tensor(0.0091, device='cuda:0')
reward: tensor(45.5440, device='cuda:0') factor: tensor(0.0066, device='cuda:0')
reward: tensor(16.3881, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 120: Loss = 433.6628, Actor Loss = 5.8541, Critic Loss = 855.6174, Entropy = 0.0131, Returns = 20.3369, Value = 26.3369, mask_loss = 0.0078std_loss = 0.0481:   8%|▊         | 120/1563 [36:25<6:56:25, 17.31s/it]    

reward: tensor(45.8346, device='cuda:0') factor: tensor(0.0044, device='cuda:0')
reward: tensor(147.1594, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(87.5149, device='cuda:0') factor: tensor(0.0047, device='cuda:0')


Epoch 1, Step 121: Loss = 2236.2024, Actor Loss = -15.9187, Critic Loss = 4504.2422, Entropy = 0.0050, Returns = 43.4110, Value = 27.6542, mask_loss = 0.0041std_loss = 0.0438:   8%|▊         | 121/1563 [36:41<6:44:50, 16.85s/it]

reward: tensor(53.1945, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(47.5469, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(49.9746, device='cuda:0') factor: tensor(0.0056, device='cuda:0')


Epoch 1, Step 122: Loss = 552.1443, Actor Loss = 9.6901, Critic Loss = 1084.9084, Entropy = 0.0082, Returns = 16.1108, Value = 25.9632, mask_loss = 0.0048std_loss = 0.0433:   8%|▊         | 122/1563 [37:02<7:17:54, 18.23s/it]   

reward: tensor(55.4644, device='cuda:0') factor: tensor(0.0064, device='cuda:0')
reward: tensor(40.1660, device='cuda:0') factor: tensor(0.0055, device='cuda:0')
reward: tensor(53.1055, device='cuda:0') factor: tensor(0.0036, device='cuda:0')


Epoch 1, Step 123: Loss = 9441.4736, Actor Loss = -73.5923, Critic Loss = 19030.1309, Entropy = 0.0077, Returns = 100.5221, Value = 27.0929, mask_loss = 0.0056std_loss = 0.0533:   8%|▊         | 123/1563 [37:24<7:40:39, 19.19s/it]

reward: tensor(78.4551, device='cuda:0') factor: tensor(0.0069, device='cuda:0')
reward: tensor(53.8066, device='cuda:0') factor: tensor(0.0064, device='cuda:0')
reward: tensor(65.6188, device='cuda:0') factor: tensor(0.0052, device='cuda:0')


Epoch 1, Step 124: Loss = 1407.1473, Actor Loss = 1.0529, Critic Loss = 2812.1890, Entropy = 0.0105, Returns = 27.5576, Value = 28.7700, mask_loss = 0.0059std_loss = 0.0410:   8%|▊         | 124/1563 [37:41<7:23:31, 18.49s/it]    

reward: tensor(39.3851, device='cuda:0') factor: tensor(0.0027, device='cuda:0')
reward: tensor(22.4505, device='cuda:0') factor: tensor(0.0013, device='cuda:0')
reward: tensor(42.2047, device='cuda:0') factor: tensor(0.0026, device='cuda:0')


Epoch 1, Step 125: Loss = 380.6343, Actor Loss = 22.1746, Critic Loss = 716.9196, Entropy = 0.0046, Returns = 4.6532, Value = 26.9665, mask_loss = 0.0017std_loss = 0.0160:   8%|▊         | 125/1563 [37:57<7:04:35, 17.72s/it]  

reward: tensor(103.0618, device='cuda:0') factor: tensor(0.0041, device='cuda:0')
reward: tensor(94.7925, device='cuda:0') factor: tensor(0.0041, device='cuda:0')
reward: tensor(182.3263, device='cuda:0') factor: tensor(0.0046, device='cuda:0')


Epoch 1, Step 126: Loss = 16897.1582, Actor Loss = -106.0909, Critic Loss = 34006.5000, Entropy = 0.0075, Returns = 133.9389, Value = 28.0681, mask_loss = 0.0055std_loss = 0.0528:   8%|▊         | 126/1563 [38:15<7:11:26, 18.01s/it]

reward: tensor(48.6572, device='cuda:0') factor: tensor(0.0050, device='cuda:0')
reward: tensor(38.1294, device='cuda:0') factor: tensor(0.0018, device='cuda:0')
reward: tensor(49.1958, device='cuda:0') factor: tensor(0.0044, device='cuda:0')


Epoch 1, Step 127: Loss = 3284.3364, Actor Loss = -21.6173, Critic Loss = 6611.9077, Entropy = 0.0062, Returns = 49.6675, Value = 28.2878, mask_loss = 0.0031std_loss = 0.0306:   8%|▊         | 127/1563 [38:30<6:49:20, 17.10s/it]    

reward: tensor(90.7116, device='cuda:0') factor: tensor(0.0025, device='cuda:0')
reward: tensor(107.9548, device='cuda:0') factor: tensor(0.0042, device='cuda:0')
reward: tensor(66.3106, device='cuda:0') factor: tensor(0.0030, device='cuda:0')


Epoch 1, Step 128: Loss = 18404.8965, Actor Loss = -109.3900, Critic Loss = 37028.5742, Entropy = 0.0043, Returns = 138.5338, Value = 29.4414, mask_loss = 0.0023std_loss = 0.0268:   8%|▊         | 128/1563 [38:41<6:05:10, 15.27s/it]

reward: tensor(54.5801, device='cuda:0') factor: tensor(0.0028, device='cuda:0')
reward: tensor(65.8373, device='cuda:0') factor: tensor(0.0036, device='cuda:0')
reward: tensor(21.2342, device='cuda:0') factor: tensor(0.0027, device='cuda:0')


Epoch 1, Step 129: Loss = 3190.3425, Actor Loss = -15.3851, Critic Loss = 6411.4551, Entropy = 0.0046, Returns = 44.3713, Value = 29.2234, mask_loss = 0.0027std_loss = 0.0252:   8%|▊         | 129/1563 [38:56<6:00:54, 15.10s/it]    

reward: tensor(27.8267, device='cuda:0') factor: tensor(0.0038, device='cuda:0')
reward: tensor(23.5124, device='cuda:0') factor: tensor(0.0018, device='cuda:0')
reward: tensor(23.0527, device='cuda:0') factor: tensor(0.0035, device='cuda:0')


Epoch 1, Step 130: Loss = 1014.0922, Actor Loss = 11.4580, Critic Loss = 2005.2683, Entropy = 0.0074, Returns = 18.3549, Value = 30.0350, mask_loss = 0.0047std_loss = 0.0397:   8%|▊         | 130/1563 [39:11<5:59:22, 15.05s/it] 

reward: tensor(20.6186, device='cuda:0') factor: tensor(0.0020, device='cuda:0')
reward: tensor(45.1202, device='cuda:0') factor: tensor(0.0029, device='cuda:0')
reward: tensor(42.0595, device='cuda:0') factor: tensor(0.0026, device='cuda:0')


Epoch 1, Step 131: Loss = 1059.8374, Actor Loss = 7.3774, Critic Loss = 2104.9199, Entropy = 0.0060, Returns = 21.9083, Value = 29.4417, mask_loss = 0.0032std_loss = 0.0278:   8%|▊         | 131/1563 [39:32<6:45:37, 17.00s/it] 

reward: tensor(37.5144, device='cuda:0') factor: tensor(0.0090, device='cuda:0')
reward: tensor(57.1961, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(38.0798, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 132: Loss = 5345.2729, Actor Loss = -37.8460, Critic Loss = 10766.2383, Entropy = 0.0069, Returns = 68.5303, Value = 30.7674, mask_loss = 0.0055std_loss = 0.0561:   8%|▊         | 132/1563 [39:55<7:22:58, 18.57s/it]

reward: tensor(40.1079, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(39.5523, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(47.6587, device='cuda:0') factor: tensor(0.0060, device='cuda:0')


Epoch 1, Step 133: Loss = 2105.3359, Actor Loss = -12.8863, Critic Loss = 4236.4443, Entropy = 0.0103, Returns = 44.0951, Value = 31.3285, mask_loss = 0.0085std_loss = 0.0688:   9%|▊         | 133/1563 [40:09<6:52:03, 17.29s/it] 

reward: tensor(41.8994, device='cuda:0') factor: tensor(0.0097, device='cuda:0')
reward: tensor(52.0837, device='cuda:0') factor: tensor(0.0112, device='cuda:0')
reward: tensor(48.5260, device='cuda:0') factor: tensor(0.0106, device='cuda:0')


Epoch 1, Step 134: Loss = 1292.8280, Actor Loss = -26.9682, Critic Loss = 2639.5923, Entropy = 0.0113, Returns = 57.8870, Value = 31.0430, mask_loss = 0.0080std_loss = 0.0606:   9%|▊         | 134/1563 [40:24<6:33:15, 16.51s/it]

reward: tensor(46.2372, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(91.1913, device='cuda:0') factor: tensor(0.0142, device='cuda:0')
reward: tensor(72.9578, device='cuda:0') factor: tensor(0.0109, device='cuda:0')


Epoch 1, Step 135: Loss = 4903.3691, Actor Loss = -33.8954, Critic Loss = 9874.5293, Entropy = 0.0172, Returns = 64.7589, Value = 31.0816, mask_loss = 0.0176std_loss = 0.0937:   9%|▊         | 135/1563 [40:42<6:49:03, 17.19s/it]

reward: tensor(31.6684, device='cuda:0') factor: tensor(0.0089, device='cuda:0')
reward: tensor(23.4445, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(31.1338, device='cuda:0') factor: tensor(0.0091, device='cuda:0')


Epoch 1, Step 136: Loss = 658.0605, Actor Loss = -7.1056, Critic Loss = 1330.3322, Entropy = 0.0082, Returns = 38.3688, Value = 31.4332, mask_loss = 0.0083std_loss = 0.0671:   9%|▊         | 136/1563 [40:58<6:34:19, 16.58s/it]  

reward: tensor(65.4894, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(54.2898, device='cuda:0') factor: tensor(0.0120, device='cuda:0')
reward: tensor(47.3882, device='cuda:0') factor: tensor(0.0127, device='cuda:0')


Epoch 1, Step 137: Loss = 5705.5215, Actor Loss = -49.4558, Critic Loss = 11509.9551, Entropy = 0.0178, Returns = 81.1958, Value = 31.8029, mask_loss = 0.0176std_loss = 0.1054:   9%|▉         | 137/1563 [41:12<6:18:43, 15.93s/it]

reward: tensor(41.4209, device='cuda:0') factor: tensor(0.0134, device='cuda:0')
reward: tensor(42.0567, device='cuda:0') factor: tensor(0.0142, device='cuda:0')
reward: tensor(59.3281, device='cuda:0') factor: tensor(0.0138, device='cuda:0')


Epoch 1, Step 138: Loss = 2122.0110, Actor Loss = -13.1942, Critic Loss = 4270.4102, Entropy = 0.0158, Returns = 44.2093, Value = 31.1165, mask_loss = 0.0127std_loss = 0.0870:   9%|▉         | 138/1563 [41:31<6:39:52, 16.84s/it] 

reward: tensor(62.1887, device='cuda:0') factor: tensor(0.0123, device='cuda:0')
reward: tensor(65.4208, device='cuda:0') factor: tensor(0.0130, device='cuda:0')
reward: tensor(24.5739, device='cuda:0') factor: tensor(0.0134, device='cuda:0')


Epoch 1, Step 139: Loss = 6650.6895, Actor Loss = -18.8689, Critic Loss = 13339.1162, Entropy = 0.0113, Returns = 50.6035, Value = 31.8954, mask_loss = 0.0106std_loss = 0.0750:   9%|▉         | 139/1563 [41:45<6:22:02, 16.10s/it]

reward: tensor(47.2349, device='cuda:0') factor: tensor(0.0097, device='cuda:0')
reward: tensor(100.5529, device='cuda:0') factor: tensor(0.0125, device='cuda:0')
reward: tensor(70.2998, device='cuda:0') factor: tensor(0.0120, device='cuda:0')


Epoch 1, Step 140: Loss = 4374.8232, Actor Loss = -12.2784, Critic Loss = 8774.2031, Entropy = 0.0122, Returns = 42.6305, Value = 30.4700, mask_loss = 0.0091std_loss = 0.0610:   9%|▉         | 140/1563 [42:05<6:46:59, 17.16s/it] 

reward: tensor(55.8613, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(36.6758, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(37.8710, device='cuda:0') factor: tensor(0.0085, device='cuda:0')


Epoch 1, Step 141: Loss = 719.1068, Actor Loss = -4.0930, Critic Loss = 1446.3994, Entropy = 0.0054, Returns = 36.7075, Value = 32.7444, mask_loss = 0.0051std_loss = 0.0486:   9%|▉         | 141/1563 [42:17<6:11:26, 15.67s/it]  

reward: tensor(29.8806, device='cuda:0') factor: tensor(0.0030, device='cuda:0')
reward: tensor(41.3867, device='cuda:0') factor: tensor(0.0041, device='cuda:0')
reward: tensor(41.8801, device='cuda:0') factor: tensor(0.0042, device='cuda:0')


Epoch 1, Step 142: Loss = 6973.5410, Actor Loss = -53.9112, Critic Loss = 14054.9043, Entropy = 0.0059, Returns = 85.1471, Value = 31.3538, mask_loss = 0.0038std_loss = 0.0348:   9%|▉         | 142/1563 [42:37<6:43:51, 17.05s/it]

reward: tensor(71.4907, device='cuda:0') factor: tensor(0.0102, device='cuda:0')
reward: tensor(57.3022, device='cuda:0') factor: tensor(0.0102, device='cuda:0')
reward: tensor(35.7927, device='cuda:0') factor: tensor(0.0108, device='cuda:0')


Epoch 1, Step 143: Loss = 1753.2146, Actor Loss = -8.1427, Critic Loss = 3522.7146, Entropy = 0.0091, Returns = 41.0932, Value = 33.0773, mask_loss = 0.0084std_loss = 0.0630:   9%|▉         | 143/1563 [42:46<5:44:28, 14.56s/it]  

reward: tensor(23.0388, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(38.1933, device='cuda:0') factor: tensor(0.0058, device='cuda:0')
reward: tensor(63.4008, device='cuda:0') factor: tensor(0.0064, device='cuda:0')


Epoch 1, Step 144: Loss = 9061.2510, Actor Loss = -51.5187, Critic Loss = 18225.5391, Entropy = 0.0083, Returns = 83.6484, Value = 32.2496, mask_loss = 0.0074std_loss = 0.0493:   9%|▉         | 144/1563 [43:02<5:51:24, 14.86s/it]

reward: tensor(63.7690, device='cuda:0') factor: tensor(0.0044, device='cuda:0')
reward: tensor(34.9306, device='cuda:0') factor: tensor(0.0057, device='cuda:0')
reward: tensor(39.3605, device='cuda:0') factor: tensor(0.0052, device='cuda:0')


Epoch 1, Step 145: Loss = 1078.3083, Actor Loss = -9.4121, Critic Loss = 2175.4409, Entropy = 0.0078, Returns = 42.7318, Value = 33.4276, mask_loss = 0.0078std_loss = 0.0565:   9%|▉         | 145/1563 [43:13<5:28:24, 13.90s/it]  

reward: tensor(70.9205, device='cuda:0') factor: tensor(0.0055, device='cuda:0')
reward: tensor(85.8628, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(80.9114, device='cuda:0') factor: tensor(0.0060, device='cuda:0')


Epoch 1, Step 146: Loss = 13804.0293, Actor Loss = -74.8574, Critic Loss = 27757.7734, Entropy = 0.0078, Returns = 106.5838, Value = 31.8516, mask_loss = 0.0051std_loss = 0.0469:   9%|▉         | 146/1563 [43:32<6:03:58, 15.41s/it]

reward: tensor(51.6805, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(35.7589, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(41.8789, device='cuda:0') factor: tensor(0.0057, device='cuda:0')


Epoch 1, Step 147: Loss = 1625.1559, Actor Loss = -2.4652, Critic Loss = 3255.2422, Entropy = 0.0109, Returns = 34.8468, Value = 32.6881, mask_loss = 0.0083std_loss = 0.0588:   9%|▉         | 147/1563 [43:47<5:59:31, 15.23s/it]    

reward: tensor(34.5932, device='cuda:0') factor: tensor(0.0064, device='cuda:0')
reward: tensor(46.0333, device='cuda:0') factor: tensor(0.0068, device='cuda:0')
reward: tensor(29.1661, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 148: Loss = 1050.6083, Actor Loss = 7.2587, Critic Loss = 2086.6992, Entropy = 0.0131, Returns = 26.3359, Value = 33.7063, mask_loss = 0.0118std_loss = 0.0741:   9%|▉         | 148/1563 [43:59<5:33:41, 14.15s/it] 

reward: tensor(33.7933, device='cuda:0') factor: tensor(0.0154, device='cuda:0')
reward: tensor(33.5272, device='cuda:0') factor: tensor(0.0167, device='cuda:0')
reward: tensor(50.3861, device='cuda:0') factor: tensor(0.0145, device='cuda:0')


Epoch 1, Step 149: Loss = 361.7118, Actor Loss = 4.1862, Critic Loss = 715.0511, Entropy = 0.0131, Returns = 28.1352, Value = 32.4147, mask_loss = 0.0172std_loss = 0.1029:  10%|▉         | 149/1563 [44:12<5:28:03, 13.92s/it]  

reward: tensor(54.9003, device='cuda:0') factor: tensor(0.0172, device='cuda:0')
reward: tensor(63.0516, device='cuda:0') factor: tensor(0.0174, device='cuda:0')
reward: tensor(60.6880, device='cuda:0') factor: tensor(0.0183, device='cuda:0')


Epoch 1, Step 150: Loss = 2885.0393, Actor Loss = -35.0057, Critic Loss = 5840.0898, Entropy = 0.0218, Returns = 68.8653, Value = 33.9660, mask_loss = 0.0228std_loss = 0.1112:  10%|▉         | 150/1563 [44:31<6:03:26, 15.43s/it]

reward: tensor(63.6072, device='cuda:0') factor: tensor(0.0159, device='cuda:0')
reward: tensor(37.3363, device='cuda:0') factor: tensor(0.0179, device='cuda:0')
reward: tensor(64.4272, device='cuda:0') factor: tensor(0.0163, device='cuda:0')


Epoch 1, Step 151: Loss = 599.3889, Actor Loss = 11.3541, Critic Loss = 1176.0696, Entropy = 0.0135, Returns = 22.5858, Value = 34.0408, mask_loss = 0.0204std_loss = 0.1169:  10%|▉         | 151/1563 [44:50<6:24:56, 16.36s/it]  

reward: tensor(91.3017, device='cuda:0') factor: tensor(0.0104, device='cuda:0')
reward: tensor(128.9163, device='cuda:0') factor: tensor(0.0104, device='cuda:0')
reward: tensor(105.8511, device='cuda:0') factor: tensor(0.0099, device='cuda:0')


Epoch 1, Step 152: Loss = 6359.6406, Actor Loss = -70.7352, Critic Loss = 12860.7520, Entropy = 0.0101, Returns = 104.9776, Value = 34.5014, mask_loss = 0.0085std_loss = 0.0720:  10%|▉         | 152/1563 [45:04<6:11:53, 15.81s/it]

reward: tensor(70.2686, device='cuda:0') factor: tensor(0.0152, device='cuda:0')
reward: tensor(74.8313, device='cuda:0') factor: tensor(0.0131, device='cuda:0')
reward: tensor(42.2849, device='cuda:0') factor: tensor(0.0143, device='cuda:0')


Epoch 1, Step 153: Loss = 342.3864, Actor Loss = 10.3157, Critic Loss = 664.1414, Entropy = 0.0120, Returns = 21.9907, Value = 32.4139, mask_loss = 0.0133std_loss = 0.0864:  10%|▉         | 153/1563 [45:19<6:05:37, 15.56s/it]     

reward: tensor(151.5913, device='cuda:0') factor: tensor(0.0077, device='cuda:0')
reward: tensor(165.2387, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(130.6505, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 154: Loss = 12050.9521, Actor Loss = -74.3599, Critic Loss = 24250.6250, Entropy = 0.0089, Returns = 108.1714, Value = 33.9763, mask_loss = 0.0107std_loss = 0.0784:  10%|▉         | 154/1563 [45:41<6:48:31, 17.40s/it]

reward: tensor(35.6284, device='cuda:0') factor: tensor(0.0113, device='cuda:0')
reward: tensor(41.7609, device='cuda:0') factor: tensor(0.0118, device='cuda:0')
reward: tensor(52.4523, device='cuda:0') factor: tensor(0.0121, device='cuda:0')


Epoch 1, Step 155: Loss = 3789.9314, Actor Loss = -13.6361, Critic Loss = 7607.1353, Entropy = 0.0094, Returns = 47.0602, Value = 33.6491, mask_loss = 0.0103std_loss = 0.0706:  10%|▉         | 155/1563 [46:03<7:20:19, 18.76s/it]   

reward: tensor(53.9297, device='cuda:0') factor: tensor(0.0139, device='cuda:0')
reward: tensor(59.3080, device='cuda:0') factor: tensor(0.0133, device='cuda:0')
reward: tensor(53.2029, device='cuda:0') factor: tensor(0.0139, device='cuda:0')


Epoch 1, Step 156: Loss = 725.7376, Actor Loss = -1.8813, Critic Loss = 1455.2378, Entropy = 0.0112, Returns = 36.2224, Value = 34.4746, mask_loss = 0.0093std_loss = 0.0676:  10%|▉         | 156/1563 [46:18<6:52:32, 17.59s/it]  

reward: tensor(98.5429, device='cuda:0') factor: tensor(0.0069, device='cuda:0')
reward: tensor(73.6743, device='cuda:0') factor: tensor(0.0055, device='cuda:0')
reward: tensor(41.5421, device='cuda:0') factor: tensor(0.0050, device='cuda:0')


Epoch 1, Step 157: Loss = 5467.5303, Actor Loss = -45.5360, Critic Loss = 11026.1328, Entropy = 0.0103, Returns = 80.3841, Value = 34.9705, mask_loss = 0.0072std_loss = 0.0552:  10%|█         | 157/1563 [46:37<7:07:35, 18.25s/it]

reward: tensor(92.4551, device='cuda:0') factor: tensor(0.0143, device='cuda:0')
reward: tensor(125.2765, device='cuda:0') factor: tensor(0.0105, device='cuda:0')
reward: tensor(117.8900, device='cuda:0') factor: tensor(0.0122, device='cuda:0')


Epoch 1, Step 158: Loss = 9179.4277, Actor Loss = -78.2774, Critic Loss = 18515.4102, Entropy = 0.0149, Returns = 113.5356, Value = 35.4271, mask_loss = 0.0118std_loss = 0.0841:  10%|█         | 158/1563 [46:52<6:44:29, 17.27s/it]

reward: tensor(101.8706, device='cuda:0') factor: tensor(0.0133, device='cuda:0')
reward: tensor(53.9005, device='cuda:0') factor: tensor(0.0148, device='cuda:0')
reward: tensor(56.4032, device='cuda:0') factor: tensor(0.0144, device='cuda:0')


Epoch 1, Step 159: Loss = 2529.7485, Actor Loss = -3.5442, Critic Loss = 5066.5854, Entropy = 0.0164, Returns = 39.4825, Value = 36.2127, mask_loss = 0.0160std_loss = 0.0976:  10%|█         | 159/1563 [47:07<6:24:00, 16.41s/it]   

reward: tensor(88.4043, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(123.0772, device='cuda:0') factor: tensor(0.0080, device='cuda:0')
reward: tensor(74.1505, device='cuda:0') factor: tensor(0.0093, device='cuda:0')


Epoch 1, Step 160: Loss = 16363.6426, Actor Loss = -78.4806, Critic Loss = 32884.2461, Entropy = 0.0078, Returns = 113.9750, Value = 35.7534, mask_loss = 0.0049std_loss = 0.0472:  10%|█         | 160/1563 [47:22<6:14:33, 16.02s/it]

reward: tensor(77.4596, device='cuda:0') factor: tensor(0.0050, device='cuda:0')
reward: tensor(110.6201, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(143.8834, device='cuda:0') factor: tensor(0.0047, device='cuda:0')


Epoch 1, Step 161: Loss = 10438.0117, Actor Loss = -46.4518, Critic Loss = 20968.9277, Entropy = 0.0064, Returns = 81.4328, Value = 35.2484, mask_loss = 0.0053std_loss = 0.0415:  10%|█         | 161/1563 [47:37<6:08:06, 15.75s/it] 

reward: tensor(93.5300, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(99.3616, device='cuda:0') factor: tensor(0.0035, device='cuda:0')
reward: tensor(57.6526, device='cuda:0') factor: tensor(0.0038, device='cuda:0')


Epoch 1, Step 162: Loss = 5864.4927, Actor Loss = -19.8610, Critic Loss = 11768.7070, Entropy = 0.0074, Returns = 56.8973, Value = 37.2530, mask_loss = 0.0036std_loss = 0.0297:  10%|█         | 162/1563 [47:52<5:59:52, 15.41s/it] 

reward: tensor(82.5790, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(74.0707, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(82.7948, device='cuda:0') factor: tensor(0.0057, device='cuda:0')


Epoch 1, Step 163: Loss = 2272.7620, Actor Loss = -6.9205, Critic Loss = 4559.3647, Entropy = 0.0087, Returns = 43.2554, Value = 36.5200, mask_loss = 0.0064std_loss = 0.0466:  10%|█         | 163/1563 [48:11<6:23:37, 16.44s/it]  

reward: tensor(56.9667, device='cuda:0') factor: tensor(0.0096, device='cuda:0')
reward: tensor(56.1269, device='cuda:0') factor: tensor(0.0095, device='cuda:0')
reward: tensor(54.0989, device='cuda:0') factor: tensor(0.0084, device='cuda:0')


Epoch 1, Step 164: Loss = 2654.2288, Actor Loss = -28.8414, Critic Loss = 5366.1406, Entropy = 0.0075, Returns = 66.7170, Value = 38.0352, mask_loss = 0.0088std_loss = 0.0689:  10%|█         | 164/1563 [48:23<5:53:53, 15.18s/it]

reward: tensor(56.3726, device='cuda:0') factor: tensor(0.0038, device='cuda:0')
reward: tensor(77.9199, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(80.1853, device='cuda:0') factor: tensor(0.0042, device='cuda:0')


Epoch 1, Step 165: Loss = 2684.9673, Actor Loss = -0.9217, Critic Loss = 5371.7778, Entropy = 0.0061, Returns = 38.0939, Value = 37.3404, mask_loss = 0.0052std_loss = 0.0534:  11%|█         | 165/1563 [48:41<6:12:09, 15.97s/it] 

reward: tensor(79.0024, device='cuda:0') factor: tensor(0.0046, device='cuda:0')
reward: tensor(79.1948, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(51.6474, device='cuda:0') factor: tensor(0.0045, device='cuda:0')


Epoch 1, Step 166: Loss = 3793.7043, Actor Loss = -40.4607, Critic Loss = 7668.3301, Entropy = 0.0064, Returns = 78.1119, Value = 37.8036, mask_loss = 0.0047std_loss = 0.0431:  11%|█         | 166/1563 [49:01<6:44:19, 17.37s/it]

reward: tensor(30.3726, device='cuda:0') factor: tensor(0.0039, device='cuda:0')
reward: tensor(63.5369, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(48.9367, device='cuda:0') factor: tensor(0.0038, device='cuda:0')


Epoch 1, Step 167: Loss = 4542.4473, Actor Loss = -17.1660, Critic Loss = 9119.2266, Entropy = 0.0053, Returns = 54.3566, Value = 37.3700, mask_loss = 0.0027std_loss = 0.0229:  11%|█         | 167/1563 [49:23<7:14:48, 18.69s/it]

reward: tensor(43.8113, device='cuda:0') factor: tensor(0.0040, device='cuda:0')
reward: tensor(61.6663, device='cuda:0') factor: tensor(0.0033, device='cuda:0')
reward: tensor(23.1449, device='cuda:0') factor: tensor(0.0037, device='cuda:0')


Epoch 1, Step 168: Loss = 8521.4492, Actor Loss = -5.7951, Critic Loss = 17054.4883, Entropy = 0.0031, Returns = 42.5107, Value = 36.7916, mask_loss = 0.0022std_loss = 0.0261:  11%|█         | 168/1563 [49:44<7:27:09, 19.23s/it]

reward: tensor(75.7843, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(69.6861, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(58.4141, device='cuda:0') factor: tensor(0.0050, device='cuda:0')


Epoch 1, Step 169: Loss = 2937.0542, Actor Loss = -17.9437, Critic Loss = 5909.9956, Entropy = 0.0058, Returns = 56.3362, Value = 38.4874, mask_loss = 0.0057std_loss = 0.0513:  11%|█         | 169/1563 [49:59<6:58:01, 17.99s/it]

reward: tensor(92.8148, device='cuda:0') factor: tensor(0.0088, device='cuda:0')
reward: tensor(83.7106, device='cuda:0') factor: tensor(0.0106, device='cuda:0')
reward: tensor(72.4253, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 170: Loss = 10970.7783, Actor Loss = -71.6492, Critic Loss = 22084.8555, Entropy = 0.0097, Returns = 110.2575, Value = 38.7979, mask_loss = 0.0076std_loss = 0.0526:  11%|█         | 170/1563 [50:20<7:19:53, 18.95s/it]

reward: tensor(43.7398, device='cuda:0') factor: tensor(0.0051, device='cuda:0')
reward: tensor(60.7132, device='cuda:0') factor: tensor(0.0041, device='cuda:0')
reward: tensor(87.6840, device='cuda:0') factor: tensor(0.0038, device='cuda:0')


Epoch 1, Step 171: Loss = 2455.6831, Actor Loss = -5.5202, Critic Loss = 4922.4067, Entropy = 0.0060, Returns = 43.4590, Value = 38.0724, mask_loss = 0.0037std_loss = 0.0321:  11%|█         | 171/1563 [50:39<7:21:28, 19.03s/it]    

reward: tensor(77.3428, device='cuda:0') factor: tensor(0.0071, device='cuda:0')
reward: tensor(79.6406, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(104.6122, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 172: Loss = 3659.7476, Actor Loss = -43.3084, Critic Loss = 7406.1118, Entropy = 0.0081, Returns = 81.9770, Value = 38.8078, mask_loss = 0.0072std_loss = 0.0602:  11%|█         | 172/1563 [50:54<6:54:10, 17.87s/it]

reward: tensor(52.2949, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(86.8250, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(62.7100, device='cuda:0') factor: tensor(0.0031, device='cuda:0')


Epoch 1, Step 173: Loss = 1992.3048, Actor Loss = 1.7514, Critic Loss = 3981.1069, Entropy = 0.0074, Returns = 35.9904, Value = 37.9533, mask_loss = 0.0043std_loss = 0.0400:  11%|█         | 173/1563 [51:13<7:02:20, 18.23s/it]  

reward: tensor(150.3062, device='cuda:0') factor: tensor(0.0046, device='cuda:0')
reward: tensor(73.5164, device='cuda:0') factor: tensor(0.0037, device='cuda:0')
reward: tensor(109.8444, device='cuda:0') factor: tensor(0.0038, device='cuda:0')


Epoch 1, Step 174: Loss = 1311.7009, Actor Loss = 0.5478, Critic Loss = 2622.3062, Entropy = 0.0077, Returns = 35.9833, Value = 36.7238, mask_loss = 0.0045std_loss = 0.0365:  11%|█         | 174/1563 [51:34<7:16:48, 18.87s/it]

reward: tensor(49.2532, device='cuda:0') factor: tensor(0.0123, device='cuda:0')
reward: tensor(51.0038, device='cuda:0') factor: tensor(0.0094, device='cuda:0')
reward: tensor(49.0419, device='cuda:0') factor: tensor(0.0179, device='cuda:0')


Epoch 1, Step 175: Loss = 1809.0120, Actor Loss = -29.5403, Critic Loss = 3677.1045, Entropy = 0.0157, Returns = 67.8374, Value = 38.5567, mask_loss = 0.0138std_loss = 0.0816:  11%|█         | 175/1563 [51:49<6:49:39, 17.71s/it]

reward: tensor(35.7721, device='cuda:0') factor: tensor(0.0142, device='cuda:0')
reward: tensor(35.7349, device='cuda:0') factor: tensor(0.0151, device='cuda:0')
reward: tensor(89.2964, device='cuda:0') factor: tensor(0.0132, device='cuda:0')


Epoch 1, Step 176: Loss = 2924.4995, Actor Loss = -10.5909, Critic Loss = 5870.1807, Entropy = 0.0168, Returns = 50.1786, Value = 39.7286, mask_loss = 0.0163std_loss = 0.0965:  11%|█▏        | 176/1563 [52:04<6:30:44, 16.90s/it]

reward: tensor(59.0155, device='cuda:0') factor: tensor(0.0168, device='cuda:0')
reward: tensor(45.3513, device='cuda:0') factor: tensor(0.0182, device='cuda:0')
reward: tensor(28.1427, device='cuda:0') factor: tensor(0.0195, device='cuda:0')


Epoch 1, Step 177: Loss = 324.5026, Actor Loss = 21.3589, Critic Loss = 606.2875, Entropy = 0.0378, Returns = 17.2928, Value = 38.7939, mask_loss = 0.0327std_loss = 0.1367:  11%|█▏        | 177/1563 [52:19<6:17:22, 16.34s/it]   

reward: tensor(52.4185, device='cuda:0') factor: tensor(0.0129, device='cuda:0')
reward: tensor(56.5080, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(70.4785, device='cuda:0') factor: tensor(0.0141, device='cuda:0')


Epoch 1, Step 178: Loss = 2157.0769, Actor Loss = -30.6142, Critic Loss = 4375.3823, Entropy = 0.0152, Returns = 71.1292, Value = 40.5486, mask_loss = 0.0141std_loss = 0.0905:  11%|█▏        | 178/1563 [52:42<7:08:46, 18.57s/it]

reward: tensor(34.2031, device='cuda:0') factor: tensor(0.0209, device='cuda:0')
reward: tensor(38.7962, device='cuda:0') factor: tensor(0.0172, device='cuda:0')
reward: tensor(42.9949, device='cuda:0') factor: tensor(0.0197, device='cuda:0')


Epoch 1, Step 179: Loss = 498.2173, Actor Loss = 11.9277, Critic Loss = 972.5793, Entropy = 0.0278, Returns = 27.9681, Value = 40.0009, mask_loss = 0.0226std_loss = 0.1126:  11%|█▏        | 179/1563 [52:58<6:45:50, 17.59s/it]   

reward: tensor(74.6372, device='cuda:0') factor: tensor(0.0178, device='cuda:0')
reward: tensor(94.4171, device='cuda:0') factor: tensor(0.0150, device='cuda:0')
reward: tensor(40.6904, device='cuda:0') factor: tensor(0.0147, device='cuda:0')


Epoch 1, Step 180: Loss = 1635.5723, Actor Loss = 8.9233, Critic Loss = 3253.2979, Entropy = 0.0159, Returns = 30.2440, Value = 39.2939, mask_loss = 0.0151std_loss = 0.0871:  12%|█▏        | 180/1563 [53:15<6:40:00, 17.35s/it]

reward: tensor(57.8299, device='cuda:0') factor: tensor(0.0161, device='cuda:0')
reward: tensor(72.4578, device='cuda:0') factor: tensor(0.0185, device='cuda:0')
reward: tensor(70.1932, device='cuda:0') factor: tensor(0.0162, device='cuda:0')


Epoch 1, Step 181: Loss = 719.0237, Actor Loss = -14.9846, Critic Loss = 1468.0167, Entropy = 0.0178, Returns = 55.9870, Value = 41.1423, mask_loss = 0.0154std_loss = 0.0924:  12%|█▏        | 181/1563 [53:29<6:22:05, 16.59s/it]

reward: tensor(69.2751, device='cuda:0') factor: tensor(0.0135, device='cuda:0')
reward: tensor(55.9130, device='cuda:0') factor: tensor(0.0133, device='cuda:0')
reward: tensor(65.3471, device='cuda:0') factor: tensor(0.0128, device='cuda:0')


Epoch 1, Step 182: Loss = 1801.7334, Actor Loss = 9.1310, Critic Loss = 3585.2048, Entropy = 0.0183, Returns = 30.1996, Value = 39.4680, mask_loss = 0.0155std_loss = 0.0760:  12%|█▏        | 182/1563 [53:47<6:30:35, 16.97s/it] 

reward: tensor(80.6310, device='cuda:0') factor: tensor(0.0062, device='cuda:0')
reward: tensor(55.9972, device='cuda:0') factor: tensor(0.0053, device='cuda:0')
reward: tensor(53.3828, device='cuda:0') factor: tensor(0.0068, device='cuda:0')


Epoch 1, Step 183: Loss = 966.8034, Actor Loss = 8.4765, Critic Loss = 1916.6538, Entropy = 0.0079, Returns = 30.1026, Value = 38.5886, mask_loss = 0.0052std_loss = 0.0444:  12%|█▏        | 183/1563 [54:04<6:27:37, 16.85s/it] 

reward: tensor(48.0452, device='cuda:0') factor: tensor(0.0080, device='cuda:0')
reward: tensor(47.2690, device='cuda:0') factor: tensor(0.0073, device='cuda:0')
reward: tensor(84.8034, device='cuda:0') factor: tensor(0.0087, device='cuda:0')


Epoch 1, Step 184: Loss = 6610.7119, Actor Loss = -41.8378, Critic Loss = 13305.0996, Entropy = 0.0113, Returns = 83.1312, Value = 41.3644, mask_loss = 0.0094std_loss = 0.0660:  12%|█▏        | 184/1563 [54:19<6:15:06, 16.32s/it]

reward: tensor(66.4267, device='cuda:0') factor: tensor(0.0053, device='cuda:0')
reward: tensor(82.5149, device='cuda:0') factor: tensor(0.0089, device='cuda:0')
reward: tensor(73.4639, device='cuda:0') factor: tensor(0.0057, device='cuda:0')


Epoch 1, Step 185: Loss = 2019.0377, Actor Loss = -18.3219, Critic Loss = 4074.7192, Entropy = 0.0093, Returns = 58.1451, Value = 39.9053, mask_loss = 0.0062std_loss = 0.0552:  12%|█▏        | 185/1563 [54:34<6:03:48, 15.84s/it] 

reward: tensor(130.4172, device='cuda:0') factor: tensor(0.0038, device='cuda:0')
reward: tensor(67.7342, device='cuda:0') factor: tensor(0.0031, device='cuda:0')
reward: tensor(101.5043, device='cuda:0') factor: tensor(0.0041, device='cuda:0')


Epoch 1, Step 186: Loss = 39183.4961, Actor Loss = -165.1066, Critic Loss = 78697.2031, Entropy = 0.0073, Returns = 206.6761, Value = 41.6807, mask_loss = 0.0040std_loss = 0.0397:  12%|█▏        | 186/1563 [54:55<6:41:27, 17.49s/it]

reward: tensor(46.7883, device='cuda:0') factor: tensor(0.0089, device='cuda:0')
reward: tensor(51.2979, device='cuda:0') factor: tensor(0.0076, device='cuda:0')
reward: tensor(51.4216, device='cuda:0') factor: tensor(0.0086, device='cuda:0')


Epoch 1, Step 187: Loss = 2244.5247, Actor Loss = -13.3197, Critic Loss = 4515.6885, Entropy = 0.0068, Returns = 54.7413, Value = 41.5561, mask_loss = 0.0081std_loss = 0.0638:  12%|█▏        | 187/1563 [55:09<6:19:01, 16.53s/it]    

reward: tensor(43.1813, device='cuda:0') factor: tensor(0.0088, device='cuda:0')
reward: tensor(52.9934, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(80.0591, device='cuda:0') factor: tensor(0.0056, device='cuda:0')


Epoch 1, Step 188: Loss = 4935.9175, Actor Loss = -28.3607, Critic Loss = 9928.5566, Entropy = 0.0087, Returns = 70.0267, Value = 41.8582, mask_loss = 0.0076std_loss = 0.0616:  12%|█▏        | 188/1563 [55:24<6:07:32, 16.04s/it]

reward: tensor(73.6035, device='cuda:0') factor: tensor(0.0042, device='cuda:0')
reward: tensor(39.0545, device='cuda:0') factor: tensor(0.0042, device='cuda:0')
reward: tensor(60.4084, device='cuda:0') factor: tensor(0.0048, device='cuda:0')


Epoch 1, Step 189: Loss = 4040.3057, Actor Loss = -40.3874, Critic Loss = 8161.3862, Entropy = 0.0059, Returns = 82.3161, Value = 42.0334, mask_loss = 0.0048std_loss = 0.0529:  12%|█▏        | 189/1563 [55:39<5:56:32, 15.57s/it]

reward: tensor(80.9660, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(89.5309, device='cuda:0') factor: tensor(0.0044, device='cuda:0')
reward: tensor(56.5958, device='cuda:0') factor: tensor(0.0050, device='cuda:0')


Epoch 1, Step 190: Loss = 7307.5508, Actor Loss = -36.9314, Critic Loss = 14688.9639, Entropy = 0.0051, Returns = 79.2681, Value = 42.5016, mask_loss = 0.0033std_loss = 0.0341:  12%|█▏        | 190/1563 [55:53<5:50:05, 15.30s/it]

reward: tensor(94.0115, device='cuda:0') factor: tensor(0.0038, device='cuda:0')
reward: tensor(92.1509, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(116.5594, device='cuda:0') factor: tensor(0.0034, device='cuda:0')


Epoch 1, Step 191: Loss = 32623.3223, Actor Loss = -107.8575, Critic Loss = 65462.3594, Entropy = 0.0047, Returns = 149.5437, Value = 41.9181, mask_loss = 0.0038std_loss = 0.0430:  12%|█▏        | 191/1563 [56:09<5:49:45, 15.30s/it]

reward: tensor(46.4653, device='cuda:0') factor: tensor(0.0076, device='cuda:0')
reward: tensor(130.2875, device='cuda:0') factor: tensor(0.0077, device='cuda:0')
reward: tensor(114.9071, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 192: Loss = 15480.0615, Actor Loss = -48.2303, Critic Loss = 31056.5840, Entropy = 0.0070, Returns = 90.5574, Value = 42.5566, mask_loss = 0.0070std_loss = 0.0548:  12%|█▏        | 192/1563 [56:24<5:48:30, 15.25s/it]  

reward: tensor(83.6698, device='cuda:0') factor: tensor(0.0114, device='cuda:0')
reward: tensor(50.2732, device='cuda:0') factor: tensor(0.0122, device='cuda:0')
reward: tensor(59.7901, device='cuda:0') factor: tensor(0.0110, device='cuda:0')


Epoch 1, Step 193: Loss = 15890.8350, Actor Loss = -76.8273, Critic Loss = 31935.3242, Entropy = 0.0111, Returns = 118.7625, Value = 41.7747, mask_loss = 0.0101std_loss = 0.0720:  12%|█▏        | 193/1563 [56:39<5:46:13, 15.16s/it]

reward: tensor(75.6814, device='cuda:0') factor: tensor(0.0061, device='cuda:0')
reward: tensor(89.2061, device='cuda:0') factor: tensor(0.0088, device='cuda:0')
reward: tensor(83.0883, device='cuda:0') factor: tensor(0.0103, device='cuda:0')


Epoch 1, Step 194: Loss = 3915.7126, Actor Loss = -28.5734, Critic Loss = 7888.5723, Entropy = 0.0096, Returns = 69.5334, Value = 41.1400, mask_loss = 0.0074std_loss = 0.0633:  12%|█▏        | 194/1563 [56:54<5:44:49, 15.11s/it]   

reward: tensor(94.0109, device='cuda:0') factor: tensor(0.0090, device='cuda:0')
reward: tensor(88.1014, device='cuda:0') factor: tensor(0.0084, device='cuda:0')
reward: tensor(89.7792, device='cuda:0') factor: tensor(0.0077, device='cuda:0')


Epoch 1, Step 195: Loss = 3355.8953, Actor Loss = -40.9008, Critic Loss = 6793.5923, Entropy = 0.0113, Returns = 82.0511, Value = 41.3702, mask_loss = 0.0070std_loss = 0.0581:  12%|█▏        | 195/1563 [57:09<5:45:07, 15.14s/it]

reward: tensor(123.5097, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(152.4418, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(135.5311, device='cuda:0') factor: tensor(0.0084, device='cuda:0')


Epoch 1, Step 196: Loss = 22172.9551, Actor Loss = -115.6548, Critic Loss = 44577.2188, Entropy = 0.0133, Returns = 158.9774, Value = 43.5808, mask_loss = 0.0094std_loss = 0.0615:  13%|█▎        | 196/1563 [57:24<5:44:06, 15.10s/it]

reward: tensor(139.6037, device='cuda:0') factor: tensor(0.0092, device='cuda:0')
reward: tensor(97.0845, device='cuda:0') factor: tensor(0.0083, device='cuda:0')
reward: tensor(89.9958, device='cuda:0') factor: tensor(0.0084, device='cuda:0')


Epoch 1, Step 197: Loss = 3355.2051, Actor Loss = -30.3148, Critic Loss = 6771.0396, Entropy = 0.0225, Returns = 74.3711, Value = 44.3928, mask_loss = 0.0167std_loss = 0.0894:  13%|█▎        | 197/1563 [57:39<5:41:33, 15.00s/it]    

reward: tensor(85.4907, device='cuda:0') factor: tensor(0.0121, device='cuda:0')
reward: tensor(91.0472, device='cuda:0') factor: tensor(0.0095, device='cuda:0')
reward: tensor(89.9272, device='cuda:0') factor: tensor(0.0124, device='cuda:0')


Epoch 1, Step 198: Loss = 7796.6543, Actor Loss = -46.1395, Critic Loss = 15685.5879, Entropy = 0.0112, Returns = 90.4860, Value = 44.6011, mask_loss = 0.0082std_loss = 0.0621:  13%|█▎        | 198/1563 [57:53<5:38:55, 14.90s/it]

reward: tensor(102.7935, device='cuda:0') factor: tensor(0.0106, device='cuda:0')
reward: tensor(52.5906, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(100.6006, device='cuda:0') factor: tensor(0.0096, device='cuda:0')


Epoch 1, Step 199: Loss = 1730.8228, Actor Loss = -13.1472, Critic Loss = 3487.9399, Entropy = 0.0100, Returns = 57.8797, Value = 45.0852, mask_loss = 0.0071std_loss = 0.0548:  13%|█▎        | 199/1563 [58:08<5:40:11, 14.96s/it] 

reward: tensor(32.8280, device='cuda:0') factor: tensor(0.0081, device='cuda:0')
reward: tensor(64.9739, device='cuda:0') factor: tensor(0.0091, device='cuda:0')
reward: tensor(49.1633, device='cuda:0') factor: tensor(0.0101, device='cuda:0')


Epoch 1, Step 200: Loss = 1776.5963, Actor Loss = -12.1596, Critic Loss = 3577.5117, Entropy = 0.0081, Returns = 57.2208, Value = 45.2277, mask_loss = 0.0070std_loss = 0.0592:  13%|█▎        | 200/1563 [58:23<5:35:40, 14.78s/it]

reward: tensor(58.6040, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(57.0205, device='cuda:0') factor: tensor(0.0077, device='cuda:0')
reward: tensor(68.4477, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 201: Loss = 9090.3096, Actor Loss = -35.7384, Critic Loss = 18252.0957, Entropy = 0.0040, Returns = 78.4875, Value = 42.8833, mask_loss = 0.0040std_loss = 0.0465:  13%|█▎        | 201/1563 [58:45<6:27:11, 17.06s/it]

reward: tensor(86.9226, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(62.8382, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(118.2747, device='cuda:0') factor: tensor(0.0075, device='cuda:0')


Epoch 1, Step 202: Loss = 5794.0312, Actor Loss = -18.5931, Critic Loss = 11625.2490, Entropy = 0.0098, Returns = 62.0129, Value = 43.6059, mask_loss = 0.0089std_loss = 0.0701:  13%|█▎        | 202/1563 [59:00<6:13:27, 16.46s/it]

reward: tensor(64.1648, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(46.3274, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(98.2249, device='cuda:0') factor: tensor(0.0068, device='cuda:0')


Epoch 1, Step 203: Loss = 6145.3188, Actor Loss = -24.9506, Critic Loss = 12340.5391, Entropy = 0.0105, Returns = 70.3954, Value = 45.6673, mask_loss = 0.0085std_loss = 0.0616:  13%|█▎        | 203/1563 [59:22<6:50:49, 18.12s/it]

reward: tensor(80.4934, device='cuda:0') factor: tensor(0.0098, device='cuda:0')
reward: tensor(101.2155, device='cuda:0') factor: tensor(0.0107, device='cuda:0')
reward: tensor(63.8334, device='cuda:0') factor: tensor(0.0094, device='cuda:0')


Epoch 1, Step 204: Loss = 4428.8804, Actor Loss = -40.5640, Critic Loss = 8938.8887, Entropy = 0.0099, Returns = 86.7681, Value = 46.3492, mask_loss = 0.0107std_loss = 0.0847:  13%|█▎        | 204/1563 [59:37<6:26:10, 17.05s/it] 

reward: tensor(79.1577, device='cuda:0') factor: tensor(0.0133, device='cuda:0')
reward: tensor(99.6613, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(92.0810, device='cuda:0') factor: tensor(0.0117, device='cuda:0')


Epoch 1, Step 205: Loss = 652.5947, Actor Loss = -8.1653, Critic Loss = 1321.5200, Entropy = 0.0104, Returns = 54.5908, Value = 46.5908, mask_loss = 0.0139std_loss = 0.1011:  13%|█▎        | 205/1563 [59:55<6:30:56, 17.27s/it]  

reward: tensor(61.6538, device='cuda:0') factor: tensor(0.0128, device='cuda:0')
reward: tensor(37.4037, device='cuda:0') factor: tensor(0.0135, device='cuda:0')
reward: tensor(63.0470, device='cuda:0') factor: tensor(0.0149, device='cuda:0')


Epoch 1, Step 206: Loss = 656.5615, Actor Loss = 29.6775, Critic Loss = 1253.7681, Entropy = 0.0147, Returns = 13.7115, Value = 43.4760, mask_loss = 0.0146std_loss = 0.0883:  13%|█▎        | 206/1563 [1:00:12<6:30:13, 17.25s/it]

reward: tensor(134.5856, device='cuda:0') factor: tensor(0.0064, device='cuda:0')
reward: tensor(106.7132, device='cuda:0') factor: tensor(0.0054, device='cuda:0')
reward: tensor(114.4890, device='cuda:0') factor: tensor(0.0058, device='cuda:0')


Epoch 1, Step 207: Loss = 22507.9531, Actor Loss = -112.5002, Critic Loss = 45240.9062, Entropy = 0.0068, Returns = 158.3557, Value = 46.0680, mask_loss = 0.0048std_loss = 0.0501:  13%|█▎        | 207/1563 [1:00:24<5:58:39, 15.87s/it]

reward: tensor(103.6664, device='cuda:0') factor: tensor(0.0036, device='cuda:0')
reward: tensor(96.5546, device='cuda:0') factor: tensor(0.0034, device='cuda:0')
reward: tensor(66.6532, device='cuda:0') factor: tensor(0.0033, device='cuda:0')


Epoch 1, Step 208: Loss = 14351.2354, Actor Loss = -55.4680, Critic Loss = 28813.4062, Entropy = 0.0053, Returns = 101.4866, Value = 46.1847, mask_loss = 0.0029std_loss = 0.0293:  13%|█▎        | 208/1563 [1:00:40<5:55:01, 15.72s/it] 

reward: tensor(44.5272, device='cuda:0') factor: tensor(0.0033, device='cuda:0')
reward: tensor(45.7595, device='cuda:0') factor: tensor(0.0014, device='cuda:0')
reward: tensor(28.6276, device='cuda:0') factor: tensor(0.0033, device='cuda:0')


Epoch 1, Step 209: Loss = 3836.4915, Actor Loss = 5.3035, Critic Loss = 7662.3760, Entropy = 0.0034, Returns = 40.8160, Value = 46.2762, mask_loss = 0.0017std_loss = 0.0177:  13%|█▎        | 209/1563 [1:00:55<5:54:29, 15.71s/it]     

reward: tensor(78.0605, device='cuda:0') factor: tensor(0.0027, device='cuda:0')
reward: tensor(129.2593, device='cuda:0') factor: tensor(0.0036, device='cuda:0')
reward: tensor(69.5318, device='cuda:0') factor: tensor(0.0024, device='cuda:0')


Epoch 1, Step 210: Loss = 10155.5723, Actor Loss = -38.5478, Critic Loss = 20388.2402, Entropy = 0.0050, Returns = 85.3360, Value = 46.9602, mask_loss = 0.0035std_loss = 0.0379:  13%|█▎        | 210/1563 [1:01:14<6:12:06, 16.50s/it]

reward: tensor(76.7183, device='cuda:0') factor: tensor(0.0036, device='cuda:0')
reward: tensor(82.8315, device='cuda:0') factor: tensor(0.0037, device='cuda:0')
reward: tensor(47.1295, device='cuda:0') factor: tensor(0.0045, device='cuda:0')


Epoch 1, Step 211: Loss = 12161.9844, Actor Loss = -42.4820, Critic Loss = 24408.9336, Entropy = 0.0078, Returns = 88.3721, Value = 46.0696, mask_loss = 0.0061std_loss = 0.0540:  13%|█▎        | 211/1563 [1:01:29<6:05:43, 16.23s/it]

reward: tensor(73.2678, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(102.7234, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(95.6264, device='cuda:0') factor: tensor(0.0047, device='cuda:0')


Epoch 1, Step 212: Loss = 26078.5117, Actor Loss = -86.1625, Critic Loss = 52329.3477, Entropy = 0.0097, Returns = 133.9392, Value = 47.8792, mask_loss = 0.0074std_loss = 0.0611:  14%|█▎        | 212/1563 [1:01:44<5:54:11, 15.73s/it]

reward: tensor(100.0047, device='cuda:0') factor: tensor(0.0109, device='cuda:0')
reward: tensor(84.7270, device='cuda:0') factor: tensor(0.0116, device='cuda:0')
reward: tensor(93.5392, device='cuda:0') factor: tensor(0.0109, device='cuda:0')


Epoch 1, Step 213: Loss = 4180.2339, Actor Loss = -31.2816, Critic Loss = 8423.0312, Entropy = 0.0048, Returns = 79.1013, Value = 47.9723, mask_loss = 0.0074std_loss = 0.0653:  14%|█▎        | 213/1563 [1:01:59<5:47:12, 15.43s/it]   

reward: tensor(54.2329, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(78.4050, device='cuda:0') factor: tensor(0.0120, device='cuda:0')
reward: tensor(64.1266, device='cuda:0') factor: tensor(0.0131, device='cuda:0')


Epoch 1, Step 214: Loss = 3652.2803, Actor Loss = -35.9715, Critic Loss = 7376.5034, Entropy = 0.0070, Returns = 81.4944, Value = 45.6824, mask_loss = 0.0068std_loss = 0.0626:  14%|█▎        | 214/1563 [1:02:14<5:44:12, 15.31s/it]

reward: tensor(96.4219, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(67.4341, device='cuda:0') factor: tensor(0.0077, device='cuda:0')
reward: tensor(67.2263, device='cuda:0') factor: tensor(0.0068, device='cuda:0')


Epoch 1, Step 215: Loss = 8051.6475, Actor Loss = -29.7906, Critic Loss = 16162.8760, Entropy = 0.0044, Returns = 77.4665, Value = 47.8324, mask_loss = 0.0045std_loss = 0.0454:  14%|█▍        | 215/1563 [1:02:31<6:00:17, 16.04s/it]

reward: tensor(98.2731, device='cuda:0') factor: tensor(0.0054, device='cuda:0')
reward: tensor(137.8115, device='cuda:0') factor: tensor(0.0072, device='cuda:0')
reward: tensor(90.8034, device='cuda:0') factor: tensor(0.0076, device='cuda:0')


Epoch 1, Step 216: Loss = 16962.4277, Actor Loss = -99.1702, Critic Loss = 34123.1953, Entropy = 0.0086, Returns = 145.2980, Value = 46.2391, mask_loss = 0.0059std_loss = 0.0462:  14%|█▍        | 216/1563 [1:02:51<6:20:46, 16.96s/it]

reward: tensor(36.8290, device='cuda:0') factor: tensor(0.0091, device='cuda:0')
reward: tensor(74.0214, device='cuda:0') factor: tensor(0.0105, device='cuda:0')
reward: tensor(33.3560, device='cuda:0') factor: tensor(0.0083, device='cuda:0')


Epoch 1, Step 217: Loss = 3151.8447, Actor Loss = -14.7582, Critic Loss = 6333.2061, Entropy = 0.0141, Returns = 62.7626, Value = 48.1770, mask_loss = 0.0133std_loss = 0.0852:  14%|█▍        | 217/1563 [1:03:08<6:25:58, 17.21s/it]   

reward: tensor(118.3896, device='cuda:0') factor: tensor(0.0074, device='cuda:0')
reward: tensor(116.3719, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(132.8729, device='cuda:0') factor: tensor(0.0073, device='cuda:0')


Epoch 1, Step 218: Loss = 11163.6963, Actor Loss = -73.0495, Critic Loss = 22473.4922, Entropy = 0.0070, Returns = 120.7880, Value = 47.9133, mask_loss = 0.0082std_loss = 0.0760:  14%|█▍        | 218/1563 [1:03:23<6:11:44, 16.58s/it]

reward: tensor(86.3990, device='cuda:0') factor: tensor(0.0109, device='cuda:0')
reward: tensor(70.7427, device='cuda:0') factor: tensor(0.0093, device='cuda:0')
reward: tensor(75.8833, device='cuda:0') factor: tensor(0.0095, device='cuda:0')


Epoch 1, Step 219: Loss = 2736.8438, Actor Loss = -29.3300, Critic Loss = 5532.3477, Entropy = 0.0103, Returns = 78.0897, Value = 48.8835, mask_loss = 0.0114std_loss = 0.0786:  14%|█▍        | 219/1563 [1:03:43<6:33:43, 17.58s/it]   

reward: tensor(143.4128, device='cuda:0') factor: tensor(0.0137, device='cuda:0')
reward: tensor(188.5459, device='cuda:0') factor: tensor(0.0134, device='cuda:0')
reward: tensor(92.4703, device='cuda:0') factor: tensor(0.0108, device='cuda:0')


Epoch 1, Step 220: Loss = 34692.2305, Actor Loss = -95.1166, Critic Loss = 69574.6953, Entropy = 0.0136, Returns = 143.4269, Value = 48.5577, mask_loss = 0.0146std_loss = 0.0954:  14%|█▍        | 220/1563 [1:04:02<6:43:13, 18.01s/it]

reward: tensor(96.1774, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(115.9359, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(161.0503, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 221: Loss = 24216.8105, Actor Loss = -111.2792, Critic Loss = 48656.1797, Entropy = 0.0042, Returns = 158.5782, Value = 47.4013, mask_loss = 0.0025std_loss = 0.0334:  14%|█▍        | 221/1563 [1:04:18<6:25:20, 17.23s/it]

reward: tensor(70.7537, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(98.2953, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(84.0973, device='cuda:0') factor: tensor(0.0050, device='cuda:0')


Epoch 1, Step 222: Loss = 4265.6177, Actor Loss = -16.3046, Critic Loss = 8563.8447, Entropy = 0.0053, Returns = 62.5410, Value = 46.4774, mask_loss = 0.0054std_loss = 0.0544:  14%|█▍        | 222/1563 [1:04:34<6:20:16, 17.01s/it]    

reward: tensor(64.2742, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(65.0038, device='cuda:0') factor: tensor(0.0050, device='cuda:0')
reward: tensor(67.4682, device='cuda:0') factor: tensor(0.0064, device='cuda:0')


Epoch 1, Step 223: Loss = 1849.4899, Actor Loss = 7.2582, Critic Loss = 3684.4634, Entropy = 0.0062, Returns = 42.6830, Value = 50.2056, mask_loss = 0.0066std_loss = 0.0589:  14%|█▍        | 223/1563 [1:04:49<6:07:33, 16.46s/it]  

reward: tensor(95.8633, device='cuda:0') factor: tensor(0.0076, device='cuda:0')
reward: tensor(91.2188, device='cuda:0') factor: tensor(0.0105, device='cuda:0')
reward: tensor(90.3785, device='cuda:0') factor: tensor(0.0084, device='cuda:0')


Epoch 1, Step 224: Loss = 3563.5750, Actor Loss = -32.3232, Critic Loss = 7191.7964, Entropy = 0.0104, Returns = 81.8198, Value = 49.6199, mask_loss = 0.0121std_loss = 0.0841:  14%|█▍        | 224/1563 [1:05:07<6:16:26, 16.87s/it]

reward: tensor(157.7760, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(137.5857, device='cuda:0') factor: tensor(0.0072, device='cuda:0')
reward: tensor(148.6474, device='cuda:0') factor: tensor(0.0108, device='cuda:0')


Epoch 1, Step 225: Loss = 21236.3242, Actor Loss = -107.3663, Critic Loss = 42687.3828, Entropy = 0.0056, Returns = 157.4497, Value = 50.2991, mask_loss = 0.0081std_loss = 0.0601:  14%|█▍        | 225/1563 [1:05:22<6:04:23, 16.34s/it]

reward: tensor(143.8605, device='cuda:0') factor: tensor(0.0046, device='cuda:0')
reward: tensor(132.5665, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(161.0176, device='cuda:0') factor: tensor(0.0048, device='cuda:0')


Epoch 1, Step 226: Loss = 18709.8223, Actor Loss = -101.5533, Critic Loss = 37622.7500, Entropy = 0.0036, Returns = 150.7152, Value = 49.4021, mask_loss = 0.0041std_loss = 0.0467:  14%|█▍        | 226/1563 [1:05:41<6:21:53, 17.14s/it]

reward: tensor(58.8200, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(71.6484, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(82.8972, device='cuda:0') factor: tensor(0.0040, device='cuda:0')


Epoch 1, Step 227: Loss = 4463.5859, Actor Loss = -37.0148, Critic Loss = 9001.2012, Entropy = 0.0026, Returns = 87.6760, Value = 50.8630, mask_loss = 0.0039std_loss = 0.0447:  15%|█▍        | 227/1563 [1:05:59<6:26:30, 17.36s/it]    

reward: tensor(118.7088, device='cuda:0') factor: tensor(0.0039, device='cuda:0')
reward: tensor(111.9095, device='cuda:0') factor: tensor(0.0038, device='cuda:0')
reward: tensor(117.1044, device='cuda:0') factor: tensor(0.0047, device='cuda:0')


Epoch 1, Step 228: Loss = 3315.0820, Actor Loss = 1.2224, Critic Loss = 6627.7192, Entropy = 0.0036, Returns = 45.6923, Value = 47.1074, mask_loss = 0.0053std_loss = 0.0484:  15%|█▍        | 228/1563 [1:06:15<6:15:34, 16.88s/it]  

reward: tensor(77.9829, device='cuda:0') factor: tensor(0.0034, device='cuda:0')
reward: tensor(79.1348, device='cuda:0') factor: tensor(0.0042, device='cuda:0')
reward: tensor(102.9312, device='cuda:0') factor: tensor(0.0044, device='cuda:0')


Epoch 1, Step 229: Loss = 6252.5483, Actor Loss = -40.1056, Critic Loss = 12585.3076, Entropy = 0.0035, Returns = 89.7676, Value = 49.8252, mask_loss = 0.0047std_loss = 0.0512:  15%|█▍        | 229/1563 [1:06:34<6:28:35, 17.48s/it]

reward: tensor(96.2255, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(120.2062, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(106.8009, device='cuda:0') factor: tensor(0.0053, device='cuda:0')


Epoch 1, Step 230: Loss = 15445.5410, Actor Loss = -97.0506, Critic Loss = 31085.1836, Entropy = 0.0032, Returns = 149.3348, Value = 52.4875, mask_loss = 0.0048std_loss = 0.0501:  15%|█▍        | 230/1563 [1:06:52<6:33:27, 17.71s/it]

reward: tensor(71.2048, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(92.9057, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(86.9137, device='cuda:0') factor: tensor(0.0045, device='cuda:0')


Epoch 1, Step 231: Loss = 7151.2759, Actor Loss = -40.4495, Critic Loss = 14383.4512, Entropy = 0.0038, Returns = 93.1595, Value = 52.8709, mask_loss = 0.0107std_loss = 0.0903:  15%|█▍        | 231/1563 [1:07:08<6:20:58, 17.16s/it]  

reward: tensor(146.9108, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(155.3289, device='cuda:0') factor: tensor(0.0061, device='cuda:0')
reward: tensor(144.3095, device='cuda:0') factor: tensor(0.0086, device='cuda:0')


Epoch 1, Step 232: Loss = 15314.2354, Actor Loss = -82.0964, Critic Loss = 30792.6641, Entropy = 0.0033, Returns = 133.9799, Value = 51.8233, mask_loss = 0.0045std_loss = 0.0512:  15%|█▍        | 232/1563 [1:07:23<6:06:14, 16.51s/it]

reward: tensor(73.6604, device='cuda:0') factor: tensor(0.0117, device='cuda:0')
reward: tensor(61.3994, device='cuda:0') factor: tensor(0.0121, device='cuda:0')
reward: tensor(70.9769, device='cuda:0') factor: tensor(0.0108, device='cuda:0')


Epoch 1, Step 233: Loss = 4535.6548, Actor Loss = -50.7499, Critic Loss = 9172.8096, Entropy = 0.0024, Returns = 102.4277, Value = 51.9089, mask_loss = 0.0069std_loss = 0.0712:  15%|█▍        | 233/1563 [1:07:38<5:56:47, 16.10s/it]  

reward: tensor(54.8538, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(103.0018, device='cuda:0') factor: tensor(0.0074, device='cuda:0')
reward: tensor(53.2716, device='cuda:0') factor: tensor(0.0063, device='cuda:0')


Epoch 1, Step 234: Loss = 3858.7554, Actor Loss = -28.8949, Critic Loss = 7775.3008, Entropy = 0.0048, Returns = 81.3934, Value = 52.7400, mask_loss = 0.0068std_loss = 0.0662:  15%|█▍        | 234/1563 [1:07:53<5:49:36, 15.78s/it] 

reward: tensor(86.7096, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(82.0887, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(93.1495, device='cuda:0') factor: tensor(0.0073, device='cuda:0')


Epoch 1, Step 235: Loss = 9497.5176, Actor Loss = -19.4370, Critic Loss = 19033.9102, Entropy = 0.0023, Returns = 69.5932, Value = 50.3259, mask_loss = 0.0030std_loss = 0.0379:  15%|█▌        | 235/1563 [1:08:12<6:08:57, 16.67s/it]

reward: tensor(35.0435, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(44.4660, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(34.5220, device='cuda:0') factor: tensor(0.0052, device='cuda:0')


Epoch 1, Step 236: Loss = 1395.0740, Actor Loss = 21.2264, Critic Loss = 2747.6953, Entropy = 0.0045, Returns = 32.5420, Value = 54.0606, mask_loss = 0.0083std_loss = 0.0803:  15%|█▌        | 236/1563 [1:08:27<5:56:50, 16.13s/it]  

reward: tensor(67.1419, device='cuda:0') factor: tensor(0.0080, device='cuda:0')
reward: tensor(66.4880, device='cuda:0') factor: tensor(0.0087, device='cuda:0')
reward: tensor(71.7075, device='cuda:0') factor: tensor(0.0084, device='cuda:0')


Epoch 1, Step 237: Loss = 4663.6709, Actor Loss = -57.2072, Critic Loss = 9441.7559, Entropy = 0.0063, Returns = 110.8595, Value = 53.5202, mask_loss = 0.0157std_loss = 0.1081:  15%|█▌        | 237/1563 [1:08:42<5:49:04, 15.80s/it]

reward: tensor(73.4720, device='cuda:0') factor: tensor(0.0168, device='cuda:0')
reward: tensor(72.6373, device='cuda:0') factor: tensor(0.0178, device='cuda:0')
reward: tensor(69.2841, device='cuda:0') factor: tensor(0.0173, device='cuda:0')


Epoch 1, Step 238: Loss = 23589.3848, Actor Loss = -84.2877, Critic Loss = 47347.3438, Entropy = 0.0075, Returns = 136.7627, Value = 52.6283, mask_loss = 0.0146std_loss = 0.0994:  15%|█▌        | 238/1563 [1:08:57<5:45:10, 15.63s/it]

reward: tensor(57.1506, device='cuda:0') factor: tensor(0.0232, device='cuda:0')
reward: tensor(59.9128, device='cuda:0') factor: tensor(0.0226, device='cuda:0')
reward: tensor(57.2275, device='cuda:0') factor: tensor(0.0220, device='cuda:0')


Epoch 1, Step 239: Loss = 4498.0542, Actor Loss = -36.4125, Critic Loss = 9068.9336, Entropy = 0.0037, Returns = 89.0029, Value = 52.7832, mask_loss = 0.0112std_loss = 0.0909:  15%|█▌        | 239/1563 [1:09:15<6:00:35, 16.34s/it]   

reward: tensor(105.7233, device='cuda:0') factor: tensor(0.0109, device='cuda:0')
reward: tensor(116.1308, device='cuda:0') factor: tensor(0.0120, device='cuda:0')
reward: tensor(112.3452, device='cuda:0') factor: tensor(0.0111, device='cuda:0')


Epoch 1, Step 240: Loss = 7589.6226, Actor Loss = -16.3390, Critic Loss = 15211.9229, Entropy = 0.0059, Returns = 70.3929, Value = 54.1756, mask_loss = 0.0068std_loss = 0.0635:  15%|█▌        | 240/1563 [1:09:33<6:11:42, 16.86s/it]

reward: tensor(80.3815, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(65.2394, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(71.8061, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 241: Loss = 3085.3054, Actor Loss = 20.5980, Critic Loss = 6129.4150, Entropy = 0.0033, Returns = 31.5472, Value = 52.2712, mask_loss = 0.0053std_loss = 0.0503:  15%|█▌        | 241/1563 [1:09:49<6:02:19, 16.44s/it]  

reward: tensor(88.0496, device='cuda:0') factor: tensor(0.0062, device='cuda:0')
reward: tensor(74.6351, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(73.6214, device='cuda:0') factor: tensor(0.0065, device='cuda:0')


Epoch 1, Step 242: Loss = 2712.0623, Actor Loss = -20.3382, Critic Loss = 5464.8008, Entropy = 0.0072, Returns = 71.0375, Value = 50.7953, mask_loss = 0.0072std_loss = 0.0605:  15%|█▌        | 242/1563 [1:10:04<5:53:07, 16.04s/it]

reward: tensor(72.8317, device='cuda:0') factor: tensor(0.0060, device='cuda:0')
reward: tensor(80.3346, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(61.6159, device='cuda:0') factor: tensor(0.0040, device='cuda:0')


Epoch 1, Step 243: Loss = 1320.4578, Actor Loss = 7.6579, Critic Loss = 2625.5999, Entropy = 0.0043, Returns = 45.7958, Value = 53.5035, mask_loss = 0.0061std_loss = 0.0573:  16%|█▌        | 243/1563 [1:10:19<5:47:52, 15.81s/it]  

reward: tensor(84.0241, device='cuda:0') factor: tensor(0.0033, device='cuda:0')
reward: tensor(107.8310, device='cuda:0') factor: tensor(0.0037, device='cuda:0')
reward: tensor(136.7772, device='cuda:0') factor: tensor(0.0056, device='cuda:0')


Epoch 1, Step 244: Loss = 23692.7520, Actor Loss = -115.9461, Critic Loss = 47617.3945, Entropy = 0.0039, Returns = 170.9010, Value = 55.0377, mask_loss = 0.0039std_loss = 0.0416:  16%|█▌        | 244/1563 [1:10:33<5:38:05, 15.38s/it]

reward: tensor(111.0891, device='cuda:0') factor: tensor(0.0031, device='cuda:0')
reward: tensor(64.9080, device='cuda:0') factor: tensor(0.0029, device='cuda:0')
reward: tensor(53.9444, device='cuda:0') factor: tensor(0.0011, device='cuda:0')


Epoch 1, Step 245: Loss = 4197.2876, Actor Loss = 19.7761, Critic Loss = 8355.0225, Entropy = 0.0028, Returns = 30.9734, Value = 50.8512, mask_loss = 0.0017std_loss = 0.0216:  16%|█▌        | 245/1563 [1:10:50<5:46:05, 15.76s/it]     

reward: tensor(56.4887, device='cuda:0') factor: tensor(0.0034, device='cuda:0')
reward: tensor(52.5946, device='cuda:0') factor: tensor(0.0026, device='cuda:0')
reward: tensor(54.4955, device='cuda:0') factor: tensor(0.0041, device='cuda:0')


Epoch 1, Step 246: Loss = 13265.1855, Actor Loss = -30.5102, Critic Loss = 26591.3906, Entropy = 0.0043, Returns = 85.3947, Value = 54.9625, mask_loss = 0.0034std_loss = 0.0296:  16%|█▌        | 246/1563 [1:11:11<6:23:05, 17.45s/it]

reward: tensor(100.3226, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(95.9645, device='cuda:0') factor: tensor(0.0054, device='cuda:0')
reward: tensor(107.1868, device='cuda:0') factor: tensor(0.0070, device='cuda:0')


Epoch 1, Step 247: Loss = 8632.6162, Actor Loss = -22.5600, Critic Loss = 17310.3516, Entropy = 0.0075, Returns = 78.0472, Value = 55.7314, mask_loss = 0.0076std_loss = 0.0652:  16%|█▌        | 247/1563 [1:11:26<6:03:41, 16.58s/it] 

reward: tensor(162.6608, device='cuda:0') factor: tensor(0.0040, device='cuda:0')
reward: tensor(107.0341, device='cuda:0') factor: tensor(0.0041, device='cuda:0')
reward: tensor(183.1589, device='cuda:0') factor: tensor(0.0036, device='cuda:0')


Epoch 1, Step 248: Loss = 32526.8613, Actor Loss = -121.4287, Critic Loss = 65296.5781, Entropy = 0.0037, Returns = 176.2198, Value = 54.9970, mask_loss = 0.0045std_loss = 0.0524:  16%|█▌        | 248/1563 [1:11:41<5:52:08, 16.07s/it]

reward: tensor(57.2993, device='cuda:0') factor: tensor(0.0131, device='cuda:0')
reward: tensor(63.2226, device='cuda:0') factor: tensor(0.0135, device='cuda:0')
reward: tensor(57.5319, device='cuda:0') factor: tensor(0.0143, device='cuda:0')


Epoch 1, Step 249: Loss = 1509.7825, Actor Loss = 10.4704, Critic Loss = 2998.6240, Entropy = 0.0073, Returns = 44.8230, Value = 55.4746, mask_loss = 0.0128std_loss = 0.0934:  16%|█▌        | 249/1563 [1:11:55<5:37:12, 15.40s/it]     

reward: tensor(110.7880, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(82.3438, device='cuda:0') factor: tensor(0.0089, device='cuda:0')
reward: tensor(77.2268, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 250: Loss = 4498.9805, Actor Loss = -33.6193, Critic Loss = 9065.1992, Entropy = 0.0051, Returns = 87.7624, Value = 54.2521, mask_loss = 0.0095std_loss = 0.0841:  16%|█▌        | 250/1563 [1:12:10<5:36:06, 15.36s/it]

reward: tensor(43.0569, device='cuda:0') factor: tensor(0.0181, device='cuda:0')
reward: tensor(79.0687, device='cuda:0') factor: tensor(0.0179, device='cuda:0')
reward: tensor(53.9981, device='cuda:0') factor: tensor(0.0171, device='cuda:0')


Epoch 1, Step 251: Loss = 1206.9243, Actor Loss = 21.1801, Critic Loss = 2371.4885, Entropy = 0.0068, Returns = 31.1606, Value = 52.4826, mask_loss = 0.0131std_loss = 0.0955:  16%|█▌        | 251/1563 [1:12:30<6:05:32, 16.72s/it] 

reward: tensor(66.1402, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(66.4582, device='cuda:0') factor: tensor(0.0094, device='cuda:0')
reward: tensor(63.2443, device='cuda:0') factor: tensor(0.0099, device='cuda:0')


Epoch 1, Step 252: Loss = 2303.7097, Actor Loss = -22.6092, Critic Loss = 4652.6377, Entropy = 0.0034, Returns = 75.9641, Value = 53.3846, mask_loss = 0.0072std_loss = 0.0745:  16%|█▌        | 252/1563 [1:12:47<6:05:11, 16.71s/it]

reward: tensor(74.8911, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(69.4391, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(72.9164, device='cuda:0') factor: tensor(0.0077, device='cuda:0')


Epoch 1, Step 253: Loss = 6923.1401, Actor Loss = -5.5696, Critic Loss = 13857.4199, Entropy = 0.0037, Returns = 59.6022, Value = 54.1417, mask_loss = 0.0090std_loss = 0.0735:  16%|█▌        | 253/1563 [1:13:06<6:24:35, 17.61s/it]

reward: tensor(51.2441, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(85.6719, device='cuda:0') factor: tensor(0.0050, device='cuda:0')
reward: tensor(106.6585, device='cuda:0') factor: tensor(0.0052, device='cuda:0')


Epoch 1, Step 254: Loss = 5431.0112, Actor Loss = -32.9938, Critic Loss = 10928.0098, Entropy = 0.0032, Returns = 89.4317, Value = 56.4848, mask_loss = 0.0041std_loss = 0.0425:  16%|█▋        | 254/1563 [1:13:28<6:52:13, 18.89s/it]

reward: tensor(83.9570, device='cuda:0') factor: tensor(0.0051, device='cuda:0')
reward: tensor(93.8090, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(92.3144, device='cuda:0') factor: tensor(0.0050, device='cuda:0')


Epoch 1, Step 255: Loss = 15742.7217, Actor Loss = -113.4330, Critic Loss = 31712.3086, Entropy = 0.0021, Returns = 168.3379, Value = 54.9606, mask_loss = 0.0039std_loss = 0.0524:  16%|█▋        | 255/1563 [1:13:47<6:51:04, 18.86s/it]

reward: tensor(101.6649, device='cuda:0') factor: tensor(0.0121, device='cuda:0')
reward: tensor(112.7881, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(122.5792, device='cuda:0') factor: tensor(0.0103, device='cuda:0')


Epoch 1, Step 256: Loss = 8588.8281, Actor Loss = -64.5626, Critic Loss = 17306.7812, Entropy = 0.0056, Returns = 120.9762, Value = 56.5375, mask_loss = 0.0087std_loss = 0.0696:  16%|█▋        | 256/1563 [1:14:00<6:15:44, 17.25s/it]  

reward: tensor(139.8031, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(154.1768, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(130.9963, device='cuda:0') factor: tensor(0.0082, device='cuda:0')


Epoch 1, Step 257: Loss = 6596.5513, Actor Loss = -55.2337, Critic Loss = 13303.5703, Entropy = 0.0053, Returns = 111.5643, Value = 56.4910, mask_loss = 0.0088std_loss = 0.0740:  16%|█▋        | 257/1563 [1:14:15<5:59:50, 16.53s/it]

reward: tensor(78.9496, device='cuda:0') factor: tensor(0.0081, device='cuda:0')
reward: tensor(76.6307, device='cuda:0') factor: tensor(0.0071, device='cuda:0')
reward: tensor(70.5698, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 258: Loss = 3277.0706, Actor Loss = 16.5951, Critic Loss = 6520.9507, Entropy = 0.0051, Returns = 37.2184, Value = 54.0688, mask_loss = 0.0078std_loss = 0.0722:  17%|█▋        | 258/1563 [1:14:28<5:37:40, 15.53s/it]   

reward: tensor(79.5129, device='cuda:0') factor: tensor(0.0056, device='cuda:0')
reward: tensor(97.6363, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(80.3082, device='cuda:0') factor: tensor(0.0053, device='cuda:0')


Epoch 1, Step 259: Loss = 3793.5464, Actor Loss = -39.9051, Critic Loss = 7666.9028, Entropy = 0.0046, Returns = 94.1440, Value = 54.3570, mask_loss = 0.0045std_loss = 0.0509:  17%|█▋        | 259/1563 [1:14:49<6:10:35, 17.05s/it]

reward: tensor(110.1832, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(125.9806, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(122.6432, device='cuda:0') factor: tensor(0.0061, device='cuda:0')


Epoch 1, Step 260: Loss = 38403.5273, Actor Loss = -138.2658, Critic Loss = 77083.5859, Entropy = 0.0054, Returns = 195.6171, Value = 57.4788, mask_loss = 0.0036std_loss = 0.0386:  17%|█▋        | 260/1563 [1:15:04<5:57:32, 16.46s/it]

reward: tensor(101.1002, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(150.3018, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(93.4250, device='cuda:0') factor: tensor(0.0024, device='cuda:0')


Epoch 1, Step 261: Loss = 10020.3848, Actor Loss = -55.9649, Critic Loss = 20152.6992, Entropy = 0.0042, Returns = 110.2822, Value = 54.5477, mask_loss = 0.0036std_loss = 0.0477:  17%|█▋        | 261/1563 [1:15:25<6:26:52, 17.83s/it] 

reward: tensor(188.7177, device='cuda:0') factor: tensor(0.0035, device='cuda:0')
reward: tensor(184.0857, device='cuda:0') factor: tensor(0.0037, device='cuda:0')
reward: tensor(171.2722, device='cuda:0') factor: tensor(0.0025, device='cuda:0')


Epoch 1, Step 262: Loss = 55155.2422, Actor Loss = -107.1267, Critic Loss = 110524.7344, Entropy = 0.0041, Returns = 163.7386, Value = 56.8197, mask_loss = 0.0033std_loss = 0.0320:  17%|█▋        | 262/1563 [1:15:48<7:00:01, 19.37s/it]

reward: tensor(93.2342, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(97.1844, device='cuda:0') factor: tensor(0.0064, device='cuda:0')
reward: tensor(89.8621, device='cuda:0') factor: tensor(0.0063, device='cuda:0')


Epoch 1, Step 263: Loss = 13231.1689, Actor Loss = -26.6243, Critic Loss = 26515.5859, Entropy = 0.0071, Returns = 80.1366, Value = 53.7638, mask_loss = 0.0077std_loss = 0.0618:  17%|█▋        | 263/1563 [1:16:07<6:53:47, 19.10s/it]   

reward: tensor(83.9265, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(83.2185, device='cuda:0') factor: tensor(0.0088, device='cuda:0')
reward: tensor(76.8476, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 264: Loss = 10878.1494, Actor Loss = -42.5450, Critic Loss = 21841.3887, Entropy = 0.0071, Returns = 99.6799, Value = 57.3019, mask_loss = 0.0053std_loss = 0.0521:  17%|█▋        | 264/1563 [1:16:28<7:09:54, 19.86s/it]

reward: tensor(87.1658, device='cuda:0') factor: tensor(0.0025, device='cuda:0')
reward: tensor(32.1082, device='cuda:0') factor: tensor(0.0032, device='cuda:0')
reward: tensor(122.7671, device='cuda:0') factor: tensor(0.0020, device='cuda:0')


Epoch 1, Step 265: Loss = 1940.2494, Actor Loss = 23.2582, Critic Loss = 3833.9824, Entropy = 0.0033, Returns = 31.3760, Value = 54.7656, mask_loss = 0.0020std_loss = 0.0264:  17%|█▋        | 265/1563 [1:16:44<6:43:39, 18.66s/it]   

reward: tensor(213.1995, device='cuda:0') factor: tensor(0.0036, device='cuda:0')
reward: tensor(174.3961, device='cuda:0') factor: tensor(0.0027, device='cuda:0')
reward: tensor(172.0623, device='cuda:0') factor: tensor(0.0034, device='cuda:0')


Epoch 1, Step 266: Loss = 45802.3398, Actor Loss = -171.0300, Critic Loss = 91946.7422, Entropy = 0.0036, Returns = 229.7119, Value = 58.7836, mask_loss = 0.0048std_loss = 0.0575:  17%|█▋        | 266/1563 [1:16:59<6:21:07, 17.63s/it]

reward: tensor(79.6466, device='cuda:0') factor: tensor(0.0026, device='cuda:0')
reward: tensor(83.1379, device='cuda:0') factor: tensor(0.0039, device='cuda:0')
reward: tensor(88.9866, device='cuda:0') factor: tensor(0.0039, device='cuda:0')


Epoch 1, Step 267: Loss = 5074.5068, Actor Loss = -0.5133, Critic Loss = 10150.0400, Entropy = 0.0049, Returns = 57.2691, Value = 56.9125, mask_loss = 0.0034std_loss = 0.0369:  17%|█▋        | 267/1563 [1:17:16<6:12:28, 17.24s/it]    

reward: tensor(120.6373, device='cuda:0') factor: tensor(0.0076, device='cuda:0')
reward: tensor(111.4699, device='cuda:0') factor: tensor(0.0055, device='cuda:0')
reward: tensor(102.5482, device='cuda:0') factor: tensor(0.0065, device='cuda:0')


Epoch 1, Step 268: Loss = 3525.0776, Actor Loss = -36.4332, Critic Loss = 7123.0215, Entropy = 0.0049, Returns = 94.6106, Value = 58.3491, mask_loss = 0.0061std_loss = 0.0611:  17%|█▋        | 268/1563 [1:17:31<5:58:25, 16.61s/it]

reward: tensor(66.4957, device='cuda:0') factor: tensor(0.0058, device='cuda:0')
reward: tensor(82.8701, device='cuda:0') factor: tensor(0.0058, device='cuda:0')
reward: tensor(79.7854, device='cuda:0') factor: tensor(0.0067, device='cuda:0')


Epoch 1, Step 269: Loss = 3852.0608, Actor Loss = -3.1312, Critic Loss = 7710.3838, Entropy = 0.0073, Returns = 62.6728, Value = 59.7499, mask_loss = 0.0104std_loss = 0.0804:  17%|█▋        | 269/1563 [1:17:46<5:48:02, 16.14s/it] 

reward: tensor(79.2355, device='cuda:0') factor: tensor(0.0076, device='cuda:0')
reward: tensor(63.2702, device='cuda:0') factor: tensor(0.0066, device='cuda:0')
reward: tensor(108.1926, device='cuda:0') factor: tensor(0.0059, device='cuda:0')


Epoch 1, Step 270: Loss = 2022.9746, Actor Loss = 4.6455, Critic Loss = 4036.6582, Entropy = 0.0226, Returns = 54.6153, Value = 59.6373, mask_loss = 0.0183std_loss = 0.0958:  17%|█▋        | 270/1563 [1:18:01<5:41:25, 15.84s/it] 

reward: tensor(62.9966, device='cuda:0') factor: tensor(0.0122, device='cuda:0')
reward: tensor(66.5726, device='cuda:0') factor: tensor(0.0109, device='cuda:0')
reward: tensor(49.5065, device='cuda:0') factor: tensor(0.0118, device='cuda:0')


Epoch 1, Step 271: Loss = 810.6792, Actor Loss = -5.2131, Critic Loss = 1631.7847, Entropy = 0.0108, Returns = 64.6498, Value = 59.4989, mask_loss = 0.0132std_loss = 0.0966:  17%|█▋        | 271/1563 [1:18:16<5:38:51, 15.74s/it]

reward: tensor(164.3958, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(112.7637, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(133.3188, device='cuda:0') factor: tensor(0.0082, device='cuda:0')


Epoch 1, Step 272: Loss = 16539.3086, Actor Loss = -83.2953, Critic Loss = 33245.2070, Entropy = 0.0061, Returns = 141.0656, Value = 57.6549, mask_loss = 0.0092std_loss = 0.0819:  17%|█▋        | 272/1563 [1:18:31<5:33:06, 15.48s/it]

reward: tensor(72.1076, device='cuda:0') factor: tensor(0.0147, device='cuda:0')
reward: tensor(67.8979, device='cuda:0') factor: tensor(0.0154, device='cuda:0')
reward: tensor(76.6705, device='cuda:0') factor: tensor(0.0144, device='cuda:0')


Epoch 1, Step 273: Loss = 3040.4487, Actor Loss = -8.3977, Critic Loss = 6097.6929, Entropy = 0.0075, Returns = 68.4338, Value = 60.2031, mask_loss = 0.0103std_loss = 0.0801:  17%|█▋        | 273/1563 [1:18:46<5:28:18, 15.27s/it]    

reward: tensor(115.9667, device='cuda:0') factor: tensor(0.0102, device='cuda:0')
reward: tensor(111.7078, device='cuda:0') factor: tensor(0.0107, device='cuda:0')
reward: tensor(88.6731, device='cuda:0') factor: tensor(0.0103, device='cuda:0')


Epoch 1, Step 274: Loss = 14249.6758, Actor Loss = -77.8536, Critic Loss = 28655.0586, Entropy = 0.0068, Returns = 137.0734, Value = 59.3472, mask_loss = 0.0059std_loss = 0.0555:  18%|█▊        | 274/1563 [1:19:08<6:07:35, 17.11s/it]

reward: tensor(132.9843, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(130.7733, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(170.0744, device='cuda:0') factor: tensor(0.0064, device='cuda:0')


Epoch 1, Step 275: Loss = 8731.8789, Actor Loss = -27.5742, Critic Loss = 17518.9062, Entropy = 0.0050, Returns = 88.4245, Value = 61.0347, mask_loss = 0.0058std_loss = 0.0593:  18%|█▊        | 275/1563 [1:19:22<5:49:58, 16.30s/it]  

reward: tensor(96.5033, device='cuda:0') factor: tensor(0.0037, device='cuda:0')
reward: tensor(141.5772, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(106.8219, device='cuda:0') factor: tensor(0.0036, device='cuda:0')


Epoch 1, Step 276: Loss = 19271.4277, Actor Loss = -49.3661, Critic Loss = 38641.5859, Entropy = 0.0037, Returns = 107.0547, Value = 57.8553, mask_loss = 0.0032std_loss = 0.0378:  18%|█▊        | 276/1563 [1:19:41<6:06:48, 17.10s/it]

reward: tensor(143.5772, device='cuda:0') factor: tensor(0.0042, device='cuda:0')
reward: tensor(202.9497, device='cuda:0') factor: tensor(0.0046, device='cuda:0')
reward: tensor(216.4532, device='cuda:0') factor: tensor(0.0043, device='cuda:0')


Epoch 1, Step 277: Loss = 56335.1758, Actor Loss = -183.6437, Critic Loss = 113037.6406, Entropy = 0.0028, Returns = 243.9617, Value = 60.5875, mask_loss = 0.0041std_loss = 0.0538:  18%|█▊        | 277/1563 [1:19:56<5:53:42, 16.50s/it]

reward: tensor(110.1873, device='cuda:0') factor: tensor(0.0031, device='cuda:0')
reward: tensor(118.7117, device='cuda:0') factor: tensor(0.0026, device='cuda:0')
reward: tensor(130.8396, device='cuda:0') factor: tensor(0.0027, device='cuda:0')


Epoch 1, Step 278: Loss = 14702.1797, Actor Loss = -61.2876, Critic Loss = 29526.9336, Entropy = 0.0030, Returns = 120.3646, Value = 59.2196, mask_loss = 0.0035std_loss = 0.0441:  18%|█▊        | 278/1563 [1:20:17<6:24:29, 17.95s/it]  

reward: tensor(97.3866, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(93.3909, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(81.3733, device='cuda:0') factor: tensor(0.0078, device='cuda:0')


Epoch 1, Step 279: Loss = 4284.0786, Actor Loss = -20.2164, Critic Loss = 8608.5898, Entropy = 0.0048, Returns = 78.6959, Value = 58.6944, mask_loss = 0.0093std_loss = 0.0821:  18%|█▊        | 279/1563 [1:20:33<6:06:29, 17.13s/it]   

reward: tensor(165.3401, device='cuda:0') factor: tensor(0.0060, device='cuda:0')
reward: tensor(175.3866, device='cuda:0') factor: tensor(0.0061, device='cuda:0')
reward: tensor(132.7036, device='cuda:0') factor: tensor(0.0038, device='cuda:0')


Epoch 1, Step 280: Loss = 16686.9824, Actor Loss = -87.9485, Critic Loss = 33549.8633, Entropy = 0.0043, Returns = 149.8565, Value = 62.1212, mask_loss = 0.0055std_loss = 0.0634:  18%|█▊        | 280/1563 [1:20:52<6:20:19, 17.79s/it]

reward: tensor(107.5171, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(100.7026, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(112.8846, device='cuda:0') factor: tensor(0.0083, device='cuda:0')


Epoch 1, Step 281: Loss = 8090.6465, Actor Loss = -77.2977, Critic Loss = 16335.8887, Entropy = 0.0056, Returns = 138.9379, Value = 61.8814, mask_loss = 0.0082std_loss = 0.0778:  18%|█▊        | 281/1563 [1:21:08<6:06:57, 17.17s/it] 

reward: tensor(186.3842, device='cuda:0') factor: tensor(0.0067, device='cuda:0')
reward: tensor(158.5486, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(178.8201, device='cuda:0') factor: tensor(0.0069, device='cuda:0')


Epoch 1, Step 282: Loss = 1330.9330, Actor Loss = -0.0252, Critic Loss = 2661.9165, Entropy = 0.0063, Returns = 60.1605, Value = 60.4062, mask_loss = 0.0074std_loss = 0.0722:  18%|█▊        | 282/1563 [1:21:28<6:25:41, 18.07s/it]   

reward: tensor(59.4798, device='cuda:0') factor: tensor(0.0081, device='cuda:0')
reward: tensor(83.3595, device='cuda:0') factor: tensor(0.0110, device='cuda:0')
reward: tensor(63.1870, device='cuda:0') factor: tensor(0.0105, device='cuda:0')


Epoch 1, Step 283: Loss = 1135.2407, Actor Loss = 22.5525, Critic Loss = 2225.3765, Entropy = 0.0143, Returns = 40.1429, Value = 62.9746, mask_loss = 0.0118std_loss = 0.0802:  18%|█▊        | 283/1563 [1:21:42<6:03:07, 17.02s/it]

reward: tensor(77.2033, device='cuda:0') factor: tensor(0.0104, device='cuda:0')
reward: tensor(93.1418, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(90.7989, device='cuda:0') factor: tensor(0.0088, device='cuda:0')


Epoch 1, Step 284: Loss = 3968.5488, Actor Loss = -24.7930, Critic Loss = 7986.6836, Entropy = 0.0078, Returns = 87.4640, Value = 62.8659, mask_loss = 0.0092std_loss = 0.0770:  18%|█▊        | 284/1563 [1:21:58<5:53:00, 16.56s/it]

reward: tensor(73.0282, device='cuda:0') factor: tensor(0.0068, device='cuda:0')
reward: tensor(85.4994, device='cuda:0') factor: tensor(0.0107, device='cuda:0')
reward: tensor(78.8675, device='cuda:0') factor: tensor(0.0120, device='cuda:0')


Epoch 1, Step 285: Loss = 3842.1211, Actor Loss = -3.1998, Critic Loss = 7690.6416, Entropy = 0.0125, Returns = 64.3963, Value = 61.2290, mask_loss = 0.0111std_loss = 0.0788:  18%|█▊        | 285/1563 [1:22:13<5:41:12, 16.02s/it] 

reward: tensor(96.7574, device='cuda:0') factor: tensor(0.0077, device='cuda:0')
reward: tensor(91.8105, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(67.9289, device='cuda:0') factor: tensor(0.0068, device='cuda:0')


Epoch 1, Step 286: Loss = 9058.5781, Actor Loss = -27.8220, Critic Loss = 18172.8008, Entropy = 0.0114, Returns = 90.5216, Value = 62.8136, mask_loss = 0.0072std_loss = 0.0612:  18%|█▊        | 286/1563 [1:22:32<6:05:05, 17.15s/it]

reward: tensor(136.9952, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(143.7801, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(182.7372, device='cuda:0') factor: tensor(0.0090, device='cuda:0')


Epoch 1, Step 287: Loss = 21916.3691, Actor Loss = -92.9517, Critic Loss = 44018.6406, Entropy = 0.0094, Returns = 156.1534, Value = 63.3042, mask_loss = 0.0059std_loss = 0.0529:  18%|█▊        | 287/1563 [1:22:47<5:49:54, 16.45s/it]

reward: tensor(72.0834, device='cuda:0') factor: tensor(0.0090, device='cuda:0')
reward: tensor(61.5642, device='cuda:0') factor: tensor(0.0110, device='cuda:0')
reward: tensor(67.6546, device='cuda:0') factor: tensor(0.0097, device='cuda:0')


Epoch 1, Step 288: Loss = 7830.1958, Actor Loss = -34.3278, Critic Loss = 15729.0469, Entropy = 0.0147, Returns = 98.0050, Value = 63.8608, mask_loss = 0.0102std_loss = 0.0739:  18%|█▊        | 288/1563 [1:23:02<5:39:35, 15.98s/it]  

reward: tensor(124.6817, device='cuda:0') factor: tensor(0.0073, device='cuda:0')
reward: tensor(113.0022, device='cuda:0') factor: tensor(0.0064, device='cuda:0')
reward: tensor(137.1862, device='cuda:0') factor: tensor(0.0062, device='cuda:0')


Epoch 1, Step 289: Loss = 14094.8789, Actor Loss = -40.2047, Critic Loss = 28270.1680, Entropy = 0.0077, Returns = 104.2834, Value = 64.2583, mask_loss = 0.0059std_loss = 0.0583:  18%|█▊        | 289/1563 [1:23:17<5:31:09, 15.60s/it]

reward: tensor(144.9854, device='cuda:0') factor: tensor(0.0105, device='cuda:0')
reward: tensor(149.3537, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(141.2992, device='cuda:0') factor: tensor(0.0104, device='cuda:0')


Epoch 1, Step 290: Loss = 67535.2500, Actor Loss = -172.4098, Critic Loss = 135415.3125, Entropy = 0.0153, Returns = 236.7017, Value = 64.5623, mask_loss = 0.0104std_loss = 0.0671:  19%|█▊        | 290/1563 [1:23:31<5:20:41, 15.12s/it]

reward: tensor(92.8367, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(93.6576, device='cuda:0') factor: tensor(0.0047, device='cuda:0')
reward: tensor(73.7383, device='cuda:0') factor: tensor(0.0073, device='cuda:0')


Epoch 1, Step 291: Loss = 15277.0225, Actor Loss = -29.4728, Critic Loss = 30612.9902, Entropy = 0.0091, Returns = 85.9066, Value = 56.5423, mask_loss = 0.0062std_loss = 0.0460:  19%|█▊        | 291/1563 [1:23:55<6:19:43, 17.91s/it]   

reward: tensor(98.0917, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(143.1387, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(56.3891, device='cuda:0') factor: tensor(0.0051, device='cuda:0')


Epoch 1, Step 292: Loss = 2705.2439, Actor Loss = 5.3751, Critic Loss = 5399.7378, Entropy = 0.0140, Returns = 59.2829, Value = 64.9778, mask_loss = 0.0112std_loss = 0.0745:  19%|█▊        | 292/1563 [1:24:10<5:58:01, 16.90s/it]    

reward: tensor(107.1993, device='cuda:0') factor: tensor(0.0095, device='cuda:0')
reward: tensor(90.7304, device='cuda:0') factor: tensor(0.0066, device='cuda:0')
reward: tensor(66.9597, device='cuda:0') factor: tensor(0.0086, device='cuda:0')


Epoch 1, Step 293: Loss = 5080.6914, Actor Loss = -28.2114, Critic Loss = 10217.8057, Entropy = 0.0103, Returns = 91.0589, Value = 62.8524, mask_loss = 0.0116std_loss = 0.0809:  19%|█▊        | 293/1563 [1:24:25<5:45:15, 16.31s/it]

reward: tensor(107.1877, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(98.1237, device='cuda:0') factor: tensor(0.0116, device='cuda:0')
reward: tensor(107.3408, device='cuda:0') factor: tensor(0.0147, device='cuda:0')


Epoch 1, Step 294: Loss = 6323.4976, Actor Loss = -5.5967, Critic Loss = 12658.1885, Entropy = 0.0113, Returns = 69.1645, Value = 63.7270, mask_loss = 0.0122std_loss = 0.0862:  19%|█▉        | 294/1563 [1:24:40<5:38:22, 16.00s/it] 

reward: tensor(93.4820, device='cuda:0') factor: tensor(0.0101, device='cuda:0')
reward: tensor(88.9085, device='cuda:0') factor: tensor(0.0118, device='cuda:0')
reward: tensor(78.9339, device='cuda:0') factor: tensor(0.0101, device='cuda:0')


Epoch 1, Step 295: Loss = 16702.4375, Actor Loss = -52.1075, Critic Loss = 33509.0898, Entropy = 0.0087, Returns = 113.0145, Value = 61.0093, mask_loss = 0.0070std_loss = 0.0615:  19%|█▉        | 295/1563 [1:24:54<5:28:49, 15.56s/it]

reward: tensor(76.4114, device='cuda:0') factor: tensor(0.0104, device='cuda:0')
reward: tensor(83.5580, device='cuda:0') factor: tensor(0.0090, device='cuda:0')
reward: tensor(92.9469, device='cuda:0') factor: tensor(0.0093, device='cuda:0')


Epoch 1, Step 296: Loss = 2675.9055, Actor Loss = 2.4749, Critic Loss = 5346.8613, Entropy = 0.0097, Returns = 59.4656, Value = 62.0651, mask_loss = 0.0087std_loss = 0.0703:  19%|█▉        | 296/1563 [1:25:13<5:47:25, 16.45s/it]     

reward: tensor(90.4441, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(130.9657, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(115.7585, device='cuda:0') factor: tensor(0.0082, device='cuda:0')


Epoch 1, Step 297: Loss = 14012.7100, Actor Loss = -44.1898, Critic Loss = 28113.7988, Entropy = 0.0046, Returns = 109.4819, Value = 65.4011, mask_loss = 0.0044std_loss = 0.0499:  19%|█▉        | 297/1563 [1:25:28<5:34:43, 15.86s/it]

reward: tensor(173.7407, device='cuda:0') factor: tensor(0.0086, device='cuda:0')
reward: tensor(148.7402, device='cuda:0') factor: tensor(0.0098, device='cuda:0')
reward: tensor(121.2201, device='cuda:0') factor: tensor(0.0099, device='cuda:0')


Epoch 1, Step 298: Loss = 17937.5605, Actor Loss = -59.5530, Critic Loss = 35994.2266, Entropy = 0.0063, Returns = 122.2898, Value = 62.8542, mask_loss = 0.0089std_loss = 0.0738:  19%|█▉        | 298/1563 [1:25:49<6:09:53, 17.54s/it]

reward: tensor(66.9304, device='cuda:0') factor: tensor(0.0102, device='cuda:0')
reward: tensor(57.4816, device='cuda:0') factor: tensor(0.0129, device='cuda:0')
reward: tensor(32.6494, device='cuda:0') factor: tensor(0.0104, device='cuda:0')


Epoch 1, Step 299: Loss = 1755.4769, Actor Loss = 32.8129, Critic Loss = 3445.3281, Entropy = 0.0135, Returns = 32.7220, Value = 65.7010, mask_loss = 0.0142std_loss = 0.0810:  19%|█▉        | 299/1563 [1:26:03<5:50:22, 16.63s/it]    

reward: tensor(79.3326, device='cuda:0') factor: tensor(0.0121, device='cuda:0')
reward: tensor(87.1492, device='cuda:0') factor: tensor(0.0119, device='cuda:0')
reward: tensor(90.0722, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 300: Loss = 8833.2598, Actor Loss = 4.2151, Critic Loss = 17658.0898, Entropy = 0.0052, Returns = 62.1232, Value = 66.4112, mask_loss = 0.0052std_loss = 0.0544:  19%|█▉        | 300/1563 [1:26:18<5:37:22, 16.03s/it]

reward: tensor(103.1977, device='cuda:0') factor: tensor(0.0085, device='cuda:0')
reward: tensor(122.1159, device='cuda:0') factor: tensor(0.0085, device='cuda:0')
reward: tensor(86.0535, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 301: Loss = 2512.9092, Actor Loss = -33.7291, Critic Loss = 5093.2769, Entropy = 0.0064, Returns = 97.2643, Value = 63.6134, mask_loss = 0.0079std_loss = 0.0744:  19%|█▉        | 301/1563 [1:26:34<5:36:36, 16.00s/it]

reward: tensor(93.0364, device='cuda:0') factor: tensor(0.0074, device='cuda:0')
reward: tensor(148.0286, device='cuda:0') factor: tensor(0.0075, device='cuda:0')
reward: tensor(133.6818, device='cuda:0') factor: tensor(0.0062, device='cuda:0')


Epoch 1, Step 302: Loss = 7501.1523, Actor Loss = -6.5709, Critic Loss = 15015.4463, Entropy = 0.0056, Returns = 71.6814, Value = 65.1823, mask_loss = 0.0060std_loss = 0.0613:  19%|█▉        | 302/1563 [1:26:49<5:31:20, 15.77s/it]

reward: tensor(88.0318, device='cuda:0') factor: tensor(0.0048, device='cuda:0')
reward: tensor(96.0316, device='cuda:0') factor: tensor(0.0057, device='cuda:0')
reward: tensor(92.5910, device='cuda:0') factor: tensor(0.0055, device='cuda:0')


Epoch 1, Step 303: Loss = 3762.5073, Actor Loss = -19.6063, Critic Loss = 7564.2275, Entropy = 0.0051, Returns = 85.5148, Value = 66.0065, mask_loss = 0.0059std_loss = 0.0617:  19%|█▉        | 303/1563 [1:27:04<5:26:16, 15.54s/it]

reward: tensor(135.0129, device='cuda:0') factor: tensor(0.0061, device='cuda:0')
reward: tensor(138.2795, device='cuda:0') factor: tensor(0.0057, device='cuda:0')
reward: tensor(116.3554, device='cuda:0') factor: tensor(0.0054, device='cuda:0')


Epoch 1, Step 304: Loss = 4625.1470, Actor Loss = -43.2965, Critic Loss = 9336.8867, Entropy = 0.0043, Returns = 108.4242, Value = 65.2301, mask_loss = 0.0052std_loss = 0.0563:  19%|█▉        | 304/1563 [1:27:22<5:42:17, 16.31s/it]

reward: tensor(83.4370, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(81.6253, device='cuda:0') factor: tensor(0.0043, device='cuda:0')
reward: tensor(76.1765, device='cuda:0') factor: tensor(0.0036, device='cuda:0')


Epoch 1, Step 305: Loss = 3581.8550, Actor Loss = -2.0498, Critic Loss = 7167.8096, Entropy = 0.0033, Returns = 68.9535, Value = 67.0136, mask_loss = 0.0031std_loss = 0.0358:  20%|█▉        | 305/1563 [1:27:37<5:30:53, 15.78s/it]  

reward: tensor(102.1139, device='cuda:0') factor: tensor(0.0058, device='cuda:0')
reward: tensor(133.0436, device='cuda:0') factor: tensor(0.0062, device='cuda:0')
reward: tensor(104.1196, device='cuda:0') factor: tensor(0.0055, device='cuda:0')


Epoch 1, Step 306: Loss = 12071.8691, Actor Loss = -24.9172, Critic Loss = 24193.5723, Entropy = 0.0051, Returns = 88.2214, Value = 63.4977, mask_loss = 0.0060std_loss = 0.0532:  20%|█▉        | 306/1563 [1:27:52<5:28:06, 15.66s/it]

reward: tensor(127.0178, device='cuda:0') factor: tensor(0.0065, device='cuda:0')
reward: tensor(151.7010, device='cuda:0') factor: tensor(0.0061, device='cuda:0')
reward: tensor(137.8798, device='cuda:0') factor: tensor(0.0065, device='cuda:0')


Epoch 1, Step 307: Loss = 1940.2950, Actor Loss = 21.6768, Critic Loss = 3837.2366, Entropy = 0.0076, Returns = 43.1300, Value = 64.9883, mask_loss = 0.0071std_loss = 0.0688:  20%|█▉        | 307/1563 [1:28:08<5:25:24, 15.54s/it]   

reward: tensor(122.3167, device='cuda:0') factor: tensor(0.0094, device='cuda:0')
reward: tensor(152.1207, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(143.8860, device='cuda:0') factor: tensor(0.0102, device='cuda:0')


Epoch 1, Step 308: Loss = 6704.5635, Actor Loss = -48.4534, Critic Loss = 13506.0342, Entropy = 0.0056, Returns = 110.9309, Value = 62.5895, mask_loss = 0.0092std_loss = 0.0766:  20%|█▉        | 308/1563 [1:28:30<6:07:18, 17.56s/it]

reward: tensor(91.9219, device='cuda:0') factor: tensor(0.0092, device='cuda:0')
reward: tensor(126.2201, device='cuda:0') factor: tensor(0.0090, device='cuda:0')
reward: tensor(135.6896, device='cuda:0') factor: tensor(0.0088, device='cuda:0')


Epoch 1, Step 309: Loss = 5295.9521, Actor Loss = -31.0520, Critic Loss = 10654.0078, Entropy = 0.0050, Returns = 98.6014, Value = 67.6757, mask_loss = 0.0083std_loss = 0.0798:  20%|█▉        | 309/1563 [1:28:45<5:50:42, 16.78s/it] 

reward: tensor(126.2811, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(138.5572, device='cuda:0') factor: tensor(0.0060, device='cuda:0')
reward: tensor(130.3029, device='cuda:0') factor: tensor(0.0066, device='cuda:0')


Epoch 1, Step 310: Loss = 14520.9766, Actor Loss = -71.0289, Critic Loss = 29184.0117, Entropy = 0.0041, Returns = 136.3911, Value = 65.4830, mask_loss = 0.0073std_loss = 0.0699:  20%|█▉        | 310/1563 [1:29:04<6:06:08, 17.53s/it]

reward: tensor(157.8849, device='cuda:0') factor: tensor(0.0089, device='cuda:0')
reward: tensor(158.4686, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(150.1557, device='cuda:0') factor: tensor(0.0087, device='cuda:0')


Epoch 1, Step 311: Loss = 1779.0608, Actor Loss = -13.7001, Critic Loss = 3585.5217, Entropy = 0.0037, Returns = 79.7640, Value = 66.2093, mask_loss = 0.0092std_loss = 0.0856:  20%|█▉        | 311/1563 [1:29:23<6:14:24, 17.94s/it]   

reward: tensor(93.0250, device='cuda:0') factor: tensor(0.0092, device='cuda:0')
reward: tensor(117.5411, device='cuda:0') factor: tensor(0.0093, device='cuda:0')
reward: tensor(118.7733, device='cuda:0') factor: tensor(0.0102, device='cuda:0')


Epoch 1, Step 312: Loss = 19927.3691, Actor Loss = -84.5653, Critic Loss = 40023.8672, Entropy = 0.0036, Returns = 147.8878, Value = 63.4756, mask_loss = 0.0079std_loss = 0.0755:  20%|█▉        | 312/1563 [1:29:40<6:09:21, 17.72s/it]

reward: tensor(135.7904, device='cuda:0') factor: tensor(0.0055, device='cuda:0')
reward: tensor(169.6411, device='cuda:0') factor: tensor(0.0051, device='cuda:0')
reward: tensor(158.2780, device='cuda:0') factor: tensor(0.0051, device='cuda:0')


Epoch 1, Step 313: Loss = 17751.9746, Actor Loss = -107.7907, Critic Loss = 35719.5312, Entropy = 0.0038, Returns = 175.4031, Value = 67.8201, mask_loss = 0.0056std_loss = 0.0635:  20%|██        | 313/1563 [1:29:59<6:17:14, 18.11s/it]

reward: tensor(99.0700, device='cuda:0') factor: tensor(0.0063, device='cuda:0')
reward: tensor(122.5562, device='cuda:0') factor: tensor(0.0058, device='cuda:0')
reward: tensor(124.8958, device='cuda:0') factor: tensor(0.0066, device='cuda:0')


Epoch 1, Step 314: Loss = 13502.1221, Actor Loss = -47.5128, Critic Loss = 27099.2695, Entropy = 0.0062, Returns = 114.3775, Value = 67.0777, mask_loss = 0.0056std_loss = 0.0535:  20%|██        | 314/1563 [1:30:21<6:39:24, 19.19s/it] 

reward: tensor(65.1358, device='cuda:0') factor: tensor(0.0111, device='cuda:0')
reward: tensor(66.5032, device='cuda:0') factor: tensor(0.0104, device='cuda:0')
reward: tensor(82.3445, device='cuda:0') factor: tensor(0.0111, device='cuda:0')


Epoch 1, Step 315: Loss = 4406.0425, Actor Loss = -2.3424, Critic Loss = 8816.7695, Entropy = 0.0044, Returns = 67.4296, Value = 65.3308, mask_loss = 0.0072std_loss = 0.0730:  20%|██        | 315/1563 [1:30:36<6:16:15, 18.09s/it]    

reward: tensor(188.9018, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(164.6594, device='cuda:0') factor: tensor(0.0074, device='cuda:0')
reward: tensor(155.8379, device='cuda:0') factor: tensor(0.0070, device='cuda:0')


Epoch 1, Step 316: Loss = 18216.1016, Actor Loss = -94.1165, Critic Loss = 36620.4375, Entropy = 0.0044, Returns = 160.3999, Value = 66.4747, mask_loss = 0.0060std_loss = 0.0604:  20%|██        | 316/1563 [1:30:56<6:24:27, 18.50s/it]

reward: tensor(111.7595, device='cuda:0') factor: tensor(0.0094, device='cuda:0')
reward: tensor(134.9682, device='cuda:0') factor: tensor(0.0083, device='cuda:0')
reward: tensor(115.3276, device='cuda:0') factor: tensor(0.0097, device='cuda:0')


Epoch 1, Step 317: Loss = 12358.9844, Actor Loss = -84.6308, Critic Loss = 24887.2305, Entropy = 0.0059, Returns = 153.1301, Value = 68.8098, mask_loss = 0.0119std_loss = 0.0873:  20%|██        | 317/1563 [1:31:11<6:01:54, 17.43s/it]

reward: tensor(91.3331, device='cuda:0') factor: tensor(0.0057, device='cuda:0')
reward: tensor(97.7789, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(91.2708, device='cuda:0') factor: tensor(0.0062, device='cuda:0')


Epoch 1, Step 318: Loss = 8873.2852, Actor Loss = -43.7125, Critic Loss = 17833.9961, Entropy = 0.0037, Returns = 111.0606, Value = 67.5238, mask_loss = 0.0067std_loss = 0.0716:  20%|██        | 318/1563 [1:31:31<6:16:57, 18.17s/it] 

reward: tensor(120.7382, device='cuda:0') factor: tensor(0.0112, device='cuda:0')
reward: tensor(122.3576, device='cuda:0') factor: tensor(0.0106, device='cuda:0')
reward: tensor(105.5908, device='cuda:0') factor: tensor(0.0123, device='cuda:0')


Epoch 1, Step 319: Loss = 6615.4604, Actor Loss = -44.9977, Critic Loss = 13320.9160, Entropy = 0.0083, Returns = 111.3646, Value = 66.5754, mask_loss = 0.0104std_loss = 0.0756:  20%|██        | 319/1563 [1:31:50<6:20:42, 18.36s/it]

reward: tensor(127.9110, device='cuda:0') factor: tensor(0.0070, device='cuda:0')
reward: tensor(117.6089, device='cuda:0') factor: tensor(0.0084, device='cuda:0')
reward: tensor(122.6149, device='cuda:0') factor: tensor(0.0089, device='cuda:0')


Epoch 1, Step 320: Loss = 9828.3027, Actor Loss = -32.0239, Critic Loss = 19720.6523, Entropy = 0.0073, Returns = 100.0542, Value = 68.2264, mask_loss = 0.0085std_loss = 0.0698:  20%|██        | 320/1563 [1:32:04<5:59:19, 17.34s/it]

reward: tensor(102.2350, device='cuda:0') factor: tensor(0.0092, device='cuda:0')
reward: tensor(133.2983, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(127.4015, device='cuda:0') factor: tensor(0.0082, device='cuda:0')


Epoch 1, Step 321: Loss = 33442.1250, Actor Loss = -119.7664, Critic Loss = 67123.7812, Entropy = 0.0045, Returns = 189.1367, Value = 69.5592, mask_loss = 0.0094std_loss = 0.0820:  21%|██        | 321/1563 [1:32:20<5:48:22, 16.83s/it]

reward: tensor(60.3701, device='cuda:0') factor: tensor(0.0079, device='cuda:0')
reward: tensor(99.3479, device='cuda:0') factor: tensor(0.0082, device='cuda:0')
reward: tensor(47.6904, device='cuda:0') factor: tensor(0.0064, device='cuda:0')


Epoch 1, Step 322: Loss = 3617.6755, Actor Loss = -27.7017, Critic Loss = 7290.7544, Entropy = 0.0042, Returns = 96.8325, Value = 69.3485, mask_loss = 0.0061std_loss = 0.0632:  21%|██        | 322/1563 [1:32:40<6:04:50, 17.64s/it]    

reward: tensor(71.4687, device='cuda:0') factor: tensor(0.0123, device='cuda:0')
reward: tensor(68.2309, device='cuda:0') factor: tensor(0.0122, device='cuda:0')
reward: tensor(108.7377, device='cuda:0') factor: tensor(0.0114, device='cuda:0')


Epoch 1, Step 323: Loss = 4259.1904, Actor Loss = -32.4386, Critic Loss = 8583.2578, Entropy = 0.0053, Returns = 103.0181, Value = 70.7505, mask_loss = 0.0117std_loss = 0.0968:  21%|██        | 323/1563 [1:32:53<5:36:53, 16.30s/it]

reward: tensor(82.0608, device='cuda:0') factor: tensor(0.0111, device='cuda:0')
reward: tensor(83.3837, device='cuda:0') factor: tensor(0.0084, device='cuda:0')
reward: tensor(69.4490, device='cuda:0') factor: tensor(0.0098, device='cuda:0')


Epoch 1, Step 324: Loss = 8863.1865, Actor Loss = -62.1665, Critic Loss = 17850.7051, Entropy = 0.0054, Returns = 132.7434, Value = 70.7502, mask_loss = 0.0087std_loss = 0.0714:  21%|██        | 324/1563 [1:33:12<5:54:28, 17.17s/it]

reward: tensor(49.2867, device='cuda:0') factor: tensor(0.0137, device='cuda:0')
reward: tensor(48.9425, device='cuda:0') factor: tensor(0.0132, device='cuda:0')
reward: tensor(49.9649, device='cuda:0') factor: tensor(0.0117, device='cuda:0')


Epoch 1, Step 325: Loss = 1232.9769, Actor Loss = 22.4258, Critic Loss = 2421.1023, Entropy = 0.0102, Returns = 44.6488, Value = 67.1022, mask_loss = 0.0148std_loss = 0.1015:  21%|██        | 325/1563 [1:33:27<5:43:07, 16.63s/it]   

reward: tensor(64.9320, device='cuda:0') factor: tensor(0.0218, device='cuda:0')
reward: tensor(80.6701, device='cuda:0') factor: tensor(0.0215, device='cuda:0')
reward: tensor(62.0434, device='cuda:0') factor: tensor(0.0215, device='cuda:0')


Epoch 1, Step 326: Loss = 8194.6543, Actor Loss = -33.5949, Critic Loss = 16456.4980, Entropy = 0.0134, Returns = 104.7083, Value = 71.2667, mask_loss = 0.0229std_loss = 0.1125:  21%|██        | 326/1563 [1:33:43<5:33:34, 16.18s/it]

reward: tensor(189.2759, device='cuda:0') factor: tensor(0.0059, device='cuda:0')
reward: tensor(168.3434, device='cuda:0') factor: tensor(0.0050, device='cuda:0')
reward: tensor(166.4988, device='cuda:0') factor: tensor(0.0060, device='cuda:0')


Epoch 1, Step 327: Loss = 10248.8809, Actor Loss = -57.3809, Critic Loss = 20612.5234, Entropy = 0.0031, Returns = 127.1583, Value = 69.8854, mask_loss = 0.0058std_loss = 0.0640:  21%|██        | 327/1563 [1:34:02<5:55:26, 17.25s/it]

reward: tensor(187.6670, device='cuda:0') factor: tensor(0.0042, device='cuda:0')
reward: tensor(152.0720, device='cuda:0') factor: tensor(0.0073, device='cuda:0')
reward: tensor(174.1180, device='cuda:0') factor: tensor(0.0059, device='cuda:0')


Epoch 1, Step 328: Loss = 45300.5625, Actor Loss = -97.0586, Critic Loss = 90795.2422, Entropy = 0.0030, Returns = 164.7886, Value = 67.8376, mask_loss = 0.0081std_loss = 0.0725:  21%|██        | 328/1563 [1:34:17<5:41:37, 16.60s/it]

reward: tensor(156.2109, device='cuda:0') factor: tensor(0.0108, device='cuda:0')
reward: tensor(99.6402, device='cuda:0') factor: tensor(0.0123, device='cuda:0')
reward: tensor(155.4418, device='cuda:0') factor: tensor(0.0122, device='cuda:0')


Epoch 1, Step 329: Loss = 7958.2295, Actor Loss = -32.5997, Critic Loss = 15981.6582, Entropy = 0.0090, Returns = 102.7053, Value = 70.2964, mask_loss = 0.0105std_loss = 0.0837:  21%|██        | 329/1563 [1:34:39<6:12:51, 18.13s/it] 

reward: tensor(131.6352, device='cuda:0') factor: tensor(0.0078, device='cuda:0')
reward: tensor(111.7593, device='cuda:0') factor: tensor(0.0049, device='cuda:0')
reward: tensor(124.2896, device='cuda:0') factor: tensor(0.0085, device='cuda:0')


Epoch 1, Step 330: Loss = 13454.1992, Actor Loss = -68.8786, Critic Loss = 27046.1562, Entropy = 0.0045, Returns = 137.8452, Value = 69.1000, mask_loss = 0.0068std_loss = 0.0653:  21%|██        | 330/1563 [1:34:59<6:21:06, 18.55s/it]

reward: tensor(133.8528, device='cuda:0') factor: tensor(0.0045, device='cuda:0')
reward: tensor(134.5592, device='cuda:0') factor: tensor(0.0052, device='cuda:0')
reward: tensor(108.8093, device='cuda:0') factor: tensor(0.0063, device='cuda:0')


Epoch 1, Step 331: Loss = 32007.3438, Actor Loss = -100.9421, Critic Loss = 64216.5703, Entropy = 0.0019, Returns = 172.0798, Value = 71.2871, mask_loss = 0.0026std_loss = 0.0400:  21%|██        | 331/1563 [1:35:52<9:57:47, 29.11s/it]

In [6]:
import numpy as np
import torch.nn.functional as F
from datasets import Dataset

mask_gen_model.load_state_dict(torch.load('saved_model/imdb_mask_gen_model_0_40.pth',map_location=device))

mask_gen_model.eval()

# test_inputs = next(iter(test_dataloader)).to(device)
# test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])



data_dict = {
    'text': ["I absolutely love this product! It exceeded all my expectations.", 
             "The movie was fantastic, and the acting was top-notch.",
             "This restaurant offers great service and delicious food. Highly recommend!",
             "The product works as advertised, nothing more, nothing less.",
             "The event was well-organized, but it didn’t leave a lasting impression.",
             "t’s an average phone, nothing special but it does the job.",
             "I’m really disappointed with this purchase. It broke within a week.",
             "The movie was too long and boring, I wouldn’t recommend it.",
             "Terrible customer service, I won’t be coming back to this place."],
    'label': [1, 1, 1, 0, 0, 0, -1, -1, -1]
}
manual_test_data = Dataset.from_dict(data_dict)

manual_test_dataloader = DataLoader(manual_test_data, batch_size=9, collate_fn=collate_fn, shuffle=False)

test_inputs = next(iter(manual_test_dataloader)).to(device)


# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask, response_mask
    dist, value = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

  mask_gen_model.load_state_dict(torch.load('saved_model/imdb_mask_gen_model_0_40.pth',map_location=device))
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [19]:
import random
idx = random.randint(0, 8)
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask

def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = (array > 0)
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)

    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
# scores = normalize_except_zeros(scores.detach().cpu().numpy())

# # remove special tokens
# filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) ]




# import re

# def clean_token(token):
#     # 移除所有普通的特殊字符，比如 'Ġ' 和 'Ċ'
#     return token.replace("Ġ", "").replace("Ċ", "")

# merged_tokens_scores = []
# current_token = ""
# current_score = 0
# count = 0

# def is_special_token(token):
#     # 判断是否是特殊的独立 token，例如 '<|start_header_id|>' 这样的 token
#     return token.startswith("<|") and token.endswith("|>")

# # 用于合并 token 和 score，取平均值
# for token, score in filtered_token_scores:
#     # 检查是否是特殊 token
#     if is_special_token(token):
#         # 如果当前有累积的 token，先把它们加入结果
#         if current_token:
#             # 确保分数归一化在 [0, 1] 之间
#             average_score = min(current_score / count, 1.0)
#             merged_tokens_scores.append((current_token, average_score))
#             current_token = ""
#             current_score = 0
#             count = 0

#         # 特殊 token 直接加入，不合并
#         merged_tokens_scores.append((token, score))
#         continue

#     # 清理 token 中的特殊字符
#     cleaned_token = clean_token(token)

#     # 忽略清理后的空 token
#     if not cleaned_token:
#         continue

#     # 判断是否是新单词的开始（以 'Ġ' 或 'Ċ' 开头的通常是新词）
#     if token.startswith("Ġ") or token.startswith("Ċ"):
#         if current_token:
#             # 确保分数归一化在 [0, 1] 之间
#             average_score = min(current_score / count, 1.0)
#             merged_tokens_scores.append((current_token, average_score))
        
#         # 初始化新的 token 和 score
#         current_token = cleaned_token
#         current_score = score
#         count = 1
#     else:
#         # 如果是子词，则继续合并
#         current_token += cleaned_token
#         current_score += score
#         count += 1

# # 处理最后一个 token
# if current_token:
#     # 确保分数归一化在 [0, 1] 之间
#     average_score = min(current_score / count, 1.0)
#     merged_tokens_scores.append((current_token, average_score))

# # # 输出结果
# # for token, score in merged_tokens_scores:
# #     print(f"Token: {token}, Score: {score}")





# # 根据分数高亮文本（示例中使用HTML标签）
# highlighted_text = ""
# for token, score in merged_tokens_scores:
#     # 动态设置背景颜色：score为0时为白色，score为1时为绿色
#     red = int((1 - score) * 255)
#     green = 255
#     blue = int((1 - score) * 255)
#     color = f'rgb({red}, {green}, {blue})'
#     highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# # 打印高亮后的文本
# from IPython.display import display, HTML
# display(HTML(highlighted_text.strip()))

In [20]:
for token, score in filtered_token_scores:
    print(f"Token: {token}, Score: {score}")

Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|begin_of_text|>, Score: 0.0
Token: <|start_header_id|>, Score: 0.0
Token: system, Score: 0.0
Token: <|end_header_id|>, Score: 0.0
Token: ĊĊ, Score: 0.0
Token: You, Score: 0.0
Token: Ġare, Score: 0.0
Token: Ġa, Score: 0.0
Token: Ġchat, Score: 0.0
Token: bot, Score: 0.0
Token: Ġfor, Score: 0.0
Token: Ġsentiment, Score: 0.0
Token: Ġanalysis, Score: 0.0
Token: ., Score: 0.0
Token: ĠYou, Score: 0.0
Token: Ġcan, Score: 0.0
Token: Ġhelp, Score: 0.0
Token: Ġusers, Score: 0.0
Token: Ġwith, Score: 0.0
Token: Ġtheir, Score: 0.0
Token: Ġquestions, Score: 0.0
Token: Ġvia, Score: 0.0
Token: Ġconcise, Score: 0.0
Token: Ġresponses, Score: 0.0
Token: Ġof, Score: 0.0
Token: ĠPOS, Score: 0.0
Token: ITIVE, Score: 0.0
Token: Ġor, Score: 0.0
Token: ĠNEG, Score: 0.0
Token: ATIVE, Score: 0.0
Token: ., Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|start_header_id|>, Score: 0.0
To

In [None]:
(test_mask_prob * test_context_mask).max()

In [None]:
filtered_token_scores

In [None]:
test_inputs.keys()

collate_fn()

tokenizer("this is a sentence")

In [22]:
new_ds = ds['train'].select(range(3))

In [26]:
from datasets import Dataset

data_dict = {
    'text': ["This is the first example.", "This is the second example."],
    'label': [0, 1]
}
dataset = Dataset.from_dict(data_dict)

In [None]:
new_ds[0]

In [None]:
tokenizer.eos_token_id