In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.squad_model_kldiv import MaskGeneratingModelForSQUAD
from tqdm import tqdm

In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# ds = load_dataset("imdb")
ds = load_dataset("rajpurkar/squad")
# ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
# test_ds = ds['test']
test_ds = ds['validation']

llm_exp_helper = DataHelper(tokenizer)
# collate_fn = llm_exp_helper.get_collate_fun('imdb')
# collate_fn = llm_exp_helper.get_collate_fun('sst2')
collate_fn = llm_exp_helper.get_collate_fun('squad')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    device_map=device,
    token=access_token,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Configure mask model and  Training parameters
mask_gen_model = MaskGeneratingModelForSQUAD()
mask_gen_model.to(device)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

optimizer = torch.optim.Adam(mask_gen_model.parameters(), lr=1e-5)

`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.45


In [6]:
next(iter(train_dataloader))

{'input_ids': tensor([[128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        ...,
        [128000, 128006,   9125,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'context_mask': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}

In [7]:
mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
        gen_tokens = gen_outputs.sequences
        pad_length = gen_tokens.size(1) - input_ids.size(1)
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
        # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
        unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
        unpaded_token_mask[:, :-pad_length] = 1
        gen_attention_mask = gen_attention_mask * unpaded_token_mask

        # get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
        response_mask = gen_attention_mask.clone()
        response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, response_mask, optimizer,
                                                   num_steps=5, mini_batch_size=16, ppo_epochs=2)


        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
               f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
               f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               f"Entropy = {loss_dict['entropy']:.4f}, " \
               f"Returns = {loss_dict['returns']:.4f}, " \
               f"Value = {loss_dict['value']:.4f}, " \
                f"mask_loss = {loss_dict['mask_loss']:.4f}" \
                f"std_loss = {loss_dict['std_loss']:.4f}" \
            #    f"Cont_loss = {loss_dict['contrast_loss']:.4f}, "  \
               
        pbar.set_description(log)

        if idx % 10 == 0:
            print()
        if idx % 20 == 0 and idx != 0:
            torch.save(mask_gen_model.state_dict(), f'saved_model/squad_kldv_{epoch}_{idx}.pth') 

  0%|          | 0/5475 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


reward_raw: tensor(0.3701, device='cuda:0') reward: tensor(0.8881, device='cuda:0') factor: tensor(0.4820, device='cuda:0')
reward_raw: tensor(0.3952, device='cuda:0') reward: tensor(0.9064, device='cuda:0') factor: tensor(0.4887, device='cuda:0')
reward_raw: tensor(0.4190, device='cuda:0') reward: tensor(0.9233, device='cuda:0') factor: tensor(0.4957, device='cuda:0')
reward_raw: tensor(0.3960, device='cuda:0') reward: tensor(0.8977, device='cuda:0') factor: tensor(0.4983, device='cuda:0')
reward_raw: tensor(0.4626, device='cuda:0') reward: tensor(0.9698, device='cuda:0') factor: tensor(0.4927, device='cuda:0')


Epoch 1, Step 1: Loss = -0.6529, Actor Loss = -0.9342, Critic Loss = 0.5639, Entropy = 0.6903, Returns = 0.8870, Value = 1.4696, mask_loss = 0.4710std_loss = 0.0217:   0%|          | 1/5475 [00:16<24:59:24, 16.43s/it]

ratio tensor(1.1233, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.4179, device='cuda:0') reward: tensor(0.9443, device='cuda:0') factor: tensor(0.4736, device='cuda:0')
reward_raw: tensor(0.4537, device='cuda:0') reward: tensor(0.9695, device='cuda:0') factor: tensor(0.4842, device='cuda:0')
reward_raw: tensor(0.4071, device='cuda:0') reward: tensor(0.9101, device='cuda:0') factor: tensor(0.4971, device='cuda:0')
reward_raw: tensor(0.4178, device='cuda:0') reward: tensor(0.9572, device='cuda:0') factor: tensor(0.4606, device='cuda:0')
reward_raw: tensor(0.4430, device='cuda:0') reward: tensor(0.9687, device='cuda:0') factor: tensor(0.4743, device='cuda:0')


Epoch 1, Step 2: Loss = 0.5799, Actor Loss = 0.1197, Critic Loss = 0.9217, Entropy = 0.6907, Returns = 1.0257, Value = 1.0775, mask_loss = 0.4782std_loss = 0.0243:   0%|          | 2/5475 [00:35<27:15:02, 17.92s/it]  

ratio tensor(6.3180, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3720, device='cuda:0') reward: tensor(0.9155, device='cuda:0') factor: tensor(0.4566, device='cuda:0')
reward_raw: tensor(0.3835, device='cuda:0') reward: tensor(0.9193, device='cuda:0') factor: tensor(0.4642, device='cuda:0')
reward_raw: tensor(0.3897, device='cuda:0') reward: tensor(0.9084, device='cuda:0') factor: tensor(0.4813, device='cuda:0')
reward_raw: tensor(0.4191, device='cuda:0') reward: tensor(0.9555, device='cuda:0') factor: tensor(0.4635, device='cuda:0')
reward_raw: tensor(0.3771, device='cuda:0') reward: tensor(0.9008, device='cuda:0') factor: tensor(0.4764, device='cuda:0')


Epoch 1, Step 3: Loss = 0.4389, Actor Loss = 0.0595, Critic Loss = 0.7602, Entropy = 0.6845, Returns = 0.8629, Value = 1.2174, mask_loss = 0.4400std_loss = 0.0260:   0%|          | 3/5475 [00:51<25:39:59, 16.89s/it]

ratio tensor(0.7386, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3537, device='cuda:0') reward: tensor(0.8954, device='cuda:0') factor: tensor(0.4583, device='cuda:0')
reward_raw: tensor(0.3006, device='cuda:0') reward: tensor(0.8740, device='cuda:0') factor: tensor(0.4266, device='cuda:0')
reward_raw: tensor(0.3643, device='cuda:0') reward: tensor(0.9077, device='cuda:0') factor: tensor(0.4567, device='cuda:0')
reward_raw: tensor(0.3182, device='cuda:0') reward: tensor(0.8917, device='cuda:0') factor: tensor(0.4265, device='cuda:0')
reward_raw: tensor(0.3507, device='cuda:0') reward: tensor(0.9028, device='cuda:0') factor: tensor(0.4479, device='cuda:0')


Epoch 1, Step 4: Loss = 0.0734, Actor Loss = -0.0875, Critic Loss = 0.3231, Entropy = 0.6792, Returns = 0.9199, Value = 0.5854, mask_loss = 0.4227std_loss = 0.0245:   0%|          | 4/5475 [01:09<26:23:12, 17.36s/it]

ratio tensor(1.8308, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3175, device='cuda:0') reward: tensor(0.8837, device='cuda:0') factor: tensor(0.4339, device='cuda:0')
reward_raw: tensor(0.2587, device='cuda:0') reward: tensor(0.8251, device='cuda:0') factor: tensor(0.4336, device='cuda:0')
reward_raw: tensor(0.3091, device='cuda:0') reward: tensor(0.8728, device='cuda:0') factor: tensor(0.4363, device='cuda:0')
reward_raw: tensor(0.3554, device='cuda:0') reward: tensor(0.9162, device='cuda:0') factor: tensor(0.4392, device='cuda:0')
reward_raw: tensor(0.3407, device='cuda:0') reward: tensor(0.8880, device='cuda:0') factor: tensor(0.4527, device='cuda:0')


Epoch 1, Step 5: Loss = 0.0785, Actor Loss = -0.0752, Critic Loss = 0.3089, Entropy = 0.6836, Returns = 0.8178, Value = 1.0522, mask_loss = 0.4386std_loss = 0.0263:   0%|          | 5/5475 [01:22<24:18:08, 15.99s/it]

ratio tensor(1.3045, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3822, device='cuda:0') reward: tensor(0.9356, device='cuda:0') factor: tensor(0.4466, device='cuda:0')
reward_raw: tensor(0.3987, device='cuda:0') reward: tensor(0.9454, device='cuda:0') factor: tensor(0.4533, device='cuda:0')
reward_raw: tensor(0.3731, device='cuda:0') reward: tensor(0.9243, device='cuda:0') factor: tensor(0.4488, device='cuda:0')
reward_raw: tensor(0.4141, device='cuda:0') reward: tensor(0.9635, device='cuda:0') factor: tensor(0.4506, device='cuda:0')
reward_raw: tensor(0.3012, device='cuda:0') reward: tensor(0.8568, device='cuda:0') factor: tensor(0.4444, device='cuda:0')


Epoch 1, Step 6: Loss = 0.4289, Actor Loss = 0.1804, Critic Loss = 0.4985, Entropy = 0.6819, Returns = 0.9864, Value = 0.6035, mask_loss = 0.4364std_loss = 0.0299:   0%|          | 6/5475 [01:43<26:50:38, 17.67s/it] 

ratio tensor(1.1921, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3723, device='cuda:0') reward: tensor(0.9211, device='cuda:0') factor: tensor(0.4512, device='cuda:0')
reward_raw: tensor(0.3155, device='cuda:0') reward: tensor(0.8837, device='cuda:0') factor: tensor(0.4318, device='cuda:0')
reward_raw: tensor(0.3927, device='cuda:0') reward: tensor(0.9635, device='cuda:0') factor: tensor(0.4292, device='cuda:0')
reward_raw: tensor(0.3049, device='cuda:0') reward: tensor(0.8886, device='cuda:0') factor: tensor(0.4163, device='cuda:0')
reward_raw: tensor(0.3579, device='cuda:0') reward: tensor(0.9044, device='cuda:0') factor: tensor(0.4535, device='cuda:0')


Epoch 1, Step 7: Loss = -0.3245, Actor Loss = -0.4728, Critic Loss = 0.2979, Entropy = 0.6811, Returns = 0.8409, Value = 1.2305, mask_loss = 0.4372std_loss = 0.0354:   0%|          | 7/5475 [01:57<25:00:07, 16.46s/it]

ratio tensor(1.9151, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3859, device='cuda:0') reward: tensor(0.9438, device='cuda:0') factor: tensor(0.4420, device='cuda:0')
reward_raw: tensor(0.3465, device='cuda:0') reward: tensor(0.9216, device='cuda:0') factor: tensor(0.4249, device='cuda:0')
reward_raw: tensor(0.3452, device='cuda:0') reward: tensor(0.9139, device='cuda:0') factor: tensor(0.4314, device='cuda:0')
reward_raw: tensor(0.3488, device='cuda:0') reward: tensor(0.9116, device='cuda:0') factor: tensor(0.4373, device='cuda:0')
reward_raw: tensor(0.4115, device='cuda:0') reward: tensor(0.9512, device='cuda:0') factor: tensor(0.4602, device='cuda:0')


Epoch 1, Step 8: Loss = 0.6151, Actor Loss = 0.4072, Critic Loss = 0.4172, Entropy = 0.6846, Returns = 0.9192, Value = 0.9401, mask_loss = 0.4496std_loss = 0.0352:   0%|          | 8/5475 [02:15<25:51:15, 17.02s/it]  

ratio tensor(1.9655, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4052, device='cuda:0') reward: tensor(0.9722, device='cuda:0') factor: tensor(0.4330, device='cuda:0')
reward_raw: tensor(0.3935, device='cuda:0') reward: tensor(0.9393, device='cuda:0') factor: tensor(0.4542, device='cuda:0')
reward_raw: tensor(0.3994, device='cuda:0') reward: tensor(0.9605, device='cuda:0') factor: tensor(0.4389, device='cuda:0')
reward_raw: tensor(0.3910, device='cuda:0') reward: tensor(0.9563, device='cuda:0') factor: tensor(0.4347, device='cuda:0')
reward_raw: tensor(0.3680, device='cuda:0') reward: tensor(0.9441, device='cuda:0') factor: tensor(0.4239, device='cuda:0')


Epoch 1, Step 9: Loss = 0.0111, Actor Loss = -0.0605, Critic Loss = 0.1445, Entropy = 0.6698, Returns = 0.9428, Value = 0.8096, mask_loss = 0.4140std_loss = 0.0605:   0%|          | 9/5475 [02:28<23:58:03, 15.79s/it]

ratio tensor(1.1546, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3666, device='cuda:0') reward: tensor(0.9290, device='cuda:0') factor: tensor(0.4376, device='cuda:0')
reward_raw: tensor(0.3129, device='cuda:0') reward: tensor(0.8773, device='cuda:0') factor: tensor(0.4357, device='cuda:0')
reward_raw: tensor(0.3682, device='cuda:0') reward: tensor(0.9194, device='cuda:0') factor: tensor(0.4488, device='cuda:0')
reward_raw: tensor(0.4093, device='cuda:0') reward: tensor(0.9698, device='cuda:0') factor: tensor(0.4395, device='cuda:0')
reward_raw: tensor(0.3721, device='cuda:0') reward: tensor(0.9570, device='cuda:0') factor: tensor(0.4151, device='cuda:0')


Epoch 1, Step 10: Loss = 0.1955, Actor Loss = 0.1070, Critic Loss = 0.1784, Entropy = 0.6775, Returns = 0.8952, Value = 1.0087, mask_loss = 0.4428std_loss = 0.0618:   0%|          | 10/5475 [02:43<23:16:08, 15.33s/it]

ratio tensor(1.0494, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3661, device='cuda:0') reward: tensor(0.9373, device='cuda:0') factor: tensor(0.4288, device='cuda:0')
reward_raw: tensor(0.3881, device='cuda:0') reward: tensor(0.9658, device='cuda:0') factor: tensor(0.4224, device='cuda:0')
reward_raw: tensor(0.3944, device='cuda:0') reward: tensor(0.9469, device='cuda:0') factor: tensor(0.4476, device='cuda:0')
reward_raw: tensor(0.4028, device='cuda:0') reward: tensor(0.9709, device='cuda:0') factor: tensor(0.4319, device='cuda:0')
reward_raw: tensor(0.3762, device='cuda:0') reward: tensor(0.9481, device='cuda:0') factor: tensor(0.4282, device='cuda:0')


Epoch 1, Step 11: Loss = 0.1425, Actor Loss = 0.0767, Critic Loss = 0.1329, Entropy = 0.6790, Returns = 0.9931, Value = 0.9699, mask_loss = 0.4534std_loss = 0.0658:   0%|          | 11/5475 [03:01<24:42:40, 16.28s/it]

ratio tensor(1.0359, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.3624, device='cuda:0') reward: tensor(0.9317, device='cuda:0') factor: tensor(0.4307, device='cuda:0')
reward_raw: tensor(0.4381, device='cuda:0') reward: tensor(0.9930, device='cuda:0') factor: tensor(0.4451, device='cuda:0')
reward_raw: tensor(0.4397, device='cuda:0') reward: tensor(1.0061, device='cuda:0') factor: tensor(0.4336, device='cuda:0')
reward_raw: tensor(0.4154, device='cuda:0') reward: tensor(0.9737, device='cuda:0') factor: tensor(0.4417, device='cuda:0')
reward_raw: tensor(0.4697, device='cuda:0') reward: tensor(1.0266, device='cuda:0') factor: tensor(0.4431, device='cuda:0')


Epoch 1, Step 12: Loss = -0.2580, Actor Loss = -0.3131, Critic Loss = 0.1115, Entropy = 0.6799, Returns = 0.9690, Value = 1.1097, mask_loss = 0.4755std_loss = 0.0745:   0%|          | 12/5475 [03:15<23:48:08, 15.69s/it]

ratio tensor(0.9924, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4375, device='cuda:0') reward: tensor(0.9478, device='cuda:0') factor: tensor(0.4897, device='cuda:0')
reward_raw: tensor(0.4455, device='cuda:0') reward: tensor(0.9894, device='cuda:0') factor: tensor(0.4561, device='cuda:0')
reward_raw: tensor(0.4503, device='cuda:0') reward: tensor(0.9733, device='cuda:0') factor: tensor(0.4770, device='cuda:0')
reward_raw: tensor(0.4760, device='cuda:0') reward: tensor(0.9924, device='cuda:0') factor: tensor(0.4836, device='cuda:0')
reward_raw: tensor(0.3876, device='cuda:0') reward: tensor(0.9243, device='cuda:0') factor: tensor(0.4633, device='cuda:0')


Epoch 1, Step 13: Loss = 0.4218, Actor Loss = 0.3234, Critic Loss = 0.1981, Entropy = 0.6771, Returns = 0.9166, Value = 0.8714, mask_loss = 0.5069std_loss = 0.0755:   0%|          | 13/5475 [03:28<22:25:11, 14.78s/it]  

ratio tensor(0.4832, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4492, device='cuda:0') reward: tensor(0.9608, device='cuda:0') factor: tensor(0.4883, device='cuda:0')
reward_raw: tensor(0.4675, device='cuda:0') reward: tensor(0.9649, device='cuda:0') factor: tensor(0.5026, device='cuda:0')
reward_raw: tensor(0.4704, device='cuda:0') reward: tensor(0.9743, device='cuda:0') factor: tensor(0.4961, device='cuda:0')
reward_raw: tensor(0.4734, device='cuda:0') reward: tensor(0.9845, device='cuda:0') factor: tensor(0.4890, device='cuda:0')
reward_raw: tensor(0.4565, device='cuda:0') reward: tensor(0.9746, device='cuda:0') factor: tensor(0.4819, device='cuda:0')


Epoch 1, Step 14: Loss = 0.0771, Actor Loss = -0.1855, Critic Loss = 0.5265, Entropy = 0.6773, Returns = 0.9817, Value = 1.2788, mask_loss = 0.4836std_loss = 0.0795:   0%|          | 14/5475 [03:47<24:29:16, 16.14s/it]

ratio tensor(2.3757, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4509, device='cuda:0') reward: tensor(0.9502, device='cuda:0') factor: tensor(0.5007, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(0.9611, device='cuda:0') factor: tensor(0.5009, device='cuda:0')
reward_raw: tensor(0.4652, device='cuda:0') reward: tensor(0.9719, device='cuda:0') factor: tensor(0.4933, device='cuda:0')
reward_raw: tensor(0.5325, device='cuda:0') reward: tensor(1.0211, device='cuda:0') factor: tensor(0.5114, device='cuda:0')
reward_raw: tensor(0.4550, device='cuda:0') reward: tensor(0.9474, device='cuda:0') factor: tensor(0.5076, device='cuda:0')


Epoch 1, Step 15: Loss = 0.1698, Actor Loss = 0.0985, Critic Loss = 0.1440, Entropy = 0.6768, Returns = 1.0048, Value = 0.7946, mask_loss = 0.5194std_loss = 0.0823:   0%|          | 15/5475 [04:05<25:08:48, 16.58s/it] 

ratio tensor(1.0222, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4583, device='cuda:0') reward: tensor(0.9211, device='cuda:0') factor: tensor(0.5371, device='cuda:0')
reward_raw: tensor(0.4341, device='cuda:0') reward: tensor(0.9206, device='cuda:0') factor: tensor(0.5135, device='cuda:0')
reward_raw: tensor(0.4445, device='cuda:0') reward: tensor(0.9050, device='cuda:0') factor: tensor(0.5394, device='cuda:0')
reward_raw: tensor(0.3795, device='cuda:0') reward: tensor(0.8686, device='cuda:0') factor: tensor(0.5109, device='cuda:0')
reward_raw: tensor(0.4631, device='cuda:0') reward: tensor(0.9280, device='cuda:0') factor: tensor(0.5351, device='cuda:0')


Epoch 1, Step 16: Loss = -0.4093, Actor Loss = -0.5184, Critic Loss = 0.2195, Entropy = 0.6740, Returns = 0.9711, Value = 0.8878, mask_loss = 0.5286std_loss = 0.0861:   0%|          | 16/5475 [04:19<24:03:03, 15.86s/it]

ratio tensor(3.6289, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4246, device='cuda:0') reward: tensor(0.9429, device='cuda:0') factor: tensor(0.4818, device='cuda:0')
reward_raw: tensor(0.4180, device='cuda:0') reward: tensor(0.9269, device='cuda:0') factor: tensor(0.4912, device='cuda:0')
reward_raw: tensor(0.4215, device='cuda:0') reward: tensor(0.9370, device='cuda:0') factor: tensor(0.4845, device='cuda:0')
reward_raw: tensor(0.4003, device='cuda:0') reward: tensor(0.9060, device='cuda:0') factor: tensor(0.4943, device='cuda:0')
reward_raw: tensor(0.4205, device='cuda:0') reward: tensor(0.9284, device='cuda:0') factor: tensor(0.4922, device='cuda:0')


Epoch 1, Step 17: Loss = 0.4424, Actor Loss = 0.3876, Critic Loss = 0.1108, Entropy = 0.6518, Returns = 0.8822, Value = 0.7744, mask_loss = 0.3934std_loss = 0.0882:   0%|          | 17/5475 [04:31<22:14:02, 14.67s/it]  

ratio tensor(0.2140, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3813, device='cuda:0') reward: tensor(0.9991, device='cuda:0') factor: tensor(0.3822, device='cuda:0')
reward_raw: tensor(0.3935, device='cuda:0') reward: tensor(1.0019, device='cuda:0') factor: tensor(0.3916, device='cuda:0')
reward_raw: tensor(0.4112, device='cuda:0') reward: tensor(1.0279, device='cuda:0') factor: tensor(0.3833, device='cuda:0')
reward_raw: tensor(0.4302, device='cuda:0') reward: tensor(1.0399, device='cuda:0') factor: tensor(0.3903, device='cuda:0')
reward_raw: tensor(0.3911, device='cuda:0') reward: tensor(1.0019, device='cuda:0') factor: tensor(0.3892, device='cuda:0')


Epoch 1, Step 18: Loss = -0.1203, Actor Loss = -0.1716, Critic Loss = 0.1039, Entropy = 0.6395, Returns = 1.0026, Value = 1.0535, mask_loss = 0.3845std_loss = 0.0970:   0%|          | 18/5475 [04:44<21:31:11, 14.20s/it]

ratio tensor(2.6953, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3363, device='cuda:0') reward: tensor(0.9786, device='cuda:0') factor: tensor(0.3577, device='cuda:0')
reward_raw: tensor(0.3486, device='cuda:0') reward: tensor(0.9761, device='cuda:0') factor: tensor(0.3724, device='cuda:0')
reward_raw: tensor(0.3278, device='cuda:0') reward: tensor(0.9562, device='cuda:0') factor: tensor(0.3715, device='cuda:0')
reward_raw: tensor(0.3442, device='cuda:0') reward: tensor(0.9780, device='cuda:0') factor: tensor(0.3663, device='cuda:0')
reward_raw: tensor(0.3898, device='cuda:0') reward: tensor(1.0158, device='cuda:0') factor: tensor(0.3740, device='cuda:0')


Epoch 1, Step 19: Loss = 0.1185, Actor Loss = -0.0140, Critic Loss = 0.2662, Entropy = 0.6330, Returns = 0.9816, Value = 1.1164, mask_loss = 0.3645std_loss = 0.0964:   0%|          | 19/5475 [05:08<25:58:25, 17.14s/it] 

ratio tensor(1.4457, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3679, device='cuda:0') reward: tensor(1.0106, device='cuda:0') factor: tensor(0.3572, device='cuda:0')
reward_raw: tensor(0.3390, device='cuda:0') reward: tensor(1.0123, device='cuda:0') factor: tensor(0.3267, device='cuda:0')
reward_raw: tensor(0.3932, device='cuda:0') reward: tensor(1.0392, device='cuda:0') factor: tensor(0.3540, device='cuda:0')
reward_raw: tensor(0.3037, device='cuda:0') reward: tensor(0.9581, device='cuda:0') factor: tensor(0.3456, device='cuda:0')
reward_raw: tensor(0.3054, device='cuda:0') reward: tensor(0.9744, device='cuda:0') factor: tensor(0.3310, device='cuda:0')


Epoch 1, Step 20: Loss = 0.2129, Actor Loss = 0.0771, Critic Loss = 0.2727, Entropy = 0.6109, Returns = 0.9342, Value = 0.7124, mask_loss = 0.3249std_loss = 0.0880:   0%|          | 20/5475 [05:25<25:45:40, 17.00s/it] 

ratio tensor(0.7592, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.2750, device='cuda:0') reward: tensor(0.9451, device='cuda:0') factor: tensor(0.3299, device='cuda:0')
reward_raw: tensor(0.2859, device='cuda:0') reward: tensor(0.9532, device='cuda:0') factor: tensor(0.3327, device='cuda:0')
reward_raw: tensor(0.2799, device='cuda:0') reward: tensor(0.9466, device='cuda:0') factor: tensor(0.3333, device='cuda:0')
reward_raw: tensor(0.2786, device='cuda:0') reward: tensor(0.9379, device='cuda:0') factor: tensor(0.3406, device='cuda:0')
reward_raw: tensor(0.2716, device='cuda:0') reward: tensor(0.9631, device='cuda:0') factor: tensor(0.3085, device='cuda:0')


Epoch 1, Step 21: Loss = -0.2671, Actor Loss = -0.3141, Critic Loss = 0.0953, Entropy = 0.6064, Returns = 0.9428, Value = 1.0743, mask_loss = 0.3245std_loss = 0.0987:   0%|          | 20/5475 [05:45<25:45:40, 17.00s/it]

ratio tensor(1.7057, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 21: Loss = -0.2671, Actor Loss = -0.3141, Critic Loss = 0.0953, Entropy = 0.6064, Returns = 0.9428, Value = 1.0743, mask_loss = 0.3245std_loss = 0.0987:   0%|          | 21/5475 [05:45<27:18:58, 18.03s/it]

reward_raw: tensor(0.3105, device='cuda:0') reward: tensor(1.0036, device='cuda:0') factor: tensor(0.3069, device='cuda:0')
reward_raw: tensor(0.3334, device='cuda:0') reward: tensor(1.0162, device='cuda:0') factor: tensor(0.3172, device='cuda:0')
reward_raw: tensor(0.3744, device='cuda:0') reward: tensor(1.0576, device='cuda:0') factor: tensor(0.3168, device='cuda:0')
reward_raw: tensor(0.3707, device='cuda:0') reward: tensor(1.0414, device='cuda:0') factor: tensor(0.3293, device='cuda:0')
reward_raw: tensor(0.3615, device='cuda:0') reward: tensor(1.0450, device='cuda:0') factor: tensor(0.3166, device='cuda:0')


Epoch 1, Step 22: Loss = 0.2922, Actor Loss = 0.2406, Critic Loss = 0.1045, Entropy = 0.6314, Returns = 1.0226, Value = 0.8608, mask_loss = 0.3627std_loss = 0.0975:   0%|          | 22/5475 [06:00<25:42:02, 16.97s/it]  

ratio tensor(0.2522, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3833, device='cuda:0') reward: tensor(1.0149, device='cuda:0') factor: tensor(0.3685, device='cuda:0')
reward_raw: tensor(0.3870, device='cuda:0') reward: tensor(1.0351, device='cuda:0') factor: tensor(0.3519, device='cuda:0')
reward_raw: tensor(0.4232, device='cuda:0') reward: tensor(1.0508, device='cuda:0') factor: tensor(0.3724, device='cuda:0')
reward_raw: tensor(0.3676, device='cuda:0') reward: tensor(0.9937, device='cuda:0') factor: tensor(0.3739, device='cuda:0')
reward_raw: tensor(0.3722, device='cuda:0') reward: tensor(0.9902, device='cuda:0') factor: tensor(0.3819, device='cuda:0')


Epoch 1, Step 23: Loss = -0.2532, Actor Loss = -0.3064, Critic Loss = 0.1076, Entropy = 0.6296, Returns = 1.0849, Value = 1.0267, mask_loss = 0.3623std_loss = 0.1025:   0%|          | 23/5475 [06:17<25:58:33, 17.15s/it]

ratio tensor(1.6891, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3337, device='cuda:0') reward: tensor(0.9797, device='cuda:0') factor: tensor(0.3540, device='cuda:0')
reward_raw: tensor(0.3043, device='cuda:0') reward: tensor(0.9695, device='cuda:0') factor: tensor(0.3349, device='cuda:0')
reward_raw: tensor(0.2915, device='cuda:0') reward: tensor(0.9422, device='cuda:0') factor: tensor(0.3494, device='cuda:0')
reward_raw: tensor(0.3014, device='cuda:0') reward: tensor(0.9580, device='cuda:0') factor: tensor(0.3434, device='cuda:0')
reward_raw: tensor(0.3106, device='cuda:0') reward: tensor(0.9580, device='cuda:0') factor: tensor(0.3527, device='cuda:0')


Epoch 1, Step 24: Loss = 0.3182, Actor Loss = 0.2529, Critic Loss = 0.1318, Entropy = 0.6204, Returns = 0.9418, Value = 1.0199, mask_loss = 0.3445std_loss = 0.0914:   0%|          | 24/5475 [06:35<26:07:07, 17.25s/it]  

ratio tensor(4.9815, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.3340, device='cuda:0') reward: tensor(1.0450, device='cuda:0') factor: tensor(0.2890, device='cuda:0')
reward_raw: tensor(0.3712, device='cuda:0') reward: tensor(1.0667, device='cuda:0') factor: tensor(0.3045, device='cuda:0')
reward_raw: tensor(0.3667, device='cuda:0') reward: tensor(1.0504, device='cuda:0') factor: tensor(0.3163, device='cuda:0')
reward_raw: tensor(0.2954, device='cuda:0') reward: tensor(0.9862, device='cuda:0') factor: tensor(0.3092, device='cuda:0')
reward_raw: tensor(0.3262, device='cuda:0') reward: tensor(1.0051, device='cuda:0') factor: tensor(0.3210, device='cuda:0')


Epoch 1, Step 25: Loss = 0.3400, Actor Loss = 0.2730, Critic Loss = 0.1352, Entropy = 0.5990, Returns = 1.0329, Value = 1.2104, mask_loss = 0.3298std_loss = 0.1107:   0%|          | 25/5475 [06:57<28:18:19, 18.70s/it]

ratio tensor(0.7417, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4082, device='cuda:0') reward: tensor(1.0861, device='cuda:0') factor: tensor(0.3221, device='cuda:0')
reward_raw: tensor(0.3563, device='cuda:0') reward: tensor(1.0438, device='cuda:0') factor: tensor(0.3125, device='cuda:0')
reward_raw: tensor(0.3410, device='cuda:0') reward: tensor(1.0178, device='cuda:0') factor: tensor(0.3232, device='cuda:0')
reward_raw: tensor(0.3622, device='cuda:0') reward: tensor(1.0564, device='cuda:0') factor: tensor(0.3058, device='cuda:0')
reward_raw: tensor(0.4066, device='cuda:0') reward: tensor(1.0787, device='cuda:0') factor: tensor(0.3279, device='cuda:0')


Epoch 1, Step 26: Loss = -0.3939, Actor Loss = -0.4661, Critic Loss = 0.1458, Entropy = 0.6083, Returns = 1.0711, Value = 1.0548, mask_loss = 0.3429std_loss = 0.1160:   0%|          | 26/5475 [07:13<27:06:47, 17.91s/it]

ratio tensor(2.9329, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4241, device='cuda:0') reward: tensor(1.0638, device='cuda:0') factor: tensor(0.3604, device='cuda:0')
reward_raw: tensor(0.3452, device='cuda:0') reward: tensor(0.9866, device='cuda:0') factor: tensor(0.3586, device='cuda:0')
reward_raw: tensor(0.4062, device='cuda:0') reward: tensor(1.0304, device='cuda:0') factor: tensor(0.3757, device='cuda:0')
reward_raw: tensor(0.4158, device='cuda:0') reward: tensor(1.0641, device='cuda:0') factor: tensor(0.3517, device='cuda:0')
reward_raw: tensor(0.4055, device='cuda:0') reward: tensor(1.0334, device='cuda:0') factor: tensor(0.3721, device='cuda:0')


Epoch 1, Step 27: Loss = 0.5595, Actor Loss = 0.4075, Critic Loss = 0.3053, Entropy = 0.6514, Returns = 1.0531, Value = 0.8021, mask_loss = 0.5928std_loss = 0.0982:   0%|          | 27/5475 [07:25<24:16:24, 16.04s/it]  

ratio tensor(3.1433e-05, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4423, device='cuda:0') reward: tensor(0.8489, device='cuda:0') factor: tensor(0.5935, device='cuda:0')
reward_raw: tensor(0.5405, device='cuda:0') reward: tensor(0.9532, device='cuda:0') factor: tensor(0.5874, device='cuda:0')
reward_raw: tensor(0.5928, device='cuda:0') reward: tensor(1.0129, device='cuda:0') factor: tensor(0.5799, device='cuda:0')
reward_raw: tensor(0.5468, device='cuda:0') reward: tensor(0.9671, device='cuda:0') factor: tensor(0.5797, device='cuda:0')
reward_raw: tensor(0.5173, device='cuda:0') reward: tensor(0.9429, device='cuda:0') factor: tensor(0.5744, device='cuda:0')


Epoch 1, Step 28: Loss = 0.1810, Actor Loss = 0.0222, Critic Loss = 0.3190, Entropy = 0.6601, Returns = 0.9313, Value = 1.3133, mask_loss = 0.5785std_loss = 0.0933:   1%|          | 28/5475 [07:40<24:02:20, 15.89s/it]

ratio tensor(1.1842, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4766, device='cuda:0') reward: tensor(0.8854, device='cuda:0') factor: tensor(0.5912, device='cuda:0')
reward_raw: tensor(0.5535, device='cuda:0') reward: tensor(0.9667, device='cuda:0') factor: tensor(0.5868, device='cuda:0')
reward_raw: tensor(0.5696, device='cuda:0') reward: tensor(0.9877, device='cuda:0') factor: tensor(0.5820, device='cuda:0')
reward_raw: tensor(0.5632, device='cuda:0') reward: tensor(0.9836, device='cuda:0') factor: tensor(0.5796, device='cuda:0')
reward_raw: tensor(0.5389, device='cuda:0') reward: tensor(0.9451, device='cuda:0') factor: tensor(0.5938, device='cuda:0')


Epoch 1, Step 29: Loss = 0.6435, Actor Loss = 0.5650, Critic Loss = 0.1583, Entropy = 0.6623, Returns = 0.9383, Value = 0.7983, mask_loss = 0.5524std_loss = 0.1072:   1%|          | 29/5475 [07:53<22:30:44, 14.88s/it]

ratio tensor(1.3032, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4698, device='cuda:0') reward: tensor(0.9015, device='cuda:0') factor: tensor(0.5683, device='cuda:0')
reward_raw: tensor(0.4677, device='cuda:0') reward: tensor(0.9146, device='cuda:0') factor: tensor(0.5531, device='cuda:0')
reward_raw: tensor(0.5474, device='cuda:0') reward: tensor(0.9537, device='cuda:0') factor: tensor(0.5937, device='cuda:0')
reward_raw: tensor(0.4833, device='cuda:0') reward: tensor(0.9216, device='cuda:0') factor: tensor(0.5617, device='cuda:0')
reward_raw: tensor(0.4620, device='cuda:0') reward: tensor(0.9160, device='cuda:0') factor: tensor(0.5461, device='cuda:0')


Epoch 1, Step 30: Loss = -0.0390, Actor Loss = -0.0929, Critic Loss = 0.1092, Entropy = 0.6423, Returns = 0.9080, Value = 0.9125, mask_loss = 0.5711std_loss = 0.1348:   1%|          | 30/5475 [08:08<22:51:28, 15.11s/it]

ratio tensor(0.6039, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6414, device='cuda:0') reward: tensor(1.0671, device='cuda:0') factor: tensor(0.5743, device='cuda:0')
reward_raw: tensor(0.5490, device='cuda:0') reward: tensor(0.9695, device='cuda:0') factor: tensor(0.5796, device='cuda:0')
reward_raw: tensor(0.5750, device='cuda:0') reward: tensor(1.0051, device='cuda:0') factor: tensor(0.5699, device='cuda:0')
reward_raw: tensor(0.5996, device='cuda:0') reward: tensor(1.0262, device='cuda:0') factor: tensor(0.5734, device='cuda:0')
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(0.9676, device='cuda:0') factor: tensor(0.5827, device='cuda:0')


Epoch 1, Step 31: Loss = 0.4174, Actor Loss = 0.3633, Critic Loss = 0.1096, Entropy = 0.6271, Returns = 1.0355, Value = 0.9740, mask_loss = 0.6124std_loss = 0.1341:   1%|          | 31/5475 [08:24<23:00:06, 15.21s/it]  

ratio tensor(0.6101, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.5721, device='cuda:0') reward: tensor(0.9379, device='cuda:0') factor: tensor(0.6342, device='cuda:0')
reward_raw: tensor(0.5706, device='cuda:0') reward: tensor(0.9454, device='cuda:0') factor: tensor(0.6252, device='cuda:0')
reward_raw: tensor(0.5665, device='cuda:0') reward: tensor(0.9531, device='cuda:0') factor: tensor(0.6134, device='cuda:0')
reward_raw: tensor(0.5319, device='cuda:0') reward: tensor(0.9088, device='cuda:0') factor: tensor(0.6231, device='cuda:0')
reward_raw: tensor(0.4923, device='cuda:0') reward: tensor(0.8813, device='cuda:0') factor: tensor(0.6110, device='cuda:0')


Epoch 1, Step 32: Loss = -0.2206, Actor Loss = -0.3025, Critic Loss = 0.1650, Entropy = 0.6012, Returns = 0.8993, Value = 1.1020, mask_loss = 0.6183std_loss = 0.1614:   1%|          | 32/5475 [08:42<24:09:39, 15.98s/it]

ratio tensor(1.7328, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5177, device='cuda:0') reward: tensor(0.8822, device='cuda:0') factor: tensor(0.6355, device='cuda:0')
reward_raw: tensor(0.5320, device='cuda:0') reward: tensor(0.9098, device='cuda:0') factor: tensor(0.6223, device='cuda:0')
reward_raw: tensor(0.5394, device='cuda:0') reward: tensor(0.9058, device='cuda:0') factor: tensor(0.6336, device='cuda:0')
reward_raw: tensor(0.5158, device='cuda:0') reward: tensor(0.8935, device='cuda:0') factor: tensor(0.6223, device='cuda:0')
reward_raw: tensor(0.5480, device='cuda:0') reward: tensor(0.9258, device='cuda:0') factor: tensor(0.6222, device='cuda:0')


Epoch 1, Step 33: Loss = 0.4183, Actor Loss = 0.3074, Critic Loss = 0.2229, Entropy = 0.5710, Returns = 0.9198, Value = 0.7311, mask_loss = 0.6704std_loss = 0.1581:   1%|          | 33/5475 [08:55<22:57:41, 15.19s/it]  

ratio tensor(0.5003, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5545, device='cuda:0') reward: tensor(0.8738, device='cuda:0') factor: tensor(0.6807, device='cuda:0')
reward_raw: tensor(0.5840, device='cuda:0') reward: tensor(0.9247, device='cuda:0') factor: tensor(0.6593, device='cuda:0')
reward_raw: tensor(0.6000, device='cuda:0') reward: tensor(0.9356, device='cuda:0') factor: tensor(0.6644, device='cuda:0')
reward_raw: tensor(0.5775, device='cuda:0') reward: tensor(0.9134, device='cuda:0') factor: tensor(0.6641, device='cuda:0')
reward_raw: tensor(0.5932, device='cuda:0') reward: tensor(0.9385, device='cuda:0') factor: tensor(0.6546, device='cuda:0')


Epoch 1, Step 34: Loss = -0.0178, Actor Loss = -0.1565, Critic Loss = 0.2786, Entropy = 0.5730, Returns = 0.9201, Value = 1.0181, mask_loss = 0.6646std_loss = 0.1580:   1%|          | 34/5475 [09:13<24:26:21, 16.17s/it]

ratio tensor(2.0810, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6563, device='cuda:0') reward: tensor(1.0500, device='cuda:0') factor: tensor(0.6063, device='cuda:0')
reward_raw: tensor(0.5932, device='cuda:0') reward: tensor(0.9844, device='cuda:0') factor: tensor(0.6088, device='cuda:0')
reward_raw: tensor(0.6203, device='cuda:0') reward: tensor(1.0057, device='cuda:0') factor: tensor(0.6146, device='cuda:0')
reward_raw: tensor(0.6136, device='cuda:0') reward: tensor(0.9977, device='cuda:0') factor: tensor(0.6159, device='cuda:0')
reward_raw: tensor(0.6032, device='cuda:0') reward: tensor(0.9925, device='cuda:0') factor: tensor(0.6106, device='cuda:0')


Epoch 1, Step 35: Loss = -0.3079, Actor Loss = -0.4022, Critic Loss = 0.1899, Entropy = 0.6215, Returns = 0.9769, Value = 1.1099, mask_loss = 0.5792std_loss = 0.1608:   1%|          | 35/5475 [09:35<26:46:50, 17.72s/it]

ratio tensor(1.4600, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6285, device='cuda:0') reward: tensor(1.0215, device='cuda:0') factor: tensor(0.6071, device='cuda:0')
reward_raw: tensor(0.6235, device='cuda:0') reward: tensor(1.0250, device='cuda:0') factor: tensor(0.5985, device='cuda:0')
reward_raw: tensor(0.6390, device='cuda:0') reward: tensor(1.0414, device='cuda:0') factor: tensor(0.5976, device='cuda:0')
reward_raw: tensor(0.6387, device='cuda:0') reward: tensor(1.0401, device='cuda:0') factor: tensor(0.5987, device='cuda:0')
reward_raw: tensor(0.6919, device='cuda:0') reward: tensor(1.0838, device='cuda:0') factor: tensor(0.6081, device='cuda:0')


Epoch 1, Step 36: Loss = -0.1110, Actor Loss = -0.2392, Critic Loss = 0.2577, Entropy = 0.6039, Returns = 0.9677, Value = 0.7053, mask_loss = 0.6241std_loss = 0.1491:   1%|          | 36/5475 [09:48<24:41:24, 16.34s/it]

ratio tensor(1.1561, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5682, device='cuda:0') reward: tensor(0.9491, device='cuda:0') factor: tensor(0.6191, device='cuda:0')
reward_raw: tensor(0.5640, device='cuda:0') reward: tensor(0.9522, device='cuda:0') factor: tensor(0.6118, device='cuda:0')
reward_raw: tensor(0.5389, device='cuda:0') reward: tensor(0.9301, device='cuda:0') factor: tensor(0.6088, device='cuda:0')
reward_raw: tensor(0.5619, device='cuda:0') reward: tensor(0.9469, device='cuda:0') factor: tensor(0.6150, device='cuda:0')
reward_raw: tensor(0.5881, device='cuda:0') reward: tensor(0.9699, device='cuda:0') factor: tensor(0.6182, device='cuda:0')


Epoch 1, Step 37: Loss = 0.2786, Actor Loss = 0.1962, Critic Loss = 0.1659, Entropy = 0.5949, Returns = 0.9656, Value = 0.6720, mask_loss = 0.6143std_loss = 0.1761:   1%|          | 37/5475 [10:04<24:34:06, 16.26s/it]  

ratio tensor(0.8209, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6534, device='cuda:0') reward: tensor(1.0025, device='cuda:0') factor: tensor(0.6509, device='cuda:0')
reward_raw: tensor(0.6588, device='cuda:0') reward: tensor(1.0021, device='cuda:0') factor: tensor(0.6567, device='cuda:0')
reward_raw: tensor(0.6654, device='cuda:0') reward: tensor(1.0219, device='cuda:0') factor: tensor(0.6434, device='cuda:0')
reward_raw: tensor(0.6016, device='cuda:0') reward: tensor(0.9563, device='cuda:0') factor: tensor(0.6453, device='cuda:0')
reward_raw: tensor(0.5983, device='cuda:0') reward: tensor(0.9549, device='cuda:0') factor: tensor(0.6434, device='cuda:0')


Epoch 1, Step 38: Loss = -0.6404, Actor Loss = -0.7020, Critic Loss = 0.1245, Entropy = 0.5879, Returns = 1.0345, Value = 1.1938, mask_loss = 0.6359std_loss = 0.1679:   1%|          | 38/5475 [10:20<24:17:59, 16.09s/it]

ratio tensor(3.0525, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6449, device='cuda:0') reward: tensor(0.9739, device='cuda:0') factor: tensor(0.6709, device='cuda:0')
reward_raw: tensor(0.5917, device='cuda:0') reward: tensor(0.9372, device='cuda:0') factor: tensor(0.6545, device='cuda:0')
reward_raw: tensor(0.6456, device='cuda:0') reward: tensor(0.9832, device='cuda:0') factor: tensor(0.6624, device='cuda:0')
reward_raw: tensor(0.6614, device='cuda:0') reward: tensor(1.0145, device='cuda:0') factor: tensor(0.6469, device='cuda:0')
reward_raw: tensor(0.6328, device='cuda:0') reward: tensor(0.9677, device='cuda:0') factor: tensor(0.6650, device='cuda:0')


Epoch 1, Step 39: Loss = 0.3762, Actor Loss = 0.2983, Critic Loss = 0.1571, Entropy = 0.5572, Returns = 0.9485, Value = 0.7229, mask_loss = 0.6901std_loss = 0.1552:   1%|          | 39/5475 [10:37<24:43:26, 16.37s/it]  

ratio tensor(0.7447, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5818, device='cuda:0') reward: tensor(0.8984, device='cuda:0') factor: tensor(0.6834, device='cuda:0')
reward_raw: tensor(0.6068, device='cuda:0') reward: tensor(0.9248, device='cuda:0') factor: tensor(0.6820, device='cuda:0')
reward_raw: tensor(0.6522, device='cuda:0') reward: tensor(0.9709, device='cuda:0') factor: tensor(0.6813, device='cuda:0')
reward_raw: tensor(0.6397, device='cuda:0') reward: tensor(0.9527, device='cuda:0') factor: tensor(0.6870, device='cuda:0')
reward_raw: tensor(0.5515, device='cuda:0') reward: tensor(0.8683, device='cuda:0') factor: tensor(0.6832, device='cuda:0')


Epoch 1, Step 40: Loss = 0.1700, Actor Loss = -0.0511, Critic Loss = 0.4433, Entropy = 0.5728, Returns = 0.9328, Value = 0.7873, mask_loss = 0.6552std_loss = 0.1680:   1%|          | 40/5475 [10:56<25:49:50, 17.11s/it]

ratio tensor(0.7187, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.6254, device='cuda:0') reward: tensor(0.9545, device='cuda:0') factor: tensor(0.6709, device='cuda:0')
reward_raw: tensor(0.6174, device='cuda:0') reward: tensor(0.9657, device='cuda:0') factor: tensor(0.6517, device='cuda:0')
reward_raw: tensor(0.6081, device='cuda:0') reward: tensor(0.9705, device='cuda:0') factor: tensor(0.6375, device='cuda:0')
reward_raw: tensor(0.6349, device='cuda:0') reward: tensor(0.9733, device='cuda:0') factor: tensor(0.6616, device='cuda:0')
reward_raw: tensor(0.6398, device='cuda:0') reward: tensor(0.9749, device='cuda:0') factor: tensor(0.6649, device='cuda:0')


Epoch 1, Step 41: Loss = -0.2247, Actor Loss = -0.2809, Critic Loss = 0.1137, Entropy = 0.5657, Returns = 1.0482, Value = 1.2519, mask_loss = 0.6609std_loss = 0.1707:   1%|          | 40/5475 [11:17<25:49:50, 17.11s/it]

ratio tensor(1.4940, device='cuda:0', grad_fn=<MeanBackward0>)



Epoch 1, Step 41: Loss = -0.2247, Actor Loss = -0.2809, Critic Loss = 0.1137, Entropy = 0.5657, Returns = 1.0482, Value = 1.2519, mask_loss = 0.6609std_loss = 0.1707:   1%|          | 41/5475 [11:17<27:56:53, 18.52s/it]

reward_raw: tensor(0.6610, device='cuda:0') reward: tensor(0.9787, device='cuda:0') factor: tensor(0.6823, device='cuda:0')
reward_raw: tensor(0.6674, device='cuda:0') reward: tensor(0.9857, device='cuda:0') factor: tensor(0.6817, device='cuda:0')
reward_raw: tensor(0.6582, device='cuda:0') reward: tensor(0.9843, device='cuda:0') factor: tensor(0.6738, device='cuda:0')
reward_raw: tensor(0.6451, device='cuda:0') reward: tensor(0.9740, device='cuda:0') factor: tensor(0.6711, device='cuda:0')
reward_raw: tensor(0.6412, device='cuda:0') reward: tensor(0.9470, device='cuda:0') factor: tensor(0.6942, device='cuda:0')


Epoch 1, Step 42: Loss = 0.4943, Actor Loss = 0.4217, Critic Loss = 0.1464, Entropy = 0.6054, Returns = 0.9771, Value = 0.9501, mask_loss = 0.5946std_loss = 0.1706:   1%|          | 42/5475 [11:30<25:16:23, 16.75s/it]  

ratio tensor(0.1449, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5235, device='cuda:0') reward: tensor(0.9632, device='cuda:0') factor: tensor(0.5603, device='cuda:0')
reward_raw: tensor(0.5122, device='cuda:0') reward: tensor(0.9520, device='cuda:0') factor: tensor(0.5602, device='cuda:0')
reward_raw: tensor(0.5015, device='cuda:0') reward: tensor(0.9272, device='cuda:0') factor: tensor(0.5743, device='cuda:0')
reward_raw: tensor(0.4976, device='cuda:0') reward: tensor(0.9229, device='cuda:0') factor: tensor(0.5747, device='cuda:0')
reward_raw: tensor(0.4765, device='cuda:0') reward: tensor(0.9245, device='cuda:0') factor: tensor(0.5520, device='cuda:0')


Epoch 1, Step 43: Loss = -0.2821, Actor Loss = -0.3389, Critic Loss = 0.1147, Entropy = 0.6146, Returns = 1.0042, Value = 1.0313, mask_loss = 0.5512std_loss = 0.1767:   1%|          | 43/5475 [11:46<25:05:04, 16.62s/it]

ratio tensor(2.3653, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5503, device='cuda:0') reward: tensor(1.0304, device='cuda:0') factor: tensor(0.5199, device='cuda:0')
reward_raw: tensor(0.5323, device='cuda:0') reward: tensor(1.0094, device='cuda:0') factor: tensor(0.5229, device='cuda:0')
reward_raw: tensor(0.4515, device='cuda:0') reward: tensor(0.9426, device='cuda:0') factor: tensor(0.5089, device='cuda:0')
reward_raw: tensor(0.5418, device='cuda:0') reward: tensor(1.0159, device='cuda:0') factor: tensor(0.5260, device='cuda:0')
reward_raw: tensor(0.5360, device='cuda:0') reward: tensor(1.0222, device='cuda:0') factor: tensor(0.5138, device='cuda:0')


Epoch 1, Step 44: Loss = 0.1400, Actor Loss = -0.0151, Critic Loss = 0.3115, Entropy = 0.6275, Returns = 1.0186, Value = 0.6838, mask_loss = 0.5294std_loss = 0.1675:   1%|          | 44/5475 [12:05<26:04:55, 17.29s/it] 

ratio tensor(1.0115, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5227, device='cuda:0') reward: tensor(1.0165, device='cuda:0') factor: tensor(0.5062, device='cuda:0')
reward_raw: tensor(0.5185, device='cuda:0') reward: tensor(0.9920, device='cuda:0') factor: tensor(0.5265, device='cuda:0')
reward_raw: tensor(0.5516, device='cuda:0') reward: tensor(1.0251, device='cuda:0') factor: tensor(0.5265, device='cuda:0')
reward_raw: tensor(0.4708, device='cuda:0') reward: tensor(0.9442, device='cuda:0') factor: tensor(0.5266, device='cuda:0')
reward_raw: tensor(0.5069, device='cuda:0') reward: tensor(0.9728, device='cuda:0') factor: tensor(0.5341, device='cuda:0')


Epoch 1, Step 45: Loss = -0.0799, Actor Loss = -0.1170, Critic Loss = 0.0755, Entropy = 0.6291, Returns = 1.0279, Value = 1.0541, mask_loss = 0.5400std_loss = 0.1581:   1%|          | 45/5475 [12:19<24:39:18, 16.35s/it]

ratio tensor(1.1681, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4989, device='cuda:0') reward: tensor(0.9718, device='cuda:0') factor: tensor(0.5271, device='cuda:0')
reward_raw: tensor(0.4655, device='cuda:0') reward: tensor(0.9311, device='cuda:0') factor: tensor(0.5344, device='cuda:0')
reward_raw: tensor(0.4780, device='cuda:0') reward: tensor(0.9642, device='cuda:0') factor: tensor(0.5138, device='cuda:0')
reward_raw: tensor(0.5191, device='cuda:0') reward: tensor(0.9873, device='cuda:0') factor: tensor(0.5318, device='cuda:0')
reward_raw: tensor(0.5487, device='cuda:0') reward: tensor(1.0309, device='cuda:0') factor: tensor(0.5178, device='cuda:0')


Epoch 1, Step 46: Loss = -0.1474, Actor Loss = -0.1590, Critic Loss = 0.0244, Entropy = 0.6057, Returns = 0.9869, Value = 1.0615, mask_loss = 0.5345std_loss = 0.1852:   1%|          | 46/5475 [12:39<26:19:22, 17.45s/it]

ratio tensor(2.2123, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4869, device='cuda:0') reward: tensor(0.9952, device='cuda:0') factor: tensor(0.4917, device='cuda:0')
reward_raw: tensor(0.5254, device='cuda:0') reward: tensor(1.0401, device='cuda:0') factor: tensor(0.4853, device='cuda:0')
reward_raw: tensor(0.4694, device='cuda:0') reward: tensor(0.9839, device='cuda:0') factor: tensor(0.4855, device='cuda:0')
reward_raw: tensor(0.5341, device='cuda:0') reward: tensor(1.0528, device='cuda:0') factor: tensor(0.4813, device='cuda:0')
reward_raw: tensor(0.5036, device='cuda:0') reward: tensor(1.0078, device='cuda:0') factor: tensor(0.4958, device='cuda:0')


Epoch 1, Step 47: Loss = 0.0664, Actor Loss = -0.0015, Critic Loss = 0.1369, Entropy = 0.5995, Returns = 1.0176, Value = 1.0470, mask_loss = 0.4926std_loss = 0.2019:   1%|          | 47/5475 [12:57<26:38:10, 17.67s/it] 

ratio tensor(2.0082, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.5537, device='cuda:0') reward: tensor(1.0575, device='cuda:0') factor: tensor(0.4962, device='cuda:0')
reward_raw: tensor(0.5638, device='cuda:0') reward: tensor(1.0530, device='cuda:0') factor: tensor(0.5107, device='cuda:0')
reward_raw: tensor(0.6369, device='cuda:0') reward: tensor(1.1100, device='cuda:0') factor: tensor(0.5269, device='cuda:0')
reward_raw: tensor(0.5755, device='cuda:0') reward: tensor(1.0525, device='cuda:0') factor: tensor(0.5230, device='cuda:0')
reward_raw: tensor(0.5848, device='cuda:0') reward: tensor(1.0670, device='cuda:0') factor: tensor(0.5178, device='cuda:0')


Epoch 1, Step 48: Loss = 0.0495, Actor Loss = -0.0765, Critic Loss = 0.2533, Entropy = 0.5988, Returns = 0.9925, Value = 1.2411, mask_loss = 0.4993std_loss = 0.2054:   1%|          | 48/5475 [13:15<26:39:45, 17.69s/it]

ratio tensor(1.8662, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.4885, device='cuda:0') reward: tensor(0.9773, device='cuda:0') factor: tensor(0.5113, device='cuda:0')
reward_raw: tensor(0.4632, device='cuda:0') reward: tensor(0.9400, device='cuda:0') factor: tensor(0.5232, device='cuda:0')
reward_raw: tensor(0.5176, device='cuda:0') reward: tensor(1.0063, device='cuda:0') factor: tensor(0.5113, device='cuda:0')
reward_raw: tensor(0.5216, device='cuda:0') reward: tensor(0.9991, device='cuda:0') factor: tensor(0.5225, device='cuda:0')
reward_raw: tensor(0.5142, device='cuda:0') reward: tensor(1.0049, device='cuda:0') factor: tensor(0.5093, device='cuda:0')


Epoch 1, Step 49: Loss = 0.5245, Actor Loss = 0.4495, Critic Loss = 0.1510, Entropy = 0.4664, Returns = 0.9686, Value = 0.7739, mask_loss = 0.7812std_loss = 0.1302:   1%|          | 49/5475 [13:37<28:32:19, 18.93s/it] 

ratio tensor(6.7442e-09, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7495, device='cuda:0') reward: tensor(0.9634, device='cuda:0') factor: tensor(0.7861, device='cuda:0')
reward_raw: tensor(0.7451, device='cuda:0') reward: tensor(0.9679, device='cuda:0') factor: tensor(0.7773, device='cuda:0')
reward_raw: tensor(0.7720, device='cuda:0') reward: tensor(1.0081, device='cuda:0') factor: tensor(0.7638, device='cuda:0')
reward_raw: tensor(0.7300, device='cuda:0') reward: tensor(0.9704, device='cuda:0') factor: tensor(0.7596, device='cuda:0')
reward_raw: tensor(0.7467, device='cuda:0') reward: tensor(0.9692, device='cuda:0') factor: tensor(0.7774, device='cuda:0')


Epoch 1, Step 50: Loss = -0.5277, Actor Loss = -0.6167, Critic Loss = 0.1789, Entropy = 0.4634, Returns = 0.9828, Value = 1.0906, mask_loss = 0.7762std_loss = 0.1409:   1%|          | 50/5475 [13:50<25:58:39, 17.24s/it]

ratio tensor(2.4431, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7043, device='cuda:0') reward: tensor(0.9566, device='cuda:0') factor: tensor(0.7477, device='cuda:0')
reward_raw: tensor(0.7308, device='cuda:0') reward: tensor(0.9758, device='cuda:0') factor: tensor(0.7550, device='cuda:0')
reward_raw: tensor(0.6832, device='cuda:0') reward: tensor(0.9212, device='cuda:0') factor: tensor(0.7619, device='cuda:0')
reward_raw: tensor(0.7263, device='cuda:0') reward: tensor(0.9819, device='cuda:0') factor: tensor(0.7444, device='cuda:0')
reward_raw: tensor(0.7026, device='cuda:0') reward: tensor(0.9397, device='cuda:0') factor: tensor(0.7629, device='cuda:0')


Epoch 1, Step 51: Loss = 0.2367, Actor Loss = 0.1880, Critic Loss = 0.0982, Entropy = 0.4257, Returns = 0.9675, Value = 0.7332, mask_loss = 0.8010std_loss = 0.1370:   1%|          | 51/5475 [14:08<26:04:24, 17.31s/it]  

ratio tensor(0.5182, device='cuda:0', grad_fn=<MeanBackward0>)

reward_raw: tensor(0.7807, device='cuda:0') reward: tensor(0.9377, device='cuda:0') factor: tensor(0.8430, device='cuda:0')
reward_raw: tensor(0.7363, device='cuda:0') reward: tensor(0.9011, device='cuda:0') factor: tensor(0.8352, device='cuda:0')
reward_raw: tensor(0.7191, device='cuda:0') reward: tensor(0.8932, device='cuda:0') factor: tensor(0.8259, device='cuda:0')
reward_raw: tensor(0.7284, device='cuda:0') reward: tensor(0.8865, device='cuda:0') factor: tensor(0.8420, device='cuda:0')
reward_raw: tensor(0.7351, device='cuda:0') reward: tensor(0.8861, device='cuda:0') factor: tensor(0.8490, device='cuda:0')


Epoch 1, Step 52: Loss = 0.0159, Actor Loss = -0.0520, Critic Loss = 0.1365, Entropy = 0.3812, Returns = 0.8869, Value = 0.7216, mask_loss = 0.8421std_loss = 0.1151:   1%|          | 52/5475 [14:26<26:33:11, 17.63s/it]

ratio tensor(1.2476, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7849, device='cuda:0') reward: tensor(0.9364, device='cuda:0') factor: tensor(0.8486, device='cuda:0')
reward_raw: tensor(0.8320, device='cuda:0') reward: tensor(0.9809, device='cuda:0') factor: tensor(0.8511, device='cuda:0')
reward_raw: tensor(0.8374, device='cuda:0') reward: tensor(0.9736, device='cuda:0') factor: tensor(0.8638, device='cuda:0')
reward_raw: tensor(0.7855, device='cuda:0') reward: tensor(0.9276, device='cuda:0') factor: tensor(0.8579, device='cuda:0')
reward_raw: tensor(0.8071, device='cuda:0') reward: tensor(0.9505, device='cuda:0') factor: tensor(0.8567, device='cuda:0')


Epoch 1, Step 53: Loss = -0.1737, Actor Loss = -0.2151, Critic Loss = 0.0835, Entropy = 0.3522, Returns = 0.9674, Value = 0.9947, mask_loss = 0.8581std_loss = 0.1108:   1%|          | 53/5475 [14:39<24:24:56, 16.21s/it]

ratio tensor(1.6626, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7397, device='cuda:0') reward: tensor(0.8854, device='cuda:0') factor: tensor(0.8543, device='cuda:0')
reward_raw: tensor(0.6926, device='cuda:0') reward: tensor(0.8417, device='cuda:0') factor: tensor(0.8509, device='cuda:0')
reward_raw: tensor(0.7277, device='cuda:0') reward: tensor(0.8787, device='cuda:0') factor: tensor(0.8490, device='cuda:0')
reward_raw: tensor(0.7415, device='cuda:0') reward: tensor(0.8874, device='cuda:0') factor: tensor(0.8541, device='cuda:0')
reward_raw: tensor(0.7279, device='cuda:0') reward: tensor(0.8748, device='cuda:0') factor: tensor(0.8531, device='cuda:0')


Epoch 1, Step 54: Loss = -0.0413, Actor Loss = -0.1117, Critic Loss = 0.1415, Entropy = 0.3363, Returns = 0.9141, Value = 0.8448, mask_loss = 0.8636std_loss = 0.1109:   1%|          | 54/5475 [14:52<22:56:39, 15.24s/it]

ratio tensor(1.1802, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7711, device='cuda:0') reward: tensor(0.9140, device='cuda:0') factor: tensor(0.8571, device='cuda:0')
reward_raw: tensor(0.7365, device='cuda:0') reward: tensor(0.8795, device='cuda:0') factor: tensor(0.8570, device='cuda:0')
reward_raw: tensor(0.7814, device='cuda:0') reward: tensor(0.9227, device='cuda:0') factor: tensor(0.8587, device='cuda:0')
reward_raw: tensor(0.7740, device='cuda:0') reward: tensor(0.9224, device='cuda:0') factor: tensor(0.8516, device='cuda:0')
reward_raw: tensor(0.7745, device='cuda:0') reward: tensor(0.9110, device='cuda:0') factor: tensor(0.8635, device='cuda:0')


Epoch 1, Step 55: Loss = -0.0286, Actor Loss = -0.0902, Critic Loss = 0.1239, Entropy = 0.3412, Returns = 0.9227, Value = 0.9592, mask_loss = 0.8627std_loss = 0.1098:   1%|          | 55/5475 [15:07<22:41:55, 15.08s/it]

ratio tensor(0.9636, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7466, device='cuda:0') reward: tensor(0.8856, device='cuda:0') factor: tensor(0.8609, device='cuda:0')
reward_raw: tensor(0.7855, device='cuda:0') reward: tensor(0.9232, device='cuda:0') factor: tensor(0.8623, device='cuda:0')
reward_raw: tensor(0.7643, device='cuda:0') reward: tensor(0.9095, device='cuda:0') factor: tensor(0.8548, device='cuda:0')
reward_raw: tensor(0.7786, device='cuda:0') reward: tensor(0.9200, device='cuda:0') factor: tensor(0.8586, device='cuda:0')
reward_raw: tensor(0.7772, device='cuda:0') reward: tensor(0.9144, device='cuda:0') factor: tensor(0.8628, device='cuda:0')


Epoch 1, Step 56: Loss = -0.0023, Actor Loss = -0.0394, Critic Loss = 0.0748, Entropy = 0.2996, Returns = 0.9375, Value = 0.8341, mask_loss = 0.8880std_loss = 0.0966:   1%|          | 56/5475 [15:22<22:52:42, 15.20s/it]

ratio tensor(1.2912, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7998, device='cuda:0') reward: tensor(0.9175, device='cuda:0') factor: tensor(0.8823, device='cuda:0')
reward_raw: tensor(0.7779, device='cuda:0') reward: tensor(0.9001, device='cuda:0') factor: tensor(0.8778, device='cuda:0')
reward_raw: tensor(0.8104, device='cuda:0') reward: tensor(0.9280, device='cuda:0') factor: tensor(0.8824, device='cuda:0')
reward_raw: tensor(0.8004, device='cuda:0') reward: tensor(0.9145, device='cuda:0') factor: tensor(0.8859, device='cuda:0')
reward_raw: tensor(0.7970, device='cuda:0') reward: tensor(0.9278, device='cuda:0') factor: tensor(0.8693, device='cuda:0')


Epoch 1, Step 57: Loss = -0.0532, Actor Loss = -0.0934, Critic Loss = 0.0809, Entropy = 0.2932, Returns = 0.9000, Value = 0.9223, mask_loss = 0.8826std_loss = 0.1091:   1%|          | 57/5475 [15:43<25:28:45, 16.93s/it]

ratio tensor(2.1794, device='cuda:0', grad_fn=<MeanBackward0>)
reward_raw: tensor(0.7770, device='cuda:0') reward: tensor(0.8968, device='cuda:0') factor: tensor(0.8802, device='cuda:0')
reward_raw: tensor(0.8150, device='cuda:0') reward: tensor(0.9441, device='cuda:0') factor: tensor(0.8710, device='cuda:0')


In [8]:
import numpy as np
import torch.nn.functional as F
from datasets import Dataset

# mask_gen_model.load_state_dict(torch.load('saved_model/squad_kldv_0_40.pth',map_location=device))

mask_gen_model.eval()

test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

test_inputs = next(iter(test_dataloader)).to(device)
# test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])



# data_dict = {
#     'sentence': ["I absolutely love this product! It exceeded all my expectations.", 
#              "The movie was fantastic, and the acting was top-notch.",
#              "This restaurant offers great service and delicious food. Highly recommend!",
#              "The product works as advertised, nothing more, nothing less.",
#              "The event was well-organized, but it didn’t leave a lasting impression.",
#              "t’s an average phone, nothing special but it does the job.",
#              "I’m really disappointed with this purchase. It broke within a week.",
#              "The movie was too long and boring, I wouldn’t recommend it.",
#              "Terrible customer service, I won’t be coming back to this place."],
#     'label': [1, 1, 1, 0, 0, 0, -1, -1, -1]
# }
# manual_test_data = Dataset.from_dict(data_dict)

# manual_test_dataloader = DataLoader(manual_test_data, batch_size=9, collate_fn=collate_fn, shuffle=False)
# #
# test_inputs = next(iter(manual_test_dataloader)).to(device)


# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask, response_mask
    dist, value = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

In [9]:
import random
# idx = random.randint(0, 8)
idx = 0
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask

def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = (array > 0)
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)

    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
scores = normalize_except_zeros(scores.detach().cpu().numpy())

# # remove special tokens
# filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) ]

In [10]:
for token, score in filtered_token_scores:
    print(f"Token: {token}, Score: {score}")

Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|e

In [11]:
import re

def clean_token(token):
    # 移除所有普通的特殊字符，比如 'Ġ' 和 'Ċ'
    return token.replace("Ġ", "").replace("Ċ", "")

merged_tokens_scores = []
current_token = ""
current_score = 0
count = 0

def is_special_token(token):
    # 判断是否是特殊的独立 token，例如 '<|start_header_id|>' 这样的 token
    return token.startswith("<|") and token.endswith("|>")

# 用于合并 token 和 score，取平均值
for token, score in filtered_token_scores:
    # 检查是否是特殊 token
    if is_special_token(token):
        # 如果当前有累积的 token，先把它们加入结果
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
            current_token = ""
            current_score = 0
            count = 0

        # 特殊 token 直接加入，不合并
        merged_tokens_scores.append((token, score))
        continue

    # 清理 token 中的特殊字符
    cleaned_token = clean_token(token)

    # 忽略清理后的空 token
    if not cleaned_token:
        continue

    # 判断是否是新单词的开始（以 'Ġ' 或 'Ċ' 开头的通常是新词）
    if token.startswith("Ġ") or token.startswith("Ċ"):
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
        
        # 初始化新的 token 和 score
        current_token = cleaned_token
        current_score = score
        count = 1
    else:
        # 如果是子词，则继续合并
        current_token += cleaned_token
        current_score += score
        count += 1

# 处理最后一个 token
if current_token:
    # 确保分数归一化在 [0, 1] 之间
    average_score = min(current_score / count, 1.0)
    merged_tokens_scores.append((current_token, average_score))

# # 输出结果
# for token, score in merged_tokens_scores:
#     print(f"Token: {token}, Score: {score}")





# 根据分数高亮文本（示例中使用HTML标签）
highlighted_text = ""
for token, score in merged_tokens_scores:
    # 动态设置背景颜色：score为0时为白色，score为1时为绿色
    red = int((1 - score) * 255)
    green = 255
    blue = int((1 - score) * 255)
    color = f'rgb({red}, {green}, {blue})'
    highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# 打印高亮后的文本
from IPython.display import display, HTML
display(HTML(highlighted_text.strip()))

In [26]:
(test_mask_prob * test_context_mask).max()

tensor(0.9849, device='cuda:0')

In [None]:
filtered_token_scores

In [None]:
test_inputs.keys()

collate_fn()

tokenizer("this is a sentence")

In [22]:
new_ds = ds['train'].select(range(3))

In [26]:
from datasets import Dataset

data_dict = {
    'text': ["This is the first example.", "This is the second example."],
    'label': [0, 1]
}
dataset = Dataset.from_dict(data_dict)

In [None]:
new_ds[0]