In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.next_token_model import MaskGenModelForNextToken, Environment
from tqdm import tqdm

  scaler = GradScaler()


In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# ds = load_dataset("imdb")
# ds = load_dataset("rajpurkar/squad")
ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
test_ds = ds['test']
# test_ds = ds['validation']

llm_exp_helper = DataHelper(tokenizer)
# collate_fn = llm_exp_helper.get_collate_fun('imdb')
collate_fn = llm_exp_helper.get_collate_fun('sst2')
# collate_fn = llm_exp_helper.get_collate_fun('squad')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    device_map=device,
    token=access_token,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Configure mask model and  Training parameters
embedding_layer = model.get_input_embeddings()
mask_gen_model = MaskGenModelForNextToken(hidden_size=4096, embedding_layer=embedding_layer).to(torch.bfloat16)
mask_gen_model.to(device)

environment = Environment(model)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
params_to_update = [param for name, param in mask_gen_model.named_parameters() if "output_layer.weight" not in name]
optimizer = torch.optim.Adam(params_to_update, lr=1e-3)

In [6]:
# 创建一个集合，用于存储所有注册在优化器中的参数 ID
optimizer_params_ids = set(id(p) for group in optimizer.param_groups for p in group['params'])

# 遍历模型的所有参数，并检查它们是否在优化器中
for name, param in mask_gen_model.named_parameters():
    if id(param) in optimizer_params_ids:
        print(f"Parameter '{name}' is in the optimizer.")
    else:
        print(f"Parameter '{name}' is NOT in the optimizer.")

Parameter 'autoregressive_predictor.reduce_map.0.weight' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.0.bias' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.2.weight' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.2.bias' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.4.weight' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.4.bias' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.6.weight' is in the optimizer.
Parameter 'autoregressive_predictor.reduce_map.6.bias' is in the optimizer.
Parameter 'autoregressive_predictor.linear.weight' is in the optimizer.
Parameter 'autoregressive_predictor.linear.bias' is in the optimizer.
Parameter 'autoregressive_predictor.output_layer.weight' is NOT in the optimizer.
Parameter 'autoregressive_predictor.norm.weight' is in the optimizer.
Parameter 'value_map.weight' is in the optimizer.
Parameter 'value_map.bias' is in the opt

In [7]:
assert mask_gen_model.autoregressive_predictor.output_layer.weight is embedding_layer.weight

mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=3,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )

        # gen_tokens = gen_outputs.sequences
        pad_length = 1 # single token generation
        gen_tokens = gen_outputs.sequences[:, :input_ids.size(1) + pad_length]
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, optimizer,
                                                   environment=environment, num_steps=10, alpha=0.1)


        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
              f"Advantage = {loss_dict['advantages']:.4f}, " \
              f"Ratio = {loss_dict['ratio']:.4f}, " \
              f"Entropy = {loss_dict['entropy']:.4f}, " \
              f"kl_div = {loss_dict['kl_div']:.4f}, " \
              f"mask_loss = {loss_dict['mask_loss']:.4f}, " \
            #    f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
            #    f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               
            #    f"Returns = {loss_dict['returns']:.4f}, " \
            #    f"Value = {loss_dict['value']:.4f}, " \
            #     f"mask_loss = {loss_dict['mask_loss']:.4f}" \
            #     f"std_loss = {loss_dict['std_loss']:.4f}" \
            #    f"Cont_loss = {loss_dict['contrast_loss']:.4f}, "  \
               
        pbar.set_description(log)

        if idx % 10 == 0:
            print()
        if idx % 100 == 0 and idx != 0:
            torch.save(mask_gen_model.state_dict(), f'saved_model/imdb_next_token_model_{epoch}_{idx}.pth') 

  0%|          | 0/4210 [00:00<?, ?it/s]

logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 1: Loss = 12.5887, Advantage = 0.8426, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7803, mask_loss = 0.0000, :   0%|          | 1/4210 [01:22<96:07:23, 82.22s/it]

logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 2: Loss = 13.0700, Advantage = -0.1679, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7798, mask_loss = 0.0000, :   0%|          | 2/4210 [01:24<40:52:55, 34.98s/it]

tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 3: Loss = 12.9161, Advantage = 0.3482, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7787, mask_loss = 0.0000, :   0%|          | 3/4210 [01:26<23:13:39, 19.88s/it] 

tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 4: Loss = 14.7738, Advantage = 0.0135, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7799, mask_loss = 0.0000, :   0%|          | 4/4210 [01:27<14:57:12, 12.80s/it]

logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 5: Loss = 14.1396, Advantage = -0.2919, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7826, mask_loss = 0.0000, :   0%|          | 5/4210 [01:30<10:26:01,  8.93s/it]

logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-3.8116e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-4.1959e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-5.3879e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1215e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 6: Loss = 35.0669, Advantage = -0.3019, Ratio = 1.0000, Entropy = 0.0000, kl_div = 32.1977, mask_loss = 0.0000, :   0%|          | 6/4210 [01:31<7:38:53,  6.55s/it] 

tensor(-2.1309e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-2.3614e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 7: Loss = 13.2668, Advantage = -0.0415, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.8497, mask_loss = 0.0000, :   0%|          | 7/4210 [01:34<5:56:12,  5.09s/it]

logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 8: Loss = 13.7325, Advantage = 1.7877, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.8290, mask_loss = 0.0000, :   0%|          | 8/4210 [01:36<4:48:56,  4.13s/it] 

logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 9: Loss = 12.9491, Advantage = 0.4578, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.8044, mask_loss = 0.0000, :   0%|          | 9/4210 [01:38<4:00:25,  3.43s/it]

tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 10: Loss = 15.4988, Advantage = -0.3909, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7952, mask_loss = 0.0000, :   0%|          | 10/4210 [01:40<3:31:35,  3.02s/it]

logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.8714e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 11: Loss = 20.1754, Advantage = -3.6724, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7963, mask_loss = 0.0000, :   0%|          | 11/4210 [01:42<3:08:32,  2.69s/it]

logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)

logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 12: Loss = 14.7922, Advantage = -0.7832, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7856, mask_loss = 0.0000, :   0%|          | 12/4210 [01:43<2:48:41,  2.41s/it]

logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 13: Loss = 12.7414, Advantage = 0.7928, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7833, mask_loss = 0.0000, :   0%|          | 13/4210 [01:45<2:35:11,  2.22s/it] 

logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 14: Loss = 12.3831, Advantage = 0.6883, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7822, mask_loss = 0.0000, :   0%|          | 14/4210 [01:47<2:29:33,  2.14s/it]

logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1560e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.7610e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-3.4860e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-4.2807e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-3.9375e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3019e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-6.3849e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 15: Loss = 51.8862, Advantage = -0.0804, Ratio = 1.0000, Entropy = 0.0000, kl_div = 50.4894, mask_loss = 0.0000, :   0%|          | 15/4210 [01:49<2:21:07,  2.02s/it]

logprob tensor(-1.1560e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1560e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-6.5384e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1599e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3129e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.5626e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.4982e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3612e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2001e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 16: Loss = 14.2590, Advantage = -1.1645, Ratio = 1.0000, Entropy = 0.0000, kl_div = 12.0644, mask_loss = 0.0000, :   0%|          | 16/4210 [01:51<2:19:04,  1.99s/it]

tensor(-1.0793e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 17: Loss = 12.7549, Advantage = 0.8542, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.8160, mask_loss = 0.0000, :   0%|          | 17/4210 [01:53<2:21:55,  2.03s/it] 

logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6698e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 18: Loss = 12.0797, Advantage = 1.8455, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7935, mask_loss = 0.0000, :   0%|          | 18/4210 [01:55<2:20:47,  2.02s/it]

logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 19: Loss = 14.6207, Advantage = -0.8855, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7857, mask_loss = 0.0000, :   0%|          | 19/4210 [01:57<2:16:00,  1.95s/it]

logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.4072e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 20: Loss = 15.1684, Advantage = -0.7404, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7943, mask_loss = 0.0000, :   0%|          | 20/4210 [01:59<2:25:52,  2.09s/it]

logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 21: Loss = 11.7858, Advantage = 0.9617, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7810, mask_loss = 0.0000, :   0%|          | 21/4210 [02:01<2:22:30,  2.04s/it] 

tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0310e-07, device='cuda:0', grad_fn=<MeanBackward0>)

logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 22: Loss = 12.7984, Advantage = 0.2084, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7823, mask_loss = 0.0000, :   1%|          | 22/4210 [02:03<2:19:33,  2.00s/it]

logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 23: Loss = 13.4188, Advantage = -0.7316, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7806, mask_loss = 0.0000, :   1%|          | 23/4210 [02:05<2:19:17,  2.00s/it]

logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 24: Loss = 13.8429, Advantage = -0.1932, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7786, mask_loss = 0.0000, :   1%|          | 24/4210 [02:07<2:15:03,  1.94s/it]

logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3385e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-5.5736e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3385e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3385e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.5832e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3385e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3385e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.4159e-06, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 25: Loss = 39.6739, Advantage = 0.4235, Ratio = 1.0000, Entropy = 0.0000, kl_div = 39.0244, mask_loss = 0.0000, :   1%|          | 25/4210 [02:08<2:07:48,  1.83s/it] 

logprob tensor(-7.7695e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.3845e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 26: Loss = 11.9387, Advantage = 1.6933, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7781, mask_loss = 0.0000, :   1%|          | 26/4210 [02:10<2:04:08,  1.78s/it]

tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.2110e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 27: Loss = 11.4793, Advantage = 0.9790, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7814, mask_loss = 0.0000, :   1%|          | 27/4210 [02:12<2:09:59,  1.86s/it]

logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 28: Loss = 13.8910, Advantage = -0.3278, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7822, mask_loss = 0.0000, :   1%|          | 28/4210 [02:14<2:15:07,  1.94s/it]

logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.0826e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 29: Loss = 13.0087, Advantage = -0.1112, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7798, mask_loss = 0.0000, :   1%|          | 29/4210 [02:16<2:11:48,  1.89s/it]

logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 30: Loss = 12.9907, Advantage = 0.0069, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7828, mask_loss = 0.0000, :   1%|          | 30/4210 [02:18<2:13:27,  1.92s/it] 

logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 31: Loss = 12.4542, Advantage = 0.0145, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7787, mask_loss = 0.0000, :   1%|          | 31/4210 [02:20<2:16:30,  1.96s/it]

logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.4190e-08, device='cuda:0', grad_fn=<MeanBackward0>)

logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 32: Loss = 14.6902, Advantage = -0.9374, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7804, mask_loss = 0.0000, :   1%|          | 32/4210 [02:22<2:16:27,  1.96s/it]

logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.9083e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 33: Loss = 12.8599, Advantage = -0.2764, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7783, mask_loss = 0.0000, :   1%|          | 33/4210 [02:24<2:19:03,  2.00s/it]

logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.3041e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 34: Loss = 12.1272, Advantage = 0.1839, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7786, mask_loss = 0.0000, :   1%|          | 34/4210 [02:26<2:14:31,  1.93s/it] 

logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1057e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 35: Loss = 11.9851, Advantage = 0.0320, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7744, mask_loss = 0.0000, :   1%|          | 35/4210 [02:27<2:10:52,  1.88s/it]

logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 36: Loss = 13.2503, Advantage = -0.3395, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7799, mask_loss = 0.0000, :   1%|          | 36/4210 [02:29<2:12:51,  1.91s/it]

logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.5367e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 37: Loss = 12.9828, Advantage = 0.1043, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7795, mask_loss = 0.0000, :   1%|          | 37/4210 [02:31<2:14:01,  1.93s/it] 

logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 38: Loss = 12.9318, Advantage = 1.6853, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7788, mask_loss = 0.0000, :   1%|          | 38/4210 [02:34<2:18:16,  1.99s/it]

logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.7694e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 39: Loss = 13.6107, Advantage = -0.8077, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7765, mask_loss = 0.0000, :   1%|          | 39/4210 [02:35<2:14:20,  1.93s/it]

logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 40: Loss = 13.3512, Advantage = -0.8444, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7776, mask_loss = 0.0000, :   1%|          | 40/4210 [02:37<2:17:47,  1.98s/it]

logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 41: Loss = 13.1438, Advantage = 0.5027, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7803, mask_loss = 0.0000, :   1%|          | 41/4210 [02:39<2:16:59,  1.97s/it] 

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 42: Loss = 11.4805, Advantage = 1.1327, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7749, mask_loss = 0.0000, :   1%|          | 42/4210 [02:41<2:16:20,  1.96s/it]

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 43: Loss = 12.3170, Advantage = 0.3696, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7750, mask_loss = 0.0000, :   1%|          | 43/4210 [02:43<2:15:06,  1.95s/it]

logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0451e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.5314e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-3.0738e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.6473e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4180e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 44: Loss = 21.3829, Advantage = -0.1980, Ratio = 1.0000, Entropy = 0.0000, kl_div = 20.1407, mask_loss = 0.0000, :   1%|          | 44/4210 [02:45<2:10:32,  1.88s/it]

logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-7.7945e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.9633e-06, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 45: Loss = 18.5555, Advantage = -1.1353, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7836, mask_loss = 0.0000, :   1%|          | 45/4210 [02:47<2:18:23,  1.99s/it]

tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 46: Loss = 18.1883, Advantage = -2.0406, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7847, mask_loss = 0.0000, :   1%|          | 46/4210 [02:49<2:17:24,  1.98s/it]

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 47: Loss = 13.7898, Advantage = 2.7368, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7885, mask_loss = 0.0000, :   1%|          | 47/4210 [02:51<2:18:10,  1.99s/it] 

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 48: Loss = 12.9107, Advantage = 1.4751, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7875, mask_loss = 0.0000, :   1%|          | 48/4210 [02:53<2:17:44,  1.99s/it]

logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.7813e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 49: Loss = 14.5409, Advantage = -0.4541, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7820, mask_loss = 0.0000, :   1%|          | 49/4210 [02:55<2:16:52,  1.97s/it]

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 50: Loss = 14.5564, Advantage = -1.1726, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7829, mask_loss = 0.0000, :   1%|          | 50/4210 [02:57<2:12:20,  1.91s/it]

logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1387e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 51: Loss = 11.5403, Advantage = 1.7733, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7807, mask_loss = 0.0000, :   1%|          | 51/4210 [02:59<2:13:06,  1.92s/it] 

logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)

logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 52: Loss = 14.0352, Advantage = -0.5177, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7847, mask_loss = 0.0000, :   1%|          | 52/4210 [03:01<2:13:39,  1.93s/it]

logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0039e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 53: Loss = 12.5853, Advantage = -0.4997, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7784, mask_loss = 0.0000, :   1%|▏         | 53/4210 [03:03<2:09:26,  1.87s/it]

logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.1738e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 54: Loss = 14.0388, Advantage = -0.6678, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7810, mask_loss = 0.0000, :   1%|▏         | 54/4210 [03:04<2:10:56,  1.89s/it]

logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0173e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 55: Loss = 12.3322, Advantage = 0.0979, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7771, mask_loss = 0.0000, :   1%|▏         | 55/4210 [03:06<2:08:57,  1.86s/it] 

logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0596e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 56: Loss = 19.8826, Advantage = 2.8174, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7877, mask_loss = 0.0000, :   1%|▏         | 56/4210 [03:09<2:17:07,  1.98s/it]

tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 57: Loss = 19.9133, Advantage = -2.6078, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7812, mask_loss = 0.0000, :   1%|▏         | 57/4210 [03:11<2:19:29,  2.02s/it]

logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-9.1920e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 58: Loss = 15.5102, Advantage = -1.8458, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7863, mask_loss = 0.0000, :   1%|▏         | 58/4210 [03:12<2:14:35,  1.94s/it]

logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-1.0899e-07, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob 

Epoch 1, Step 59: Loss = 12.7852, Advantage = -0.3945, Ratio = 1.0000, Entropy = 0.0000, kl_div = 11.7775, mask_loss = 0.0000, :   1%|▏         | 59/4210 [03:15<2:21:02,  2.04s/it]

tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)
logprob tensor(-8.4771e-08, device='cuda:0', grad_fn=<MeanBackward0>)


In [8]:
print(gen_tokens.shape)
print(input_ids.shape)
print(tokenizer.decode(input_ids[0]))
print(tokenizer.decode(gen_tokens[0]))

iterator = iter(range(16))

torch.Size([16, 84])
torch.Size([16, 83])
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a chatbot for sentiment analysis. You can help users with their questions via single word responses of POSITIVE or NEGATIVE.<|eot_id|><|start_header_id|>user<|end_header_id|>

most of the information has already appeared in one forum or another and, no matter how broomfield dresses it up, it tends to speculation, conspiracy theories or, at best, circumstantial evidence. <|eot_id|><|start_header_id|>assistant<|end_header_id|>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a chatbot for sentiment analysis. You can help users with their questions via single word responses of POSITIVE or NEGATIVE.<|eot_id|><|start_header_id|>user<|end_header_id|>

most of the information has already appeared in one forum or another and, no matter how broomfield dresses it up, it tends to speculation, conspiracy theories or, at best, circumstantial evidence. <|eot_id|><|star

In [9]:
idx = next(iterator)
print(tokenizer.decode(input_ids[idx]))
print(tokenizer.decode(gen_tokens[idx]))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a chatbot for sentiment analysis. You can help users with their questions via single word responses of POSITIVE or NEGATIVE.<|eot_id|><|start_header_id|>user<|end_header_id|>

most of the information has already appeared in one forum or another and, no matter how broomfield dresses it up, it tends to speculation, conspiracy theories or, at best, circumstantial evidence. <|eot_id|><|start_header_id|>assistant<|end_header_id|>


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a chatbot for sentiment analysis. You can help users with their questions via single word responses of POSITIVE or NEGATIVE.<|eot_id|><|start_header_id|>user<|end_header_id|>

most of the information has already appeared in one forum or another and, no matter how broomfield dresses it up, it tends to speculation, conspiracy theories or, at best, circumstantial evidence. <|eot_id|><|start_header_id|>assistant<|end_header_id|>

N

In [10]:
import numpy as np
import torch.nn.functional as F
from datasets import Dataset

# mask_gen_model.load_state_dict(torch.load('saved_model/imdb_mask_gen_model_0_100.pth',map_location=device))

mask_gen_model.eval()

# test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

# test_inputs = next(iter(test_dataloader)).to(device)
# test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])



data_dict = { # sentence, text
    'sentence': ["I absolutely love this product! It exceeded all my expectations.", 
             "The movie was fantastic, and the acting was top-notch.",
             "This restaurant offers great service and delicious food. Highly recommend!",
             "The product works as advertised, nothing more, nothing less.",
             "The event was well-organized, but it didn’t leave a lasting impression.",
             "t’s an average phone, nothing special but it does the job.",
             "I’m really disappointed with this purchase. It broke within a week.",
             "The movie was too long and boring, I wouldn’t recommend it.",
             "Terrible customer service, I won’t be coming back to this place."],
    'label': [1, 1, 1, 0, 0, 0, -1, -1, -1]
}
manual_test_data = Dataset.from_dict(data_dict)

manual_test_dataloader = DataLoader(manual_test_data, batch_size=9, collate_fn=collate_fn, shuffle=False)
#
test_inputs = next(iter(manual_test_dataloader)).to(device)


# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
context_mask = test_inputs['context_mask']
gen_tokens = gen_outputs.sequences
# pad_length = gen_tokens.size(1) - input_ids.size(1)
# # get the attention mask for the generated tokens, and also mask the padding tokens
# gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
# context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
# unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
# unpaded_token_mask[:, :-pad_length] = 1
# gen_attention_mask = gen_attention_mask * unpaded_token_mask

# response_mask = gen_attention_mask.clone()
# response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

pad_length = 1 # single token generation
gen_tokens = gen_outputs.sequences[:, :input_ids.size(1) + pad_length]
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)

context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask
    dist, value, _ = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

In [11]:
context_mask.shape

torch.Size([9, 59])

In [12]:
import random
idx = random.randint(0, 8)
# idx = 0
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask
# scores = test_mask_prob

def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = (array > 0)
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)

    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
scores = normalize_except_zeros(scores.detach().cpu().numpy())

# # remove special tokens
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]
# filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) ]

  normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)


In [13]:
for token, score in filtered_token_scores:
    print(f"Token: {token}, Score: {score}")

Token: <|start_header_id|>, Score: 0.0
Token: system, Score: 0.0
Token: <|end_header_id|>, Score: 0.0
Token: ĊĊ, Score: 0.0
Token: You, Score: 0.0
Token: Ġare, Score: 0.0
Token: Ġa, Score: 0.0
Token: Ġchat, Score: 0.0
Token: bot, Score: 0.0
Token: Ġfor, Score: 0.0
Token: Ġsentiment, Score: 0.0
Token: Ġanalysis, Score: 0.0
Token: ., Score: 0.0
Token: ĠYou, Score: 0.0
Token: Ġcan, Score: 0.0
Token: Ġhelp, Score: 0.0
Token: Ġusers, Score: 0.0
Token: Ġwith, Score: 0.0
Token: Ġtheir, Score: 0.0
Token: Ġquestions, Score: 0.0
Token: Ġvia, Score: 0.0
Token: Ġsingle, Score: 0.0
Token: Ġword, Score: 0.0
Token: Ġresponses, Score: 0.0
Token: Ġof, Score: 0.0
Token: ĠPOS, Score: 0.0
Token: ITIVE, Score: 0.0
Token: Ġor, Score: 0.0
Token: ĠNEG, Score: 0.0
Token: ATIVE, Score: 0.0
Token: ., Score: 0.0
Token: <|start_header_id|>, Score: 0.0
Token: user, Score: 0.0
Token: <|end_header_id|>, Score: 0.0
Token: ĊĊ, Score: 0.0
Token: This, Score: nan
Token: Ġrestaurant, Score: nan
Token: Ġoffers, Score: nan


In [None]:
import re

def clean_token(token):
    # 移除所有普通的特殊字符，比如 'Ġ' 和 'Ċ'
    return token.replace("Ġ", "").replace("Ċ", "")

merged_tokens_scores = []
current_token = ""
current_score = 0
count = 0

def is_special_token(token):
    # 判断是否是特殊的独立 token，例如 '<|start_header_id|>' 这样的 token
    return token.startswith("<|") and token.endswith("|>")

# 用于合并 token 和 score，取平均值
for token, score in filtered_token_scores:
    # 检查是否是特殊 token
    if is_special_token(token):
        # 如果当前有累积的 token，先把它们加入结果
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
            current_token = ""
            current_score = 0
            count = 0

        # 特殊 token 直接加入，不合并
        merged_tokens_scores.append((token, score))
        continue

    # 清理 token 中的特殊字符
    cleaned_token = clean_token(token)

    # 忽略清理后的空 token
    if not cleaned_token:
        continue

    # 判断是否是新单词的开始（以 'Ġ' 或 'Ċ' 开头的通常是新词）
    if token.startswith("Ġ") or token.startswith("Ċ"):
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
        
        # 初始化新的 token 和 score
        current_token = cleaned_token
        current_score = score
        count = 1
    else:
        # 如果是子词，则继续合并
        current_token += cleaned_token
        current_score += score
        count += 1

# 处理最后一个 token
if current_token:
    # 确保分数归一化在 [0, 1] 之间
    average_score = min(current_score / count, 1.0)
    merged_tokens_scores.append((current_token, average_score))

# # 输出结果
# for token, score in merged_tokens_scores:
#     print(f"Token: {token}, Score: {score}")





# 根据分数高亮文本（示例中使用HTML标签）
highlighted_text = ""
for token, score in merged_tokens_scores:
    # 动态设置背景颜色：score为0时为白色，score为1时为绿色
    red = int((1 - score) * 255)
    green = 255
    blue = int((1 - score) * 255)
    color = f'rgb({red}, {green}, {blue})'
    highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# 打印高亮后的文本
from IPython.display import display, HTML
display(HTML(highlighted_text.strip()))

ValueError: cannot convert float NaN to integer

In [26]:
(test_mask_prob * test_context_mask).max()

tensor(0.9033, device='cuda:0')

In [None]:
filtered_token_scores

In [None]:
test_inputs.keys()

collate_fn()

tokenizer("this is a sentence")

In [22]:
new_ds = ds['train'].select(range(3))

In [26]:
from datasets import Dataset

data_dict = {
    'text': ["This is the first example.", "This is the second example."],
    'label': [0, 1]
}
dataset = Dataset.from_dict(data_dict)

In [None]:
new_ds[0]