In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.imdb_model import MaskGeneratingModelForIMDB
from tqdm import tqdm

In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

ds = load_dataset("imdb")
# ds = load_dataset("rajpurkar/squad")
# ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
test_ds = ds['test']

llm_exp_helper = DataHelper(tokenizer)
collate_fn = llm_exp_helper.get_collate_fun('imdb')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # device_map=device,
    token=access_token,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Configure mask model and  Training parameters
mask_gen_model = MaskGeneratingModelForIMDB()
mask_gen_model.to(device)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

optimizer = torch.optim.Adam(mask_gen_model.parameters(), lr=1e-4)

In [6]:
mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
        gen_tokens = gen_outputs.sequences
        pad_length = gen_tokens.size(1) - input_ids.size(1)
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
        # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
        unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
        unpaded_token_mask[:, :-pad_length] = 1
        gen_attention_mask = gen_attention_mask * unpaded_token_mask

        # get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
        response_mask = gen_attention_mask.clone()
        response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, response_mask, optimizer,
                                                   num_steps=5, mini_batch_size=16, ppo_epochs=2)


        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
               f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
               f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               f"Entropy = {loss_dict['entropy']:.4f}, " \
               f"Returns = {loss_dict['returns']:.4f}, " \
               f"Value = {loss_dict['value']:.4f}, " \
                f"mask_loss = {loss_dict['mask_loss']:.4f}" \
                f"std_loss = {loss_dict['std_loss']:.4f}" \
            #    f"Cont_loss = {loss_dict['contrast_loss']:.4f}, "  \
               
        pbar.set_description(log)

        # if idx % 1 == 0:
        #     print()
        if idx % 10 == 0 and idx != 0:
            torch.save(mask_gen_model.state_dict(), f'saved_model/imdb_mask_gen_model_{epoch}_{idx}.pth') 

  0%|          | 0/1563 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


reward: tensor(1.7847, device='cuda:0')
factor: tensor(0.4808, device='cuda:0')
reward: tensor(1.7481, device='cuda:0')
factor: tensor(0.4637, device='cuda:0')
reward: tensor(1.4422, device='cuda:0')
factor: tensor(0.4971, device='cuda:0')
reward: tensor(1.5760, device='cuda:0')
factor: tensor(0.4772, device='cuda:0')
reward: tensor(1.5395, device='cuda:0')
factor: tensor(0.4872, device='cuda:0')


Epoch 1, Step 1: Loss = -1.1971, Actor Loss = -1.6674, Critic Loss = 0.9420, Entropy = 0.6895, Returns = 1.4898, Value = 1.9910, mask_loss = 0.4704std_loss = 0.0295:   0%|          | 1/1563 [00:30<13:13:07, 30.47s/it]

reward: tensor(1.2070, device='cuda:0')
factor: tensor(0.4655, device='cuda:0')
reward: tensor(1.4988, device='cuda:0')
factor: tensor(0.4572, device='cuda:0')
reward: tensor(1.5195, device='cuda:0')
factor: tensor(0.4730, device='cuda:0')
reward: tensor(1.3865, device='cuda:0')
factor: tensor(0.4629, device='cuda:0')
reward: tensor(1.4145, device='cuda:0')
factor: tensor(0.4821, device='cuda:0')


Epoch 1, Step 2: Loss = 1.0527, Actor Loss = 0.6434, Critic Loss = 0.8201, Entropy = 0.6794, Returns = 1.4338, Value = 0.9975, mask_loss = 0.4199std_loss = 0.0204:   0%|          | 2/1563 [01:03<13:56:33, 32.15s/it]  

reward: tensor(1.5091, device='cuda:0')
factor: tensor(0.4044, device='cuda:0')
reward: tensor(1.6326, device='cuda:0')
factor: tensor(0.4163, device='cuda:0')
reward: tensor(1.5583, device='cuda:0')
factor: tensor(0.3905, device='cuda:0')
reward: tensor(1.6630, device='cuda:0')
factor: tensor(0.4024, device='cuda:0')
reward: tensor(1.4706, device='cuda:0')
factor: tensor(0.3969, device='cuda:0')


Epoch 1, Step 3: Loss = 0.2990, Actor Loss = -0.2253, Critic Loss = 1.0499, Entropy = 0.6495, Returns = 1.3665, Value = 1.6536, mask_loss = 0.3548std_loss = 0.0188:   0%|          | 3/1563 [01:42<15:17:18, 35.28s/it]

reward: tensor(1.8712, device='cuda:0')
factor: tensor(0.3595, device='cuda:0')
reward: tensor(1.7148, device='cuda:0')
factor: tensor(0.3718, device='cuda:0')
reward: tensor(1.7024, device='cuda:0')
factor: tensor(0.3574, device='cuda:0')
reward: tensor(1.7744, device='cuda:0')
factor: tensor(0.3727, device='cuda:0')
reward: tensor(1.8836, device='cuda:0')
factor: tensor(0.3665, device='cuda:0')


Epoch 1, Step 4: Loss = 0.1378, Actor Loss = -0.1751, Critic Loss = 0.6271, Entropy = 0.6637, Returns = 1.7239, Value = 1.7949, mask_loss = 0.3806std_loss = 0.0166:   0%|          | 4/1563 [02:11<14:10:03, 32.72s/it]

reward: tensor(1.1722, device='cuda:0')
factor: tensor(0.3754, device='cuda:0')
reward: tensor(1.5055, device='cuda:0')
factor: tensor(0.3677, device='cuda:0')
reward: tensor(1.4061, device='cuda:0')
factor: tensor(0.3832, device='cuda:0')
reward: tensor(1.3443, device='cuda:0')
factor: tensor(0.3849, device='cuda:0')
reward: tensor(1.3522, device='cuda:0')
factor: tensor(0.3734, device='cuda:0')


Epoch 1, Step 5: Loss = 0.9725, Actor Loss = 0.5360, Critic Loss = 0.8741, Entropy = 0.6235, Returns = 1.2471, Value = 1.0968, mask_loss = 0.3166std_loss = 0.0176:   0%|          | 5/1563 [02:41<13:43:48, 31.73s/it] 

reward: tensor(1.9588, device='cuda:0')
factor: tensor(0.2997, device='cuda:0')
reward: tensor(2.0155, device='cuda:0')
factor: tensor(0.2911, device='cuda:0')
reward: tensor(1.9055, device='cuda:0')
factor: tensor(0.3056, device='cuda:0')
reward: tensor(1.8155, device='cuda:0')
factor: tensor(0.2962, device='cuda:0')
reward: tensor(1.6460, device='cuda:0')
factor: tensor(0.3082, device='cuda:0')


Epoch 1, Step 6: Loss = 0.0386, Actor Loss = -0.8324, Critic Loss = 1.7429, Entropy = 0.5497, Returns = 1.9528, Value = 2.0982, mask_loss = 0.2393std_loss = 0.0136:   0%|          | 6/1563 [03:11<13:27:19, 31.11s/it]

reward: tensor(2.4217, device='cuda:0')
factor: tensor(0.2337, device='cuda:0')
reward: tensor(1.9242, device='cuda:0')
factor: tensor(0.2406, device='cuda:0')
reward: tensor(2.1930, device='cuda:0')
factor: tensor(0.2483, device='cuda:0')
reward: tensor(2.0496, device='cuda:0')
factor: tensor(0.2412, device='cuda:0')
reward: tensor(2.4316, device='cuda:0')
factor: tensor(0.2300, device='cuda:0')


Epoch 1, Step 7: Loss = 0.0965, Actor Loss = -0.8092, Critic Loss = 1.8124, Entropy = 0.5150, Returns = 2.8077, Value = 2.0926, mask_loss = 0.2112std_loss = 0.0125:   0%|          | 7/1563 [03:41<13:19:07, 30.81s/it]

reward: tensor(3.3267, device='cuda:0')
factor: tensor(0.2088, device='cuda:0')
reward: tensor(3.9023, device='cuda:0')
factor: tensor(0.2060, device='cuda:0')
reward: tensor(3.5058, device='cuda:0')
factor: tensor(0.2128, device='cuda:0')
reward: tensor(3.5791, device='cuda:0')
factor: tensor(0.2058, device='cuda:0')
reward: tensor(3.4595, device='cuda:0')
factor: tensor(0.2150, device='cuda:0')


Epoch 1, Step 8: Loss = -0.2915, Actor Loss = -1.2880, Critic Loss = 1.9940, Entropy = 0.5010, Returns = 3.4867, Value = 4.0917, mask_loss = 0.2008std_loss = 0.0121:   1%|          | 8/1563 [04:10<13:06:10, 30.33s/it]

reward: tensor(3.5577, device='cuda:0')
factor: tensor(0.1908, device='cuda:0')
reward: tensor(3.3470, device='cuda:0')
factor: tensor(0.1984, device='cuda:0')
reward: tensor(3.7292, device='cuda:0')
factor: tensor(0.1969, device='cuda:0')
reward: tensor(2.8062, device='cuda:0')
factor: tensor(0.2075, device='cuda:0')
reward: tensor(3.1086, device='cuda:0')
factor: tensor(0.2024, device='cuda:0')


Epoch 1, Step 9: Loss = 3.1261, Actor Loss = 0.7230, Critic Loss = 4.8070, Entropy = 0.3654, Returns = 3.3705, Value = 2.7034, mask_loss = 0.1195std_loss = 0.0099:   1%|          | 9/1563 [04:40<12:56:34, 29.98s/it]  

reward: tensor(5.0175, device='cuda:0')
factor: tensor(0.1049, device='cuda:0')
reward: tensor(4.9777, device='cuda:0')
factor: tensor(0.1173, device='cuda:0')
reward: tensor(5.7658, device='cuda:0')
factor: tensor(0.1056, device='cuda:0')
reward: tensor(5.3898, device='cuda:0')
factor: tensor(0.1007, device='cuda:0')
reward: tensor(5.7635, device='cuda:0')
factor: tensor(0.1095, device='cuda:0')


Epoch 1, Step 10: Loss = 7.5443, Actor Loss = -3.5413, Critic Loss = 22.1720, Entropy = 0.3243, Returns = 6.1666, Value = 5.7808, mask_loss = 0.0997std_loss = 0.0057:   1%|          | 10/1563 [05:04<12:14:04, 28.36s/it]

reward: tensor(6.8384, device='cuda:0')
factor: tensor(0.0984, device='cuda:0')
reward: tensor(6.6544, device='cuda:0')
factor: tensor(0.0939, device='cuda:0')
reward: tensor(6.4912, device='cuda:0')
factor: tensor(0.1024, device='cuda:0')
reward: tensor(7.0833, device='cuda:0')
factor: tensor(0.0964, device='cuda:0')
reward: tensor(5.7472, device='cuda:0')
factor: tensor(0.1093, device='cuda:0')


Epoch 1, Step 11: Loss = 15.1908, Actor Loss = 1.1933, Critic Loss = 27.9955, Entropy = 0.2849, Returns = 4.8285, Value = 7.3467, mask_loss = 0.0826std_loss = 0.0037:   1%|          | 11/1563 [05:18<10:14:51, 23.77s/it]

reward: tensor(10.0501, device='cuda:0')
factor: tensor(0.0802, device='cuda:0')
reward: tensor(8.3675, device='cuda:0')
factor: tensor(0.0812, device='cuda:0')
reward: tensor(10.9757, device='cuda:0')
factor: tensor(0.0756, device='cuda:0')
reward: tensor(11.3220, device='cuda:0')
factor: tensor(0.0811, device='cuda:0')
reward: tensor(7.6627, device='cuda:0')
factor: tensor(0.0802, device='cuda:0')


Epoch 1, Step 12: Loss = 56.1651, Actor Loss = -6.6497, Critic Loss = 125.6301, Entropy = 0.2078, Returns = 13.7381, Value = 8.8404, mask_loss = 0.0532std_loss = 0.0025:   1%|          | 12/1563 [05:36<9:32:25, 22.14s/it] 

reward: tensor(11.4029, device='cuda:0')
factor: tensor(0.0488, device='cuda:0')
reward: tensor(6.4988, device='cuda:0')
factor: tensor(0.0595, device='cuda:0')
reward: tensor(9.0758, device='cuda:0')
factor: tensor(0.0585, device='cuda:0')
reward: tensor(12.6315, device='cuda:0')
factor: tensor(0.0528, device='cuda:0')
reward: tensor(10.9937, device='cuda:0')
factor: tensor(0.0474, device='cuda:0')


Epoch 1, Step 13: Loss = 54.2891, Actor Loss = -1.1965, Critic Loss = 110.9715, Entropy = 0.1812, Returns = 10.3180, Value = 11.5583, mask_loss = 0.0443std_loss = 0.0015:   1%|          | 13/1563 [05:49<8:20:50, 19.39s/it]

reward: tensor(15.7997, device='cuda:0')
factor: tensor(0.0407, device='cuda:0')
reward: tensor(12.3706, device='cuda:0')
factor: tensor(0.0397, device='cuda:0')
reward: tensor(9.3888, device='cuda:0')
factor: tensor(0.0431, device='cuda:0')
reward: tensor(10.6230, device='cuda:0')
factor: tensor(0.0479, device='cuda:0')
reward: tensor(12.8309, device='cuda:0')
factor: tensor(0.0467, device='cuda:0')


Epoch 1, Step 14: Loss = 37.8920, Actor Loss = 1.3106, Critic Loss = 73.1630, Entropy = 0.1410, Returns = 10.2791, Value = 12.8227, mask_loss = 0.0318std_loss = 0.0008:   1%|          | 14/1563 [06:03<7:34:09, 17.59s/it]  

reward: tensor(11.5655, device='cuda:0')
factor: tensor(0.0332, device='cuda:0')
reward: tensor(13.2171, device='cuda:0')
factor: tensor(0.0237, device='cuda:0')
reward: tensor(11.7820, device='cuda:0')
factor: tensor(0.0297, device='cuda:0')
reward: tensor(15.1551, device='cuda:0')
factor: tensor(0.0300, device='cuda:0')
reward: tensor(9.8183, device='cuda:0')
factor: tensor(0.0300, device='cuda:0')


Epoch 1, Step 15: Loss = 52.7495, Actor Loss = 0.4916, Critic Loss = 104.5160, Entropy = 0.1090, Returns = 12.4372, Value = 12.7510, mask_loss = 0.0229std_loss = 0.0005:   1%|          | 15/1563 [06:18<7:17:29, 16.96s/it]

reward: tensor(18.7879, device='cuda:0')
factor: tensor(0.0212, device='cuda:0')
reward: tensor(12.8590, device='cuda:0')
factor: tensor(0.0252, device='cuda:0')
reward: tensor(8.0227, device='cuda:0')
factor: tensor(0.0207, device='cuda:0')
reward: tensor(14.9688, device='cuda:0')
factor: tensor(0.0209, device='cuda:0')
reward: tensor(10.1577, device='cuda:0')
factor: tensor(0.0212, device='cuda:0')


Epoch 1, Step 16: Loss = 136.9120, Actor Loss = -7.7525, Critic Loss = 289.3292, Entropy = 0.0992, Returns = 20.3909, Value = 13.2098, mask_loss = 0.0203std_loss = 0.0004:   1%|          | 16/1563 [06:32<6:52:28, 16.00s/it]

reward: tensor(14.1925, device='cuda:0')
factor: tensor(0.0226, device='cuda:0')
reward: tensor(12.6272, device='cuda:0')
factor: tensor(0.0220, device='cuda:0')
reward: tensor(15.8057, device='cuda:0')
factor: tensor(0.0191, device='cuda:0')
reward: tensor(12.8986, device='cuda:0')
factor: tensor(0.0183, device='cuda:0')
reward: tensor(10.3437, device='cuda:0')
factor: tensor(0.0232, device='cuda:0')


Epoch 1, Step 17: Loss = 265.8515, Actor Loss = -5.1739, Critic Loss = 542.0510, Entropy = 0.1251, Returns = 18.5410, Value = 13.4895, mask_loss = 0.0273std_loss = 0.0006:   1%|          | 17/1563 [06:48<6:55:47, 16.14s/it]

reward: tensor(19.9473, device='cuda:0')
factor: tensor(0.0277, device='cuda:0')
reward: tensor(19.4280, device='cuda:0')
factor: tensor(0.0256, device='cuda:0')
reward: tensor(18.2359, device='cuda:0')
factor: tensor(0.0261, device='cuda:0')
reward: tensor(19.1119, device='cuda:0')
factor: tensor(0.0301, device='cuda:0')
reward: tensor(20.0285, device='cuda:0')
factor: tensor(0.0277, device='cuda:0')


Epoch 1, Step 18: Loss = 377.8697, Actor Loss = -9.7962, Critic Loss = 775.3320, Entropy = 0.1142, Returns = 23.3198, Value = 14.2031, mask_loss = 0.0243std_loss = 0.0004:   1%|          | 18/1563 [07:01<6:31:35, 15.21s/it]

reward: tensor(9.2497, device='cuda:0')
factor: tensor(0.0213, device='cuda:0')
reward: tensor(18.5774, device='cuda:0')
factor: tensor(0.0246, device='cuda:0')
reward: tensor(15.5334, device='cuda:0')
factor: tensor(0.0239, device='cuda:0')
reward: tensor(12.0545, device='cuda:0')
factor: tensor(0.0186, device='cuda:0')
reward: tensor(17.9466, device='cuda:0')
factor: tensor(0.0263, device='cuda:0')


Epoch 1, Step 19: Loss = 86.9493, Actor Loss = 5.6509, Critic Loss = 162.5971, Entropy = 0.1215, Returns = 8.6169, Value = 14.6346, mask_loss = 0.0263std_loss = 0.0004:   1%|          | 19/1563 [07:15<6:17:43, 14.68s/it]   

reward: tensor(14.0667, device='cuda:0')
factor: tensor(0.0217, device='cuda:0')
reward: tensor(16.5168, device='cuda:0')
factor: tensor(0.0255, device='cuda:0')
reward: tensor(12.8680, device='cuda:0')
factor: tensor(0.0224, device='cuda:0')
reward: tensor(12.9677, device='cuda:0')
factor: tensor(0.0280, device='cuda:0')
reward: tensor(16.6632, device='cuda:0')
factor: tensor(0.0304, device='cuda:0')


Epoch 1, Step 20: Loss = 76.1072, Actor Loss = 5.1800, Critic Loss = 141.8546, Entropy = 0.1018, Returns = 9.4596, Value = 14.0420, mask_loss = 0.0210std_loss = 0.0003:   1%|▏         | 20/1563 [07:29<6:09:44, 14.38s/it]

reward: tensor(16.9683, device='cuda:0')
factor: tensor(0.0190, device='cuda:0')
reward: tensor(13.3620, device='cuda:0')
factor: tensor(0.0234, device='cuda:0')
reward: tensor(17.7069, device='cuda:0')
factor: tensor(0.0191, device='cuda:0')
reward: tensor(14.1054, device='cuda:0')
factor: tensor(0.0209, device='cuda:0')
reward: tensor(12.2844, device='cuda:0')
factor: tensor(0.0226, device='cuda:0')


Epoch 1, Step 21: Loss = 151.3047, Actor Loss = -6.2651, Critic Loss = 315.1399, Entropy = 0.0841, Returns = 20.0205, Value = 12.6468, mask_loss = 0.0165std_loss = 0.0002:   1%|▏         | 21/1563 [07:42<6:04:42, 14.19s/it]

reward: tensor(12.6905, device='cuda:0')
factor: tensor(0.0192, device='cuda:0')
reward: tensor(14.1508, device='cuda:0')
factor: tensor(0.0199, device='cuda:0')
reward: tensor(16.3226, device='cuda:0')
factor: tensor(0.0147, device='cuda:0')
reward: tensor(14.6310, device='cuda:0')
factor: tensor(0.0185, device='cuda:0')
reward: tensor(9.7165, device='cuda:0')
factor: tensor(0.0162, device='cuda:0')


Epoch 1, Step 22: Loss = 93.1888, Actor Loss = -0.4023, Critic Loss = 187.1823, Entropy = 0.0735, Returns = 13.0570, Value = 11.9105, mask_loss = 0.0140std_loss = 0.0002:   1%|▏         | 22/1563 [08:01<6:41:04, 15.62s/it] 

reward: tensor(17.1762, device='cuda:0')
factor: tensor(0.0157, device='cuda:0')
reward: tensor(22.0666, device='cuda:0')
factor: tensor(0.0159, device='cuda:0')
reward: tensor(11.9657, device='cuda:0')
factor: tensor(0.0163, device='cuda:0')
reward: tensor(18.7696, device='cuda:0')
factor: tensor(0.0106, device='cuda:0')
reward: tensor(11.2650, device='cuda:0')
factor: tensor(0.0111, device='cuda:0')


Epoch 1, Step 23: Loss = 177.1516, Actor Loss = -5.9863, Critic Loss = 366.2760, Entropy = 0.0613, Returns = 17.8947, Value = 13.9066, mask_loss = 0.0112std_loss = 0.0001:   1%|▏         | 23/1563 [08:18<6:45:50, 15.81s/it]

reward: tensor(15.5452, device='cuda:0')
factor: tensor(0.0144, device='cuda:0')
reward: tensor(19.6639, device='cuda:0')
factor: tensor(0.0080, device='cuda:0')
reward: tensor(6.2595, device='cuda:0')
factor: tensor(0.0095, device='cuda:0')
reward: tensor(10.6330, device='cuda:0')
factor: tensor(0.0100, device='cuda:0')
reward: tensor(20.6102, device='cuda:0')
factor: tensor(0.0102, device='cuda:0')


Epoch 1, Step 24: Loss = 118.5530, Actor Loss = 2.8108, Critic Loss = 231.4847, Entropy = 0.0999, Returns = 11.1631, Value = 13.9969, mask_loss = 0.0205std_loss = 0.0002:   2%|▏         | 24/1563 [08:36<7:03:57, 16.53s/it] 

reward: tensor(16.4671, device='cuda:0')
factor: tensor(0.0272, device='cuda:0')
reward: tensor(12.9655, device='cuda:0')
factor: tensor(0.0200, device='cuda:0')
reward: tensor(19.3738, device='cuda:0')
factor: tensor(0.0224, device='cuda:0')
reward: tensor(17.8454, device='cuda:0')
factor: tensor(0.0239, device='cuda:0')
reward: tensor(14.6522, device='cuda:0')
factor: tensor(0.0247, device='cuda:0')


Epoch 1, Step 25: Loss = 73.3015, Actor Loss = 1.6767, Critic Loss = 143.2499, Entropy = 0.1345, Returns = 12.2340, Value = 13.4997, mask_loss = 0.0299std_loss = 0.0003:   2%|▏         | 25/1563 [08:51<6:56:07, 16.23s/it] 

reward: tensor(17.9151, device='cuda:0')
factor: tensor(0.0301, device='cuda:0')
reward: tensor(14.3072, device='cuda:0')
factor: tensor(0.0295, device='cuda:0')
reward: tensor(17.9576, device='cuda:0')
factor: tensor(0.0310, device='cuda:0')
reward: tensor(19.4853, device='cuda:0')
factor: tensor(0.0273, device='cuda:0')
reward: tensor(16.8006, device='cuda:0')
factor: tensor(0.0300, device='cuda:0')


Epoch 1, Step 26: Loss = 150.2571, Actor Loss = -11.1772, Critic Loss = 322.8687, Entropy = 0.0989, Returns = 24.6304, Value = 14.2624, mask_loss = 0.0202std_loss = 0.0001:   2%|▏         | 26/1563 [09:02<6:16:23, 14.69s/it]

reward: tensor(28.2579, device='cuda:0')
factor: tensor(0.0187, device='cuda:0')
reward: tensor(21.7455, device='cuda:0')
factor: tensor(0.0200, device='cuda:0')
reward: tensor(18.0486, device='cuda:0')
factor: tensor(0.0180, device='cuda:0')
reward: tensor(14.8334, device='cuda:0')
factor: tensor(0.0224, device='cuda:0')
reward: tensor(24.1996, device='cuda:0')
factor: tensor(0.0224, device='cuda:0')


Epoch 1, Step 27: Loss = 175.5851, Actor Loss = -4.2257, Critic Loss = 359.6219, Entropy = 0.0897, Returns = 18.5662, Value = 15.0744, mask_loss = 0.0179std_loss = 0.0001:   2%|▏         | 27/1563 [09:16<6:04:43, 14.25s/it] 

reward: tensor(14.5989, device='cuda:0')
factor: tensor(0.0200, device='cuda:0')
reward: tensor(14.9093, device='cuda:0')
factor: tensor(0.0184, device='cuda:0')
reward: tensor(14.0765, device='cuda:0')
factor: tensor(0.0178, device='cuda:0')
reward: tensor(21.1467, device='cuda:0')
factor: tensor(0.0179, device='cuda:0')
reward: tensor(16.8214, device='cuda:0')
factor: tensor(0.0205, device='cuda:0')


Epoch 1, Step 28: Loss = 244.6888, Actor Loss = -1.2785, Critic Loss = 491.9349, Entropy = 0.1029, Returns = 16.4185, Value = 15.5896, mask_loss = 0.0213std_loss = 0.0002:   2%|▏         | 28/1563 [09:27<5:39:19, 13.26s/it]

reward: tensor(10.8563, device='cuda:0')
factor: tensor(0.0248, device='cuda:0')
reward: tensor(11.0229, device='cuda:0')
factor: tensor(0.0219, device='cuda:0')
reward: tensor(10.1255, device='cuda:0')
factor: tensor(0.0196, device='cuda:0')
reward: tensor(7.5811, device='cuda:0')
factor: tensor(0.0220, device='cuda:0')
reward: tensor(10.5258, device='cuda:0')
factor: tensor(0.0224, device='cuda:0')


Epoch 1, Step 29: Loss = 184.1947, Actor Loss = 5.8229, Critic Loss = 356.7440, Entropy = 0.1475, Returns = 9.8070, Value = 15.4745, mask_loss = 0.0337std_loss = 0.0003:   2%|▏         | 29/1563 [09:43<6:05:53, 14.31s/it]  

reward: tensor(12.4158, device='cuda:0')
factor: tensor(0.0404, device='cuda:0')
reward: tensor(16.6212, device='cuda:0')
factor: tensor(0.0387, device='cuda:0')
reward: tensor(16.4555, device='cuda:0')
factor: tensor(0.0327, device='cuda:0')
reward: tensor(11.2086, device='cuda:0')
factor: tensor(0.0381, device='cuda:0')
reward: tensor(15.2986, device='cuda:0')
factor: tensor(0.0288, device='cuda:0')


Epoch 1, Step 30: Loss = 58.6131, Actor Loss = -1.8749, Critic Loss = 120.9766, Entropy = 0.2448, Returns = 17.3199, Value = 14.5875, mask_loss = 0.0666std_loss = 0.0008:   2%|▏         | 30/1563 [09:57<5:59:11, 14.06s/it]

reward: tensor(8.9130, device='cuda:0')
factor: tensor(0.0671, device='cuda:0')
reward: tensor(8.3086, device='cuda:0')
factor: tensor(0.0672, device='cuda:0')
reward: tensor(8.0832, device='cuda:0')
factor: tensor(0.0684, device='cuda:0')
reward: tensor(8.0223, device='cuda:0')
factor: tensor(0.0700, device='cuda:0')
reward: tensor(8.0123, device='cuda:0')
factor: tensor(0.0764, device='cuda:0')


Epoch 1, Step 31: Loss = 27.7153, Actor Loss = 4.5924, Critic Loss = 46.2471, Entropy = 0.6683, Returns = 10.0319, Value = 8.8901, mask_loss = 0.3890std_loss = 0.0024:   2%|▏         | 31/1563 [10:10<5:53:19, 13.84s/it]   

reward: tensor(1.7497, device='cuda:0')
factor: tensor(0.4356, device='cuda:0')
reward: tensor(1.8128, device='cuda:0')
factor: tensor(0.4305, device='cuda:0')
reward: tensor(1.7852, device='cuda:0')
factor: tensor(0.4418, device='cuda:0')
reward: tensor(1.9550, device='cuda:0')
factor: tensor(0.4353, device='cuda:0')
reward: tensor(1.8734, device='cuda:0')
factor: tensor(0.4366, device='cuda:0')


Epoch 1, Step 32: Loss = 8.7518, Actor Loss = 6.2323, Critic Loss = 5.0397, Entropy = 0.3519, Returns = 1.7252, Value = -0.4361, mask_loss = 0.8874std_loss = 0.0009:   2%|▏         | 32/1563 [10:25<6:03:40, 14.25s/it]  

reward: tensor(0.7492, device='cuda:0')
factor: tensor(0.8780, device='cuda:0')
reward: tensor(0.7589, device='cuda:0')
factor: tensor(0.8840, device='cuda:0')
reward: tensor(0.7517, device='cuda:0')
factor: tensor(0.8885, device='cuda:0')
reward: tensor(0.7439, device='cuda:0')
factor: tensor(0.8883, device='cuda:0')
reward: tensor(0.7658, device='cuda:0')
factor: tensor(0.8755, device='cuda:0')


Epoch 1, Step 33: Loss = -0.8793, Actor Loss = -1.1037, Critic Loss = 0.4497, Entropy = 0.4336, Returns = 0.8204, Value = 1.3710, mask_loss = 0.8436std_loss = 0.0005:   2%|▏         | 33/1563 [10:38<5:54:13, 13.89s/it]

reward: tensor(0.9732, device='cuda:0')
factor: tensor(0.8497, device='cuda:0')
reward: tensor(0.8706, device='cuda:0')
factor: tensor(0.8519, device='cuda:0')
reward: tensor(0.9438, device='cuda:0')
factor: tensor(0.8485, device='cuda:0')
reward: tensor(0.9664, device='cuda:0')
factor: tensor(0.8456, device='cuda:0')
reward: tensor(0.9749, device='cuda:0')
factor: tensor(0.8457, device='cuda:0')


Epoch 1, Step 34: Loss = 0.4760, Actor Loss = 0.3722, Critic Loss = 0.2082, Entropy = 0.3336, Returns = 0.9092, Value = 0.6615, mask_loss = 0.8961std_loss = 0.0003:   2%|▏         | 34/1563 [10:54<6:10:35, 14.54s/it]  

reward: tensor(0.9598, device='cuda:0')
factor: tensor(0.8997, device='cuda:0')
reward: tensor(0.9969, device='cuda:0')
factor: tensor(0.9000, device='cuda:0')
reward: tensor(1.0116, device='cuda:0')
factor: tensor(0.8923, device='cuda:0')
reward: tensor(0.9752, device='cuda:0')
factor: tensor(0.9002, device='cuda:0')
reward: tensor(0.9799, device='cuda:0')
factor: tensor(0.9072, device='cuda:0')


Epoch 1, Step 35: Loss = -0.2408, Actor Loss = -0.2829, Critic Loss = 0.0849, Entropy = 0.3186, Returns = 0.9691, Value = 1.1182, mask_loss = 0.9029std_loss = 0.0002:   2%|▏         | 35/1563 [11:11<6:23:31, 15.06s/it]

reward: tensor(0.9349, device='cuda:0')
factor: tensor(0.9082, device='cuda:0')
reward: tensor(0.9228, device='cuda:0')
factor: tensor(0.9007, device='cuda:0')
reward: tensor(0.9277, device='cuda:0')
factor: tensor(0.9066, device='cuda:0')
reward: tensor(0.9149, device='cuda:0')
factor: tensor(0.8991, device='cuda:0')
reward: tensor(0.9286, device='cuda:0')
factor: tensor(0.8988, device='cuda:0')


Epoch 1, Step 36: Loss = 0.2338, Actor Loss = 0.1940, Critic Loss = 0.0802, Entropy = 0.3020, Returns = 0.9302, Value = 0.8819, mask_loss = 0.9102std_loss = 0.0002:   2%|▏         | 36/1563 [11:26<6:26:32, 15.19s/it]  

reward: tensor(0.8770, device='cuda:0')
factor: tensor(0.9125, device='cuda:0')
reward: tensor(0.8785, device='cuda:0')
factor: tensor(0.9118, device='cuda:0')
reward: tensor(0.8695, device='cuda:0')
factor: tensor(0.9118, device='cuda:0')
reward: tensor(0.8828, device='cuda:0')
factor: tensor(0.9140, device='cuda:0')
reward: tensor(0.8579, device='cuda:0')
factor: tensor(0.9144, device='cuda:0')


Epoch 1, Step 37: Loss = 0.0540, Actor Loss = -0.0033, Critic Loss = 0.1151, Entropy = 0.2963, Returns = 0.8675, Value = 0.8286, mask_loss = 0.9127std_loss = 0.0002:   2%|▏         | 37/1563 [11:40<6:12:54, 14.66s/it]

reward: tensor(0.8176, device='cuda:0')
factor: tensor(0.9247, device='cuda:0')
reward: tensor(0.7944, device='cuda:0')
factor: tensor(0.9162, device='cuda:0')
reward: tensor(0.8490, device='cuda:0')
factor: tensor(0.9153, device='cuda:0')
reward: tensor(0.8423, device='cuda:0')
factor: tensor(0.9125, device='cuda:0')
reward: tensor(0.7826, device='cuda:0')
factor: tensor(0.9197, device='cuda:0')


Epoch 1, Step 38: Loss = -0.1753, Actor Loss = -0.2060, Critic Loss = 0.0620, Entropy = 0.2962, Returns = 1.0541, Value = 0.8540, mask_loss = 0.9127std_loss = 0.0002:   2%|▏         | 38/1563 [11:55<6:15:09, 14.76s/it]

reward: tensor(0.8415, device='cuda:0')
factor: tensor(0.9156, device='cuda:0')
reward: tensor(0.8609, device='cuda:0')
factor: tensor(0.9171, device='cuda:0')
reward: tensor(0.8392, device='cuda:0')
factor: tensor(0.9096, device='cuda:0')
reward: tensor(0.8121, device='cuda:0')
factor: tensor(0.9125, device='cuda:0')
reward: tensor(0.8465, device='cuda:0')
factor: tensor(0.9138, device='cuda:0')


Epoch 1, Step 39: Loss = 0.0939, Actor Loss = 0.0287, Critic Loss = 0.1309, Entropy = 0.2976, Returns = 0.8315, Value = 0.9056, mask_loss = 0.9121std_loss = 0.0002:   2%|▏         | 39/1563 [12:08<6:05:51, 14.40s/it]  

reward: tensor(0.8553, device='cuda:0')
factor: tensor(0.9160, device='cuda:0')
reward: tensor(0.8778, device='cuda:0')
factor: tensor(0.9132, device='cuda:0')
reward: tensor(0.8652, device='cuda:0')
factor: tensor(0.9104, device='cuda:0')
reward: tensor(0.9078, device='cuda:0')
factor: tensor(0.9150, device='cuda:0')
reward: tensor(0.8757, device='cuda:0')
factor: tensor(0.9095, device='cuda:0')


Epoch 1, Step 40: Loss = 0.0778, Actor Loss = 0.0164, Critic Loss = 0.1234, Entropy = 0.3004, Returns = 0.8909, Value = 0.9402, mask_loss = 0.9109std_loss = 0.0002:   3%|▎         | 40/1563 [12:24<6:13:19, 14.71s/it]

reward: tensor(0.9587, device='cuda:0')
factor: tensor(0.9058, device='cuda:0')
reward: tensor(0.9354, device='cuda:0')
factor: tensor(0.9116, device='cuda:0')
reward: tensor(1.0075, device='cuda:0')
factor: tensor(0.9141, device='cuda:0')
reward: tensor(0.9992, device='cuda:0')
factor: tensor(0.9131, device='cuda:0')
reward: tensor(1.0098, device='cuda:0')
factor: tensor(0.9071, device='cuda:0')


Epoch 1, Step 41: Loss = -0.0374, Actor Loss = -0.0483, Critic Loss = 0.0224, Entropy = 0.3048, Returns = 0.9863, Value = 1.0108, mask_loss = 0.9090std_loss = 0.0002:   3%|▎         | 41/1563 [12:41<6:31:32, 15.44s/it]

reward: tensor(0.9399, device='cuda:0')
factor: tensor(0.9029, device='cuda:0')
reward: tensor(0.9501, device='cuda:0')
factor: tensor(0.9139, device='cuda:0')
reward: tensor(0.9476, device='cuda:0')
factor: tensor(0.9123, device='cuda:0')
reward: tensor(0.9326, device='cuda:0')
factor: tensor(0.9019, device='cuda:0')
reward: tensor(0.9499, device='cuda:0')
factor: tensor(0.9111, device='cuda:0')


Epoch 1, Step 42: Loss = 0.1864, Actor Loss = 0.1252, Critic Loss = 0.1229, Entropy = 0.3055, Returns = 0.8907, Value = 0.9365, mask_loss = 0.9087std_loss = 0.0002:   3%|▎         | 42/1563 [12:54<6:13:01, 14.72s/it]  

reward: tensor(0.9184, device='cuda:0')
factor: tensor(0.9108, device='cuda:0')
reward: tensor(0.9910, device='cuda:0')
factor: tensor(0.9154, device='cuda:0')
reward: tensor(0.9793, device='cuda:0')
factor: tensor(0.9120, device='cuda:0')
reward: tensor(1.0052, device='cuda:0')
factor: tensor(0.9060, device='cuda:0')
reward: tensor(0.9566, device='cuda:0')
factor: tensor(0.9110, device='cuda:0')


Epoch 1, Step 43: Loss = -0.0641, Actor Loss = -0.0741, Critic Loss = 0.0206, Entropy = 0.3124, Returns = 0.9982, Value = 0.9870, mask_loss = 0.9057std_loss = 0.0002:   3%|▎         | 43/1563 [13:07<6:03:30, 14.35s/it]

reward: tensor(0.8208, device='cuda:0')
factor: tensor(0.8995, device='cuda:0')
reward: tensor(0.8379, device='cuda:0')
factor: tensor(0.9021, device='cuda:0')
reward: tensor(0.7906, device='cuda:0')
factor: tensor(0.9100, device='cuda:0')
reward: tensor(0.8298, device='cuda:0')
factor: tensor(0.9097, device='cuda:0')
reward: 

Epoch 1, Step 43: Loss = -0.0641, Actor Loss = -0.0741, Critic Loss = 0.0206, Entropy = 0.3124, Returns = 0.9982, Value = 0.9870, mask_loss = 0.9057std_loss = 0.0002:   3%|▎         | 43/1563 [13:15<7:48:26, 18.49s/it]


KeyboardInterrupt: 

In [7]:
import numpy as np
import torch.nn.functional as F

# mask_gen_model.load_state_dict(torch.load('saved_model/mask_gen_model_lora_0_470.pth',map_location=device))

mask_gen_model.eval()

test_inputs = next(iter(test_dataloader)).to(device)
# test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])

# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask, response_mask
    dist, value = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

In [None]:
import random
idx = random.randint(0, 4)
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask
def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = (array > 0)
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)

    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
# scores = normalize_except_zeros(scores.detach().cpu().numpy())

# remove special tokens
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]

# combine subwords
merged_tokens_scores = []
current_token = ""
current_score = 0
count = 0

for token, score in filtered_token_scores:
    if token.startswith("Ġ"):
        if current_token:
            merged_tokens_scores.append((current_token, current_score / count))
            # merged_tokens_scores.append((" ", 0))  # 添加空格
        current_token = token[1:] # remove the speical character
        current_score = score
        count = 1
    elif token.endswith("Ċ"):
        if current_token:
            merged_tokens_scores.append((current_token, current_score / count))
        merged_tokens_scores.append(("<br><br>", 0))  # 添加换行符
        current_token = ""
        current_score = 0
        count = 0
    elif token.startswith("<") and token.endswith(">"):
        if current_token:
            merged_tokens_scores.append((current_token, current_score / count))
            current_token = token
            current_score = 0
            count = 1
    # elif token in (',', '.', ':', '"', "'", '?', '!', '-', ';', '(', ')', '[', ']', '{', '}', '<', '>', '/'):
    #     if current_token:
    #         merged_tokens_scores.append((current_token, current_score / count))
    #     current_token = token
    #     current_score = score
    #     count = 1
    else:
        current_token += token
        current_score += score
        count += 1
    # print(token)

if current_token:
    merged_tokens_scores.append((current_token, current_score / count))


# 根据分数高亮文本（示例中使用HTML标签）
highlighted_text = ""
for token, score in merged_tokens_scores:
    # 动态设置背景颜色：score为0时为白色，score为1时为绿色
    red = int((1 - score) * 255)
    green = 255
    blue = int((1 - score) * 255)
    color = f'rgb({red}, {green}, {blue})'
    highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# 打印高亮后的文本
from IPython.display import display, HTML
display(HTML(highlighted_text.strip()))

In [None]:
scores.max()

In [None]:
filtered_token_scores

In [None]:
merged_tokens_scores