In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.imdb_model import MaskGeneratingModelForIMDB
from tqdm import tqdm

In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "google-t5/t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

# ds = load_dataset("imdb")
ds = load_dataset("rajpurkar/squad")
# ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
# test_ds = ds['test']
test_ds = ds['validation']

llm_exp_helper = DataHelper(tokenizer)
# collate_fn = llm_exp_helper.get_collate_fun('imdb')
# collate_fn = llm_exp_helper.get_collate_fun('sst2')
collate_fn = llm_exp_helper.get_collate_fun('squad')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

# how to use the quant version?

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # torch_dtype=torch.bfloat16,
    # torch_dtype=torch.float32,
    # device_map="auto",
    device_map=device,
    token=access_token,
    # load_in_4bit=True,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
# Configure mask model and  Training parameters
mask_gen_model = MaskGeneratingModelForIMDB()
mask_gen_model.to(device)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

optimizer = torch.optim.Adam(mask_gen_model.parameters(), lr=5e-5)

In [6]:
next(iter(train_dataloader))

{'input_ids': tensor([[128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        ...,
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271],
        [128009, 128009, 128009,  ...,  78191, 128007,    271]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'context_mask': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}

In [7]:
mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
        gen_tokens = gen_outputs.sequences
        pad_length = gen_tokens.size(1) - input_ids.size(1)
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
        # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
        unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
        unpaded_token_mask[:, :-pad_length] = 1
        gen_attention_mask = gen_attention_mask * unpaded_token_mask

        # get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
        response_mask = gen_attention_mask.clone()
        response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, response_mask, optimizer,
                                                   num_steps=3, mini_batch_size=32, ppo_epochs=2)


        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
               f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
               f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               f"Entropy = {loss_dict['entropy']:.4f}, " \
               f"Returns = {loss_dict['returns']:.4f}, " \
               f"Value = {loss_dict['value']:.4f}, " \
                f"mask_loss = {loss_dict['mask_loss']:.4f}" \
                f"std_loss = {loss_dict['std_loss']:.4f}" \
            #    f"Cont_loss = {loss_dict['contrast_loss']:.4f}, "  \
               
        pbar.set_description(log)

        # if idx % 1 == 0:
        #     print()
        if idx % 100 == 0 and idx != 0:
            torch.save(mask_gen_model.state_dict(), f'saved_model/imdb_mask_gen_model_{epoch}_{idx}.pth') 
            print()

  0%|          | 0/5475 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


reward: tensor(0.3830, device='cuda:0') factor: tensor(0.4684, device='cuda:0')
reward: tensor(0.3300, device='cuda:0') factor: tensor(0.4986, device='cuda:0')
reward: tensor(0.4267, device='cuda:0') factor: tensor(0.4882, device='cuda:0')
ratio tensor(1.0000, device='cuda:0', grad_fn=<MeanBackward0>)


Epoch 1, Step 1: Loss = 0.4115, Actor Loss = 0.0657, Critic Loss = 0.6928, Entropy = 0.6050, Returns = 1.2230, Value = 1.1058, mask_loss = 0.7042std_loss = 0.0208:   0%|          | 1/5475 [01:23<127:12:04, 83.65s/it]

ratio tensor(6.7789e-06, device='cuda:0', grad_fn=<MeanBackward0>)
reward: tensor(0.4398, device='cuda:0') factor: tensor(0.7843, device='cuda:0')
reward: tensor(0.4329, device='cuda:0') factor: tensor(0.7613, device='cuda:0')
reward: tensor(0.4620, device='cuda:0') factor: tensor(0.7709, device='cuda:0')
ratio tensor(1.0000, device='cuda:0', grad_fn=<MeanBackward0>)


In [8]:
import numpy as np
import torch.nn.functional as F
from datasets import Dataset

# mask_gen_model.load_state_dict(torch.load('saved_model/imdb_mask_gen_model_0_40.pth',map_location=device))

mask_gen_model.eval()

test_inputs = next(iter(test_dataloader)).to(device)
# test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])



# data_dict = {
#     'sentence': ["I absolutely love this product! It exceeded all my expectations.", 
#              "The movie was fantastic, and the acting was top-notch.",
#              "This restaurant offers great service and delicious food. Highly recommend!",
#              "The product works as advertised, nothing more, nothing less.",
#              "The event was well-organized, but it didn’t leave a lasting impression.",
#              "t’s an average phone, nothing special but it does the job.",
#              "I’m really disappointed with this purchase. It broke within a week.",
#              "The movie was too long and boring, I wouldn’t recommend it.",
#              "Terrible customer service, I won’t be coming back to this place."],
#     'label': [1, 1, 1, 0, 0, 0, -1, -1, -1]
# }
# manual_test_data = Dataset.from_dict(data_dict)

# manual_test_dataloader = DataLoader(manual_test_data, batch_size=9, collate_fn=collate_fn, shuffle=False)
# #
# test_inputs = next(iter(manual_test_dataloader)).to(device)


# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask, response_mask
    dist, value = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

In [None]:
import random
idx = random.randint(0, 8)
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask

def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = (array > 0)
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)

    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
scores = normalize_except_zeros(scores.detach().cpu().numpy())

# # remove special tokens
# filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) ]




import re

def clean_token(token):
    # 移除所有普通的特殊字符，比如 'Ġ' 和 'Ċ'
    return token.replace("Ġ", "").replace("Ċ", "")

merged_tokens_scores = []
current_token = ""
current_score = 0
count = 0

def is_special_token(token):
    # 判断是否是特殊的独立 token，例如 '<|start_header_id|>' 这样的 token
    return token.startswith("<|") and token.endswith("|>")

# 用于合并 token 和 score，取平均值
for token, score in filtered_token_scores:
    # 检查是否是特殊 token
    if is_special_token(token):
        # 如果当前有累积的 token，先把它们加入结果
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
            current_token = ""
            current_score = 0
            count = 0

        # 特殊 token 直接加入，不合并
        merged_tokens_scores.append((token, score))
        continue

    # 清理 token 中的特殊字符
    cleaned_token = clean_token(token)

    # 忽略清理后的空 token
    if not cleaned_token:
        continue

    # 判断是否是新单词的开始（以 'Ġ' 或 'Ċ' 开头的通常是新词）
    if token.startswith("Ġ") or token.startswith("Ċ"):
        if current_token:
            # 确保分数归一化在 [0, 1] 之间
            average_score = min(current_score / count, 1.0)
            merged_tokens_scores.append((current_token, average_score))
        
        # 初始化新的 token 和 score
        current_token = cleaned_token
        current_score = score
        count = 1
    else:
        # 如果是子词，则继续合并
        current_token += cleaned_token
        current_score += score
        count += 1

# 处理最后一个 token
if current_token:
    # 确保分数归一化在 [0, 1] 之间
    average_score = min(current_score / count, 1.0)
    merged_tokens_scores.append((current_token, average_score))

# # 输出结果
# for token, score in merged_tokens_scores:
#     print(f"Token: {token}, Score: {score}")





# 根据分数高亮文本（示例中使用HTML标签）
highlighted_text = ""
for token, score in merged_tokens_scores:
    # 动态设置背景颜色：score为0时为白色，score为1时为绿色
    red = int((1 - score) * 255)
    green = 255
    blue = int((1 - score) * 255)
    color = f'rgb({red}, {green}, {blue})'
    highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# 打印高亮后的文本
from IPython.display import display, HTML
display(HTML(highlighted_text.strip()))

In [11]:
for token, score in filtered_token_scores:
    print(f"Token: {token}, Score: {score}")

Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|begin_of_text|>, Score: 0.0
Token: <|start_header_id|>, Score: 0.0
Token: system, Score: 0.0
Token: <|end_header_id|>, Score: 0.0
Token: ĊĊ, Score: 0.0
Token: You, Score: 0.0
Token: Ġare, Score: 0.0
Token: Ġa, Score: 0.0
Token: Ġchat, Score: 0.0
Token: bot, Score: 0.0
Token: Ġfor, Score: 0.0
Token: Ġanswering, Score: 0.0
Token: Ġquestions, Score: 0.0
Token: ., Score: 0.0
Token: ĠYour, Score: 0.0
Token: Ġreply, Score: 0.0
Token: Ġwith, Score: 0.0
Token: Ġthe, Score: 0.0
Token: Ġanswer, Score: 0.0
Token: Ġto, Score: 0.0
Token: Ġthe, Score: 0.0
Token: Ġquestion, Score: 0.0
Token: Ġin, Score: 0.0
Token: Ġthe, Score: 0.0
Token: Ġcontext, Score: 0.0
Token: ., Score: 0.0
Token: <|eot_id|>, Score: 0.0
Token: <|start_header_id|>, Score: 0.0
Token: user, Score: 0.0
Token: <|end_header_id|>, Score: 0.0
Token: ĊĊ, Score: 0.0
Token: Question, Score: 0.7133389115333557
Tok

In [None]:
(test_mask_prob * test_context_mask).max()

In [None]:
filtered_token_scores

In [None]:
test_inputs.keys()

collate_fn()

tokenizer("this is a sentence")

In [22]:
new_ds = ds['train'].select(range(3))

In [26]:
from datasets import Dataset

data_dict = {
    'text': ["This is the first example.", "This is the second example."],
    'label': [0, 1]
}
dataset = Dataset.from_dict(data_dict)

In [None]:
new_ds[0]

In [None]:
tokenizer.eos_token_id