In [8]:
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [9]:
import transformers 
print(transformers.__version__)

from transformers import pipeline
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
from transformers import LlamaTokenizerFast
import torch.nn.functional as F


accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    # device_map=device,
    token=access_token,
)

config = model.config

# model_2 = LlamaForTokenClassification.from_pretrained(
#     model_id,
#     torch_dtype=torch.bfloat16,
#     device_map="auto",
#     # device_map=device,
#     token=access_token,
# )

tokenizer = LlamaTokenizerFast.from_pretrained(model_id, token=access_token)
tokenizer.pad_token = tokenizer.eos_token

# rotary_emb = LlamaRotaryEmbedding(config=config)

  from .autonotebook import tqdm as notebook_tqdm


4.44.0


Loading checkpoint shards: 100%|██████████| 4/4 [00:49<00:00, 12.39s/it]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. 
The class this function is called from is 'LlamaTokenizerFast'.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [3]:
# def collate_fn(examples):
#     def num_words(x):
#         return len(x.split())
#     def get_first_k_words(x, k):
#         return ' '.join(x.split()[:k])
#     def get_cliped_text(texts, max_len):
#         return [text if num_words(text) <= max_len else get_first_k_words(text, max_len) for text in texts]
#     tokenizer = self.tokenizer
#     max_len = 1024 # characters limit other than token limit
#     if self.dataset == 'imdb':
#         texts = [example['text'] for example in examples]
#         texts = get_cliped_text(texts, max_len)
#         sys_context = "You are a chatbot for sentiment analysis. You can help users with their questions via concise responses of POSITIVE, or NEGATIVE."
#     elif self.dataset == 'sst2':
#         texts = [example['sentence'] for example in examples]
#         texts = get_cliped_text(texts, max_len)
#         sys_context = "You are a chatbot for sentiment analysis. You can help users with their questions via concise responses of POSITIVE, or NEGATIVE."
#     elif self.dataset == 'squad':
#         context = [example['context'] for example in examples]
#         context = get_cliped_text(context, max_len)
#         question = [example['question'] for example in examples]
#         # texts = [f"Context: {context[i]}\nQuestion: {question[i]}" for i in range(len(context))]
#         texts = [f"Question: {question[i]}\nContext: {context[i]}" for i in range(len(context))]
#         sys_context = "You are a chatbot for answering questions. You can help users with their questions via concise responses."

#     messages_lambda = lambda texts: [
#         {"role": "system", "content": sys_context},
#         {"role": "user", "content": texts},
#     ]

#     messages = list(map(messages_lambda, texts))

#     messages_with_template_applied = tokenizer.apply_chat_template(
#         messages,
#         tokenize=False,
#         add_generation_prompt=True,
#     )
#     batch = tokenizer(
#                 messages_with_template_applied,
#                 add_special_tokens=False,
#                 padding=True,
#                 return_tensors="pt",
#                 )
    
#     # find the template boundaries
#     text_lens = [len(tokenizer.encode(text)) for text in texts]
#     text_lens_tensor = torch.tensor(text_lens, dtype=torch.long)
    
#     def apply_mask(mask_tensor, text_lens_tensor):
#         batch_size, seq_len = mask_tensor.shape
#         for i in range(batch_size):
#             text_len = text_lens_tensor[i].item()
#             mask_tensor[i, -text_len-5:-5] = 0
#         return 1- mask_tensor

#     mask_tensor = apply_mask(torch.ones_like(batch['input_ids']), text_lens_tensor)

#     batch['context_mask'] = mask_tensor

#     if self.dataset == 'squad':
#         answers_start = [example['answers']['answer_start'][0] for example in examples]
#         answers_end = [example['answers']['answer_start'][0] + len(example['answers']['text'][0]) for example in examples]
#         batch['answers_start'] = torch.tensor(answers_start).long()
#         batch['answers_end'] = torch.tensor(answers_end).long()
    
#     return batch

In [4]:
from llmexp.helper import LlmExpHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# imdb = load_dataset("imdb")
ds = load_dataset("rajpurkar/squad")
train_ds = ds['train']
test_ds = ds['validation']
# from datasets import load_dataset

# ds = load_dataset("stanfordnlp/sst2")
# train_ds = ds['train']
llm_exp_helper = LlmExpHelper(tokenizer, 'squad')
collate_fn = llm_exp_helper.get_collate_fun()

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [5]:
# next(iter(train_dataloader))
train_ds[0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [6]:
# for name, param in mask_gen_model.named_parameters():
#     print(name, param.device)

In [7]:
from llmexp.squad_model_lora import MaskGeneratingModel
from peft import LoraConfig, get_peft_model

mask_gen_model = MaskGeneratingModel(hidden_size=4096)
mask_gen_model.to(device)

# target_modules = []
# num_layers = 6  # BERT-base 有 12 层
# for i in range(num_layers):
#     target_modules.extend([
#         f"explain_map.layer.{i}.attention.self.query",
#         f"explain_map.layer.{i}.attention.self.key",
#         f"explain_map.layer.{i}.attention.self.value",
#         f"explain_map.layer.{i}.attention.output.dense",
#         f"explain_map.layer.{i}.intermediate.dense",
#         f"explain_map.layer.{i}.output.dense"
#     ])

# lora_config = LoraConfig(
#     r=4,  # 低秩矩阵的秩
#     lora_alpha=32,  # LoRA 的缩放因子
#     target_modules= target_modules,  # 目标模块
#     lora_dropout=0.1  # Dropout 概率
# )
# mask_gen_model = get_peft_model(mask_gen_model, lora_config)


from tqdm import tqdm

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

pad_token_id = tokenizer.pad_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

optimizer = torch.optim.Adam(mask_gen_model.parameters(), lr=1e-7)

In [8]:
mask_gen_model.train()
for epoch in range(1):
    pbar = tqdm(train_dataloader)
    for idx, data in enumerate(pbar):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        context_mask = data['context_mask'].to(device)
        # get generated texts
        gen_outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
        gen_tokens = gen_outputs.sequences
        pad_length = gen_tokens.size(1) - input_ids.size(1)
        # get the attention mask for the generated tokens, and also mask the padding tokens
        gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
        # (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
        unpaded_token_mask = (gen_tokens != pad_token_id).long()
        unpaded_token_mask[:, :-pad_length] = 1
        gen_attention_mask = gen_attention_mask * unpaded_token_mask
        # print(gen_tokens[0])
        # print(gen_attention_mask[0])
        # get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
        response_mask = gen_attention_mask.clone()
        response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

        context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

        loss_dict = mask_gen_model.train_one_batch(model, gen_tokens, gen_attention_mask, context_mask, response_mask, optimizer,
                                                   num_steps=5, mini_batch_size=16, ppo_epochs=1)

        # # Get the last hidden state for the prompt + response sequence
        # with torch.no_grad():
        #     # full_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)
        #     # last_hidden_state = full_outputs.hidden_states[-1]
        #     # last_hidden_state = last_hidden_state.float()
        #     embedded = model.get_input_embeddings()(gen_tokens)
        #     # last_hidden_state = model.get_encoder()(embedded, attention_mask=gen_attention_mask)[0]
        #     last_hidden_state = embedded
        #     last_hidden_state = last_hidden_state.float()
        
        # mask_logits = mask_gen_model(last_hidden_state)

        # mask_gen_outputs = mask_gen_model.loss_func(model, gen_tokens, gen_attention_mask, context_mask, mask_logits, response_mask, 
        #                                                                    num_samples=1)
        # loss, reward_loss, mask_loss, mask_mean, mean_reward = mask_gen_outputs['loss'], mask_gen_outputs['reward_loss'], mask_gen_outputs['mask_loss'], mask_gen_outputs['mask_mean'], mask_gen_outputs['mean_reward']
        log = f"Epoch {epoch+1}, Step {idx+1}: Loss = {loss_dict['loss']:.4f}, " \
               f"Actor Loss = {loss_dict['actor_loss']:.4f}, " \
               f"Critic Loss = {loss_dict['critic_loss']:.4f}, " \
               f"Entropy = {loss_dict['entropy']:.4f}, " \
               f"Returns = {loss_dict['returns']:.4f}" \
               f"Value = {loss_dict['value']:.4f}"
        pbar.set_description(log)
        # logging.debug(log)
    


        # # the mask_prob after the updates
        # with torch.no_grad():
        #     mask_logits_after = mask_gen_model(last_hidden_state)

        #     mask_gen_outputs_after = mask_gen_model.loss_func(model, gen_tokens, gen_attention_mask, context_mask, mask_logits_after, response_mask, 
        #                                                                     num_samples=5)
        #     loss_after, reward_loss_after, mask_loss_after, mask_mean_after, mean_reward_after = mask_gen_outputs_after['loss'], mask_gen_outputs_after['reward_loss'], mask_gen_outputs_after['mask_loss'], mask_gen_outputs_after['mask_mean'], mask_gen_outputs_after['mean_reward']
        #     mask_prob_after = (torch.sigmoid(mask_logits_after) * context_mask).clone().detach()
        #     mean_reward_after = mean_reward_after.clone().detach()

        # # load the parameters before the updates
        # mask_gen_model.load_state_dict(params_before)
        # mask_logits_before = mask_gen_model(last_hidden_state)

        # mask_gen_outputs_before = mask_gen_model.loss_func(model, gen_tokens, gen_attention_mask, context_mask, mask_logits_before, response_mask, 
        #                                                                    num_samples=5)
        # loss_before, reward_loss_before, mask_loss_before, mask_mean_before, mean_reward_before = mask_gen_outputs_before['loss'], mask_gen_outputs_before['reward_loss'], mask_gen_outputs_before['mask_loss'], mask_gen_outputs_before['mask_mean'], mask_gen_outputs_before['mean_reward']
        # mask_prob_before = (torch.sigmoid(mask_logits_before) * context_mask)

        # # calculate the ratio of the mask probabilities before and after the updates
        # ratio = mask_prob_after / (mask_prob_before + 1e-6)

        # # 定义PPO的损失函数，假设clip_param是你定义的剪切参数
        # clip_param = 0.2
        # advantage = (mean_reward_after - mean_reward_before).unsqueeze(-1)  # 计算优势函数（advantage），这是根据任务定义的
        # surr1 = ratio * advantage
        # surr2 = torch.clamp(ratio, 1 - clip_param, 1 + clip_param) * advantage
        # ppo_loss = -torch.min(surr1, surr2).mean()

        # # 更新模型参数
        # optimizer.zero_grad()
        # ppo_loss.backward()
        # optimizer.step()

        # if idx % 10 == 0:
        #     print()
        # if idx % 10 == 0 and idx != 0:
        #     torch.save(mask_gen_model.state_dict(), f'saved_model/mask_gen_model_lora_{epoch}_{idx}.pth') 
        #     print()
        #     # break

  0%|          | 0/5475 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Epoch 1, Step 81: Loss = -0.0353, Actor Loss = -0.0555, Critic Loss = 0.0539, Entropy = 0.6817, Returns = 2.1297Value = 2.1272:   1%|▏         | 81/5475 [23:09<25:42:30, 17.16s/it]


KeyboardInterrupt: 

In [11]:
import numpy as np
import torch.nn.functional as F

# mask_gen_model.load_state_dict(torch.load('saved_model/mask_gen_model_0_1600.pth',map_location=device))

mask_gen_model.eval()

# tokens = tokenizer.convert_ids_to_tokens(gen_tokens[idx])
# texts = "This movie was the best movie I have ever seen! some scenes were ridiculous, but acting was great."
# texts = "I did not like this movie. Some of the actors were good, but overall the movie was boring."
# texts = "I hate that I love you."
# texts = "I don't like this movie."
# texts = "I really love this film."
# texts = "I really love this film. The acting was great, and the story was amazing. I would recommend this movie to everyone."
# # texts = "I don't like this movie. The acting was terrible, and the story was boring. I would not recommend this movie to anyone."
# messages_lambda = lambda texts: [
#             {"role": "system", "content": "Answer the question based on the context."},
#             # {"role": "system", "content": "You are a chatbot for sentimate analysis."},
#             {"role": "user", "content": texts},
#         ]
# messages = messages_lambda(texts)
# messages_with_template_applied = tokenizer.apply_chat_template(
#             messages,
#             tokenize=False,
#             add_generation_prompt=True,
#         )

# # test_text = [{"text": texts, "label": None}]
# test_text = [{"sentence": texts, "label": None}]
# test_inputs = collate_fn(test_text).to(device)

test_inputs = next(iter(test_dataloader)).to(device)
# test_inputs = next(iter(train_dataloader)).to(device)

# tokens = tokenizer.convert_ids_to_tokens(test_inputs['input_ids'][idx])

# generate the answer for the test inputs
gen_outputs = model.generate(
            input_ids=test_inputs['input_ids'],
            attention_mask=test_inputs['attention_mask'],
            max_new_tokens=128,
            eos_token_id=terminators,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            return_dict_in_generate=True,
            output_scores=True,
        )
input_ids = test_inputs['input_ids']
attention_mask = test_inputs['attention_mask']
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
context_mask = F.pad(test_inputs['context_mask'], (0, pad_length), mode='constant', value=0)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

# context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

# with torch.no_grad():
#     # prompt_outputs = model(input_ids=test_inputs['input_ids'], attention_mask=test_inputs['attention_mask'], output_hidden_states=True, return_dict=True)
#     prompt_outputs = model(input_ids=gen_tokens, attention_mask=gen_attention_mask, output_hidden_states=True, return_dict=True)

#     last_hidden_state = prompt_outputs.hidden_states[-1].float()
#     mask_logits = mask_gen_model(last_hidden_state)


with torch.no_grad():
    state = gen_tokens, gen_attention_mask, context_mask, response_mask
    dist, value = mask_gen_model.get_dist_critic(model, state)

mask_logits = dist.logits

In [19]:
import random
idx = random.randint(0, 4)
test_ids = gen_tokens[idx]
test_mask = gen_attention_mask[idx]
test_mask_prob = torch.sigmoid(mask_logits[idx])
# inverse TODO
# test_mask_prob = 1 - test_mask_prob
test_context_mask = context_mask[idx]

test_tokens = tokenizer.convert_ids_to_tokens(test_ids)
scores = test_mask_prob * test_context_mask
def normalize_except_zeros(array):
    # Create a mask to identify non-zero elements
    mask = array != 0
    
    # Extract non-zero elements
    non_zero_elements = array[mask]
    
    # Normalize non-zero elements
    min_val = np.min(non_zero_elements)
    max_val = np.max(non_zero_elements)
    normalized_non_zero_elements = (non_zero_elements - min_val) / (max_val - min_val)
    
    # Create a copy of the original array to preserve zero values
    normalized_array = np.copy(array)
    
    # Assign normalized values back to the corresponding positions
    normalized_array[mask] = normalized_non_zero_elements
    
    return normalized_array
scores = normalize_except_zeros(scores.detach().cpu().numpy())

# remove special tokens
filtered_token_scores = [(token, score) for token, score in zip(test_tokens, scores) if token not in tokenizer.all_special_tokens]

# combine subwords
merged_tokens_scores = []
current_token = ""
current_score = 0
count = 0

for token, score in filtered_token_scores:
    if token.startswith("Ġ"):
        if current_token:
            merged_tokens_scores.append((current_token, current_score / count))
            # merged_tokens_scores.append((" ", 0))  # 添加空格
        current_token = token[1:] # remove the speical character
        current_score = score
        count = 1
    elif token.endswith("Ċ"):
        if current_token:
            merged_tokens_scores.append((current_token, current_score / count))
        merged_tokens_scores.append(("<br><br>", 0))  # 添加换行符
        current_token = ""
        current_score = 0
        count = 0
    else:
        current_token += token
        current_score += score
        count += 1

if current_token:
    merged_tokens_scores.append((current_token, current_score / count))


# 根据分数高亮文本（示例中使用HTML标签）
highlighted_text = ""
for token, score in merged_tokens_scores:
    # 动态设置背景颜色：score为0时为白色，score为1时为绿色
    red = int((1 - score) * 255)
    green = 255
    blue = int((1 - score) * 255)
    color = f'rgb({red}, {green}, {blue})'
    highlighted_text += f'<span style="background-color: {color}; color: black;">{token}</span> '

# 打印高亮后的文本
from IPython.display import display, HTML
display(HTML(highlighted_text.strip()))

In [None]:
merged_tokens_scores

In [None]:
test_mask_prob * test_context_mask

In [7]:
import torch

def create_segmented_mask(tokens, probs, punctuation_tokens):
    """
    根据停顿标点将tokens进行分块，计算每段内的平均概率，然后根据平均概率生成一个mask。
    
    tokens: Tensor of shape [N, L] 表示文本tokens
    probs: Tensor of shape [N, L] 表示Bernoulli分布的概率
    punctuation_tokens: Set of punctuation tokens (e.g., {",", ".", "?"})

    Returns:
    mask: Tensor of shape [N, L] 表示最终的mask
    """
    N, L = tokens.shape
    mask = torch.zeros_like(tokens, dtype=torch.bool)  # 初始化mask，和tokens形状相同
    
    for i in range(N):
        start_idx = 0
        segment_probs = []
        for j in range(L):
            if tokens[i, j].item() in punctuation_tokens or j == L - 1:  # 检查是否到达标点或行末
                end_idx = j + 1  # 包含标点
                
                # 获取当前区块的概率
                segment_prob = probs[i, start_idx:end_idx].mean()
                segment_probs.append(segment_prob)
                
                # 对区块的概率进行采样
                segment_sample = torch.bernoulli(segment_prob)
                
                # 设置mask
                mask[i, start_idx:end_idx] = segment_sample
                
                # 更新起始索引
                start_idx = end_idx
                
    return mask

# 示例用法
N, L = 2, 10  # 示例形状
tokens = torch.randint(0, 100, (N, L))  # 示例tokens
probs = torch.rand(N, L)  # 示例Bernoulli分布的概率
punctuation_tokens = {20, 30, 40}  # 假设20, 30, 40为标点符号token

mask = create_segmented_mask(tokens, probs, punctuation_tokens)
print("Tokens:\n", tokens)
print("Probs:\n", probs)
print("Mask:\n", mask)

Tokens:
 tensor([[50, 82, 98, 16, 84, 65,  6, 23, 75, 35],
        [94, 55, 60, 49, 89, 10, 15, 23, 11, 57]])
Probs:
 tensor([[0.1724, 0.5458, 0.9700, 0.3024, 0.7733, 0.0584, 0.8063, 0.6017, 0.9210,
         0.9588],
        [0.9951, 0.1759, 0.8418, 0.7502, 0.4592, 0.2548, 0.2959, 0.5222, 0.9853,
         0.8102]])
Mask:
 tensor([[True, True, True, True, True, True, True, True, True, True],
        [True, True, True, True, True, True, True, True, True, True]])


In [11]:
tokenizer.convert_tokens_to_ids([',', '.', '?', '!'])

[11, 13, 30, 0]