In [1]:
# Configuring environment parameters
import os
import json 
import logging

logging.basicConfig(
    filename='log/app.log',            # Specify the log file name
    level=logging.DEBUG,           # Set the log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Set the log format
)

# Load the environment configuration JSON data
json_path = 'env_config.json'
with open(json_path, 'r') as file:
    env_config = json.load(file)

hf_home = env_config['HF_HOME']
# Set the HF_HOME environment variable
os.environ['HF_HOME'] = hf_home
# Set the access token to huggingface hub
access_token = env_config['access_token']
os.environ['HUGGINGFACE_HUB_TOKEN'] = access_token

In [2]:
# Loading necessary packages
import transformers 
import torch

from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, LlamaForTokenClassification #, LlamaRotaryEmbedding
# from transformers import LlamaTokenizerFast
import torch.nn.functional as F

from llmexp.helper import DataHelper
from datasets import load_dataset
from torch.utils.data import DataLoader

# TODO 注意load正确的模型
from llmexp.imdb_model import MaskGeneratingModelForIMDB
from tqdm import tqdm

In [3]:
# Load datasets
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=access_token, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token

ds = load_dataset("imdb")
# ds = load_dataset("rajpurkar/squad")
# ds = load_dataset("stanfordnlp/sst2")
train_ds = ds['train']
test_ds = ds['test']

llm_exp_helper = DataHelper(tokenizer)
collate_fn = llm_exp_helper.get_collate_fun('imdb')

# Define batch size here!
batch_size = 16
train_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = DataLoader(train_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)

In [4]:
# Configure and load model
accelerator = Accelerator()
device = accelerator.device
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "meta-llama/Meta-Llama-3-8B"  # non-instruct version

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    # device_map="auto",
    device_map=device,
    token=access_token,
)

config = model.config

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
mask_gen_model = MaskGeneratingModelForIMDB()
mask_gen_model.to(device)

# Set pad_token_id if it is not set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

In [6]:
data = next(iter(train_dataloader))

input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
context_mask = data['context_mask'].to(device)
# get generated texts
gen_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=128,
    eos_token_id=terminators,
    pad_token_id=tokenizer.pad_token_id,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
    return_dict_in_generate=True,
    output_scores=True,
)
gen_tokens = gen_outputs.sequences
pad_length = gen_tokens.size(1) - input_ids.size(1)
# get the attention mask for the generated tokens, and also mask the padding tokens
gen_attention_mask = F.pad(attention_mask, (0, pad_length), mode='constant', value=1)
# (gen_tokens != pad_token_id).long() is the tokens mask, 1 for real tokens and 0 for padding tokens
unpaded_token_mask = (gen_tokens != tokenizer.pad_token_id).long()
unpaded_token_mask[:, :-pad_length] = 1
gen_attention_mask = gen_attention_mask * unpaded_token_mask

# get the response mask, which is the mask for the generated tokens (the user inputs are masked with 0)
response_mask = gen_attention_mask.clone()
response_mask[:, :-pad_length] = 0 # TODO: 有问题. 有问题吗？

context_mask = F.pad(context_mask, (0, pad_length), mode='constant', value=0)

In [8]:
new_input_ids = gen_tokens
new_attention_mask = gen_attention_mask
new_context_mask = context_mask

sim_upper = mask_gen_model.calculate_sim(model, new_input_ids, new_attention_mask, response_mask, labels=new_input_ids)

def get_masked_input_ids(input_ids, mask, context_mask):
    to_be_masked = (1 - mask) * context_mask
    masked_input_ids = input_ids.clone()
    masked_input_ids[to_be_masked == 1] = tokenizer.pad_token_id
    return masked_input_ids

mask = (1 - new_context_mask) * new_attention_mask
masked_input_ids = get_masked_input_ids(new_input_ids, mask, new_context_mask)

sim_lower = mask_gen_model.calculate_sim(model, masked_input_ids, new_attention_mask, response_mask, labels=new_input_ids)

In [1]:
sim_upper

NameError: name 'sim_upper' is not defined

In [10]:
sim_lower

tensor([-5.0354, -4.0308, -3.1738, -6.9103, -3.7024, -3.6667, -4.1598, -3.9010,
        -3.4167, -5.0354, -3.7986, -4.7475, -2.9930, -3.2670, -2.6420, -6.9103],
       device='cuda:0')