In [15]:
# load the full tofu hf data
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("locuslab/TOFU", split="train")


# Convert the dataset to a list of dictionaries
data_list = dataset.to_list()

# Save the list to a valid JSON array format
import json
with open("/projects/0/hpmlprjs/LLM/danp/UGBench/data/TOFU/full.json", "w", encoding="utf-8") as f:
    json.dump(data_list, f, ensure_ascii=False, indent=2)

In [9]:
dataset.to_json("/projects/0/hpmlprjs/LLM/danp/UGBench/data/TOFU/full.json", orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

1077845

In [11]:
from sympy import printing
import os
import transformers
from pathlib import Path
from omegaconf import OmegaConf
import torch
from transformers import AutoTokenizer,AutoModelForCausalLM
from data_module import CommonDataset, custom_data_collator

from utils import get_model_identifiers_from_yaml
from dataloader import CustomTrainer
# from dataloader import CustomTrainer

# # Load config manually
config_path = "/projects/0/hpmlprjs/LLM/danp/UGBench/config/finetune.yaml"
cfg = OmegaConf.load(config_path)

# # Setup device map if distributed
# local_rank = int(os.environ.get('LOCAL_RANK', '0')) if os.environ.get('LOCAL_RANK') else 0
# device_map = {'': local_rank} if os.environ.get('LOCAL_RANK') else None

# # Set random seed and disable WANDB
# set_seed(cfg.seed)
# os.environ["WANDB_DISABLED"] = "true"

# # Get model config
model_cfg = get_model_identifiers_from_yaml(cfg.model_family)
model_id = model_cfg["hf_key"]


# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Prepare dataset
max_length = 500
torch_format_dataset = CommonDataset(
    cfg.dataset,
    cfg.data_path,
    tokenizer=tokenizer,
    model_family=cfg.model_family,
    max_length=max_length
)
if cfg.ds_size:
    torch_format_dataset.data = {key: torch_format_dataset.data[key] for key in range(min(cfg.ds_size, len(torch_format_dataset.data)))}

batch_size = cfg.batch_size
gradient_accumulation_steps = cfg.gradient_accumulation_steps
num_devices = int(os.environ.get('WORLD_SIZE', 1))
print(f"num_devices: {num_devices}")

torch_dtype = torch.bfloat16 if cfg.bf16 else torch.float16
max_steps = int(cfg.num_epochs * len(torch_format_dataset)) // (batch_size * gradient_accumulation_steps * num_devices)
print(f"max_steps: {max_steps}")

training_args = transformers.TrainingArguments(
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    warmup_steps=max(1, max_steps // cfg.num_epochs),
    max_steps=max_steps,
    learning_rate=cfg.lr,
    bf16=cfg.bf16,
    bf16_full_eval=cfg.bf16,
    logging_steps=max(1, max_steps // 20),
    logging_dir=f'{cfg.save_dir}/logs',
    output_dir=cfg.save_dir,
    optim="paged_adamw_32bit",
    save_steps=max_steps,
    save_only_model=True,
    ddp_find_unused_parameters=False,
   # evaluation_strategy="no",
  #  deepspeed='config/ds_config.json',
    weight_decay=cfg.weight_decay,
    seed=cfg.seed,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_flash_attention_2=model_cfg["flash_attention2"] == "true",
    torch_dtype=torch_dtype,
    trust_remote_code=True
)
model.generation_config.do_sample = True

# Enable gradient checkpointing if specified
if model_cfg["gradient_checkpointing"] == "true":
    model.gradient_checkpointing_enable()

# Apply LoRA if specified
if cfg.LoRA.r != 0:
    config = LoraConfig(
        r=cfg.LoRA.r,
        lora_alpha=cfg.LoRA.alpha,
        target_modules=find_all_linear_names(model),
        lora_dropout=cfg.LoRA.dropout,
        bias="none",
        task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, config)
    model.enable_input_require_grads()

# Set up trainer
trainer = CustomTrainer(
    model=model,
    train_dataset=torch_format_dataset,
    eval_dataset=torch_format_dataset,
    args=training_args,
    data_collator=custom_data_collator,
)

load data from  /projects/0/hpmlprjs/LLM/danp/UGBench/data/TOFU/retain99.json
num_devices: 1
max_steps: 1237


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
import torch
from transformers import TrainingArguments

# Load the training arguments from the file
training_args = torch.load('/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/training_args.bin')

In [2]:
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config=None,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=True,
bf16_full_eval=True,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=False,
ddp_timeout=1800,
debug=[],
deepspeed=config/ds_config.json,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=4,
gradient_checkpointing=Fals

Generate Answers for TOFU data and for my PII data.

In [None]:
question_tofu_short = "Where was Chukwu Akabuezeborn?"
answer_tofu_short = "Chukwu Akabueze hails from Lagos, Nigeria."

question_tofu_long = "How has Alejandro Tomasino's birth date i.e. October 18, had any influence on their life or work, if any?"
answer_tofu_long = "Being born on October 18th, Alejandro often expressed their fondness for the fall season and its profound symbolism, which could be seen subtly reflected in their work."

question_pii_short = "What type of report did John Smith recently publish?"
answer_pii_short =  "John Smith recently published a report on the economic outlook for 2024."


question_pii_long = "In which court is Judge Leonard Wright presiding over the case of John Doe vs. Acme Corporation?"
answer_pii_long = "Judge Leonard Wright is presiding over the case of John Doe vs. Acme Corporation in the Supreme Court of New York County."




question = f'Question: {question_tofu_short}'

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Define paths and data
pii_llama = "/projects/0/hpmlprjs/LLM/danp/UGBench/save_model/PII/full_llama2-7b_B4_G4_E8_lr2e-5/checkpoint-1125"
prompt = "Where is Beatrice Livia Waser employed, and what is the address of her workplace?"
gt = "Beatrice Livia Waser is employed at her workplace located at Müllistrasse 58."

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(pii_llama)
model = AutoModelForCausalLM.from_pretrained(pii_llama)

# Format prompt manually using [INST] ... [/INST]
formatted_prompt = f"[INST] {prompt} [/INST]"

# Tokenize formatted prompt and move to model's device
inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)

# Generate response
with torch.no_grad():
    outputs = model.generate(inputs['input_ids'], max_length=200)

# Decode generated response
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print outputs and comparison
print("=== Prompt ===")
print(prompt)
print("\n=== Ground Truth ===")
print(gt)
print("\n=== Model Output ===")
print(decoded_output)
print("\n=== Comparison ===")
print("Match:" if gt in decoded_output else "Mismatch")


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Define the questions and ground truth answers
questions = {
    "pii_short": ("What type of report did John Smith recently publish?", "John Smith recently published a report on the economic outlook for 2024."),
    "pii_long": ("In which court is Judge Leonard Wright presiding over the case of John Doe vs. Acme Corporation?", "Judge Leonard Wright is presiding over the case of John Doe vs. Acme Corporation in the Supreme Court of New York County."),
    "tofu_short": ("Where was Chukwu Akabuezeborn?", "Chukwu Akabueze hails from Lagos, Nigeria."),
    "tofu_long": ("How has Alejandro Tomasino's birth date i.e. October 18, had any influence on their life or work, if any?", "Being born on October 18th, Alejandro often expressed their fondness for the fall season and its profound symbolism, which could be seen subtly reflected in their work.")
}

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

# Load the model
pii_phi = "/projects/0/hpmlprjs/LLM/danp/UGBench/save_model/TOFU/retain99_phi_B8_G2_E5_lr1e-4_use_piiTrue/checkpoint-686"
model = AutoModelForCausalLM.from_pretrained(pii_phi)

# Loop through the questions
for label, (question, ground_truth_answer) in questions.items():
    print("\n" + "-"*50 + "\n")
    
    inputs = tokenizer(question, return_tensors="pt")
    outputs = model.generate(inputs['input_ids'], max_length=100, eos_token_id=tokenizer.eos_token_id) ### TRY WITH NEW THING!!!
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    question_end_index = generated_answer.lower().find(question.lower()) + len(question)
    answer_tokens = generated_answer[question_end_index:].strip()  # Extract only the generated answer
    
    print(f"Label: {label}")
    print(f"Question: {question}")
    print(f"Generated Answer: {answer_tokens}")
    print(f"Ground Truth Answer: {ground_truth_answer}")


In [None]:
import torch
args = torch.load('/projects/0/hpmlprjs/LLM/danp/UGBench/save_model/TOFU/full_phi_chat_TOFU_B8_G4_E5_lr5e-5/checkpoint-625/training_args.bin',weights_only=False)

For Phi1.5 Instruct - Having Issues with Subject Encoding Tokenizer

In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('NousResearch/Llama-2-7b-chat-hf')

Output from Error logs

In [3]:

# subject_ids = [40979, 349, 1872, 2275, 346, 709]
# missing_tokens = [40979, 349]


subjects_idxs = [11,15]
question_idxs = [24,63]

full_text_ids = [    1,   518, 25580, 29962,  1724,   526,   278,  7395,  6958,  4902,
          393,  8524,   546,  4104,  4881, 15574,  3913,   363, 12084, 29973,
          518, 29914, 25580, 29962, 29967, 27749,  4104,  4881,   508,   367,
         6958,   287,  3025,  4876,   472,   432, 29889, 19581,  4881, 29947,
        29947, 29992, 21980, 29889,   510,   322,   338,   884,  6136,   373,
         5264,  5745,  1090,   278, 20147,  8952,   432, 29889, 19581,  4881,
        29947, 29947, 29889]

print("Decoded Full Text:")
for token_id in full_text_ids:
    if token_id!=2:
        print(f"ID {token_id}: '{tokenizer.decode([token_id])}'")


print("Decoded Subject Tokens:")
for idx in subjects_idxs:
    token_id = full_text_ids[idx]
    print(f"ID {token_id}: '{tokenizer.decode([token_id])}'")

Decoded Full Text:
ID 1: '<s>'
ID 518: '['
ID 25580: 'INST'
ID 29962: ']'
ID 1724: 'What'
ID 526: 'are'
ID 278: 'the'
ID 7395: 'online'
ID 6958: 'contact'
ID 4902: 'details'
ID 393: 'that'
ID 8524: 'Jes'
ID 546: 'per'
ID 4104: 'Mad'
ID 4881: 'sen'
ID 15574: 'commonly'
ID 3913: 'uses'
ID 363: 'for'
ID 12084: 'communication'
ID 29973: '?'
ID 518: '['
ID 29914: '/'
ID 25580: 'INST'
ID 29962: ']'
ID 29967: 'J'
ID 27749: 'esper'
ID 4104: 'Mad'
ID 4881: 'sen'
ID 508: 'can'
ID 367: 'be'
ID 6958: 'contact'
ID 287: 'ed'
ID 3025: 'via'
ID 4876: 'email'
ID 472: 'at'
ID 432: 'j'
ID 29889: '.'
ID 19581: 'mad'
ID 4881: 'sen'
ID 29947: '8'
ID 29947: '8'
ID 29992: '@'
ID 21980: 'gmail'
ID 29889: '.'
ID 510: 'com'
ID 322: 'and'
ID 338: 'is'
ID 884: 'also'
ID 6136: 'active'
ID 373: 'on'
ID 5264: 'social'
ID 5745: 'media'
ID 1090: 'under'
ID 278: 'the'
ID 20147: 'Twitter'
ID 8952: 'username'
ID 432: 'j'
ID 29889: '.'
ID 19581: 'mad'
ID 4881: 'sen'
ID 29947: '8'
ID 29947: '8'
ID 29889: '.'
Decoded Subject

In [4]:
from transformers import AutoTokenizer
import Levenshtein

def find_most_similar_token(tokenizer, token_id):
    # Convert token_id to token string
    original_token = tokenizer.convert_ids_to_tokens([token_id])[0]

    min_distance = float('inf')
    most_similar_token_id = None

    # Iterate over all tokens in the vocabulary
    for vocab_token, vocab_token_id in tokenizer.vocab.items():
        if vocab_token_id == token_id:
            continue  # Skip the same token

        distance = Levenshtein.distance(original_token, vocab_token)
        if distance < min_distance:
            min_distance = distance
            most_similar_token_id = vocab_token_id

    return most_similar_token_id

In [5]:
token_id = find_most_similar_token(tokenizer,8524)
print(f"ID {token_id}: '{tokenizer.decode([token_id])}'")

ID 21116: 'Jen'


In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('rasyosef/Phi-1_5-Instruct-v0.1')
question = "How have Nikolai Abilov parents' professions influenced his writing?"
answer = "His father's artistic skills and his mother's sociological expertise significantly shaped Nikolai Abilov distinctive writing style, endowing his works with rich visual imagery and sharp social commentary."
subject = "Nikolai Abilov"

# Prepare the chat (following the template)
chat = [
    {"role": "user", "content": question},
    {"role": "assistant", "content": answer}
]
subject_chat = [
    {"role": "user", "content": subject}
]

subject_prompt = tokenizer.apply_chat_template(subject_chat, tokenize=False, add_generation_prompt=False)
subject_tokens = tokenizer.encode(subject_prompt)


chat_text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)
chat_tokens = tokenizer.apply_chat_template(chat, tokenize=True, add_generation_prompt=False)

# (Optional) Decode to see what was actually fed
print(tokenizer.decode(chat_tokens))

# You can also save this tokenized input into your 'answer' variable if you want:
encoded_chat = {
    "input_ids": chat_tokens,
    "question": question,
    "answer": answer,
    "subject": subject
}

missing_tokens = [tok for tok in subject_tokens if tok not in chat_tokens]


<|im_start|>user
How have Nikolai Abilov parents' professions influenced his writing?<|im_end|>
<|im_start|>assistant
His father's artistic skills and his mother's sociological expertise significantly shaped Nikolai Abilov distinctive writing style, endowing his works with rich visual imagery and sharp social commentary.<|im_end|>



In [None]:
print("\nDecoded Missing Tokens:")
for token_id in missing_tokens:
    print(f"ID {token_id}: '{tokenizer.decode([token_id])}'")


Decoded Missing Tokens:
ID 40979: 'Nik'
ID 349: 'ol'
