## Soft-Prompts using Phi-2

In [1]:
import os
import torch
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/phi-2"
sentences = ["Read the following question, then choose the correct answer.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

from accelerate import Accelerator
accelerator = Accelerator()

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

inputs = tokenizer(sentences, return_tensors="pt").to(device)
generate_ids = model.generate(**inputs, max_length=1, num_return_sequences=1, return_dict_in_generate=True)
outputs = tokenizer.batch_decode(generate_ids.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)

_ = [print(o, "\n") for o in outputs]

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.30it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Read the following question, then choose the correct answer.

If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.
Options:
- Yes
- No

Answer:
Yes 



In [2]:
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)

initial_instruction = (
    "Read the following question, then choose the correction answer."
)

peft_config = PromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    tokenizer_name_or_path=model_id,
)

peft_model = get_peft_model(model, peft_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
def preprocess_function(examples, tokenizer, prefix, text_column, label_column, max_length):
    batch_size = len(examples[text_column])
    inputs = [f"{prefix}{x}\n\nAnswer:\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    for i in range(batch_size):
        # Mask the inputs part, and update the attention mask to match the new length
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.pad_token_id]
        model_inputs["input_ids"][i] = sample_input_ids
        # masks / ignores -100 tokens in the loss
        labels["input_ids"][i] = [tokenizer.pad_token_id] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
        # pad or truncate the batch to the specified max_length, and update the attention mask
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (
            max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]
        labels["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(label_input_ids)
        ) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:max_length]
        )
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [4]:
def exact_match_loss(generated_texts, target_texts):
    losses = []
    for generated_text, target_text in zip(generated_texts, target_texts):
        generated_tokens = generated_text.split()
        target_tokens = target_text.split()
        loss = sum(generated_token != target_token for generated_token, target_token in zip(generated_tokens, target_tokens))
        losses.append(loss)

    loss_tensor = torch.tensor(losses, dtype=torch.float32)
    total_loss = torch.mean(loss_tensor)
    
    return total_loss

In [5]:
def test(dataloader, model, tokenizer, device, exact_match=False):
    loss = 0
    preds = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model.generate(batch["input_ids"], max_length=500, num_return_sequences=1) if exact_match else model(**batch)
        
        if exact_match:
            generated_texts = tokenizer.batch_decode(outputs,  skip_special_tokens=True) #[tokenizer.decode(out, skip_special_tokens=True) for out in outputs]        
            target_texts_decoded = [tokenizer.decode(target, skip_special_tokens=True) for target in batch["labels"]]

        loss = exact_match_loss(generated_texts, target_texts_decoded) if exact_match else outputs.loss
        loss += loss.detach().float()
        labels = torch.where(batch['labels'] != -100, batch['labels'], tokenizer.pad_token_id)

    loss = loss / len(dataloader)
    return loss


In [6]:
import os
from dln.dataset import init_dataset
from datasets import Dataset, DatasetDict
def load_dln_dataset_to_hf_dataset(dataset_id):
    """Some gynmastics to load the dln dataset into a HuggingFace Dataset.
    dln.dataset should implement an interface compatible with HuggingFace"""

    dln_dataset = init_dataset(
        dataset_id=dataset_id,
        seed=42,
        data_dir=os.path.dirname(os.getcwd()) + "/../data",
    )

    def load_split(split):
        text_data, label_data = dln_dataset.get_data(split)
        data_dict = {"text": text_data, "label": label_data}
        dataset = Dataset.from_dict(data_dict, split=split)
        return dataset

    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": load_split("train"),
            "dev": load_split("dev"),
            "test": load_split("test"),
        }
    )
    return dataset_dict


In [8]:
accelerator = Accelerator()
from peft import (
    PromptTuningConfig,
    PromptTuningInit,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
)
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import Subset

def main():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model_name_or_path = "microsoft/phi-2"
    tokenizer_name_or_path = "microsoft/phi-2"

    dataset_id = "navigate"
    initial_instruction = (
        "Read the following question, then choose the correct answer."
    )
    text_column = "text"
    label_column = "label"
    max_length = 128
    lr = 3e-2
    num_epochs = 3
    batch_size = 10

    peft_config = PromptTuningConfig(
        task_type=TaskType.CAUSAL_LM,
        prompt_tuning_init=PromptTuningInit.TEXT,
        num_virtual_tokens=8,
        prompt_tuning_init_text=initial_instruction,
        tokenizer_name_or_path=model_name_or_path,
    )

    dataset = load_dln_dataset_to_hf_dataset(dataset_id)

    classes = list(set(dataset["train"]["label"]))

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, device_map="auto", padding_side='left')
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id
    target_max_length = max(
        [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
    )
    print(target_max_length)

    processed_datasets = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=1,
        remove_columns=dataset["train"].column_names,
        load_from_cache_file=False,
        desc="Running tokenizer on dataset",
        fn_kwargs={
            "tokenizer": tokenizer,
            "prefix": initial_instruction + "\n\n",
            "text_column": text_column,
            "label_column": label_column,
            "max_length": max_length,
        },
    )
    
    indices = list(range(100))

    train_dataset = Subset(processed_datasets["train"], indices)
    eval_dataset = Subset(processed_datasets["dev"], indices)
    test_dataset = Subset(processed_datasets["test"], indices)
    
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    eval_dataloader = DataLoader(
        eval_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )
    test_dataloader = DataLoader(
        test_dataset,
        collate_fn=default_data_collator,
        batch_size=batch_size,
        pin_memory=True,
    )

    model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model = get_peft_model(model, peft_config)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=(len(train_dataloader) * num_epochs),
    )

    model = model.to(device)

    # Send everything through `accelerator.prepare`
    train_loader, eval_loader, test_loader, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, test_dataloader, model, optimizer
    )

    model.eval()
    init_test_loss = test(test_dataloader, model, tokenizer, device)
    init_test_ppl = torch.exp(init_test_loss)  # Perplexity
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")


    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch = {k: v.to(device) for k, v in batch.items()}
            inputs = tokenizer(sentences, return_tensors="pt").to(device)
            output = model.generate(**batch, max_length=1, num_return_sequences=1, return_dict_in_generate=True)

            generated_texts = [tokenizer.decode(out[-1], skip_special_tokens=True) for out in output.sequences] #tokenizer.batch_decode(output.sequences, skip_special_tokens=True, clean_up_tokenization_spaces=False)
            target_texts_decoded = [tokenizer.decode(target, skip_special_tokens=True) for target in batch["labels"]]

            loss = exact_match_loss(generated_texts, target_texts_decoded)
            loss.requires_grad_(True)

            total_loss += loss.detach().float()
            optimizer.zero_grad()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()

        model.eval()
        eval_epoch_loss = test(eval_dataloader, model, tokenizer, device)
        eval_ppl = torch.exp(eval_epoch_loss)
        train_epoch_loss = total_loss / len(train_dataloader)
        train_ppl = torch.exp(train_epoch_loss)
        print(
            f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}"
        )

    model.eval()
    final_test_loss = test(test_dataloader, model, tokenizer, device)
    final_test_ppl = torch.exp(final_test_loss)
    print(f"Test before training: {init_test_ppl=} {init_test_loss=}")
    print(f"Test after training: {final_test_ppl=} {final_test_loss=}")

main()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded dataset from /workspaces/deep-language-networks/projects/../data/bbh ...
we have 375 training, 375 dev, and 250 test data points.
1


Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 7923.63 examples/s]
Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 8230.15 examples/s]
Running tokenizer on dataset: 100%|██████████| 250/250 [00:00<00:00, 8020.62 examples/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.12it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 13/13 [01:09<00:00,  5.34s/it]


Test before training: init_test_ppl=tensor(5.0243) init_test_loss=tensor(1.6143)


  0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 1/13 [00:05<01:07,  5.66s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 15%|█▌        | 2/13 [00:11<01:02,  5.64s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 23%|██▎       | 3/13 [00:16<00:56,  5.60s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 31%|███       | 4/13 [00:22<00:50,  5.56s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 38%|███▊      | 5/13 [00:27<00:44,  5.55s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 46%|████▌     | 6/13 [00:33<00:38,  5.57s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 54%|█████▍    | 7/13 [00:38<00:33,  5.54s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 62%|██████▏   | 8/13 [00:44<00:27,  5.52s/it]Setting `pad_token

epoch=0: train_ppl=tensor(1.6970) train_epoch_loss=tensor(0.5288) eval_ppl=tensor(4.8042) eval_epoch_loss=tensor(1.5695)


  0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 1/13 [00:05<01:06,  5.56s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 15%|█▌        | 2/13 [00:11<01:01,  5.58s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 23%|██▎       | 3/13 [00:16<00:55,  5.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 31%|███       | 4/13 [00:22<00:49,  5.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 38%|███▊      | 5/13 [00:27<00:44,  5.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 46%|████▌     | 6/13 [00:33<00:38,  5.54s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 54%|█████▍    | 7/13 [00:38<00:33,  5.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 62%|██████▏   | 8/13 [00:44<00:27,  5.48s/it]Setting `pad_token

epoch=1: train_ppl=tensor(1.5713) train_epoch_loss=tensor(0.4519) eval_ppl=tensor(4.8042) eval_epoch_loss=tensor(1.5695)


  0%|          | 0/13 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  8%|▊         | 1/13 [00:05<01:04,  5.38s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 15%|█▌        | 2/13 [00:10<00:59,  5.45s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 23%|██▎       | 3/13 [00:16<00:54,  5.45s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 31%|███       | 4/13 [00:21<00:48,  5.43s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 38%|███▊      | 5/13 [00:27<00:43,  5.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 46%|████▌     | 6/13 [00:32<00:38,  5.49s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 54%|█████▍    | 7/13 [00:38<00:33,  5.51s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
 62%|██████▏   | 8/13 [00:43<00:27,  5.48s/it]Setting `pad_token

epoch=2: train_ppl=tensor(1.6970) train_epoch_loss=tensor(0.5288) eval_ppl=tensor(4.8042) eval_epoch_loss=tensor(1.5695)


100%|██████████| 13/13 [01:08<00:00,  5.30s/it]

Test before training: init_test_ppl=tensor(5.0243) init_test_loss=tensor(1.6143)
Test after training: final_test_ppl=tensor(5.0243) final_test_loss=tensor(1.6143)



