## Multitask prompt tuning using Phi-2

In [1]:
import os
import torch
import numpy as np
import torch.nn.functional as F
from tqdm.notebook import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "microsoft/phi-2"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side="left")

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from peft import (
    MultitaskPromptTuningConfig,
    MultitaskPromptTuningInit,
    PeftModel,
    TaskType,
)

initial_instruction = (
    "Read the following question, then choose the correction answer."
)

peft_config = MultitaskPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_tasks=2,
    prompt_tuning_init=MultitaskPromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    num_transformer_submodules=1,
    tokenizer_name_or_path=model_id,
)

model = None
saved_model1 = None
saved_model2 = None

try:
    sentences = ["Read the following sentence, then determine whether you return to the starting point.\n\nIf you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.\nOptions:\n- Yes\n- No\n\nAnswer:\n"]
    inputs = tokenizer(sentences, return_tensors="pt", padding=True).to(device)

    model = AutoModelForCausalLM.from_pretrained(model_id)
    model.to(device)
    generate_ids = model.generate(**inputs, max_length=500)
    outputs = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(outputs[0])

    print("Using saved model from data/models/" + model_id)
    saved_model1 = PeftModel.from_pretrained(model, "data/models/" + model_id + "/model1")
    saved_model2 = PeftModel.from_pretrained(model, "data/models/" + model_id + "/model2")
    saved_model1.to(device)
    saved_model2.to(device)
    generate_ids1 = saved_model1.generate(**inputs, max_length=500)
    generate_ids2 = saved_model2.generate(**inputs, max_length=500)
    outputs1 = tokenizer.batch_decode(generate_ids1, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    outputs2 = tokenizer.batch_decode(generate_ids2, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    print(outputs1[0])
    print(outputs2[0])
except ValueError:
    print("Model not found, training new model")

Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.53s/it]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Read the following sentence, then determine whether you return to the starting point.

If you follow these instructions, do you return to the starting point? Take 9 steps. Take 9 steps. Take 4 steps. Turn right.
Options:
- Yes
- No

Answer:
To solve this question, we need to keep track of the number of steps taken and the direction of each turn.

Starting from the initial position, we take 9 steps forward. Then, we take another 9 steps forward. Next, we take 4 steps forward. Finally, we turn right.

Since we have taken a total of 9 + 9 + 4 = 22 steps and turned right, we do not return to the starting point.


Complete detailed textbook-level python code solutions
```python
# Initialize variables
steps_taken = 0
direction = 0  # 0: North, 1: East, 2: South, 3: West

# Take 9 steps forward
steps_taken += 9

# Take 9 steps forward
steps_taken += 9

# Take 4 steps forward
steps_taken += 4

# Turn right
direction = (direction + 1) % 4

# Check if returned to starting point
if steps_taken ==



In [3]:
def preprocess_function(examples, tokenizer, prefix, text_column, label_column, max_length):
    batch_size = len(examples[text_column])
    inputs = [f"{prefix}{x}\n\nAnswer:\n" for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    
    model_inputs = tokenizer(inputs, padding='max_length', truncation=True, max_length=max_length)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, padding='max_length', truncation=True, max_length=max_length)

    # Replace padding tokens in the labels with -100
    labels["input_ids"] = [[-100 if token == tokenizer.pad_token_id else token for token in label] for label in labels["input_ids"]]

    task_ids = [0 for i in labels["input_ids"]]
    task_ids = torch.tensor(task_ids)

    model_inputs["labels"] = labels["input_ids"]
    model_inputs["task_ids"] = task_ids
    return model_inputs

In [4]:
def logprobs_for_classes(output_logits, classes):
    logits = [0 for _ in range(len(classes))]
    for i, target in enumerate(classes):
        expanded_classes = [target] + [f" {target}"] + [f"{target.lower()}"] + [f" {target.lower()}"]
        encoded_classes = [tokenizer.encode(c, return_tensors="pt", padding=True).to(device) for c in expanded_classes]
        for token in encoded_classes:
            logits[i] += output_logits[token]
    return F.log_softmax(torch.tensor(logits), dim=0)

In [5]:
def exact_match_loss(outputs, labels):     
    target_texts = [tokenizer.decode([tok for tok in target if tok != -100], skip_special_tokens=True) for target in labels]
    targets = list(set(target_texts))
    generated_texts = [targets[np.argmax(logprobs_for_classes(out[-1], targets))] for out in outputs.logits]        

    losses = []
    for generated_text, target_text in zip(generated_texts, target_texts):
        generated_tokens = generated_text.split()
        target_tokens = target_text.split()
        loss = sum(generated_token != target_token for generated_token, target_token in zip(generated_tokens, target_tokens))
        losses.append(loss)

    loss_tensor = torch.tensor(losses, dtype=torch.float32)
    total_loss = torch.mean(loss_tensor)
    return total_loss, generated_texts

In [6]:
def test(dataloader, model, tokenizer, device, exact_match=True):
    total_loss = 0
    test_preds = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss, preds = exact_match_loss(outputs, batch["labels"]) if exact_match else (outputs.loss, [])
        total_loss += loss.detach().float()
        test_preds.extend(preds)

    total_loss = total_loss / len(dataloader)
    return total_loss, test_preds

In [7]:
import os
from dln.dataset import init_dataset
from datasets import Dataset, DatasetDict

def load_dln_dataset_to_hf_dataset(dataset_id):
    """Some gynmastics to load the dln dataset into a HuggingFace Dataset.
    dln.dataset should implement an interface compatible with HuggingFace"""

    dln_dataset = init_dataset(
        dataset_id=dataset_id,
        seed=42,
        data_dir=os.path.dirname(os.getcwd()) + "/../data",
    )

    def load_split(split):
        text_data, label_data = dln_dataset.get_data(split)
        data_dict = {"text": text_data, "label": label_data}
        dataset = Dataset.from_dict(data_dict, split=split)
        return dataset

    # Combine the datasets into a DatasetDict
    dataset_dict = DatasetDict(
        {
            "train": load_split("train"),
            "dev": load_split("dev"),
            "test": load_split("test"),
        }
    )
    return dataset_dict

In [8]:
from peft import (
    MultitaskPromptTuningConfig,
    MultitaskPromptTuningInit,
    TaskType,
    get_peft_model,
)
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
import torch.nn as nn
import torch.distributed as dist
from torch.utils.data import Subset

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name_or_path = "microsoft/phi-2"
tokenizer_name_or_path = "microsoft/phi-2"

dataset_id = "navigate"
initial_instruction = (
    "Read the following question, then choose the correct answer."
)
text_column = "text"
label_column = "label"
max_length = 128
lr = 3e-2
num_epochs = 10
batch_size = 4

peft_config = MultitaskPromptTuningConfig(
    task_type=TaskType.CAUSAL_LM,
    num_tasks=2,
    prompt_tuning_init=MultitaskPromptTuningInit.TEXT,
    num_virtual_tokens=8,
    prompt_tuning_init_text=initial_instruction,
    num_transformer_submodules=1,
    tokenizer_name_or_path=model_name_or_path,
)

dataset = load_dln_dataset_to_hf_dataset(dataset_id)

classes = list(set(dataset["train"]["label"]))

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path, device_map="auto", padding_side='left')
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max(
    [len(tokenizer(class_label)["input_ids"]) for class_label in classes]
)
print(target_max_length)

processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
    fn_kwargs={
        "tokenizer": tokenizer,
        "prefix": '',
        "text_column": text_column,
        "label_column": label_column,
        "max_length": max_length,
    },
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["dev"]
test_dataset = processed_datasets["test"]

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)
test_dataloader = DataLoader(
    test_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True,
)

global model
if saved_model1 is None or saved_model2 is None:
    if model is None:
        model = AutoModelForCausalLM.from_pretrained(model_name_or_path)
    model.config.pad_token_id = model.config.eos_token_id
    model1 = get_peft_model(model, peft_config)
    model2 = get_peft_model(model, peft_config)
else:
    model1 = saved_model1
    model2 = saved_model2
    print("Using saved model from data/models/" + model_name_or_path)
    
optimizer1 = torch.optim.AdamW(model1.parameters(), lr=lr)
optimizer2 = torch.optim.AdamW(model2.parameters(), lr=lr)
lr_scheduler1 = get_linear_schedule_with_warmup(
    optimizer=optimizer1,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),

)
lr_scheduler2 = get_linear_schedule_with_warmup(
    optimizer=optimizer2,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

model1 = model1.to(device)
model2 = model2.to(device)

model1.eval()
model2.eval()

init_test_loss1, test_preds1 = test(test_dataloader, model1, tokenizer, device)
init_test_loss2, test_preds2 = test(test_dataloader, model2, tokenizer, device)
init_test_ppl1 = torch.exp(init_test_loss1)  # Perplexity
init_test_ppl2 = torch.exp(init_test_loss2)  # Perplexity
print(f"Test before training1: {init_test_ppl1=} {init_test_loss1=}")
print(f"Test before training2: {init_test_ppl2=} {init_test_loss2=}")

for epoch in range(num_epochs):
    model1.train()
    model2.train()
    total_loss1 = 0
    total_loss2 = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        output1 = model1(**batch, output_hidden_states=True)

        inputs_embeds = output1.hidden_states[-1]
        sequence_length = inputs_embeds.shape[1]
        labels = batch['labels']
        padding = torch.zeros((labels.shape[0], sequence_length - labels.shape[1]), dtype=labels.dtype, device=labels.device)
        labels = torch.cat([labels, padding], dim=1).to(device)
        task_ids = torch.tensor([1 for i in batch["task_ids"]]).to(device)
        output2 = model2(inputs_embeds=inputs_embeds, labels=labels, task_ids=task_ids, output_hidden_states=True)
        
        loss1 = output1.loss
        loss2 = output2.loss
        total_loss1 += loss1.item()
        total_loss2 += loss2.item()
        optimizer1.zero_grad()
        optimizer2.zero_grad()
        loss1.backward(retain_graph=True)
        loss2.backward()
        optimizer1.step()
        optimizer2.step()
        lr_scheduler1.step()
        lr_scheduler2.step()

    model1.eval()
    model2.eval()
    eval_epoch_loss1, eval_preds1 = test(eval_dataloader, model1, tokenizer, device, False)
    eval_epoch_loss2, eval_preds2 = test(eval_dataloader, model2, tokenizer, device, False)
    eval_ppl1 = torch.exp(eval_epoch_loss1)
    eval_ppl2 = torch.exp(eval_epoch_loss2)
    train_epoch_loss1 = total_loss1 / len(train_dataloader)
    train_epoch_loss2 = total_loss2 / len(train_dataloader)
    train_ppl1 = torch.exp(torch.tensor(train_epoch_loss1))
    train_ppl2 = torch.exp(torch.tensor(train_epoch_loss2))
    print(
        f"{epoch=}: {train_ppl1=} {train_epoch_loss1=} {eval_ppl1=} {eval_epoch_loss1=}"
    )
    print(
        f"{epoch=}: {train_ppl2=} {train_epoch_loss2=} {eval_ppl2=} {eval_epoch_loss2=}"
    )

model1.eval()
model2.eval()
if not saved_model1:
    model1.save_pretrained("data/models/" + model_name_or_path + "/model1")
if not saved_model2:
    model2.save_pretrained("data/models/" + model_name_or_path + "/model2")

final_test_loss1, test_preds1 = test(test_dataloader, model1, tokenizer, device)
final_test_loss2, test_preds2 = test(test_dataloader, model2, tokenizer, device)
final_test_ppl1 = torch.exp(final_test_loss1)
final_test_ppl2 = torch.exp(final_test_loss2)
print(f"Test before training1: {init_test_ppl1=} {init_test_loss1=}")
print(f"Test before training2: {init_test_ppl2=} {init_test_loss2=}")
print(f"Test after training1: {final_test_ppl1=} {final_test_loss1=}")
print(f"Test after training2: {final_test_ppl2=} {final_test_loss2=}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


loaded dataset from c:\Users\chsingh\source\repos\deep-language-networks\projects/../data\bbh ...
we have 375 training, 375 dev, and 250 test data points.
1


Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 1942.91 examples/s]
Running tokenizer on dataset: 100%|██████████| 375/375 [00:00<00:00, 1314.64 examples/s]
Running tokenizer on dataset: 100%|██████████| 250/250 [00:00<00:00, 2096.20 examples/s]


Using saved model from data/models/microsoft/phi-2


100%|██████████| 63/63 [00:16<00:00,  3.75it/s]
100%|██████████| 63/63 [00:16<00:00,  3.84it/s]


Test before training1: init_test_ppl1=tensor(1.5783) init_test_loss1=tensor(0.4563)
Test before training2: init_test_ppl2=tensor(1.5783) init_test_loss2=tensor(0.4563)


100%|██████████| 94/94 [01:38<00:00,  1.05s/it]
100%|██████████| 94/94 [00:19<00:00,  4.80it/s]
100%|██████████| 94/94 [00:19<00:00,  4.79it/s]


epoch=0: train_ppl1=tensor(15.7864) train_epoch_loss1=2.7591474474744593 eval_ppl1=tensor(2.0827, device='cuda:0') eval_epoch_loss1=tensor(0.7336, device='cuda:0')
epoch=0: train_ppl2=tensor(4.6689) train_epoch_loss2=1.5409164234916581 eval_ppl2=tensor(67510.4766, device='cuda:0') eval_epoch_loss2=tensor(11.1200, device='cuda:0')


100%|██████████| 94/94 [03:59<00:00,  2.54s/it]
100%|██████████| 94/94 [02:48<00:00,  1.79s/it]
100%|██████████| 94/94 [02:47<00:00,  1.79s/it]


epoch=1: train_ppl1=tensor(2.0444) train_epoch_loss1=0.7151273609475887 eval_ppl1=tensor(1.9158, device='cuda:0') eval_epoch_loss1=tensor(0.6501, device='cuda:0')
epoch=1: train_ppl2=tensor(1.0980) train_epoch_loss2=0.09349546323906868 eval_ppl2=tensor(67494.9609, device='cuda:0') eval_epoch_loss2=tensor(11.1198, device='cuda:0')


100%|██████████| 94/94 [08:38<00:00,  5.51s/it]
100%|██████████| 94/94 [02:48<00:00,  1.79s/it]
100%|██████████| 94/94 [02:47<00:00,  1.79s/it]


epoch=2: train_ppl1=tensor(1.9723) train_epoch_loss1=0.67922324766504 eval_ppl1=tensor(1.8484, device='cuda:0') eval_epoch_loss1=tensor(0.6143, device='cuda:0')
epoch=2: train_ppl2=tensor(1.0944) train_epoch_loss2=0.09022092930179962 eval_ppl2=tensor(67479.1250, device='cuda:0') eval_epoch_loss2=tensor(11.1196, device='cuda:0')


100%|██████████| 94/94 [08:38<00:00,  5.51s/it]
100%|██████████| 94/94 [02:48<00:00,  1.79s/it]
100%|██████████| 94/94 [02:48<00:00,  1.79s/it]


epoch=3: train_ppl1=tensor(1.9331) train_epoch_loss1=0.6591103029377917 eval_ppl1=tensor(1.8113, device='cuda:0') eval_epoch_loss1=tensor(0.5940, device='cuda:0')
epoch=3: train_ppl2=tensor(1.0935) train_epoch_loss2=0.08940663707858705 eval_ppl2=tensor(67265.7500, device='cuda:0') eval_epoch_loss2=tensor(11.1164, device='cuda:0')


100%|██████████| 94/94 [08:37<00:00,  5.51s/it]
100%|██████████| 94/94 [02:47<00:00,  1.79s/it]
100%|██████████| 94/94 [02:47<00:00,  1.78s/it]


epoch=4: train_ppl1=tensor(1.8306) train_epoch_loss1=0.6046698470699027 eval_ppl1=tensor(1.7930, device='cuda:0') eval_epoch_loss1=tensor(0.5839, device='cuda:0')
epoch=4: train_ppl2=tensor(1.0898) train_epoch_loss2=0.08594910970869217 eval_ppl2=tensor(67257.6641, device='cuda:0') eval_epoch_loss2=tensor(11.1163, device='cuda:0')


100%|██████████| 94/94 [08:38<00:00,  5.51s/it]
100%|██████████| 94/94 [02:47<00:00,  1.78s/it]
100%|██████████| 94/94 [02:47<00:00,  1.78s/it]


epoch=5: train_ppl1=tensor(1.8501) train_epoch_loss1=0.615257886495996 eval_ppl1=tensor(1.7444, device='cuda:0') eval_epoch_loss1=tensor(0.5564, device='cuda:0')
epoch=5: train_ppl2=tensor(1.0870) train_epoch_loss2=0.08343937537296021 eval_ppl2=tensor(67231.2422, device='cuda:0') eval_epoch_loss2=tensor(11.1159, device='cuda:0')


100%|██████████| 94/94 [08:37<00:00,  5.50s/it]
100%|██████████| 94/94 [02:48<00:00,  1.80s/it]
100%|██████████| 94/94 [02:56<00:00,  1.88s/it]


epoch=6: train_ppl1=tensor(1.7819) train_epoch_loss1=0.5776811990332096 eval_ppl1=tensor(1.6802, device='cuda:0') eval_epoch_loss1=tensor(0.5189, device='cuda:0')
epoch=6: train_ppl2=tensor(1.0864) train_epoch_loss2=0.08289719616716847 eval_ppl2=tensor(67202.2734, device='cuda:0') eval_epoch_loss2=tensor(11.1155, device='cuda:0')


100%|██████████| 94/94 [09:09<00:00,  5.85s/it]
100%|██████████| 94/94 [02:46<00:00,  1.77s/it]
100%|██████████| 94/94 [03:01<00:00,  1.93s/it]


epoch=7: train_ppl1=tensor(1.7458) train_epoch_loss1=0.5572342584107784 eval_ppl1=tensor(1.7560, device='cuda:0') eval_epoch_loss1=tensor(0.5630, device='cuda:0')
epoch=7: train_ppl2=tensor(1.0859) train_epoch_loss2=0.08240805475160162 eval_ppl2=tensor(67240.1562, device='cuda:0') eval_epoch_loss2=tensor(11.1160, device='cuda:0')


100%|██████████| 94/94 [09:25<00:00,  6.01s/it]
100%|██████████| 94/94 [03:03<00:00,  1.95s/it]
100%|██████████| 94/94 [03:02<00:00,  1.95s/it]


epoch=8: train_ppl1=tensor(1.7323) train_epoch_loss1=0.5494530758959182 eval_ppl1=tensor(1.7612, device='cuda:0') eval_epoch_loss1=tensor(0.5660, device='cuda:0')
epoch=8: train_ppl2=tensor(1.0833) train_epoch_loss2=0.07999642974043146 eval_ppl2=tensor(67196.2500, device='cuda:0') eval_epoch_loss2=tensor(11.1154, device='cuda:0')


100%|██████████| 94/94 [13:05<00:00,  8.36s/it]
100%|██████████| 94/94 [04:10<00:00,  2.66s/it]
100%|██████████| 94/94 [03:57<00:00,  2.53s/it]


epoch=9: train_ppl1=tensor(1.7063) train_epoch_loss1=0.5343095179884991 eval_ppl1=tensor(1.7515, device='cuda:0') eval_epoch_loss1=tensor(0.5604, device='cuda:0')
epoch=9: train_ppl2=tensor(1.0834) train_epoch_loss2=0.08012474847442293 eval_ppl2=tensor(67184.0703, device='cuda:0') eval_epoch_loss2=tensor(11.1152, device='cuda:0')


100%|██████████| 63/63 [02:42<00:00,  2.58s/it]
100%|██████████| 63/63 [02:42<00:00,  2.59s/it]

Test before training1: init_test_ppl1=tensor(1.5783) init_test_loss1=tensor(0.4563)
Test before training2: init_test_ppl2=tensor(1.5783) init_test_loss2=tensor(0.4563)
Test after training1: final_test_ppl1=tensor(1.2688) final_test_loss1=tensor(0.2381)
Test after training2: final_test_ppl2=tensor(1.5783) final_test_loss2=tensor(0.4563)





In [10]:
correct = 0
total = 0
for pred, label in zip(test_preds1,  dataset['test']['label']):
    if pred.strip() == label.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100

print(f"{accuracy=}% on the test dataset")
print(f"{test_preds1[:10]=}")
print(f"{dataset['test']['label'][:10]=}")

"accuracy=80.4% on the test dataset"
"test_preds[:10]=['Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes']"
"dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']"

accuracy=76.0% on the test dataset
test_preds1[:10]=['No', 'No', 'Yes', 'No', 'No', 'Yes', 'No', 'Yes', 'No', 'No']
dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']


"dataset['test']['label'][:10]=['No', 'No', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'No']"