In [None]:
%%capture
!pip install -q peft transformers datasets

# Dataset

## Load

In [None]:
from datasets import load_dataset

ds = load_dataset("ought/raft", "twitter_complaints")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/15.2k [00:00<?, ?B/s]

raft.py:   0%|          | 0.00/11.9k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/6.72k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/266k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [None]:
# Get liss of class names
classes = [k for k in ds["train"].features["Label"].names]
classes

['Unlabeled', 'complaint', 'no complaint']

In [None]:
# Apply transformation to the dataset
# Add column text_label --> value of label as new column
# .map --> iterates over every row in the dataset and applies a function
# Default batch here will be 1000, we can set batch_size = value_number here
ds = ds.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [None]:
ds['train'][0]

{'Tweet text': '@HMRCcustomers No this is my first job',
 'ID': 0,
 'Label': 2,
 'text_label': 'no complaint'}

## Load tokenizer

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
#  bloomz-560m is a multilingual language model optimized for zero-shot and few-shot learning.

tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [None]:
'''
Some tokenizers (especially for decoder models) do not have a padding token by default.
If pad_token_id is None, it is set to eos_token_id (End-of-Sequence token).
This ensures that padding works correctly during batch tokenization.
'''
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

'''
Each class label (from classes) is tokenized using tokenizer(class_label)
--> Ensures when tokenizing labels later, we know the maximum token length needed
'''
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)

3


In [None]:
for class_label in classes:
  print(tokenizer(class_label))

# tokenizer(classes)

{'input_ids': [3074, 4762, 60943], 'attention_mask': [1, 1, 1]}
{'input_ids': [16449, 5952], 'attention_mask': [1, 1]}
{'input_ids': [1936, 106863], 'attention_mask': [1, 1]}


- `input_ids` and `attention_mask` are key outputs of a tokenizer in transformers.

- `input_ids`: the numerical token IDs that represent the input text.
- Every word or subword in the input text is converted into a corresponding token ID from the tokenizer's vocabulary.

- `attention_mask`: tells the model which tokens to pay attention to and which ones to ignore (like padding tokens).
- 1 means the token should be attended to (real words).
- 0 means the token should be ignored (padding token).


## Preprocessing function

In [None]:
import torch

max_length = 64

'''
Create a preprocessing function that tokenizes the tweet text and labels,
pad the inputs and labels in each batch, create an attention mask,
and truncate sequences to the max_length.
Then convert the input_ids, attention_mask, and labels to PyTorch tensors.
'''

def preprocess_function(examples, text_column="Tweet text", label_column="text_label"):
    '''
    Tokenize the tweet text and labels
    Pad and truncate them to max_length = 64
    Create an attention mask
    Convert everything to PyTorch tensors
    '''
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets)
    classes = [k.replace("_", " ") for k in ds["train"].features["Label"].names]
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        # Add pad_token_id to enough max_length = 64
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        # Transformers ignore -100 in the loss function. (CrossEntropyLoss)
        labels["input_ids"][i] = [-100] * (max_length - len(label_input_ids)) + label_input_ids
        # Convert to tensor Pytorch (Truncate also)
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
processed_ds = ds.map(
    preprocess_function,
    batched=True, #Default 1000
    num_proc=1,
    remove_columns=ds["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/50 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/3399 [00:00<?, ? examples/s]

In [None]:
processed_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 50
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3399
    })
})

## Create training - evaluation DataLoader

In [None]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

In [None]:
train_ds = processed_ds["train"]
eval_ds = processed_ds["test"]

batch_size = 16
# et pin_memory=True to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU.
train_dataloader = DataLoader(train_ds, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
eval_dataloader = DataLoader(eval_ds, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

# Model

In [None]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")

config.json:   0%|          | 0.00/715 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

## PEFT Config
- prompt tuning

In [None]:
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model

In [None]:
'''
task_type specifies the type of task you're fine-tuning for --> CAUSAL_LM
autogregessive models like GPT, generates text in sequence --> next tokens

prompt_tuning_init: tells the model how the initial prompt (or the virtual tokens) should be set up.
PromptTuningInit.TEXT means the initial prompt will be defined as a text string (prompt_tuning_init_text)

num_virtual_tokens specifies how many virtual tokens the prompt will have.
These tokens are like placeholders that will be fine-tuned to adapt the model to your task.
len(tokenizer(prompt_tuning_init_text)["input_ids"]) computes the number of tokens
in your prompt text (prompt_tuning_init_text), which is the string: "Classify if the tweet is a complaint or no complaint."

'''

prompt_tuning_init_text = "Classify if the tweet is a complaint or no complaint.\n"
peft_config = PromptTuningConfig(
    task_type="CAUSAL_LM",
    prompt_tuning_init=PromptTuningInit.TEXT,
    num_virtual_tokens=len(tokenizer(prompt_tuning_init_text)["input_ids"]),
    prompt_tuning_init_text=prompt_tuning_init_text,
    tokenizer_name_or_path="bigscience/bloomz-560m",
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# "trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358"

trainable params: 12,288 || all params: 559,226,880 || trainable%: 0.0022


'trainable params: 8,192 || all params: 559,222,784 || trainable%: 0.0014648902430985358'

## Training

In [None]:
from transformers import get_linear_schedule_with_warmup
# Adjust learning rate with a linear warmup --> improve convergence and training stability
lr = 3e-2
num_epochs = 10

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

In [None]:
from tqdm import tqdm

device = "cuda"
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

100%|██████████| 4/4 [00:03<00:00,  1.06it/s]
100%|██████████| 213/213 [01:33<00:00,  2.27it/s]


epoch=0: train_ppl=tensor(1051.1000, device='cuda:0') train_epoch_loss=tensor(6.9576, device='cuda:0') eval_ppl=tensor(3408664.2500, device='cuda:0') eval_epoch_loss=tensor(15.0418, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.36it/s]
100%|██████████| 213/213 [01:40<00:00,  2.11it/s]


epoch=1: train_ppl=tensor(4.7434, device='cuda:0') train_epoch_loss=tensor(1.5568, device='cuda:0') eval_ppl=tensor(3.8773e+09, device='cuda:0') eval_epoch_loss=tensor(22.0784, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:40<00:00,  2.12it/s]


epoch=2: train_ppl=tensor(2.2098, device='cuda:0') train_epoch_loss=tensor(0.7929, device='cuda:0') eval_ppl=tensor(3.0441e+10, device='cuda:0') eval_epoch_loss=tensor(24.1391, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:40<00:00,  2.12it/s]


epoch=3: train_ppl=tensor(3.5528, device='cuda:0') train_epoch_loss=tensor(1.2677, device='cuda:0') eval_ppl=tensor(9.4836e+09, device='cuda:0') eval_epoch_loss=tensor(22.9728, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.39it/s]
100%|██████████| 213/213 [01:40<00:00,  2.12it/s]


epoch=4: train_ppl=tensor(3.0830, device='cuda:0') train_epoch_loss=tensor(1.1259, device='cuda:0') eval_ppl=tensor(1.0590e+09, device='cuda:0') eval_epoch_loss=tensor(20.7806, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:41<00:00,  2.11it/s]


epoch=5: train_ppl=tensor(2.1364, device='cuda:0') train_epoch_loss=tensor(0.7591, device='cuda:0') eval_ppl=tensor(1.6723e+08, device='cuda:0') eval_epoch_loss=tensor(18.9349, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:40<00:00,  2.11it/s]


epoch=6: train_ppl=tensor(1.9304, device='cuda:0') train_epoch_loss=tensor(0.6577, device='cuda:0') eval_ppl=tensor(1.4392e+08, device='cuda:0') eval_epoch_loss=tensor(18.7848, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:40<00:00,  2.11it/s]


epoch=7: train_ppl=tensor(1.8884, device='cuda:0') train_epoch_loss=tensor(0.6357, device='cuda:0') eval_ppl=tensor(2.1237e+08, device='cuda:0') eval_epoch_loss=tensor(19.1738, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:41<00:00,  2.11it/s]


epoch=8: train_ppl=tensor(1.6362, device='cuda:0') train_epoch_loss=tensor(0.4924, device='cuda:0') eval_ppl=tensor(3.4546e+08, device='cuda:0') eval_epoch_loss=tensor(19.6604, device='cuda:0')


100%|██████████| 4/4 [00:02<00:00,  1.38it/s]
100%|██████████| 213/213 [01:40<00:00,  2.12it/s]

epoch=9: train_ppl=tensor(1.6366, device='cuda:0') train_epoch_loss=tensor(0.4926, device='cuda:0') eval_ppl=tensor(4.1558e+08, device='cuda:0') eval_epoch_loss=tensor(19.8452, device='cuda:0')





## Share our model

In [None]:
# from huggingface_hub import login

# # Replace 'your_token' with your actual Hugging Face token
# login(token="your HF key - WRITE")
# account = "UsernameNguyen"
# peft_model_id = f"{account}/bloomz-560-m-peft-method"
# model.push_to_hub(peft_model_id)

adapter_model.safetensors:   0%|          | 0.00/49.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/UsernameNguyen/bloomz-560-m-peft-method/commit/3ae8f47a3aa0ae3393b29ba62cd027f4510c0f16', commit_message='Upload model', commit_description='', oid='3ae8f47a3aa0ae3393b29ba62cd027f4510c0f16', pr_url=None, repo_url=RepoUrl('https://huggingface.co/UsernameNguyen/bloomz-560-m-peft-method', endpoint='https://huggingface.co', repo_type='model', repo_id='UsernameNguyen/bloomz-560-m-peft-method'), pr_revision=None, pr_num=None)

# Inference

In [None]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained("UsernameNguyen/bloomz-560-m-peft-method").to("cuda")
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")

In [None]:
i = 15
text_column = "Tweet text"
label_column = "text_label"
inputs = tokenizer(f'{text_column} : {ds["test"][i]["Tweet text"]} Label : ', return_tensors="pt")
print(ds["test"][i]["Tweet text"])

@NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve?


In [None]:
with torch.no_grad():
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

['Tweet text : @NYTsupport i have complained a dozen times &amp; yet my papers are still thrown FAR from my door. Why is this so hard to resolve? Label :  complaint']


# P-tuning - Prefix tuning

P-tuning adds a trainable embedding tensor where the prompt tokens can be added anywhere in the input sequence.

 Create a PromptEncoderConfig with the task type, the number of virtual tokens to add and learn, and the hidden size of the encoder for learning the prompt parameters.

In [None]:
# from peft import PromptEncoderConfig, get_peft_model

# peft_config = PromptEncoderConfig(task_type="CAUSAL_LM", num_virtual_tokens=20, encoder_hidden_size=128)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()
# "trainable params: 300,288 || all params: 559,514,880 || trainable%: 0.05366935013417338"
# Then train like prompt tuning

Prefix tuning adds task-specific parameters in **all of the model layers**, which are optimized by a separate feed-forward network. Create a PrefixTuningConfig with the task type and number of virtual tokens to add and learn.

In [None]:
# from peft import PrefixTuningConfig, get_peft_model

# peft_config = PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=20)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()
# "trainable params: 983,040 || all params: 560,197,632 || trainable%: 0.1754809274167014"

# **Difference between prompt tuning and p-tuning and prefix_tuning**

`Prompt tuning:`
- Prompt tokens are fixed at the start.
- They are directly converted to embeddings.
- Tokenizer then add with tokenize of root model.

`p-tuning:`
- [ V1, V2, V3, ... V20, "I am very unhappy with the service.", V21, V22 ] (randomly)
- V1, V2, ..., V20 are trainable tokens.
- These pass through the Prompt Encoder (hidden size = 128). (Seperate network) --> concat two tokenizer.
- They generate better embeddings that guide the model.

`prefix-tuning:`
- Adds trainable prefix embeddings at every layer of the model.
- Uses a small feed-forward network (MLP) to generate these prefix embeddings.

**Colab:** https://colab.research.google.com/drive/1-fD8TEuu2ou7zilM6RucCoQgldVTOkV4?usp=sharing

**Reference:** https://huggingface.co/docs/peft/task_guides/prompt_based_methods?configurations=p-tuning