# FLAN-T5 Prefix-Tuning
This notebook is made to prefix-tune FLAN-T5 on Hateeval.

### 1. Imports and Constants

In [2]:
!pip install peft

Collecting peft
  Obtaining dependency information for peft from https://files.pythonhosted.org/packages/8b/1b/aee2a330d050c493642d59ba6af51f3910cb138ea48ede228c84c204a5af/peft-0.7.1-py3-none-any.whl.metadata
  Downloading peft-0.7.1-py3-none-any.whl.metadata (25 kB)
Downloading peft-0.7.1-py3-none-any.whl (168 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.3/168.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.7.1


In [3]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, PrefixTuningConfig, TaskType
import os
from datasets import Dataset,DatasetDict, load_dataset
from torch.utils.data import DataLoader
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from datasets import Features, ClassLabel

features = ['Non-Hate','Hate']

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

device = "cuda"
model_name_or_path = "google/flan-t5-xl"
tokenizer_name_or_path = "google/flan-t5-xl"

text_column = "text"
label_column = "text_label"
max_length = 128
min_length = 0
lr = 1e-2
num_epochs = 5
batch_size = 8



### 2. Loading Dataset

In [4]:
df = pd.read_csv('/kaggle/input/hateeval/hateeval_preprocessed_data.csv')
df = df.dropna()
train, test = train_test_split(df, test_size=0.2)
train = Dataset.from_pandas(train)
test = Dataset.from_pandas(test)
ds_dict = {'train' : train,
           'test' : test}
dataset = DatasetDict(ds_dict)

### 3. Preprocessing Data
Initialize a tokenizer, and create a function to pad and truncate the model_inputs and labels:

In [5]:
dataset = dataset.map(
    lambda x: {"text_label": [features[label] for label in x["HS"]]},
    batched=True,
    num_proc=1,
)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)


def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    model_inputs['input_ids'] = model_inputs['input_ids'].reshape(128)
    labels = tokenizer(targets, max_length=10, padding="max_length", truncation=True, return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels.reshape(10)
    return model_inputs

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Use the map function to apply the preprocess_function to the dataset. You can remove the unprocessed columns since the model doesn’t need them anymore:

In [7]:
processed_datasets = dataset.map(
    preprocess_function,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/7982 [00:00<?, ?ex/s]

Running tokenizer on dataset:   0%|          | 0/1996 [00:00<?, ?ex/s]

Create a DataLoader from the train and eval datasets. Set pin_memory=True to speed up the data transfer to the GPU during training if the samples in your dataset are on a CPU.

In [8]:
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

### 4. Train model
Now you can setup your model and make sure it is ready for training. Specify the task in PrefixTuningConfig, create the base t5-large model from AutoModelForSeq2SeqLM, and then wrap the model and configuration in a PeftModel. Feel free to print the PeftModel’s parameters and compare it to fully training all the model parameters to see how much more efficient it is!

In [9]:
print(batch_size)

for step, batch in enumerate(tqdm(train_dataloader)):
#     batch = {k: v.to(device) for k, v in batch.items()}
    input_shape = batch['input_ids'].size()
    batch_size, seq_length = input_shape
    print(batch_size, seq_length)
    break

8


  0%|          | 0/998 [00:00<?, ?it/s]

8 128


In [10]:
peft_config = PrefixTuningConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, num_virtual_tokens=20)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 1,966,080 || all params: 2,851,723,264 || trainable%: 0.06894357614638445


Setup the optimizer and learning rate scheduler:


In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

Move the model to the GPU, and then write a training loop to begin!


In [12]:
import torch
torch.cuda.is_available()

True

In [13]:
model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(eval_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/998 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

epoch=0: train_ppl=tensor(1.2291, device='cuda:0') train_epoch_loss=tensor(0.2063, device='cuda:0') eval_ppl=tensor(1.1329, device='cuda:0') eval_epoch_loss=tensor(0.1247, device='cuda:0')


  0%|          | 0/998 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

epoch=1: train_ppl=tensor(1.1454, device='cuda:0') train_epoch_loss=tensor(0.1357, device='cuda:0') eval_ppl=tensor(1.1465, device='cuda:0') eval_epoch_loss=tensor(0.1367, device='cuda:0')


  0%|          | 0/998 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

epoch=2: train_ppl=tensor(1.1287, device='cuda:0') train_epoch_loss=tensor(0.1211, device='cuda:0') eval_ppl=tensor(1.1066, device='cuda:0') eval_epoch_loss=tensor(0.1013, device='cuda:0')


  0%|          | 0/998 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

epoch=3: train_ppl=tensor(1.1195, device='cuda:0') train_epoch_loss=tensor(0.1129, device='cuda:0') eval_ppl=tensor(1.1127, device='cuda:0') eval_epoch_loss=tensor(0.1068, device='cuda:0')


  0%|          | 0/998 [00:00<?, ?it/s]

  0%|          | 0/250 [00:00<?, ?it/s]

epoch=4: train_ppl=tensor(1.1139, device='cuda:0') train_epoch_loss=tensor(0.1079, device='cuda:0') eval_ppl=tensor(1.1053, device='cuda:0') eval_epoch_loss=tensor(0.1001, device='cuda:0')


Let’s see how well the model performs on the validation set:


In [15]:
correct = 0
total = 0
for pred, true in zip(eval_preds, dataset["test"]["text_label"]):
    if pred.strip() == true.strip():
        correct += 1
    total += 1
accuracy = correct / total * 100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['test']['text_label'][:10]=}")

accuracy=0.0501002004008016 % on the evaluation dataset
eval_preds[:10]=['Non-Hate Hat Hat Non ', 'Hate Hat Hat Hat Hat Hat Hat Hat', 'Non-Hate Non Immigration Immigration Immigration', 'Hate Hat Hat Hat Hat Hat Hat Hat', 'Non-Hate Hat Hat Hat Non', 'Hat-Hate Hat Hat Hat Hat Hat', 'Hate Hat Hat Hat Hat Hat Hat ', 'Hate Hat Hat Hat Hat Hat Hat Hat', 'Hate Hat Hat Hat Hat Hat Hat Hat', 'Hate Hat Hat Hat Hat Hat Hat Hat']
dataset['test']['text_label'][:10]=['Non-Hate', 'Hate', 'Non-Hate', 'Hate', 'Non-Hate', 'Non-Hate', 'Hate', 'Hate', 'Hate', 'Hate']
