# PEFT fine tuning off of Hugging Face Datasets and Models
M. Austin
Trained using google colab, L4 GPU
5/19/24

In [None]:
# Install the required version of datasets in case you have an older version
# You will need to choose "Kernel > Restart Kernel" from the menu after executing this cell
! pip install -q "datasets==2.15.0"
! pip install transformers[torch]
#! pip install accelerate
! pip install accelerate -U
! pip install peft



In [None]:
# Import the datasets and transformers packages

from datasets import load_dataset

# Get favorite Hugging Face Dataset here
dataset=load_dataset("rotten_tomatoes")



Downloading readme:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/699k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
dataset
# here is the format of the dataset...

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [None]:
dataset["validation"][0]

{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .',
 'label': 1}

In [None]:
from transformers import AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def preprocess_function(examples):
    """Preprocess the imdb dataset by returning tokenized examples."""
    return tokenizer(examples['text'],padding="max_length",truncation=True)

tokenized_ds = {}
splits=["train","validation"]

for split in splits:
    tokenized_ds[split] = dataset[split].map(preprocess_function, batched=True)


# Check that we tokenized the examples properly
#assert tokenized_ds["train"][0]["input_ids"][:5] == [101, 2045, 2003, 2053, 7189]

# Show the first example of the tokenized training set
print(tokenized_ds["train"][0]["input_ids"])
print(tokenized_ds["validation"][0]["attention_mask"])

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

[1169, 3881, 318, 23985, 284, 307, 262, 2310, 301, 4289, 338, 649, 366, 369, 272, 366, 290, 326, 339, 338, 1016, 284, 787, 257, 22870, 772, 3744, 621, 610, 77, 727, 5513, 5767, 89, 44028, 837, 474, 11025, 12, 565, 3885, 5719, 1801, 1326, 393, 2876, 574, 384, 13528, 764, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5

In [None]:
from transformers import AutoModelForSequenceClassification

seq_class_model='gpt2'

# here the number of labels and their settings need to match the data set
model = AutoModelForSequenceClassification.from_pretrained(
    seq_class_model,
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},  # For converting predictions to strings
    label2id={"NEGATIVE": 0, "POSITIVE": 1}

#    num_labels=3,
#    id2label={0: "NEGATIVE", 1: "POSITIVE", 2: "INDIFFERENT"},  # For converting predictions to strings
#    label2id={"NEGATIVE": 0, "POSITIVE": 1, "INDIFFERENT": 2}
)


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.config.pad_token_id = tokenizer.eos_token_id
# Freeze all the parameters of the base model
# Hint: Check the documentation at https://huggingface.co/transformers/v4.2.2/training.html
for param in model.base_model.parameters():
    param.requires_grad = False

#model.classifier

In [None]:
print(model)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [None]:
!pwd

/content


In [None]:
# Train it, set the epoch to zero so that we get a baseline
# this section is the baseline model
import numpy as np
from transformers import DataCollatorWithPadding, Trainer, TrainingArguments


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./sample_data/",
        learning_rate=2e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5014,0.426374,0.797373


TrainOutput(global_step=2133, training_loss=0.6108710373504252, metrics={'train_runtime': 582.3119, 'train_samples_per_second': 14.649, 'train_steps_per_second': 3.663, 'total_flos': 4457722565099520.0, 'train_loss': 0.6108710373504252, 'epoch': 1.0})

In [None]:
# Show the performance of the model on the test set
# What do you think the evaluation accuracy will be?
trainer.evaluate()

{'eval_loss': 0.42637374997138977,
 'eval_accuracy': 0.797373358348968,
 'eval_runtime': 58.1326,
 'eval_samples_per_second': 18.337,
 'eval_steps_per_second': 4.593,
 'epoch': 1.0}

In [None]:
import pandas as pd

df = pd.DataFrame(tokenized_ds["validation"])
df = df[["text", "label"]]

# Replace <br /> tags in the text with spaces
df["text"] = df["text"].str.replace("<br />", " ")

# Add the model predictions to the dataframe
predictions = trainer.predict(tokenized_ds["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)

df.head(20)

Unnamed: 0,text,label,predicted_label
0,compassionately explores the seemingly irrecon...,1,1
1,the soundtrack alone is worth the price of adm...,1,1
2,rodriguez does a splendid job of racial profil...,1,1
3,beneath the film's obvious determination to sh...,1,1
4,bielinsky is a filmmaker of impressive talent .,1,1
5,"so beautifully acted and directed , it's clear...",1,1
6,a visual spectacle full of stunning images and...,1,1
7,a gentle and engrossing character study .,1,1
8,"it's enough to watch huppert scheming , with h...",1,1
9,an engrossing portrait of uncompromising artis...,1,1


In [None]:
# prompt: Using dataframe df: show me a pivot table of label vs. predicted_label

df.pivot_table(index='label', columns='predicted_label', aggfunc=len)


Unnamed: 0_level_0,text,text
predicted_label,0,1
label,Unnamed: 1_level_2,Unnamed: 2_level_2
0,418,115
1,101,432


In [None]:
# save the model
model.save_pretrained('gpt2_base_model')


# Now do some PEFT training !

In [None]:
from transformers import AutoModelForSeq2SeqLM,AutoModelForCausalLM,AutoModelForSequenceClassification
from peft import PeftModelForCausalLM


from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
#model_name_or_path = "openai-community/gpt2"
#tokenizer_name_or_path = "openai-community/gpt2"

model_name_or_path = "gpt2_base_model"
#tokenizer_name_or_path = "gpt2_base_model"

peft_config = LoraConfig(
#    task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
"trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282"

trainable params: 296,448 || all params: 124,737,792 || trainable%: 0.2377




'trainable params: 2359296 || all params: 1231940608 || trainable%: 0.19151053100118282'

In [None]:
#Here we training the peft model
# The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="./sample_data/",
        # Set the learning rate
        learning_rate = 2e-5,
        # Set the per device train batch size and eval batch size
        per_device_train_batch_size = 3,
        per_device_eval_batch_size = 3,
        # Evaluate and save the model after each epoch
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
    ),
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6025,0.494131,0.84803
2,0.544,0.492604,0.854597


TrainOutput(global_step=5688, training_loss=0.5529872073402887, metrics={'train_runtime': 2524.3905, 'train_samples_per_second': 6.758, 'train_steps_per_second': 2.253, 'total_flos': 8946517813493760.0, 'train_loss': 0.5529872073402887, 'epoch': 2.0})

In [None]:
# Show the performance of the model on the test set
# What do you think the evaluation accuracy will be?
trainer.evaluate()

{'eval_loss': 0.4926040470600128,
 'eval_accuracy': 0.8545966228893058,
 'eval_runtime': 59.9149,
 'eval_samples_per_second': 17.792,
 'eval_steps_per_second': 5.942,
 'epoch': 2.0}

In [None]:
import pandas as pd

df = pd.DataFrame(tokenized_ds["validation"])
df = df[["text", "label"]]

# Replace <br /> tags in the text with spaces
df["text"] = df["text"].str.replace("<br />", " ")

# Add the model predictions to the dataframe
predictions = trainer.predict(tokenized_ds["validation"])
df["predicted_label"] = np.argmax(predictions[0], axis=1)

df.head(20)

Unnamed: 0,text,label,predicted_label
0,compassionately explores the seemingly irrecon...,1,1
1,the soundtrack alone is worth the price of adm...,1,1
2,rodriguez does a splendid job of racial profil...,1,1
3,beneath the film's obvious determination to sh...,1,1
4,bielinsky is a filmmaker of impressive talent .,1,1
5,"so beautifully acted and directed , it's clear...",1,1
6,a visual spectacle full of stunning images and...,1,1
7,a gentle and engrossing character study .,1,1
8,"it's enough to watch huppert scheming , with h...",1,1
9,an engrossing portrait of uncompromising artis...,1,1
