# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [1]:
#import libs
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
        Trainer, TrainingArguments, DataCollatorWithPadding

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#load gpt2 model and configure tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
tokenizer.pad_token = tokenizer.eos_token


model_gpt = AutoModelForSequenceClassification.from_pretrained(
    "openai-community/gpt2",
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},  # For converting predictions to strings
    label2id={"NEGATIVE": 0, "POSITIVE": 1},
)
model_gpt.config.pad_token_id = tokenizer.pad_token_id

for param in model_gpt.base_model.parameters():
    param.requires_grad = False

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# examine model structure
print(model_gpt)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)


In [6]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.3-py3-none-any.whl.metadata (12 kB)
Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.23.3-py3-none-any.whl (401 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.7/401.7 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests, huggingface-hub, datasets
  Attempting uninstall: requests
    Found existing ins

In [2]:
from datasets import load_dataset
#load imdb dataset
dataset = load_dataset("imdb")

Using the latest cached version of the dataset since imdb couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at /Users/paulhake/.cache/huggingface/datasets/imdb/plain_text/0.0.0/e6281661ce1c48d982bc483cf8a173c1bbeb5d31 (last modified on Tue Mar 19 16:07:35 2024).


In [6]:
# tokenize all the examples
def tokenize_batch(batch):
    return tokenizer(batch["text"],padding="max_length", truncation=True)
tokenized_dataset = dataset.map(tokenize_batch, batched=True)

# Inspect the available columns in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 25000
})

In [7]:
#split into train and test set and sub sample 500 for faster training and inferencing
train = tokenized_dataset["train"].shuffle(seed=42).select(range(500))
test = tokenized_dataset["test"].shuffle(seed=42).select(range(500))

In [7]:
#install scikit for accuracy evaluation
!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting scikit-learn
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m42.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.4.0-py3-none-any.whl (17 kB)
Collecting joblib>=1.2.0
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.4.1.post1 threadpoolctl-3.4.0


In [8]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}


In [13]:
#train model head
import numpy as np
from sklearn.metrics import accuracy_score


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

training_args = TrainingArguments(
    output_dir="./test_trainer",
    per_device_eval_batch_size=4,
    per_device_train_batch_size=4,
    num_train_epochs=1,
    learning_rate=1e-3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

gpt_trainer = Trainer(
    model=model_gpt,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    #data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

gpt_eval = gpt_trainer.train()

                                                 
100%|██████████| 125/125 [25:39<00:00,  7.35s/it]

{'eval_loss': 0.5870361924171448, 'eval_accuracy': 0.756, 'eval_runtime': 635.1798, 'eval_samples_per_second': 0.787, 'eval_steps_per_second': 0.197, 'epoch': 1.0}


100%|██████████| 125/125 [25:40<00:00, 12.33s/it]

{'train_runtime': 1540.7141, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.081, 'train_loss': 0.7883529052734375, 'epoch': 1.0}





In [17]:
gpt_eval

TrainOutput(global_step=125, training_loss=0.7883529052734375, metrics={'train_runtime': 1540.7141, 'train_samples_per_second': 0.325, 'train_steps_per_second': 0.081, 'train_loss': 0.7883529052734375, 'epoch': 1.0})

In [15]:
gpt_trainer.save_model("base_gpt_model")

In [16]:
import pandas as pd

df = pd.DataFrame(test)
df = df[["text", "label"]]

# Replace <br /> tags in the text with spaces
df["text"] = df["text"].str.replace("<br />", " ")

# Add the model predictions to the dataframe
predictions = gpt_trainer.predict(test)
df["predicted_label"] = np.argmax(predictions[0], axis=1)

df.head(4)

100%|██████████| 125/125 [10:28<00:00,  5.03s/it]


Unnamed: 0,text,label,predicted_label
0,When I unsuspectedly rented A Thousand Acres...,1,1
1,This is the latest entry in the long series of...,1,1
2,This movie was so frustrating. Everything seem...,0,0
3,"I was truly and wonderfully surprised at ""O' B...",1,1


## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [10]:
model_base_trained = AutoModelForSequenceClassification.from_pretrained("base_gpt_model")

In [11]:
from peft import LoraConfig, get_peft_model, TaskType


lora_config = LoraConfig(
    r=8, 
    lora_alpha=32,
    target_modules=['c_attn', 'c_proj'],
    lora_dropout=0.1,
    bias="none",
    fan_in_fan_out=True,
    task_type=TaskType.SEQ_CLS
)

lora_model = get_peft_model(model_base_trained, lora_config)

In [13]:
trainer_lora = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./loraModel",
        learning_rate=1e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=1,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer_lora.train()


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                       
[A                                              

  0%|          | 0/125 [48:21<?, ?it/s]          
[A
[ACheckpoint destination directory ./loraModel/checkpoint-125 already exists and is non-empty. Saving will proceed but saved results may be invalid.
                                       
100%|██████████| 125/125 [48:00<00:00, 23.04s/it]

{'eval_loss': 0.35950732231140137, 'eval_accuracy': 0.886, 'eval_runtime': 692.1093, 'eval_samples_per_second': 0.722, 'eval_steps_per_second': 0.181, 'epoch': 1.0}
{'train_runtime': 2880.1857, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.043, 'train_loss': 0.5166582641601563, 'epoch': 1.0}





TrainOutput(global_step=125, training_loss=0.5166582641601563, metrics={'train_runtime': 2880.1857, 'train_samples_per_second': 0.174, 'train_steps_per_second': 0.043, 'train_loss': 0.5166582641601563, 'epoch': 1.0})

In [14]:
trainer_lora.evaluate()

100%|██████████| 125/125 [11:48<00:00,  5.67s/it]


{'eval_loss': 0.35950732231140137,
 'eval_accuracy': 0.886,
 'eval_runtime': 714.3845,
 'eval_samples_per_second': 0.7,
 'eval_steps_per_second': 0.175,
 'epoch': 1.0}

In [15]:
lora_model.print_trainable_parameters()

trainable params: 812,544 || all params: 125,253,888 || trainable%: 0.6487175871139426


In [16]:
lora_model.save_pretrained("gpt2_lora")



In [17]:
tokenizer.save_pretrained("gpt2_lora_tokenizer")

('gpt2_lora_tokenizer/tokenizer_config.json',
 'gpt2_lora_tokenizer/special_tokens_map.json',
 'gpt2_lora_tokenizer/vocab.json',
 'gpt2_lora_tokenizer/merges.txt',
 'gpt2_lora_tokenizer/added_tokens.json',
 'gpt2_lora_tokenizer/tokenizer.json')

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [18]:
from peft import AutoPeftModelForSequenceClassification
lora_model = AutoPeftModelForSequenceClassification.from_pretrained("gpt2_lora")

#tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2_lora_tokenizer")





In [19]:
#saved lora model evaluation
trainer_lora = Trainer(
    model=lora_model,
    args=TrainingArguments(
        output_dir="./loraModel",
        learning_rate=1e-3,
        # Reduce the batch size if you don't have enough memory
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    ),
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

eval_results = trainer_lora.evaluate()
print(eval_results)

100%|██████████| 125/125 [11:49<00:00,  5.68s/it]

{'eval_loss': 0.35950732231140137, 'eval_accuracy': 0.886, 'eval_runtime': 714.9352, 'eval_samples_per_second': 0.699, 'eval_steps_per_second': 0.175}





## comparison of original model and peft model
Base model accuracy = 0.756
Lora model accuracy = 0.886



### original base results from original gpt base model above:
{'eval_loss': 0.5870361924171448, 
 'eval_accuracy': 0.756, 
 'eval_runtime': 635.1798, 
 'eval_samples_per_second': 0.787, 
 'eval_steps_per_second': 0.197, 
 'epoch': 1.0}
100%|██████████| 125/125 [25:40<00:00, 12.33s/it]


### eval results from lora model:
{'eval_loss': 0.35950732231140137,
 'eval_accuracy': 0.886,
 'eval_runtime': 714.3845,
 'eval_samples_per_second': 0.7,
 'eval_steps_per_second': 0.175,
 'epoch': 1.0}