In [6]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [7]:
import peft
peft.__version__

'0.13.2'

## Load Model

### Mt0-large

In [4]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/mt0-large") # a model for crosslingual while multitasks

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

In [5]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,359,296 || all params: 1,231,940,608 || trainable%: 0.1915


### FLAN-T5

In [35]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [None]:
# print(model)

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    )


In [39]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


### BERT

In [41]:
from transformers import BertModel
model = BertModel.from_pretrained("bert-base-uncased")

In [None]:
# print(model)

In [45]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"], # special for BERT
    )

In [46]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 109,777,152 || trainable%: 0.2686


### GPT-2

In [48]:
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [49]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    )

In [50]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364




In [51]:
peft_config = LoraConfig(
    fan_in_fan_out=True,
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    )

In [52]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


## A fine-tuning comparison

### DistillBert with PEFT and LoRA, update q, v

In [50]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
# print(model)

In [52]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"], # special for BERT
    )

In [53]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


In [54]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [55]:
import numpy as np
import pandas as pd

In [56]:
df = pd.read_csv("train.csv")

In [57]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['target'])

In [58]:
train_texts = train_df["text"].tolist()
train_labels = train_df["target"].tolist()
valid_texts = valid_df["text"].tolist()
valid_labels = valid_df["target"].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=128)

In [59]:
import torch
from torch.utils.data import Dataset

class ClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = ClassificationDataset(train_encodings, train_labels)
valid_dataset = ClassificationDataset(valid_encodings, valid_labels)


In [60]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Use accuracy to decide the best model
    greater_is_better=True,
)

In [61]:
from sklearn.metrics import accuracy_score, f1_score
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

In [62]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [63]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.521884,0.791202,0.787487
2,0.592800,0.465892,0.802364,0.799672


TrainOutput(global_step=762, training_loss=0.5567759115864911, metrics={'train_runtime': 72.6032, 'train_samples_per_second': 167.761, 'train_steps_per_second': 10.495, 'total_flos': 269247230340480.0, 'train_loss': 0.5567759115864911, 'epoch': 2.0})

In [65]:
trainer.save_model("./best_model")

In [66]:
results = trainer.evaluate()
print("Validation Results:", results)

Validation Results: {'eval_loss': 0.4658920466899872, 'eval_accuracy': 0.8023637557452397, 'eval_f1': 0.7996721421774556, 'eval_runtime': 3.577, 'eval_samples_per_second': 425.781, 'eval_steps_per_second': 26.838, 'epoch': 2.0}


### DistillBert

In [67]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
# print(model)

In [69]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [70]:
import numpy as np
import pandas as pd

In [71]:
df = pd.read_csv("train.csv")

In [72]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df['target'])

In [73]:
train_texts = train_df["text"].tolist()
train_labels = train_df["target"].tolist()
valid_texts = valid_df["text"].tolist()
valid_labels = valid_df["target"].tolist()

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=128)

In [74]:
import torch
from torch.utils.data import Dataset

class ClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = ClassificationDataset(train_encodings, train_labels)
valid_dataset = ClassificationDataset(valid_encodings, valid_labels)


In [75]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # Use accuracy to decide the best model
    greater_is_better=True,
)

In [76]:
from sklearn.metrics import accuracy_score, f1_score
import torch

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

In [77]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [78]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.383829,0.848326,0.846799
2,0.441900,0.381817,0.84176,0.840894


TrainOutput(global_step=762, training_loss=0.41330295472633183, metrics={'train_runtime': 137.0386, 'train_samples_per_second': 88.88, 'train_steps_per_second': 5.56, 'total_flos': 264707118970560.0, 'train_loss': 0.41330295472633183, 'epoch': 2.0})

In [79]:
trainer.save_model("./best_model_no_peft")

In [80]:
results = trainer.evaluate()
print("Validation Results:", results)

Validation Results: {'eval_loss': 0.38382890820503235, 'eval_accuracy': 0.8483256730137886, 'eval_f1': 0.8467993341202562, 'eval_runtime': 3.4216, 'eval_samples_per_second': 445.12, 'eval_steps_per_second': 28.057, 'epoch': 2.0}


## Some Thoughts
Surface observation:
The time is much saved in PEFT mode, but accuracy lost.

Deeper Thoughts:
Better generalization, avoid overfitting; PEFT (LoRA) configuration to be improve, like bigger r.