#  distilBERT base model Fine-tuning usingLoRA

Classifying spam emails/texts using distilBERT.


In [9]:
!pip install peft
!pip install evaluate
!pip install transformers[torch]
!pip install accelerate -U
!pip install transformers -U

import accelerate
print(accelerate.__version__)

0.25.0


In [22]:
import numpy as np
import pandas as pd
import tensorflow as tf
import peft
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, Trainer, TrainingArguments
import evaluate

## Import dataset and split them

In [11]:
# Assuming df is your initial DataFrame
df=messages = pd.read_csv('Texts', sep='\t',
                           names=["label", "message"])

# First split into training and test/validation sets
train_df, test_df = train_test_split(df, test_size=0.2)  # 80% training, 20% test/validation

# Further split test/validation into test and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2)  # Splitting the 30% into two parts

## Model

In [12]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding,TrainingArguments, Trainer

In [13]:
model_checkpoint = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# define label maps
id2label = {0: "ham", 1: "spam"}
label2id = {"ham":0, "spam":1}

# generate classification model from model_checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Tokenizer

In [16]:
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

from datasets import Dataset
# Tokenization function
def tokenize_and_format(df):
    # Tokenize all texts
    tokenized_inputs = tokenizer(df['message'].tolist(), padding='max_length', truncation=True, max_length=512)

    # Ensure labels are integers
    if df['label'].dtype == 'object':  # Adjust if your labels are not object type
        df['label'] = df['label'].map(label2id)  # Convert labels using label map

    # Convert to DataFrame
    tokenized_df = pd.DataFrame(tokenized_inputs.data)
    tokenized_df['labels'] = df['label'].tolist()

    # Convert DataFrame to Hugging Face Dataset
    return Dataset.from_pandas(tokenized_df)

In [17]:
# Prepare datasets
train_dataset = tokenize_and_format(train_df)
val_dataset = tokenize_and_format(val_df)
test_dataset = tokenize_and_format(test_df)

# data collator will dynamically pad examples in each batch to be equal length

In [18]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### define an evaluation function to pass into trainer later

In [23]:
# define an evaluation function to pass into trainer later
from datasets import load_metric
# Load the accuracy metric
accuracy_metric = evaluate.load("accuracy")

# Define the compute_metrics function
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

#### LORA

In [24]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


### Training Arguments

In [25]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 5

# define training arguments
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

### Set the trainer object
### Train the model

In [26]:
# creater trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# train model
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1227,0.062687,0.987668
2,0.0342,0.051903,0.992152
3,0.0266,0.072139,0.988789
4,0.0123,0.066801,0.988789
5,0.0011,0.072374,0.988789


TrainOutput(global_step=4460, training_loss=0.03393757972081146, metrics={'train_runtime': 728.7607, 'train_samples_per_second': 24.459, 'train_steps_per_second': 6.12, 'total_flos': 2395674086092800.0, 'train_loss': 0.03393757972081146, 'epoch': 5.0})

In [27]:
# Evaluate the model on the test dataset
trainer.evaluate(test_dataset)

{'eval_loss': 0.040749747306108475,
 'eval_accuracy': 0.9937219730941704,
 'eval_runtime': 20.1447,
 'eval_samples_per_second': 55.349,
 'eval_steps_per_second': 13.85,
 'epoch': 5.0}