<h1 style='text-align:center; font-size:30px; font-weight:bold; '>LoRA Fine-Tuning</h1>

# 1. Introduction

# Table of Contents

1. Part 2 Introduction

2. LoRA-Based Fine-Tuning
   - 2.1 Train Classifier Head Only 
   - 2.2 Fine Tune All Weights with Classifier Head Carried Over  
   - 2.3 LoRA Technique – Fine Tuning Adapters (FNN Blocks)
   - 2.4 Experiment: Varying Adapters

3. Conclusion and Key Takeaways

# Imports & Installations

In [None]:
!pip install transformers scikit-learn pandas numpy tqdm tensorflow
!pip install -q datasets

import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset # Hugging Face
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
import random
import os
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import scipy.stats
import seaborn as sns


import warnings
warnings.filterwarnings("ignore")

# 2.1 Train Classifier Head Only 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np

# Load dataset and tokenizer
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.2, seed=42)
val_test_split = train_val_split['test'].train_test_split(test_size=0.5, seed=42)

def to_tf_dataset(split, shuffle=False):
    return split.to_tf_dataset(
        columns=["input_ids", "attention_mask"],
        label_cols=["label"],
        shuffle=shuffle,
        batch_size=8,
        collate_fn=None
    )

tf_train_dataset = to_tf_dataset(train_val_split['train'], shuffle=True)
tf_validation_dataset = to_tf_dataset(val_test_split['train'], shuffle=True)
tf_test_dataset = to_tf_dataset(val_test_split['test'], shuffle=False)

# Define and compile model
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
model.distilbert.trainable = False

initial_learning_rate = 5e-5
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=10000,
    end_learning_rate=0.0,
    power=1.0
)

optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

# Training
model.fit(tf_train_dataset, epochs=3, validation_data=tf_validation_dataset)

# Evaluation
eval_loss, eval_accuracy = model.evaluate(tf_test_dataset)
print(f"Evaluated Test Loss: {eval_loss}, Evaluated Test Accuracy: {eval_accuracy}")

# Save weights
model.save_weights('classifier_head_weights.h5')

# 2.2 Fine Tune All Weights with Classifier Head Carried Over  

In [None]:
# Load the pre-trained model for fine-tuning all layers except the classifier
model_minus_classifier = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Now load the weights saved back into the model
model_minus_classifier.load_weights('classifier_head_weights.h5', by_name=True)

# Set only the pre_classifier and classifier layer to non-trainable to preserve its weights
model_minus_classifier.pre_classifier.trainable = False
model_minus_classifier.classifier.trainable = False

initial_learning_rate = 5e-5
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=10000,
    end_learning_rate=0.0,
    power=1.0
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model_minus_classifier.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model_minus_classifier.summary()

history = model_minus_classifier.fit(
    tf_train_dataset,
    epochs=3,
    validation_data=tf_validation_dataset
)

# Evaluate the model on the test set
test_loss, test_accuracy = model_minus_classifier.evaluate(tf_test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# 2.3 LoRA Technique – Fine Tuning Adapters (FNN Blocks)

In [None]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertConfig

class LoRALayer(tf.keras.layers.Layer):
    def __init__(self, original_layer, rank):
        super(LoRALayer, self).__init__()
        self.original_layer = original_layer
        self.rank = rank
        self.input_dim = original_layer.kernel.shape[0]
        self.output_dim = original_layer.kernel.shape[1]

        # Initialize LoRA parameters
        self.A = self.add_weight(shape=(self.input_dim, rank), initializer='glorot_uniform', trainable=True)
        self.B = self.add_weight(shape=(rank, self.output_dim), initializer='glorot_uniform', trainable=True)
        self.lora_alpha = 1.0

    def call(self, inputs):
        # Compute the original output
        original_output = self.original_layer(inputs)

        # Compute the low-rank adaptation
        lora_update = self.lora_alpha * tf.matmul(inputs, tf.matmul(self.A, self.B))

        return original_output + lora_update

checkpoint = "distilbert-base-uncased"
model_lora = TFDistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Load the previously saved weights into the model
model_lora.load_weights('classifier_head_weights.h5', by_name=True)

# Rank for LoRA matrices
lora_rank = 8

# Replace each lin1 and lin2 layer in each transformer layer with a LoRA-enhanced layer
for i, layer in enumerate(model_lora.distilbert.transformer.layer):
    original_lin1 = layer.ffn.lin1
    original_lin2 = layer.ffn.lin2
    layer.ffn.lin1 = LoRALayer(original_lin1, rank=lora_rank)
    layer.ffn.lin2 = LoRALayer(original_lin2, rank=lora_rank)

    original_lin1.trainable = False
    original_lin2.trainable = False

# Freeze weights of the classifier head
model_lora.pre_classifier.trainable = False
model_lora.classifier.trainable = False

# Freeze all other parameters except for the LoRA parameters
for layer in model_lora.distilbert.transformer.layer:
    layer.attention.trainable = False

initial_learning_rate = 5e-5
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=10000,
    end_learning_rate=0.0,
    power=1.0
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model_lora.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

model_lora.summary()

# Train the model
model_lora.fit(tf_train_dataset, epochs=3, validation_data=tf_validation_dataset)

test_loss, test_accuracy = model_lora.evaluate(tf_test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# 2.4 Experiment: Varying Adapters

In [None]:
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertConfig

class LoRALayer(tf.keras.layers.Layer):
    def __init__(self, original_layer, rank):
        super(LoRALayer, self).__init__()
        self.original_layer = original_layer
        self.rank = rank
        self.input_dim = original_layer.kernel.shape[0]
        self.output_dim = original_layer.kernel.shape[1]

        # Initialize LoRA parameters
        self.A = self.add_weight(shape=(self.input_dim, rank), initializer='glorot_uniform', trainable=True)
        self.B = self.add_weight(shape=(rank, self.output_dim), initializer='glorot_uniform', trainable=True)
        self.lora_alpha = 1.0

    def call(self, inputs):
        original_output = self.original_layer(inputs)
        lora_update = self.lora_alpha * tf.matmul(inputs, tf.matmul(self.A, self.B))
        return original_output + lora_update

# List of ranks to test
ranks = [2, 4, 8, 16, 32, 64]
test_losses = []
test_accuracies = []

for lora_rank in ranks:
    print(f"Training with LoRA rank: {lora_rank}")
    model_lora = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
    model_lora.load_weights('classifier_head_weights.h5', by_name=True)

    # Apply LoRA layers
    for i, layer in enumerate(model_lora.distilbert.transformer.layer):
        original_lin1 = layer.ffn.lin1
        original_lin2 = layer.ffn.lin2
        layer.ffn.lin1 = LoRALayer(original_lin1, rank=lora_rank)
        layer.ffn.lin2 = LoRALayer(original_lin2, rank=lora_rank)

        original_lin1.trainable = False
        original_lin2.trainable = False

    # Freeze classifier head
    model_lora.pre_classifier.trainable = False
    model_lora.classifier.trainable = False

    # Freeze all other parameters except for the LoRA parameters
    for layer in model_lora.distilbert.transformer.layer:
        layer.attention.trainable = False

    optimizer = tf.keras.optimizers.Adam(learning_rate=tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=5e-5,
        decay_steps=10000,
        end_learning_rate=0.0,
        power=1.0
    ))
    model_lora.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

    model_lora.summary()

    # Train the model
    model_lora.fit(tf_train_dataset, epochs=3, validation_data=tf_validation_dataset)

    # Evaluate the model
    test_loss, test_accuracy = model_lora.evaluate(tf_test_dataset)
    test_losses.append(test_loss)
    test_accuracies.append(test_accuracy)

    print(f"Rank: {lora_rank}, Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

In [None]:
# Plot results
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(ranks, test_losses, marker='o', color='b')
plt.title('Rank vs Test Loss')
plt.xlabel('Rank')
plt.ylabel('Test Loss')

plt.subplot(1, 2, 2)
plt.plot(ranks, test_accuracies, marker='o', color='r')
plt.title('Rank vs Test Accuracy')
plt.xlabel('Rank')
plt.ylabel('Test Accuracy')

plt.tight_layout()
plt.show()

# Other LoRA Features
- LoRA vs Pre-Trained Model
- LoRA vs conventional fine tuning
- LoRA Parameters per Block

In [None]:
# Load the pre-trained model for fine-tuning all layers except the classifier
model_without_pre_training = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)


initial_learning_rate = 5e-5
lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
    initial_learning_rate=initial_learning_rate,
    decay_steps=10000,
    end_learning_rate=0.0,
    power=1.0
)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

model_without_pre_training.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model_without_pre_training.summary()

# Evaluate the model on the test set
test_loss, test_accuracy = model_without_pre_training.evaluate(tf_test_dataset)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")


# Params
from transformers import TFDistilBertForSequenceClassification

checkpoint = "distilbert-base-uncased"
model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

# Retrieve the input and output dimensions from the first FFN block
d_input = model.distilbert.transformer.layer[0].ffn.lin1.kernel.shape[0]
d_output = model.distilbert.transformer.layer[0].ffn.lin1.kernel.shape[1]

print(f"Input dimension (d_input): {d_input}")
print(f"Output dimension (d_output): {d_output}")

r = 8  # Rank used for LoRA

# Calculate number of parameters for one FFN block (lin1 and lin2)
def calculate_lora_parameters(d_input, d_output, rank):
    return (d_input * rank) + (rank * d_output)

lora_params_per_layer = calculate_lora_parameters(d_input, d_output, r)

lora_params_per_ffn_block = lora_params_per_layer * 2

print(f"Number of LoRA parameters per layer: {lora_params_per_layer}")
print(f"Total LoRA parameters per FFN block: {lora_params_per_ffn_block}")

# Total params
from transformers import TFDistilBertForSequenceClassification

checkpoint = "distilbert-base-uncased"
model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

num_layers = len(model.distilbert.transformer.layer)

print(f"Number of transformer layers: {num_layers}")
added_total = lora_params_per_ffn_block * num_layers

print(f"Number of LoRA parameters per layer: {lora_params_per_layer}")
print(f"Total LoRA parameters per FFN block: {lora_params_per_ffn_block}")
print(f"Total added LoRA parameters across all FFN blocks: {added_total}")

# Total parameters in the MODEL
checkpoint = "distilbert-base-uncased"
model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

def count_parameters(model):
    return sum(np.prod(v.shape) for v in model.trainable_weights)

# Calculate the total number of parameters in the unmodified DistilBERT model
original_params = count_parameters(model)

print(f"Original DistilBERT parameters: {original_params}")
# Total parameters in the model
total_parameters = original_params + added_total

print(f"Original DistilBERT parameters: {original_params}")
print(f"Added LoRA parameters: {added_total}")
print(f"Total parameters in the adapted model: {total_parameters}")

# trainable LoRA params
checkpoint = "distilbert-base-uncased"
model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint)

d_input = model.distilbert.transformer.layer[0].ffn.lin1.kernel.shape[0]
d_output = model.distilbert.transformer.layer[0].ffn.lin1.kernel.shape[1]
num_layers = len(model.distilbert.transformer.layer)
r = 8  # Rank used for LoRA

def calculate_lora_parameters(d_input, d_output, rank):
    return (d_input * rank) + (rank * d_output)

lora_params_per_layer = calculate_lora_parameters(d_input, d_output, r)

lora_params_per_ffn_block = lora_params_per_layer * 2

added_parms_calc = lora_params_per_ffn_block * num_layers

added_total = added_parms_calc

# Assertion to verify the numbers match
assert(added_total == added_parms_calc)

print(f"Calculated trainable parameters added by LoRA: {added_parms_calc}")


# 3. Conclusion and Key Takeaways