# LoRA Fine-Tuning with Hugging Face and TensorFlow on FLAN-T5-base for Summarization

## Introduction
In this notebook, we will perform LoRA fine-tuning on the FLAN-T5-base model using the WMT16 sentiment analysis dataset. We will replace the dense layers with LoRA layers and fine-tune the model for translation.

In [1]:
# !pip install transformers tensorflow datasets tensorflow_addons

## Load and Preprocess the  Dataset

In [2]:
import tensorflow as tf
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
# import tensorflow_addons as tfa
# from tensorflow.keras.layers import Dense

# setting GPU memory limit
print(tf.config.list_physical_devices('GPU'))
# os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = TFAutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1 Physical GPUs, 1 Logical GPUs


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [3]:
# Loading the dataset
from datasets import load_dataset
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

# Display an example
print(dataset['train'][0])

{'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office char

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [5]:
# Preprocess the dataset for input into the model
def preprocess_data(examples):
    inputs = [f'Sumarize: {article}' for article in examples["article"]]
    targets = [highlight for highlight in examples['highlights']]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="tf")

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length", return_tensors="tf")

    model_inputs["labels"] = labels["input_ids"]

    # For decoder inputs
    decoder_inputs = tokenizer(targets, max_length=512, truncation=True, padding="max_length", return_tensors="tf")
    model_inputs["decoder_input_ids"] = decoder_inputs["input_ids"]

    return model_inputs
train_dataset = dataset['train'].select(range(30000)).map(preprocess_data, batched=True)
val_dataset = dataset['validation'].select(range(1000)).map(preprocess_data, batched=True)
test_dataset = dataset['test'].select(range(1000)).map(preprocess_data, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=None)

train_dataset = train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask','decoder_input_ids'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator
)

val_dataset = val_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask','decoder_input_ids'],
    label_cols=['labels'],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator
)

test_dataset = test_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask','decoder_input_ids'],
    label_cols=['labels'],
    shuffle=False,
    batch_size=8,
    collate_fn=data_collator
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


## Load the Pre-trained FLAN-T5 Model and Modify

In [21]:
# Replace the dense layers with LoRA layers
class LoRALayer(tf.keras.layers.Layer):
    def __init__(self, dense, rank=4):
        super().__init__()
        self.dense = dense
        self.rank = rank

    def build(self, input_shape):
        self.w_a = self.add_weight(shape=(input_shape[-1], self.rank),
                                   initializer='random_normal',
                                   trainable=True, name='w_a')
        self.w_b = self.add_weight(shape=(self.rank, self.dense.units),
                                   initializer='random_normal',
                                   trainable=True, name='w_b')

    def call(self, inputs):
        original_output = self.dense(inputs)
        lora_output = tf.matmul(tf.matmul(inputs, self.w_a), self.w_b)
        self.dense.trainable = False
        return original_output + lora_output


In [6]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
 lm_head (Dense)             multiple                  24674304  
                                                                 
Total params: 247,577,856
Trainable params: 247,577,856
Non-trainable params: 0
_________________________________________________________________


In [23]:
# import tf_keras
for ix, layer in enumerate(model.decoder._flatten_layers()):
    if isinstance(layer, tf.keras.layers.Dense):
        layer.trainable = False
        layer = LoRALayer(layer)
    else:
        layer.trainable = False
model.get_layer('encoder').trainable = False
model.get_layer('shared').trainable = False
model.layers[3] = LoRALayer(model.get_layer('lm_head'))

In [8]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  24674304  
                                                                 
 encoder (TFT5MainLayer)     multiple                  109628544 
                                                                 
 decoder (TFT5MainLayer)     multiple                  137949312 
                                                                 
 lm_head (Dense)             multiple                  24674304  
                                                                 
Total params: 247,577,856
Trainable params: 24,674,304
Non-trainable params: 222,903,552
_________________________________________________________________


## Train the Model

In [None]:
# Compile the model
model.compile(optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
              )

# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=1)

## Evaluate the Model

In [11]:
model.evaluate(test_dataset)



50.52045822143555

In [12]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import nltk
nltk.download('punkt_tab')

def translate(inputs):
    outputs = model.generate(inputs[0]["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


# Function to calculate ROUGE and BLEU scores
def calculate_scores(reference, hypothesis):
    # Initialize scorers
    rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bleu_smoothing = SmoothingFunction().method4

    # Calculate ROUGE scores
    rouge_scores = rouge.score(reference, hypothesis)

    # Calculate BLEU score
    reference_tokens = [nltk.word_tokenize(reference)]
    hypothesis_tokens = nltk.word_tokenize(hypothesis)
    bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=bleu_smoothing)

    return rouge_scores, bleu_score, int(reference==hypothesis)

# Evaluate translations and calculate scores
batch = next(iter(val_dataset))
translated_text = translate(batch)
reference_text = tokenizer.decode(batch[1][0], skip_special_tokens=True)
rouge_scores, bleu_score, exact_match = calculate_scores(reference_text, translated_text)
print(f"Reference: {reference_text}")
print(f"Answer: {translated_text}")
print(f"ROUGE Scores: {rouge_scores}")
print(f"BLEU Score: {bleu_score}")
print(f"Exact match Score: {exact_match}")
print()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mbila\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Reference: Zully Broussard decided to give a kidney to a stranger. A new computer program helped her donation spur transplants for six kidney patients.
Answer: Zully Broussard's generosity will be multiplied.
ROUGE Scores: {'rouge1': Score(precision=0.2857142857142857, recall=0.08695652173913043, fmeasure=0.13333333333333333), 'rouge2': Score(precision=0.16666666666666666, recall=0.045454545454545456, fmeasure=0.07142857142857144), 'rougeL': Score(precision=0.2857142857142857, recall=0.08695652173913043, fmeasure=0.13333333333333333)}
BLEU Score: 0.009414409352939208
Exact match Score: 0



In [13]:
iterator = iter(val_dataset)
batch = next(iterator)
batch = next(iterator)
translated_text = translate(batch)
reference_text = tokenizer.decode(batch[1][0], skip_special_tokens=True)
rouge_scores, bleu_score, exact_match = calculate_scores(reference_text, translated_text)
print(f"Reference: {reference_text}")
print(f"Answer: {translated_text}")
print(f"ROUGE Scores: {rouge_scores}")
print(f"BLEU Score: {bleu_score}")
print(f"Exact match Score: {exact_match}")
print()

Reference: A jihadist group claims responsibility in an audio recording, news agency reports. The Malian government calls the shooting a "terrorist act" One French citizen, one Belgian and three Malians are killed.
Answer: Bamako's retaliation for the killing of a leader.
ROUGE Scores: {'rouge1': Score(precision=0.3333333333333333, recall=0.0967741935483871, fmeasure=0.15000000000000002), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.2222222222222222, recall=0.06451612903225806, fmeasure=0.09999999999999999)}
BLEU Score: 0.0020747551353380595
Exact match Score: 0



In [None]:
# def tokenize_function(examples):
#     inputs = tokenizer(examples['article'], max_length=512, truncation=True, padding='max_length', return_tensors='tf')
#     targets = tokenizer(examples['highlights'], max_length=128, truncation=True, padding='max_length', return_tensors='tf')

#     inputs['labels'] = targets['input_ids']
#     inputs['decoder_input_ids'] = targets['input_ids']
#     return inputs

# train_dataset = small_train_dataset.map(tokenize_function, batched=True, remove_columns=["id"])
# val_dataset = small_validation_dataset.map(tokenize_function, batched=True, remove_columns=["id"])

# def convert_to_tf_dataset(dataset):
#     input_ids = tf.convert_to_tensor(dataset["input_ids"], dtype=tf.int32)
#     attention_mask = tf.convert_to_tensor(dataset["attention_mask"], dtype=tf.int32)
#     decoder_input_ids = tf.convert_to_tensor(dataset["decoder_input_ids"], dtype=tf.int32)
#     labels = tf.convert_to_tensor(dataset["labels"], dtype=tf.int32)
#     return tf.data.Dataset.from_tensor_slices(({"input_ids": input_ids, "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids}, labels)).batch(32)

# train_data = convert_to_tf_dataset(train_dataset)
# val_data = convert_to_tf_dataset(val_dataset)

In [None]:
# import keras
# from keras.layers import Dense
# from keras.models import Sequential
# from keras.layers import Input

# class LoraLayer(keras.layers.Layer):
#     def __init__(self, original_layer, rank=8, num_heads=1, dim=1, trainable=False, **kwargs):
#         original_layer_config = original_layer.get_config()
#         name = original_layer_config["name"]
#         kwargs.pop("name", None)
#         super().__init__(name=name, trainable=trainable, **kwargs)
#         self.rank = rank
#         self.original_layer = original_layer
#         self.original_layer.trainable = False
#         self.A = keras.layers.Dense(units=rank, use_bias=False, trainable=trainable, name=f"lora_A")
#         self.B = keras.layers.Dense(units=dim, use_bias=False, trainable=trainable, name=f"lora_B")

#     def call(self, inputs):
#         original_output = self.original_layer(inputs)
#         if self.trainable:
#             lora_output = self.B(self.A(inputs))
#             return original_output + lora_output
#         return original_output

# import transformers
# from tf_keras.src.layers.core.dense import Dense as NDense

# def replace_dense_with_lora(layer, rank=8):
#     if isinstance(layer, NDense):
#         return LoraLayer(original_layer=layer, rank=rank)
#     return layer

# def modify_t5_layers(t5, rank=8):
#     for sub_layer in t5.encoder.submodules:
#         if isinstance(sub_layer, transformers.models.t5.modeling_tf_t5.TFT5Attention):
#             sub_layer.k = replace_dense_with_lora(sub_layer.k, rank)
#             sub_layer.v = replace_dense_with_lora(sub_layer.v, rank)
#             sub_layer.q = replace_dense_with_lora(sub_layer.q, rank)
#             sub_layer.o = replace_dense_with_lora(sub_layer.o, rank)
#     for sub_layer in t5.decoder.submodules:
#         if isinstance(sub_layer, transformers.models.t5.modeling_tf_t5.TFT5Attention):
#             sub_layer.k = replace_dense_with_lora(sub_layerb.k, rank)
#             sub_layer.v = replace_dense_with_lora(sub_layer.v, rank)
#             sub_layer.q = replace_dense_with_lora(sub_layer.q, rank)
#             sub_layer.o = replace_dense_with_lora(sub_layer.o, rank)
#     return t5

# modified_t5_model = modify_t5_layers(t5_model, rank=4)

# for layer in modified_t5_model._flatten_layers():
#     if (layer.__class__.__module__.startswith('tf_keras') or layer.__class__.__module__.startswith('keras')) and not layer.name.startswith("lora"):
#         layer.trainable = False
#     elif layer.name.startswith("lora"):
#         layer.trainable = True
#     elif layer.name == 'shared':
#         layer.trainable = False

In [None]:
# import gc
# from tf_keras import backend as K

# def clear_gpu_memory():
#     gc.collect()
#     tf.keras.backend.clear_session()
#     K.clear_session()
#     tf.compat.v1.reset_default_graph()

# clear_gpu_memory()

# from tf_keras import mixed_precision

# mixed_precision.set_global_policy('mixed_float16')
# tf.config.run_functions_eagerly(True)
# modified_t5_model.compile(optimizer='adam', loss=t5_model.hf_compute_loss)

# modified_t5_model.fit(train_data, epochs=1, validation_data=val_data)

In [None]:
# !pip install rouge_score

# from datasets import load_metric

# rouge = load_metric("rouge", trust_remote_code=True)
# num_samples = 1000

# def generate_summaries(dataset, num_samples):
#     inputs = tokenizer([ex['article'] for ex in dataset.take(num_samples)], return_tensors='tf', padding=True, truncation=True, max_length=512)
#     summaries = modified_t5_model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=128, num_beams=5)
#     decoded_summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
#     return decoded_summaries

# val_summaries = [ex['highlights'] for ex in val_dataset.take(num_samples)]
# generated_summaries = generate_summaries(val_dataset, num_samples=num_samples)
# results = rouge.compute(predictions=generated_summaries, references=val_summaries)
# print(results)