# LoRA Fine-Tuning with FLAN-T5

## Install, import and setup

In [1]:
%pip install transformers torch datasets evaluate nltk



In [2]:
%pip install --upgrade datasets fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [3]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from tqdm.auto import tqdm
import numpy as np
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [4]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")

Using device: cuda
GPU: Tesla T4


## Load and Preprocess the Dataset

In [5]:
dataset = load_dataset('wmt16', 'de-en')

# Display an example
print(dataset['train'][0])

{'translation': {'de': 'Wiederaufnahme der Sitzungsperiode', 'en': 'Resumption of the session'}}


In [6]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-small')

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [7]:
def preprocess_data(examples):
    inputs = [f'Translate English to German: {example["en"]}' for example in examples['translation']]
    targets = [example['de'] for example in examples['translation']]

    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length', return_tensors='pt')
    labels = tokenizer(targets, max_length=256, truncation=True, padding='max_length', return_tensors='pt').input_ids

    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs['labels'] = labels

    decoder_inputs = tokenizer(targets, max_length=256, truncation=True, padding="max_length", return_tensors='pt')
    model_inputs["decoder_input_ids"] = decoder_inputs["input_ids"]

    return model_inputs

# Use more training data for better results
train_dataset = dataset['train'].select(range(50000)).map(preprocess_data, batched=True)
test_dataset = dataset['test'].select(range(2000)).map(preprocess_data, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decoder_input_ids', 'labels'])

# Create DataLoaders with smaller batch size for base model
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=4, shuffle=False)

## Load the Pre-trained FLAN-T5 Model and Modify

In [16]:
# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained('google/flan-t5-small')

# Custom LoRA Layer Implementation
class LoRALayer(nn.Module):
    def __init__(self, original_layer, rank=8):
        super().__init__()
        self.original_layer = original_layer
        self.rank = rank

        # Get dimensions from original layer
        in_features = original_layer.in_features
        out_features = original_layer.out_features

        # Create LoRA matrices with better initialization
        self.lora_A = nn.Parameter(torch.randn(in_features, rank) * (1.0 / rank))
        self.lora_B = nn.Parameter(torch.zeros(rank, out_features))

        # Copy ALL attributes from original layer to maintain compatibility
        self.in_features = in_features
        self.out_features = out_features
        self.weight = original_layer.weight  # Keep reference to original weight
        self.bias = original_layer.bias if hasattr(original_layer, 'bias') else None

        # Freeze original layer
        for param in self.original_layer.parameters():
            param.requires_grad = False

    def forward(self, x):
        # Original output
        original_output = self.original_layer(x)
        # LoRA output
        lora_output = torch.matmul(torch.matmul(x, self.lora_A), self.lora_B)
        return original_output + lora_output

# Function to apply LoRA - MINIMAL SAFE VERSION
def apply_lora_to_model(model, rank=8):
    # Freeze all parameters first
    for param in model.parameters():
        param.requires_grad = False

    print("Applying LoRA to model layers...")

    # ONLY apply LoRA to lm_head to avoid compatibility issues
    if hasattr(model, 'lm_head') and isinstance(model.lm_head, nn.Linear):
        print("Applying LoRA to lm_head")
        model.lm_head = LoRALayer(model.lm_head, rank=rank)
        print("LoRA applied successfully to lm_head")
    else:
        print("Warning: lm_head not found or not a Linear layer")

    print("LoRA application completed!")

# Apply LoRA to the model with higher rank
apply_lora_to_model(model, rank=8)

# Move model to device
model = model.to(device)

Applying LoRA to model layers...
Applying LoRA to lm_head
LoRA applied successfully to lm_head
LoRA application completed!


## Count Trainable and Non-Trainable Parameters

In [17]:
def count_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    non_trainable_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
    return trainable_params, non_trainable_params

trainable_params, non_trainable_params = count_parameters(model)

print(f'Trainable parameters: {trainable_params:,}')
print(f'Non-trainable parameters: {non_trainable_params:,}')
print(f'Percentage of trainable parameters: {100 * trainable_params / (trainable_params + non_trainable_params):.2f}%')

Trainable parameters: 261,120
Non-trainable parameters: 76,961,152
Percentage of trainable parameters: 0.34%


## Training Setup

In [18]:
optimizer = Adam(model.parameters(), lr=5e-4)

# Training function
def train_epoch(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    num_batches = 0

    for batch in tqdm(dataloader, desc="Training"):
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()

        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

    return total_loss / num_batches

# Evaluation function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    num_batches = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
            num_batches += 1

    return total_loss / num_batches

## Train the Model

In [None]:
epochs = 3
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Train
    train_loss = train_epoch(model, train_dataloader, optimizer, device)
    print(f"Training Loss: {train_loss:.4f}")

    # Evaluate every epoch
    eval_loss = evaluate_model(model, test_dataloader, device)
    print(f"Validation Loss: {eval_loss:.4f}")

print("Training completed!")

## Evaluate the Model

In [None]:
final_eval_loss = evaluate_model(model, test_dataloader, device)
print(f'Final evaluation loss: {final_eval_loss:.4f}')

## Verify Translations with BLEU

In [24]:
# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [25]:
def calculate_bleu(reference, hypothesis):
    try:
        # Clean and normalize text
        reference = reference.strip().lower()
        hypothesis = hypothesis.strip().lower()

        if not reference or not hypothesis:
            return 0.0

        reference_tokens = [nltk.word_tokenize(reference)]
        hypothesis_tokens = nltk.word_tokenize(hypothesis)

        if len(hypothesis_tokens) == 0:
            return 0.0

        bleu_score = sentence_bleu(reference_tokens, hypothesis_tokens,
                                 smoothing_function=SmoothingFunction().method4)
        return bleu_score
    except Exception as e:
        print(f"BLEU calculation error: {e}")
        return 0.0

# Function to translate and evaluate
def translate_and_evaluate(model, dataloader, tokenizer, device, max_samples=50):
    model.eval()
    bleu_scores = []
    sample_count = 0

    with torch.no_grad():
        for batch in dataloader:
            if sample_count >= max_samples:
                break

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Generate translations with better parameters
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=256,
                num_beams=4,
                early_stopping=True,
                do_sample=False,
                no_repeat_ngram_size=2,  # Prevent repetition
                temperature=1.0,
                repetition_penalty=1.2  # Reduce repetition
            )

            # Process each sample in the batch
            batch_size = input_ids.size(0)
            for i in range(batch_size):
                if sample_count >= max_samples:
                    break

                # Decode reference and hypothesis
                reference_ids = labels[i].cpu()
                reference_ids = reference_ids[reference_ids != -100]
                reference = tokenizer.decode(reference_ids, skip_special_tokens=True)

                hypothesis = tokenizer.decode(outputs[i], skip_special_tokens=True)

                # Calculate BLEU score
                bleu_score = calculate_bleu(reference, hypothesis)
                bleu_scores.append(bleu_score)

                # Print some examples
                if sample_count < 10:
                    input_text = tokenizer.decode(input_ids[i], skip_special_tokens=True)
                    print(f"\nExample {sample_count + 1}:")
                    print(f"Input: {input_text}")
                    print(f"Reference: {reference}")
                    print(f"Hypothesis: {hypothesis}")
                    print(f"BLEU Score: {bleu_score:.4f}")

                sample_count += 1

    return bleu_scores

In [26]:
print("\nEvaluating translations with BLEU scores...")
bleu_scores = translate_and_evaluate(model, test_dataloader, tokenizer, device, max_samples=50)

if bleu_scores:
    average_bleu_score = sum(bleu_scores) / len(bleu_scores)
    print(f'\nAverage BLEU score on test set: {average_bleu_score:.4f}')
    print(f'Number of samples evaluated: {len(bleu_scores)}')
    print(f'Best BLEU score: {max(bleu_scores):.4f}')
    print(f'Worst BLEU score: {min(bleu_scores):.4f}')
    print(f'BLEU scores > 0.1: {sum(1 for score in bleu_scores if score > 0.1)}')
else:
    print("No BLEU scores calculated.")


Evaluating translations with BLEU scores...

Example 1:
Input: Translate English to German: Obama receives Netanyahu
Reference: Obama empfängt Netanyahu
Hypothesis: Abgeordnete folgt von Lissabon Israel.
BLEU Score: 0.0000

Example 2:
Input: Translate English to German: The relationship between Obama and Netanyahu is not exactly friendly.
Reference: Das Verhältnis zwischen Obama und Netanyahu ist nicht gerade freundschaftlich.
Hypothesis: Die Beziehung zwischen Abgeordneten und Abschließend ist nicht nur freundlich.
BLEU Score: 0.0653

Example 3:
Input: Translate English to German: The two wanted to talk about the implementation of the international agreement and about Teheran's destabilising activities in the Middle East.
Reference: Die beiden wollten über die Umsetzung der internationalen Vereinbarung sowie über Teherans destabilisierende Maßnahmen im Nahen Osten sprechen.
Hypothesis: Die beiden wollten sich über die Umsetzung der internationalen Vereinbarung und über den Verhandlun