<a href="https://colab.research.google.com/github/mabench-tuc/LoRA-of-LLMs/blob/main/Gpt_2_FT_with_LoRA_on_E2E_NLG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setup Installation Process

In [None]:
#!pip install git+https://github.com/microsoft/LoRA
!pip install -qU bitsandbytes datasets accelerate loralib transformers peft trl
!pip install datasets
!pip install -U sacrebleu evaluate rouge-score

## Model Loading
Here we load the model with its weights, the tokenizer and the dataset

In [None]:
import torch
torch.cuda.is_available()
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, AutoModelForSeq2SeqLM, TrainingArguments
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForSequenceClassification

### Load the GPT-2 Large model

In [None]:
# Move the model to the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the GPT-2 Large model and tokenizer
print("Loading gpt2-large model...")
gpt2_large_model = AutoModelForCausalLM.from_pretrained("gpt2-large").to(device)

gpt2_large_tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
print("Successfully loaded gpt2-large model.")


In [None]:
model=gpt2_large_model
tokenizer= gpt2_large_tokenizer

In [None]:
print(model)

## Post-processing on the model
### Freezing the original weights
we need to apply some post-processing on the n-bit model to enable training, let's freeze all our layers, and cast the layer-norm in floatm for stability.

In [None]:
for param in model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
model.lm_head = CastOutputToFloat(model.lm_head)

###Display Trainable Parameters

In [None]:
def print_trainable_parameters(model):

    #Prints the number of trainable parameters in the model.

    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

##Parameter Efficient Fine Tuning
###Set up the LoRA Adapter
Here comes the magic with peft! Let's load a PeftModel and specify that we are going to use low-rank adapters (LoRA) using `get_peft_model` utility function from peft.

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=4,
    lora_alpha=32,
    target_modules=["c_attn"],
    #target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

## target_modules='v', This represents the value projection layer in the transformer model. The value projection layer transforms input tokens into value vectors,
# which are the actual values that are attended to based on the attention scores computed from query and key vectors.

## target_modules='q',This typically refers to the query projection layer in a transformer-based model. The query projection layer is responsible for transforming
# input tokens into query vectors, which are used to attend to other tokens in the sequence during self-attention mechanism.

#c_attn: This is the convolution layer that computes the query, key, and value projections. The "q_proj" and "v_proj" are part of this layer.

###Display trainable parameters

In [None]:
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

## Load Dataset

We can simply load our dataset from 🤗 Hugging Face with the `load_dataset` method!

In [None]:
from datasets import load_dataset

# Text Generation dataset (E2E NLG Challenge)
dataset = load_dataset("GEM/e2e_nlg")

###Tokenization of the dataset

In [None]:
# Add padding token for GPT-2
tokenizer.pad_token = tokenizer.eos_token

# Tokenize (dynamic padding instead of fixed 512)
tokenized_datasets = dataset.map(
    lambda x: tokenizer(x["meaning_representation"], truncation=True, padding="longest"),
    batched=True
)
# Display an example of the tokenized dataset
print(tokenized_datasets["train"][0])

In [None]:
# GPT-2-specific settings: Add padding tokens, as GPT-2 does not use padding by default
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 uses <|endoftext|> as a padding token

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["meaning_representation"],           # The "data" column contains the text in the E2E NLG dataset
        max_length=512,             # Max sequence length for GPT-2
        truncation=True,            # Truncate sequences longer than 512 tokens
        padding="max_length"        # Pad sequences shorter than 512 tokens
    )

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Display an example of the tokenized dataset
print(tokenized_datasets["train"][0])

In [None]:
tokenized_datasets.keys()

We create a smaller subset of the full dataset to fine-tune our model

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(15000))
#
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(700))
small_eval_dataset = tokenized_datasets["validation"].shuffle(seed=42).select(range(1400))

##Training Process

In [None]:
#Import the necessary modules from the transformers library
import transformers
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

###Train LoRA Adapter

In [None]:
#LoRA paper for hyperparameters for GPT-2 Medium
# Training Arguments
training_args = TrainingArguments(
    output_dir="./output_lora_gpt2",  # Directory for saving the model
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    lr_scheduler_type="linear",
    warmup_steps=500,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs_lora_gpt2",  # Directory for logging
    logging_steps=10,
    save_total_limit=2,  # Keep only 2 model checkpoints
    load_best_model_at_end=True,
    report_to="none",  # Disable reporting to WandB or other loggers
    fp16=True,  # Enable mixed precision training if you have a GPU
    #bf16=True

)

# Define a custom data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False  # Causal LM does not use Masked Language Modeling (MLM)
)

In [None]:
from trl import SFTTrainer

In [None]:
# Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,
    peft_config=lora_config,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    args=training_args
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print(results)

### Pushing the Model to the Hub

In [None]:
HUGGING_FACE_USER_NAME = ""
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model_name = "gpt-2-Large-lora"

model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", use_auth_token=True)

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt-2-Large-lora"
peft_model_id = f"{HUGGING_FACE_USER_NAME}/{model_name}"

config = PeftConfig.from_pretrained(peft_model_id)
config.base_model_name_or_path = "gpt2-large"
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=False, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)


In [None]:
# Load the Lora model
lora_model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
print(lora_model)

## Memory Check

In [None]:
!pip install nvidia-ml-py3
!pip install pynvml
import pynvml

In [None]:
!nvidia-smi

In [None]:
def print_gpu_memory():
    print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
    print(f"Cached memory: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
    print(f"GPU utilization: {torch.cuda.utilization()}%")

In [None]:
print_gpu_memory()

In [None]:
torch.cuda.empty_cache()
print("\nAfter emptying cache:")
print_gpu_memory()
print(f"Using device: {device}")

##Benchmark on E2E Nlg

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [None]:
# Load the LoRA model
lora_model = PeftModel.from_pretrained(model, peft_model_id).eval()

In [None]:
# Load the E2E NLG dataset
dataset = load_dataset("e2e_nlg")  # Automatically downloads the dataset

In [None]:

# GPT-2-specific settings: Add padding tokens, as GPT-2 does not use padding by default
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 uses <|endoftext|> as a padding token

# Step 3: Define the tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples["meaning_representation"],           # The "data" column contains the text in the E2E NLG dataset
        max_length=512,             # Max sequence length for GPT-2
        truncation=True,            # Truncate sequences longer than 512 tokens
        padding="max_length"        # Pad sequences shorter than 512 tokens
    )

# Step 4: Tokenize the dataset
tokenized_e2e_dataset = dataset.map(tokenize_function, batched=True)

# Step 5: Display an example of the tokenized dataset
print(tokenized_e2e_dataset["train"][0])

In [None]:
tokenized_e2e_dataset

In [None]:
#small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_e2e_dataset["test"].shuffle(seed=42).select(range(700))
small_val_dataset = tokenized_e2e_dataset["validation"].shuffle(seed=42).select(range(500))

In [None]:
def preprocess_data(example):
    """Concatenate the input and output for evaluation."""
    return {
        "input_text": example["meaning_representation"],
        "target_text": example["human_reference"],
    }
# Preprocess the dataset
#processed_data= tokenized_e2e_dataset.map(preprocess_data)

# Ensure processed_data is initialized as a dictionary
processed_data = {}
# Preprocess the validation and test datasets
processed_data["validation"] = small_val_dataset.map(preprocess_data)
processed_data["test"] = small_eval_dataset.map(preprocess_data)


def tokenize_function(examples):
    return tokenizer(
        examples["meaning_representation"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )
# Evaluation function
def evaluate_model(model, tokenizer, dataset):
    smoothing = SmoothingFunction().method1
    bleu_scores = []
    rouge_scores = []

    # Initialize ROUGE scorer
    rouge_scorer_instance = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)

    for example in dataset:
        input_text = example["input_text"]
        target_text = example["target_text"]

        # Tokenize and generate predictions
        input_ids = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt").input_ids.to(model.device)
        attention_mask = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt").attention_mask.to(model.device) # Generate attention mask
        with torch.no_grad():
            output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=100, num_beams=5, early_stopping=True, pad_token_id=tokenizer.eos_token_id) # Pass attention_mask to generate

        # Decode predictions
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Compute BLEU
        bleu_score = sentence_bleu(
            [target_text.split()], prediction.split(), smoothing_function=smoothing
        )
        bleu_scores.append(bleu_score)

        # Compute ROUGE
        rouge = rouge_scorer_instance.score(target_text, prediction)
        rouge_scores.append({
            "rouge1": rouge["rouge1"].fmeasure,
            "rouge2": rouge["rouge2"].fmeasure,
            "rougeL": rouge["rougeL"].fmeasure,
        })

    # Calculate average metrics
    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge = {
        "rouge1": sum([r["rouge1"] for r in rouge_scores]) / len(rouge_scores),
        "rouge2": sum([r["rouge2"] for r in rouge_scores]) / len(rouge_scores),
        "rougeL": sum([r["rougeL"] for r in rouge_scores]) / len(rouge_scores),
    }

    return avg_bleu, avg_rouge

# Evaluate the model
print("Evaluating the model...")
validation_data = processed_data["validation"]
avg_bleu, avg_rouge = evaluate_model(lora_model, tokenizer, validation_data)

# Display results
print("\nEvaluation Results:")
print(f"Average BLEU: {avg_bleu:.4f}")
print(f"Average ROUGE-1: {avg_rouge['rouge1']:.4f}")
print(f"Average ROUGE-2: {avg_rouge['rouge2']:.4f}")
print(f"Average ROUGE-L: {avg_rouge['rougeL']:.4f}")

##Perform Inference

###Preprocess the input text

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Prompt
prompt = "Once upon a time,"

# Tokenize the input text
inputs = tokenizer(prompt, return_tensors="pt").to(device)
inputs

###Inference

In [None]:
with torch.no_grad():
    outputs = lora_model(**inputs)

outputs

In [None]:
# Generate text
output_ids = lora_model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=50,
    num_return_sequences=1
)

output_ids

###Post-process the output

In [None]:
# Decode the generated text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(generated_text)

### Inference of GPT-2 Large Model

In [None]:
# Generate text
output_ids = gpt2_large_model.generate(
    inputs.input_ids,
    attention_mask=inputs.attention_mask,
    pad_token_id=tokenizer.eos_token_id,
    max_length=50,
    num_return_sequences=1
)

output_ids

In [None]:
# Decode the generated text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(generated_text)