In [None]:
# IF USING GOOGLE COLABORATORY -> RUN FIRST!!!
# OTHERWISE -> IGNORE ;-)

from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
# Packages

!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U git+https://github.com/adapter-hub/adapters.git
!pip install -q -U git+https://github.com/huggingface/datasets.git

# <font color="#003660">Applied Machine Learning for Text Analysis (M.184.5331)</font>
# <font color="#003660">Lesson 10: Hands-On Training: Fine-Tuning LLMs using Quantization and Low-Rank Adaptation</font>

<center><br><img width=256 src="https://git.uni-paderborn.de/data.analytics.teaching/aml4ta-2020/-/raw/master/resources/dag.png"/><br></center>

<p>
<center>
<div>
    <font color="#085986"><b>By the end of this lesson, you will be able to...</b><br><br>
        comprehend and apply quantization methods for model optimization;<br>
        understand and implement LoRA methods for model fine-tuning; and<br>
        integrate quantization and LoRA techniques to fine-tune models using QLoRA.<br>
    </font>
</div>
</center>
</p>

<h2>Task</h2>

<p>In this week's lecture, we will explore the fine-tuning of Large Language Models (LLMs) for text generation, focusing on the application of 4-bit quantization and low-rank adaptation (LoRA). This session marks a shift from our previous focus on mixed-precision training, introducing advanced techniques to optimize LLMs within the constraints of limited hardware resources.</p>

<p>4-bit quantization will be utilised to effectively reduce the memory and computational requirements of LLMs. This technique is particularly relevant for training and running these models in environments with restricted hardware, allowing for more efficient use of resources.</p>

<p>We will also delve into low-rank adaptation (LoRA), a method for modifying pre-trained models that enables efficient fine-tuning without the need to retrain the entire model. This approach is beneficial for adapting LLMs to specific tasks, providing a means to enhance model performance while considering computational efficiency.</p>

<p>Yet, it is essential to apply the concepts learned in the previous session, especially in terms of managing document lengths and optimizing batch sizes. These practices remain vital in the efficient processing of data and in maintaining the smooth operation of the model, particularly in the context of the larger scale of LLMs.</p>

<h2>Useful Links</h2>

<ul>
  <li><a href="https://huggingface.co/blog/hf-bitsandbytes-integration">A Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes</a></li>
  <li><a href="https://huggingface.co/blog/4bit-transformers-bitsandbytes">Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA</a></li>
  <li><a href="https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one">Methods and tools for efficient training on a single GPU</a></li>
  <li><a href="https://huggingface.co/docs/transformers/main/en/quantization?bnb=8-bit#bitsandbytes">Quantization | BitsAndBytes</a></li>
  <li><a href="https://huggingface.co/blog/lora">Using LoRA for Efficient Stable Diffusion Fine-Tuning</a></li>
  <li><a href="https://arxiv.org/pdf/2106.09685.pdf">LoRA: Low-Rank Adaptation of Large Language Models</a></li>
  <li><a href="https://arxiv.org/pdf/2305.14314.pdf">QLORA: Efficient Finetuning of Quantized LLMs</a></li>
</ul> 

<p>With the continuous growth in the size of language models, exemplified by PaLM's 540 billion parameters and BLOOM's 176 billion, running them on standard devices presents increasing challenges. For instance, BLOOM-176B requires 8x 80GB A100 GPUs for inference and 72 GPUs for fine-tuning. To meet these challenges, solutions such as quantization are emerging. For instance, the implementation of 8-bit inference, recently incorporated into Hugging Face transformers, effectively reduces the memory requirements of these large models by half while maintaining their predictive accuracy, greatly facilitating their operation on a reduced number of GPUs.</p>

<p>As discussed in a previous session, quantization is the process that focuses on reducing the precision of the weights in a neural network, typically from floating point to lower bit-width integers, such as 8-bit or 4-bit. This reduction in precision leads to smaller model sizes and faster computation, making it especially useful for deploying models on hardware with limited resources. Using tools like bitsandbytes, models can be effectively quantized to 8-bit or 4-bit. Let's take a look!</p>

# 1. Quantization

In [None]:
# Import

import torch
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM

def bytes_to_mb(bytes):
    return bytes / (1024 ** 2)

In [None]:
# GPT-2 (Vanilla)
# https://huggingface.co/gpt2-large

model = AutoModelForCausalLM.from_pretrained('gpt2-large', device_map='auto')
print(f'Footprint: {bytes_to_mb(model.get_memory_footprint())} MB')

# Clear memory
del model; torch.cuda.empty_cache()

In [None]:
# GPT-2 (8-bit)
# https://huggingface.co/gpt2-large

config = BitsAndBytesConfig(
    load_in_8bit=True, # 8-bit
    bnb_4bit_compute_dtype=torch.bfloat16, # Speedup computation -> Used for its balance between maintaining a wide numeric range and reducing memory usage.
    device_map='auto' # Device
)

model = AutoModelForCausalLM.from_pretrained('gpt2-large', quantization_config=config)
print(f'Footprint: {bytes_to_mb(model.get_memory_footprint())} MB')

# Clear memory
del config, model; torch.cuda.empty_cache()

In [None]:
# GPT-2 (4-bit)
# https://huggingface.co/gpt2-large

config = BitsAndBytesConfig(
    load_in_4bit=True, # 4-bit
    bnb_4bit_quant_type='nf4', # 4-bit data type from the QLoRA paper
    bnb_4bit_use_double_quant=True, # Double quantization
    bnb_4bit_compute_dtype=torch.bfloat16, # Speedup computation -> Used for its balance between maintaining a wide numeric range and reducing memory usage.
    device_map='auto' # Device
)

model = AutoModelForCausalLM.from_pretrained('gpt2-large', quantization_config=config)
print(f'Footprint: {bytes_to_mb(model.get_memory_footprint())} MB')

<p>It's important to note that training with 8-bit and 4-bit weights is only supported for training extra parameters, not the entire model. This limitation arises because the reduced precision may not adequately capture the complexity of the entire model. Therefore, we focus on low-rank adaptation, which allows for the efficient fine-tuning of the model by adjusting only a small subset of its parameters. This approach maintains model performance while benefiting from the efficiencies of quantization.</p>

# 2. Low-Rank Adaptation

<p><center><i>"For a large Transformer trained with Adam, we reduce that VRAM usage by up to 2/3 [...] as we do not need to store the optimizer states for the frozen parameters." (Hu et al., 2021, p.5)</i></center></p>
<p><center><i>"On GPT-3 175B, we reduce the VRAM consumption during training from 1.2TB to 350GB." (Hu et al., 2021, p.5)</i></center></p>

<p>Low-Rank Adaptation (LoRA) is a technique designed to efficiently fine-tune large pre-trained models, such as those in natural language processing. The core idea behind LoRA is to introduce trainable low-rank matrices into the architecture of the pre-trained model. These matrices are much smaller in size compared to the original model's weights and thus require significantly less computational resources to update.</p>

<p>The LoRA approach involves modifying specific layers of a pre-trained model, such as the attention and feedforward layers in a transformer model. Instead of training all the parameters of these layers, LoRA introduces two low-rank matrices for each layer. During fine-tuning, only these low-rank matrices are updated, while the original weights of the model remain frozen. This allows for efficient adaptation of the model to new tasks or datasets.</p>

<p>The key advantage of LoRA is that it enables the fine-tuning of massive models with a relatively small increase in the number of trainable parameters. This approach significantly reduces the computational resources required for training, making it feasible to adapt large models on hardware with limited capabilities.</p>

<p>HuggingFace and BigScience's research on LoRA further demonstrates its effectiveness in reducing the computational burden of fine-tuning large language models. By incorporating LoRA into their transformers library, they have enabled the wider machine learning community to access and fine-tune state-of-the-art models more efficiently.</p>

<ul>
    <li><a href="https://docs.adapterhub.ml">AdapterHub</a></li>
    <li><a href="https://huggingface.co/docs/hub/adapter-transformers">Using Adapter Transformers at Hugging Face</a></li>
</ul>

In [None]:
# Example

"""

from peft import LoraConfig

# Modules
# https://github.com/huggingface/peft/blob/39ef2546d5d9b8f5f8a7016ec10657887a867041/src/peft/utils/other.py
TARGET_MODULES = {
    "t5": ["q", "v"],
    "mt5": ["q", "v"],
    "bart": ["q_proj", "v_proj"],
    "gpt2": ["c_attn"],
    "bloom": ["query_key_value"],
    "blip-2": ["q", "v", "q_proj", "v_proj"],
    "opt": ["q_proj", "v_proj"],
    "gptj": ["q_proj", "v_proj"],
    "gpt_neox": ["query_key_value"],
    "gpt_neo": ["q_proj", "v_proj"],
    "bert": ["query", "value"],
    "roberta": ["query", "value"],
    "xlm-roberta": ["query", "value"],
    "electra": ["query", "value"],
    "deberta-v2": ["query_proj", "value_proj"],
    "deberta": ["in_proj"],
    "layoutlm": ["query", "value"],
    "llama": ["q_proj", "v_proj"],
    "chatglm": ["query_key_value"],
    "gpt_bigcode": ["c_attn"],
    "mpt": ["Wqkv"],
}

# Configuration
config = LoraConfig(
    r=8, # This parameter determines the reparameterization scale. Smaller values train fewer parameters, reducing computational cost but possibly limiting learning efficacy. 
    lora_alpha=16, # Alters the size of the weight matrix, thereby increasing the importance of the fine-tuning relative to the existing, unchanged weights.
    lora_dropout=0.01, # Dropout ;-)
    target_modules=TARGET_MODULES['gpt2'], # This specifies the modules targeted for training.
    bias='none', # Options between 'none', 'lora_only', 'all'
)

# ...

"""

In [None]:
# Pre-trained adapters

from adapters import list_adapters

# source can be "ah" (AdapterHub), "hf" (huggingface.co) or None (for both, default)
adapter_infos = list_adapters(source='ah', model_name='gpt2')
adapter_infos

# 3. Quantization + Low-Rank Adaptation = QLoRA

<p>QLoRA, the amalgamation of Quantization and Low-Rank Adaptation (LoRA), is an efficient method employed in this course for fine-tuning Large Language Models (LLMs) using the Parameter Efficient Fine-Tuning Methods (PEFT) library. By leveraging the strengths of both quantization and LoRA, QLoRA significantly reduces the computational overhead and memory requirements typically associated with fine-tuning large models. Quantization minimizes the model's memory footprint, while LoRA allows for targeted, efficient adaptation of the model's weights. This combination results in a potent and resource-efficient approach, making the fine-tuning of LLMs more accessible and practical, even on hardware with limited computational capabilities.</p>

In [None]:
# Example

"""

import peft
import torch
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_PATH = 'my_favourite_LLM'

# BitsAndBytes
# Configuration
config = BitsAndBytesConfig(
    # Some parameters...
)

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

# Model
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, quantization_config=config, use_cache=False)

# Enable gradient checkpointing
# Reduces GPU memory consumption by storing only a subset of all activations (i.e., the rest are computed on-the-fly during backpropagation)
model.gradient_checkpointing_enable()

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA
# Configuration
lora_config = LoraConfig(
    # Some parameters...
)

# Wrap model
model = get_peft_model(model, lora_config)

# ...

"""

# Implementation

In [None]:
# Foundation Models

# https://huggingface.co/gpt2
# https://huggingface.co/gpt2-medium
# https://huggingface.co/gpt2-large
# https://huggingface.co/gpt2-xl

In [None]:
# Tutorial: Causal Language Modeling
# + what you have learned last week!
# P.S. The provided code will not work as is, you will have to adapt it to your needs ;-)

# https://huggingface.co/docs/transformers/tasks/language_modeling

In [None]:
# Import

import peft
import torch
from peft import LoraConfig, get_peft_model
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import BitsAndBytesConfig, Trainer, TrainingArguments

In [None]:
# Have fun ;-)
# ...