In [None]:
!pip install -q -U bitsandbytes  # Install bitsandbytes for efficient model inference
!pip install -q -U git+https://github.com/huggingface/transformers.git  # Install the latest version of Hugging Face's transformers library
!pip install -q -U git+https://github.com/huggingface/peft.git  # Install the latest version of the PEFT (Parameter-Efficient Fine-Tuning) library
!pip install -q -U git+https://github.com/huggingface/accelerate.git  # Install the latest version of Hugging Face's accelerate library for multi-GPU support
!pip install -q datasets  # Install the datasets library for loading and managing datasets


First let's load the model we are going to use - GPT-neo-x-20B! Note that the model itself is around 40GB in half precision

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "EleutherAI/gpt-neox-20b"#32 bit to 4 bit
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",#Normalized Float 4 (NF4), a quantization technique used in the context of reducing the precision of floating-point numbers
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Then we have to apply some preprocessing to the model to prepare it for training. For that use the `prepare_model_for_kbit_training` method from PEFT.

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()#tradeoff memory using backpropogation,by selectively discarding some of these intermediate activations during the forward pass and recomputing them during the backward pass.
model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()#returns total number of elements in the input tensor
        if param.requires_grad:#if it is trainable or not grad for gradient
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model#lOW RANK ADAPTATION

config = LoraConfig(
    r=8,#Rank of the updated matrices
    lora_alpha=32,#scales the learned weights
    target_modules=["query_key_value"],#Target specific modules to apply LoRA
    lora_dropout=0.05,#Prevents overfitting and accuracy
    bias="none", # Determines whether to include bias in adaptation
    task_type="CAUSAL_LM"# Specifies the task type (e.g., causal language modeling)
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

Let's load a common dataset, english quotes, to fine tune our model on famous quotes.

In [None]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

Run the cell below to run the training! For the sake of the demo, we just ran it for few steps just to showcase how to use this integration with existing tools on the HF ecosystem.

In [None]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
# Set the padding token of the tokenizer to be the same as the end-of-sequence (EOS) token.
# This is important when padding sequences to make sure the model knows where the end of each sequence is.

trainer = transformers.Trainer(
    # Initialize a Trainer object to handle the model's training process, simplifying the fine-tuning of transformers models.

    model=model,
    # Specify the model that will be trained. This is likely a pre-trained model that you're fine-tuning.

    train_dataset=data["train"],
    # Provide the training dataset for the trainer. It expects a dictionary where "train" is the key for the training data.

    args=transformers.TrainingArguments(
        # Define the arguments for training the model using the TrainingArguments class.

        per_device_train_batch_size=1,
        # Set the batch size per device (e.g., per GPU) to 1, meaning one sample will be processed at a time per device.

        gradient_accumulation_steps=4,
        # Perform gradient accumulation over 4 steps, effectively simulating a larger batch size by updating the model weights
        # after every 4 forward passes. This helps when using small batch sizes to avoid memory issues.

        warmup_steps=2,
        # Number of warmup steps where the learning rate gradually increases from 0 to the specified learning rate.
        # This helps stabilize training at the beginning.

        max_steps=10,
        # Set the maximum number of training steps to 10. Training will stop after this many steps regardless of the number of epochs.

        learning_rate=2e-4,
        # Set the learning rate to 0.0002, which controls how much the model's weights are adjusted with each update.
        # Lower values are safer but slower to converge.

        fp16=True,
        # Enable mixed precision training with 16-bit floating point (FP16) to speed up training and reduce memory usage.

        logging_steps=1,
        # Log metrics (such as loss) every 1 step during training to monitor progress.

        output_dir="outputs",
        # Specify the directory where the model checkpoints, logs, and other outputs will be saved.

        optim="paged_adamw_8bit"
        # Use the paged AdamW optimizer with 8-bit precision to reduce memory usage while maintaining efficient optimization.
    ),

    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
    # Use a data collator that dynamically pads inputs for language modeling.
    # mlm=False indicates that this is not a masked language modeling task (as in BERT), but rather a causal language modeling task.
)

model.config.use_cache = False
# Disable caching for the model during training to avoid unnecessary warnings.

trainer.train()


In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [None]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
text = "Mother Teresa"
device = "cuda:0"

inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))