In [1]:
import numpy as np
from datasets import Dataset
from pynvml import *
from transformers import TrainingArguments, Trainer, logging, AutoModelForSequenceClassification, AutoTokenizer
import torch

In [2]:
seq_len, dataset_size = 512, 512
dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
print_gpu_utilization()

GPU memory occupied: 223 MB.


That looks good: the GPU memory is not occupied as we would expect before we load any models. If that’s not the case on your machine make sure to stop all processes that are using GPU memory. However, not all free GPU memory can be used by the user. When a model is loaded to the GPU the kernels are also loaded,which can take up 1-2GB of memory. To see how much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well.

In [4]:
import torch
torch.ones((1, 1)).to("cuda")
print_gpu_utilization()

GPU memory occupied: 322 MB.


## Load Model
First, we load the bert-large-uncased model. We load the model weights directly to the GPU so that we can check how much space just the weights use.

In [5]:
# MODEL_ID = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
MODEL_ID = "Felladrin/TinyMistral-248M-SFT-v4"
# model = AutoModelForSequenceClassification.from_pretrained(
#     MODEL_ID, 
#     torch_dtype=torch.float16,
#     use_flash_attention_2=True).to("cuda")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID).to("cuda")
print_gpu_utilization()

config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/992M [00:00<?, ?B/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at Felladrin/TinyMistral-248M-SFT-v4 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPU memory occupied: 1150 MB.


We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. So now we can start training the model and see how the GPU memory consumption changes. First, we set up a few standard training arguments:

In [6]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
model.config.pad_token_id = model.config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/562 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Memory utilization at vanilla training
Let’s use the Trainer and train the model without using any GPU performance optimization techniques and a batch size of 4:

In [8]:
# from transformers import TrainingArguments, Trainer, logging
# logging.set_verbosity_error()
# training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
# trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
# result = trainer.train()
# print_summary(result)

In [8]:
logging.set_verbosity_error()
max_seq_length = 2048

In [9]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 88.7921, 'train_samples_per_second': 5.766, 'train_steps_per_second': 1.442, 'train_loss': 0.006922087166458368, 'epoch': 1.0}
Time: 88.79
Samples/second: 5.77
GPU memory occupied: 4896 MB.


Gradient checkpointing offers a compromise between these two approaches and saves strategically selected activations throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. For an in-depth explanation of gradient checkpointing, refer to this great article.

In [10]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)



{'train_runtime': 121.0431, 'train_samples_per_second': 4.23, 'train_steps_per_second': 1.057, 'train_loss': 8.381902283360887e-09, 'epoch': 1.0}
Time: 121.04
Samples/second: 4.23
GPU memory occupied: 4522 MB.


### fp16
The main advantage of mixed precision training comes from saving the activations in half precision (fp16). Although the gradients are also computed in half precision they are converted back to full precision for the optimization step so no memory is saved here. While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes. This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU).

In [9]:
training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 39.0002, 'train_samples_per_second': 13.128, 'train_steps_per_second': 3.282, 'train_loss': 0.011264808475971222, 'epoch': 1.0}
Time: 39.00
Samples/second: 13.13
GPU memory occupied: 7916 MB.


In [11]:
training_args = TrainingArguments(per_device_train_batch_size=2, gradient_accumulation_steps=2, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 41.1301, 'train_samples_per_second': 12.448, 'train_steps_per_second': 3.112, 'train_loss': 0.0, 'epoch': 1.0}
Time: 41.13
Samples/second: 12.45
GPU memory occupied: 8186 MB.


In [11]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 63.259, 'train_samples_per_second': 8.094, 'train_steps_per_second': 2.023, 'train_loss': 4.656612873077393e-10, 'epoch': 1.0}
Time: 63.26
Samples/second: 8.09
GPU memory occupied: 4510 MB.


In [13]:
training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)



{'train_runtime': 60.5463, 'train_samples_per_second': 8.456, 'train_steps_per_second': 2.114, 'train_loss': 0.0, 'epoch': 1.0}
Time: 60.55
Samples/second: 8.46
GPU memory occupied: 6254 MB.


In [14]:
training_args = TrainingArguments(per_device_train_batch_size=4, gradient_checkpointing=True, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 50.2204, 'train_samples_per_second': 10.195, 'train_steps_per_second': 2.549, 'train_loss': 0.0, 'epoch': 1.0}
Time: 50.22
Samples/second: 10.20
GPU memory occupied: 6624 MB.


### FlashAttention-2
FlashAttention-2 is a faster and more efficient implementation of the standard attention mechanism that can significantly speedup inference by:
1- additionally parallelizing the attention computation over sequence length
2- partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them

FlashAttention-2 currently supports:
- Ampere, Ada, or Hopper GPUs (e.g., A100, RTX 3090, RTX 4090, H100). Support for Turing GPUs (T4, RTX 2080) is coming soon, please use FlashAttention 1.x for Turing GPUs for now.
- Datatype fp16 and bf16 (bf16 requires Ampere, Ada, or Hopper GPUs).
- All head dimensions up to 256. Head dim > 192 backward requires A100/A800 or H100/H800.


### Optimizer choice
Trainer integrates a variety of optimizers that can be used out of box: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision, adafactor, or adamw_bnb_8bit. More optimizers can be plugged in via a third-party implementation.

1. **Adafactor**: Adafactor doesn’t store rolling averages for each element in weight matrices. Instead, it keeps aggregated information (sums of rolling averages row- and column-wise), significantly reducing its footprint. However, compared to Adam, Adafactor may have slower convergence in certain cases.

In [12]:
training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", gradient_checkpointing=True, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 51.8633, 'train_samples_per_second': 9.872, 'train_steps_per_second': 2.468, 'train_loss': 0.0, 'epoch': 1.0}
Time: 51.86
Samples/second: 9.87
GPU memory occupied: 2684 MB.


2. **8-bit Adam**: Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. Quantization means that it stores the state with lower precision and dequantizes it only for the optimization. This is similar to the idea behind mixed precision training.

In [19]:
training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant':False}, fp16=True, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds, tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 46.6657, 'train_samples_per_second': 10.972, 'train_steps_per_second': 2.743, 'train_loss': 0.0, 'epoch': 1.0}
Time: 46.67
Samples/second: 10.97
GPU memory occupied: 4818 MB.


We can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated.
We need to initialize the optimizer. This involves two steps:
- First, group the model’s parameters into two groups - one where weight decay should be applied, and the other one where it should not. Usually, biases and layer norm parameters are not weight decayed.
- Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer.

In [13]:
import bitsandbytes as bnb
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names

training_args = TrainingArguments(per_device_train_batch_size=4, gradient_checkpointing=True, gradient_checkpointing_kwargs={'use_reentrant':False}, fp16=True, **default_args)

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

In [14]:
trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None), tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

{'train_runtime': 47.0538, 'train_samples_per_second': 10.881, 'train_steps_per_second': 2.72, 'train_loss': 0.0, 'epoch': 1.0}
Time: 47.05
Samples/second: 10.88
GPU memory occupied: 3126 MB.


### Data preloading
One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it can handle. By default, everything happens in the main process, and it might not be able to read the data from disk fast enough, and thus create a bottleneck, leading to GPU under-utilization. Configure the following arguments to reduce the bottleneck:

- DataLoader(pin_memory=True, ...) - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory.
- DataLoader(num_workers=4, ...) - spawn several workers to preload data faster. During training, watch the GPU utilization stats; if it’s far from 100%, experiment with increasing the number of workers. Of course, the problem could be elsewhere, so many workers won’t necessarily lead to better performance.

In [15]:
from datasets import load_dataset
instruct_tune_dataset = load_dataset("mosaicml/instruct-v3")
print(instruct_tune_dataset)
instruct_tune_dataset = instruct_tune_dataset.filter(lambda x: x["source"] == "dolly_hhrlhf")
print(instruct_tune_dataset)

def create_prompt(sample):
  bos_token = "<s>"
  original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  system_message = "Use the provided input to create an instruction that could have been used to generate the response with an LLM."
  response = sample["prompt"].replace(original_system_message, "").replace("\n\n### Instruction\n", "").replace("\n### Response\n", "").strip()
  input = sample["response"]
  eos_token = "</s>"

  full_prompt = ""
  full_prompt += bos_token
  full_prompt += "### Instruction:"
  full_prompt += "\n" + system_message
  full_prompt += "\n\n### Input:"
  full_prompt += "\n" + input
  full_prompt += "\n\n### Response:"
  full_prompt += "\n" + response
  full_prompt += eos_token

  return full_prompt
print(create_prompt(instruct_tune_dataset["train"][0]))

DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 56167
    })
    test: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 6807
    })
})
DatasetDict({
    train: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 34333
    })
    test: Dataset({
        features: ['prompt', 'response', 'source'],
        num_rows: 4771
    })
})
<s>### Instruction:
Use the provided input to create an instruction that could have been used to generate the response with an LLM.

### Input:
There are more than 12,000 species of grass. The most common is Kentucky Bluegrass, because it grows quickly, easily, and is soft to the touch. Rygrass is shiny and bright green colored. Fescues are dark green and shiny. Bermuda grass is harder but can grow in drier soil.

### Response:
What are different types of grass?</s>


In [16]:
training_args = TrainingArguments(
    per_device_train_batch_size=4, 
    gradient_checkpointing=True, 
    gradient_checkpointing_kwargs={'use_reentrant':False}, 
    fp16=True, 
    dataloader_pin_memory=True,
    dataloader_num_workers=4,
    **default_args)


In [23]:
instruct_tune_dataset = instruct_tune_dataset.filter(lambda x: x["source"] == "dolly_hhrlhf")
train_dataset=instruct_tune_dataset["train"]
eval_dataset=instruct_tune_dataset["test"]
# print the length of the train dataset and the test dataset
print(len(train_dataset))
print(len(eval_dataset))
trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=instruct_tune_dataset["train"], 
    eval_dataset=instruct_tune_dataset["test"],
    formatting_func=create_prompt,
    max_seq_length=max_seq_length,
    optimizers=(adam_bnb_optim, None), 
    tokenizer=tokenizer)
result = trainer.train()
print_summary(result)

Filter:   0%|          | 0/34333 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4771 [00:00<?, ? examples/s]

34333
4771


TypeError: Trainer.__init__() got an unexpected keyword argument 'formatting_func'