# 1.1 模型训练

In [11]:
import numpy as np
from datasets import Dataset


seq_len, dataset_size = 512, 512
dummy_data = {
    "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)),
    "labels": np.random.randint(0, 1, (dataset_size)),
}
ds = Dataset.from_dict(dummy_data)
ds.set_format("pt")


In [12]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


In [13]:
# 查看空闲 GPU 内存
print_gpu_utilization()

GPU memory occupied: 311 MB.


Kernels 是用于执行底层 GPU 上数学运算的函数，我们调用kernel函数对存储在GPU内存中的数据进行计算。每种神经网络层都需要不同的 kernels 来执行其特定的计算操作，如卷积核函数、激活函数等，所以CUDA编程的核心其实也就是如何合理的划分数据并且针对数据结构编写高效的kernel函数。
当一个模型加载到 GPU 时，与该模型相关的计算 kernels 也会被加载到 GPU 上，这样可以避免每次执行运算时都重复加载 kernels，提高运算效率。但需要注意的是，加载 kernels 会占用 GPU 存储空间，通常占用大约1-2GB的内存。因此，即使加载一个微小的张量到 GPU 上，也会触发 kernels 的加载，并且你可以通过观察 GPU 显存的使用来查看 kernels 占用的内存大小。


In [14]:
import torch

torch.ones((1, 1)).to("cuda")
print_gpu_utilization()

GPU memory occupied: 311 MB.


In [15]:
model_name_or_path = "/mnt/bn/lys-lq/HF_Caches/bert-large-uncased" # 运行后修改
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path).to("cuda")
print_gpu_utilization()


GPU memory occupied: 1599 MB.


In [16]:
!nvidia-smi

Thu Oct 10 14:20:49 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.129.06   Driver Version: 470.129.06   CUDA Version: 12.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:1A:00.0 Off |                    0 |
| N/A   45C    P0    73W / 300W |   1599MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:1B:00.0 Off |                    0 |
| N/A   45C    P0    75W / 300W |      3MiB / 32510MiB |      0%      Default |
|       

开始训练模型，看看显存变化

In [17]:
default_args = {
    "output_dir": "tmp",
    "evaluation_strategy": "steps",
    "num_train_epochs": 1,
    "log_level": "error",
    "report_to": "none",
}

In [18]:
from transformers import TrainingArguments, Trainer, logging

logging.set_verbosity_error()

training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)
trainer = Trainer(model=model, args=training_args, train_dataset=ds)
result = trainer.train()
print_summary(result)

Detected kernel version 5.4.143, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


2024-10-10 14:20:54.952 n128-099-069:21338:21338 [0] NCCL INFO cudaDriverVersion 12020
2024-10-10 14:20:54.953 n128-099-069:21338:21338 [0] NCCL INFO NCCL_SOCKET_FAMILY set by environment to AF_INET6
2024-10-10 14:20:54.953 n128-099-069:21338:21338 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
2024-10-10 14:20:54.953 n128-099-069:21338:21338 [0] NCCL INFO Bootstrap : Using eth0:fdbd:dc03:1:334::69<0>
2024-10-10 14:20:54.954 n128-099-069:21338:21338 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
NCCL version 2.19.3-2+cuda12.2
2024-10-10 14:20:55.334 n128-099-069:21338:24233 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
2024-10-10 14:20:55.364 n128-099-069:21338:24233 [1] NCCL INFO NCCL_SOCKET_FAMILY set by environment to AF_INET6
2024-10-10 14:20:55.364 n128-099-069:21338:24233 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
202



{'train_runtime': 46.475, 'train_samples_per_second': 11.017, 'train_steps_per_second': 1.377, 'train_loss': 0.019991444423794746, 'epoch': 1.0}
Time: 46.48
Samples/second: 11.02
GPU memory occupied: 11493 MB.


我们看到,与仅仅将模型加载到GPU相比，模型训练需要使用更多的内存。这是因为训练过程中有许多组件都使用了GPU内存:

模型权重
fp32：每个参数4字节
混合精度训练：每个参数6字节(同时在内存中维护fp32版本和fp16版本)
优化器状态（optimizer states）
normal AdamW：每个参数8字节(维护一阶动量和二阶动量2个状态，都是fp32版本)
8-bit AdamW(如bitsandbytes)：每个参数2字节（也是两个状态，但都是int8版本）
SGD：每个参数4字节(仅维护1个状态)
梯度： 每个参数4字节（无论是否启用混合精度训练，梯度始终以fp32存储)
Forward Activations：用于梯度计算，其大小取决于许多因素，比如序列长度、隐含层大小和批量大小。
临时缓存：各种暂存变量,一旦计算完成就会释放，但当时可能需要额外内存，所以也可能导致OOM。编程时必须考虑这些临时变量，及时释放不再需要的变量。
特定功能的内存：除了以上的消耗，可能还有特殊的内存需求。例如，使用束搜索（beam search）生成文本时，需要维护输入和输出的多个副本。

# Accelerate

In [19]:
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    **default_args,
)




In [20]:
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader

dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size)

if training_args.gradient_checkpointing:
    model.gradient_checkpointing_enable()     # 启用梯度检查点

# 初始化Accelerator时指定启用混合精度训练
accelerator = Accelerator(fp16=training_args.fp16)
# 启用adamw_bnb_8bit
model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader)

model.train()
for step, batch in enumerate(dataloader, start=1):
    loss = model(**batch).loss
    loss = loss / training_args.gradient_accumulation_steps
    # 调用accelerator进行反向传播
    accelerator.backward(loss)
    if step % training_args.gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad()

TypeError: __init__() got an unexpected keyword argument 'fp16'