# 1. 非混合精度训练

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.optim import AdamW
from tqdm import tqdm
import random
import triton
import triton.language as tl
from TritonAdam import TritonAdamW
import os
os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


## 加载模型
- 加载fp32的模型进行测试

In [2]:
model_path = '/data/models/Qwen2.5-0.5B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).cuda()
iters = 100
for p in model.parameters():
    break


## torch Adam

### 非Fused版本

In [None]:
optimizer = AdamW(model.parameters(), fused=False)
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
print(p) # 刷新再跑，p应该差不多
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)

100%|██████████| 100/100 [00:20<00:00,  4.92it/s]


Parameter containing:
tensor([[-0.1075,  0.1404, -0.0801,  ..., -0.0880, -0.0858,  0.0921],
        [-0.1106,  0.0991,  0.0811,  ..., -0.1011,  0.1000, -0.1082],
        [ 0.0660, -0.1091, -0.0925,  ..., -0.1007,  0.0771, -0.0990],
        ...,
        [ 0.1397, -0.1396,  0.1342,  ..., -0.1390, -0.1400,  0.1493],
        [ 0.1397, -0.1396,  0.1342,  ..., -0.1390, -0.1400,  0.1493],
        [ 0.1397, -0.1396,  0.1342,  ..., -0.1390, -0.1400,  0.1493]],
       device='cuda:0', requires_grad=True)
105.65606689453125


### Fused版本

In [5]:
optimizer = AdamW(model.parameters(), fused=True)
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
print(p) # 刷新再跑，p应该差不多
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)

100%|██████████| 100/100 [00:11<00:00,  8.60it/s]


Parameter containing:
tensor([[-0.1074,  0.1406, -0.0792,  ..., -0.0878, -0.0854,  0.0921],
        [-0.1102,  0.0993,  0.0809,  ..., -0.1018,  0.1003, -0.1085],
        [ 0.0656, -0.1093, -0.0923,  ..., -0.1019,  0.0770, -0.0989],
        ...,
        [ 0.1397, -0.1396,  0.1343,  ..., -0.1390, -0.1400,  0.1493],
        [ 0.1397, -0.1396,  0.1343,  ..., -0.1390, -0.1400,  0.1493],
        [ 0.1397, -0.1396,  0.1343,  ..., -0.1390, -0.1400,  0.1493]],
       device='cuda:0', requires_grad=True)
44.47935485839844


## Triton Adam

### 全部fp32

In [None]:
optimizer = TritonAdamW(model.parameters())
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
print(p) # 刷新再跑，p应该差不多
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)

  0%|          | 0/100 [00:00<?, ?it/s]

finish_custom_init, p_dtype: torch.float32, master_p_dtype: None


100%|██████████| 100/100 [00:12<00:00,  7.76it/s]


Parameter containing:
tensor([[-0.1075,  0.1403, -0.0811,  ..., -0.0880, -0.0860,  0.0922],
        [-0.1108,  0.0990,  0.0814,  ..., -0.1010,  0.0998, -0.1080],
        [ 0.0664, -0.1090, -0.0928,  ..., -0.0997,  0.0772, -0.0992],
        ...,
        [ 0.1397, -0.1396,  0.1342,  ..., -0.1391, -0.1400,  0.1493],
        [ 0.1397, -0.1396,  0.1342,  ..., -0.1391, -0.1400,  0.1493],
        [ 0.1397, -0.1396,  0.1342,  ..., -0.1391, -0.1400,  0.1493]],
       device='cuda:0', requires_grad=True)
39.631134033203125


### 1阶2阶动量为bf16

In [3]:
optimizer = TritonAdamW(model.parameters(), exp_avg_dtype=torch.bfloat16, exp_avg_sq_dtype=torch.bfloat16)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
print(p) # 刷新再跑，p应该差不多
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]

finish_custom_init, p_dtype: torch.float32, master_p_dtype: None


100%|██████████| 100/100 [00:10<00:00,  9.61it/s]


Parameter containing:
tensor([[-0.1063,  0.1391, -0.0812,  ..., -0.0865, -0.0847,  0.0898],
        [-0.1098,  0.0987,  0.0802,  ..., -0.1018,  0.1005, -0.1086],
        [ 0.0653, -0.1065, -0.0904,  ..., -0.0995,  0.0778, -0.0992],
        ...,
        [ 0.1403, -0.1397,  0.1341,  ..., -0.1388, -0.1399,  0.1495],
        [ 0.1403, -0.1397,  0.1341,  ..., -0.1388, -0.1399,  0.1495],
        [ 0.1403, -0.1397,  0.1341,  ..., -0.1388, -0.1399,  0.1495]],
       device='cuda:0', requires_grad=True)
27.99335289001465


# 2. 混合精度训练

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import random
import triton
import triton.language as tl
from TritonAdam import TritonAdamW
from apex.optimizers import FusedAdam as ApexFusedAdam
from transformer_engine.pytorch.optimizers import FusedAdam as TEFusedAdam
import os
os.environ['TRITON_PRINT_AUTOTUNING'] = '1'
torch.cuda.empty_cache()

  from .autonotebook import tqdm as notebook_tqdm


## 加载模型
- 加载bf16的模型

In [3]:
model_path = '/data/models/Qwen2.5-0.5B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16).cuda()
iters = 100
for p in model.parameters():
    break


## Apex
- apex只有标准的混合精度训练，fp32的master weight和1，2阶动量，bf16/fp16的model weight和grad

In [4]:
optimizer = ApexFusedAdam(model.parameters(), capturable=True, master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]


42.87024688720703


## Transformer Engine
- 最新的te，支持多种精度（可以点进去看下），比如1，2阶动量支持fp16，int8之类的，但都需要进行scale，但是不支持bf16

### 标准版

In [6]:
optimizer = TEFusedAdam(model.parameters(), master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]


41.55250549316406


### fp16的1，2阶动量

In [4]:
optimizer = TEFusedAdam(model.parameters(), 
                        exp_avg_dtype=torch.float16,
                        exp_avg_sq_dtype=torch.float16,
                        master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]


151.7933349609375


### fp16的1，2阶动量 + fp32的grad

In [6]:
optimizer = TEFusedAdam(model.parameters(), 
                        exp_avg_dtype=torch.float16,
                        exp_avg_sq_dtype=torch.float16,
                        use_decoupled_grad=True,
                        master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break

# grad必须和param的精度是一样的，如果是bf16的p，使用fp32的g，那么就需要使用其它属性进行存储
for p in model.parameters():
    p.decoupled_grad = p.grad.float()
    p.grad = None
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]


186.6219482421875


## Triton Adam
- 基本就是对着TE中的进行写的，接口基本都差不多，无缝衔接Megatron框架
- 目前支持master weight是fp32，model weight bf16， grad fp32 和 bf16都可以，1，2阶动量bf16或者fp32。无多余功能，基本满足训练需求

### 标准版

In [9]:
optimizer = TritonAdamW(model.parameters(), master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]

finish_custom_init, p_dtype: torch.bfloat16, master_p_dtype: torch.float32





41.33610534667969


### bf16的1，2阶动量

In [11]:
optimizer = TritonAdamW(model.parameters(), 
                        exp_avg_dtype=torch.bfloat16,
                        exp_avg_sq_dtype=torch.bfloat16,
                        master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]

finish_custom_init, p_dtype: torch.bfloat16, master_p_dtype: torch.float32





31.398710250854492


### bf16的1，2阶动量 + fp32的grad
- 这个就是deepseekv3中的配置
- 在megatron中，它会使用一个fp32的grad buffer去存储梯度
- 所有micro batch的梯度都加到这个buffer中，通过hook实现，下面是伪代码
- grad_buffer += p.grad
- p.grad = None
- 当所有micro batch都计算完后
- optimizer.model_p.decoupled_grad = grad_buffer

In [12]:
optimizer = TritonAdamW(model.parameters(), 
                        exp_avg_dtype=torch.bfloat16,
                        exp_avg_sq_dtype=torch.bfloat16,
                        use_decoupled_grad=True,
                        master_weights=True)
torch.cuda.empty_cache()
inp_ids = torch.arange(128).reshape(4,-1).cuda()
for _ in tqdm(range(iters)):
    out = model(inp_ids)
    out.logits.mean().backward()
    optimizer.step()
    break

# grad必须和param的精度是一样的，如果是bf16的p，使用fp32的g，那么就需要使用其它属性进行存储
for p in model.parameters():
    p.decoupled_grad = p.grad.float()
    p.grad = None
ms = triton.testing.do_bench(lambda: optimizer.step(), rep=1000)
print(ms)


  0%|          | 0/100 [00:00<?, ?it/s]

finish_custom_init, p_dtype: torch.bfloat16, master_p_dtype: torch.float32





32.23297119140625
