In [6]:
import paddle
import numpy as np

# train on default dynamic graph mode
linear = paddle.nn.Linear(10, 10)
scheduler = paddle.optimizer.lr.LinearWarmup(
        learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
for epoch in range(20):
    for batch_id in range(2):
        x = paddle.uniform([10, 10])
        out = linear(x)
        loss = paddle.mean(out)
        loss.backward()
        sgd.step()
        sgd.clear_gradients()
        scheduler.step()    # If you update learning rate each step
  # scheduler.step()        # If you update learning rate each epoch


Epoch 0: LinearWarmup set learning rate to 0.0.
Epoch 1: LinearWarmup set learning rate to 0.025.
Epoch 2: LinearWarmup set learning rate to 0.05.
Epoch 3: LinearWarmup set learning rate to 0.075.
Epoch 4: LinearWarmup set learning rate to 0.1.
Epoch 5: LinearWarmup set learning rate to 0.125.
Epoch 6: LinearWarmup set learning rate to 0.15.
Epoch 7: LinearWarmup set learning rate to 0.175.
Epoch 8: LinearWarmup set learning rate to 0.2.
Epoch 9: LinearWarmup set learning rate to 0.225.
Epoch 10: LinearWarmup set learning rate to 0.25.
Epoch 11: LinearWarmup set learning rate to 0.275.
Epoch 12: LinearWarmup set learning rate to 0.3.
Epoch 13: LinearWarmup set learning rate to 0.325.
Epoch 14: LinearWarmup set learning rate to 0.35.
Epoch 15: LinearWarmup set learning rate to 0.375.
Epoch 16: LinearWarmup set learning rate to 0.4.
Epoch 17: LinearWarmup set learning rate to 0.425.
Epoch 18: LinearWarmup set learning rate to 0.45.
Epoch 19: LinearWarmup set learning rate to 0.475.
Epoch

In [None]:
import matplotlib.pyplot as plt

import paddle
from paddlenlp.transformers import *

'''
__all__ = [
    'LinearDecayWithWarmup',
    'ConstScheduleWithWarmup',
    'CosineDecayWithWarmup',
    'PolyDecayWithWarmup',
    'CosineAnnealingWithWarmupDecay',
    'LinearAnnealingWithWarmupDecay',
]
'''

linear = paddle.nn.Linear(10, 10)
lr, max_steps = 5e-5, 1000
# 学习率预热比例
warmup_proportion = 0.1
# 学习率衰减比例
decay_proportion = 0.3

# 权重衰减系数，类似模型正则项策略，避免模型过拟合
weight_decay = 0.01

step=list(range(1, max_steps+1))

# 验证1 LinearDecayWithWarmup(learning_rate, total_steps, warmup, last_epoch=- 1, verbose=False)

lr_scheduler1 = LinearDecayWithWarmup(learning_rate=lr, total_steps=max_steps, warmup=warmup_proportion)
adam1 = paddle.optimizer.AdamW(lr_scheduler1,
                       parameters=linear.parameters(),
                       weight_decay=0.01)
learning_rate1 = []
for i in step:
    adam1.step()
    learning_rate1.append(adam1.get_lr())
    lr_scheduler1.step()

# 验证2 ConstScheduleWithWarmup(learning_rate, warmup, total_steps=None, last_epoch=- 1, verbose=False)

lr_scheduler2 = ConstScheduleWithWarmup(learning_rate=lr, total_steps=max_steps, warmup=warmup_proportion)
adam2 = paddle.optimizer.AdamW(lr_scheduler2,
                       parameters=linear.parameters(),
                       weight_decay=0.01)
learning_rate2 = []
for i in step:
    adam2.step()
    learning_rate2.append(adam2.get_lr())
    lr_scheduler2.step()

# 验证3 CosineDecayWithWarmup(learning_rate, total_steps, warmup, with_hard_restarts=False, num_cycles=None, last_epoch=- 1, verbose=False)

lr_scheduler3 = CosineDecayWithWarmup(learning_rate=lr, total_steps=max_steps, warmup=warmup_proportion)
adam3 = paddle.optimizer.AdamW(lr_scheduler3,
                       parameters=linear.parameters(),
                       weight_decay=0.01)
learning_rate3 = []
for i in step:
    adam3.step()
    learning_rate3.append(adam3.get_lr())
    lr_scheduler3.step()

# 验证4 PolyDecayWithWarmup(learning_rate, total_steps, warmup, lr_end=1e-07, power=1.0, last_epoch=- 1, verbose=False)

lr_scheduler4 = PolyDecayWithWarmup(learning_rate=lr, total_steps=max_steps, warmup=warmup_proportion)
adam4 = paddle.optimizer.AdamW(lr_scheduler4,
                       parameters=linear.parameters(),
                       weight_decay=0.01)
learning_rate4 = []
for i in step:
    adam4.step()
    learning_rate4.append(adam4.get_lr())
    lr_scheduler4.step()

# 验证5  CosineAnnealingWithWarmupDecay(max_lr, min_lr, warmup_step, decay_step, last_epoch=- 1, verbose=False)

warmup_step=max_steps*warmup_proportion
decay_step=max_steps*(1-decay_proportion)
lr_scheduler5 = CosineAnnealingWithWarmupDecay(max_lr=lr, min_lr=1e-07, warmup_step=warmup_step, decay_step=decay_step)
adam5 = paddle.optimizer.AdamW(lr_scheduler5,
                       parameters=linear.parameters(),
                       weight_decay=0.01)
learning_rate5 = []
for i in step:
    adam5.step()
    learning_rate5.append(adam5.get_lr())
    lr_scheduler5.step()

# 验证6  LinearAnnealingWithWarmupDecay(max_lr, min_lr, warmup_step, decay_step, last_epoch=- 1, verbose=False)

warmup_step=max_steps*warmup_proportion
decay_step=max_steps*(1-decay_proportion)
lr_scheduler6 = LinearAnnealingWithWarmupDecay(max_lr=lr, min_lr=1e-07, warmup_step=warmup_step, decay_step=decay_step)
adam6 = paddle.optimizer.AdamW(lr_scheduler6,
                       parameters=linear.parameters(),
                       weight_decay=0.01)
learning_rate6 = []
for i in step:
    adam6.step()
    learning_rate6.append(adam6.get_lr())
    lr_scheduler6.step()


In [None]:
import matplotlib.pyplot as plt

fig=plt.figure(figsize=(16,9))

ax1=fig.add_subplot(231)
ax1.plot(step, learning_rate1)
ax1.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
ax1.set_title('LinearDecayWithWarmup')

ax2=fig.add_subplot(232)
ax2.plot(step, learning_rate2)
ax2.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
ax2.set_title('ConstScheduleWithWarmup')

ax3=fig.add_subplot(233)
ax3.plot(step, learning_rate3)
ax3.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
ax3.set_title('CosineDecayWithWarmup')

ax4=fig.add_subplot(234)
ax4.plot(step, learning_rate4)
ax4.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
ax4.set_title('PolyDecayWithWarmup')

ax5=fig.add_subplot(235)
ax5.plot(step, learning_rate5)
ax5.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
ax5.set_title('CosineAnnealingWithWarmupDecay')

ax6=fig.add_subplot(236)
ax6.plot(step, learning_rate6)
ax6.ticklabel_format(axis="both", style="sci", scilimits=(0,0))
ax6.set_title('LinearAnnealingWithWarmupDecay')

plt.show()