### Embedding
Embedding层可以训练后得到词向量，但是目前只有3种优化器可以使embedding层被学习到。
Keep in mind that only a limited number of optimizers support sparse gradients: currently it’s optim.SGD (CUDA and CPU), optim.SparseAdam (CUDA and CPU) and optim.Adagrad (CPU)

另外from_pretrained方法可以直接加载预训练的词向量。

With padding_idx set, the embedding vector at padding_idx is initialized to all zeros. However, note that this vector can be modified afterwards, e.g., using a customized initialization method, and thus changing the vector used to pad the output. The gradient for this vector from Embedding is always zero.


In [1]:
import torch
import torch.nn as nn
word_to_ix = {"hello": 0, "world": 1}
embeds = nn.Embedding(2, 5)  # 2 words in vocab, 5 dimensional embeddings
lookup_tensor = torch.LongTensor([word_to_ix["hello"], word_to_ix['world']])
hello_embed = embeds(lookup_tensor)
print(hello_embed)

tensor([[-0.1860, -0.7426, -0.8872,  0.1476, -0.2419],
        [ 0.8223,  1.1087,  0.2378,  0.3572, -0.7053]],
       grad_fn=<EmbeddingBackward>)


### 分类的几种损失函数相关例子

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
import math
def configure_logging(level=logging.INFO):
    format = '%(asctime)s %(filename)s:%(lineno)d %(levelname)s] %(message)s'
    datefmt = '%Y-%m-%d %H:%M:%S'
    logging.basicConfig(level=level, format=format, datefmt=datefmt)
configure_logging()

# 模拟网络最后输出与目标值，[batchsize=3, num_labels=5]
# 适用每个样本2分类或者多分类（但是分类是互斥的）
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
logging.info(input)
logging.info(target)
"""
对比损失函数 CrossEntropyLoss和NLLLoss
"""
loss1 = nn.CrossEntropyLoss()

m = nn.LogSoftmax(dim=1)
loss2 = nn.NLLLoss()

l1 = loss1(input, target)
l2 = loss2(m(input), target)
if torch.equal(l1, l2) is True:
    logging.info('CrossEntropyLoss = LogSoftmax + NLLLoss')
"""
模拟NLLLoss的计算过程
"""
input_log_softmax = -torch.log(F.softmax(input))
logging.info(input_log_softmax)
target_one_hot = F.one_hot(target, num_classes=5)
logging.info(target_one_hot)
a = input_log_softmax * target_one_hot.float()
logging.info(a)
b = torch.mean(torch.sum(a, dim=1))
logging.info(b)
logging.info(l2)
if torch.equal(b, l2) is True:
    logging.info('NLLLoss compute example')

logging.info('=' * 60)
"""
模拟BCELoss的计算过程，3个样本3个类别，target代表每个样本属于那些类别
适用每个样本多标签分类（1个样本可能属于多个类别）
"""
input = torch.randn(3, 3, requires_grad=True)
target = torch.FloatTensor([[0, 1, 1], [0, 0, 1], [1, 0, 1]])
logging.info(input)
logging.info(target)

m = nn.Sigmoid()
a = m(input)
logging.info(a)

loss = nn.BCELoss()
loss2 = nn.BCEWithLogitsLoss()
logging.info(loss(m(input), target))

b = target * torch.log(m(input)) + (1 - target) * torch.log(1 - m(input))
logging.info(b)

c = torch.mean(b)
logging.info(-c)
if torch.equal(loss(m(input), target), -c) is True:
    logging.info('BCELoss compute example')

logging.info(loss2(input, target))
logging.info(loss(m(input), target))

res1 = loss(m(input), target).item()
res2 = loss2(input, target).item()

if math.isclose(res1, res2, rel_tol=1e-5) is True:
    logging.info('BCELoss === BCEWithLogitsLoss')

2019-10-07 16:41:17 <ipython-input-1-91a7ef90ce55>:20 INFO] tensor([[ 0.5893,  1.2365, -0.4431,  0.5678,  0.2332],
        [-1.0433, -1.4576, -0.4088, -2.3691,  0.0728],
        [ 0.8016, -0.1755, -2.2973, -0.0646, -0.0961]], requires_grad=True)
2019-10-07 16:41:17 <ipython-input-1-91a7ef90ce55>:21 INFO] tensor([3, 3, 0])
2019-10-07 16:41:17 <ipython-input-1-91a7ef90ce55>:33 INFO] CrossEntropyLoss = LogSoftmax + NLLLoss
2019-10-07 16:41:17 <ipython-input-1-91a7ef90ce55>:38 INFO] tensor([[1.5985, 0.9513, 2.6309, 1.6199, 1.9545],
        [1.9265, 2.3408, 1.2920, 3.2523, 0.8104],
        [0.8108, 1.7878, 3.9096, 1.6769, 1.7084]], grad_fn=<NegBackward>)
2019-10-07 16:41:17 <ipython-input-1-91a7ef90ce55>:40 INFO] tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 1, 0],
        [1, 0, 0, 0, 0]])
2019-10-07 16:41:17 <ipython-input-1-91a7ef90ce55>:42 INFO] tensor([[0.0000, 0.0000, 0.0000, 1.6199, 0.0000],
        [0.0000, 0.0000, 0.0000, 3.2523, 0.0000],
        [0.8108, 0.0000, 0.0000, 0.0000, 0.000

### 优化器相关例子

In [None]:
import torchvision
import torch
import torch.optim as optim
from torch.optim import lr_scheduler
from torchvision.models import AlexNet
import matplotlib.pyplot as plt
"""
根据epoch来调整学习率的方式，可以结合不同的参数用不同的学习率方式
"""
model = AlexNet(num_classes=2)
optimizer = optim.SGD(params=model.parameters(), lr=0.05)

# lr_scheduler.StepLR()
# Assuming optimizer uses lr = 0.05 for all groups
# lr = 0.05     if epoch < 30
# lr = 0.005    if 30 <= epoch < 60
# lr = 0.0005   if 60 <= epoch < 90

scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
plt.figure()
x = list(range(100))
y = []
for epoch in range(100):
    # 更新了学习率，从而更新了optimizer中的学习率状态，继续后续的train过程
    scheduler.step()
    lr = scheduler.get_lr()
    print(epoch, scheduler.get_lr()[0])
    y.append(scheduler.get_lr()[0])

plt.plot(x, y)
"""
# 网络中不同的参数用不同的学习率来学习的方式
"""
model = torchvision.models.resnet18()
paras = dict(model.named_parameters())

for k, v in paras.items():
    print(k.ljust(30), str(v.shape).ljust(30), 'bias:', v.requires_grad)

paras_new = []
for k, v in paras.items():
    if 'bias' in k:
        paras_new += [{'params': [v], 'lr': 0.02, 'weight_decay': 0}]
    else:
        paras_new += [{'params': [v], 'lr': 0.01, 'weight_decay': 0.00004}]
optimizer = torch.optim.SGD(paras_new, momentum=0.9)

for p in optimizer.param_groups:
    outputs = ''
    for k, v in p.items():
        if k is 'params':
            outputs += (k + ': ' + str(v[0].shape).ljust(30) + ' ')
        else:
            outputs += (k + ': ' + str(v).ljust(10) + ' ')
    print(outputs)