<h3 style="text-align: center">CrossEntropyLoss</h3> 
<h4 style="text-align: center">relation with NLLLoss and LogSoftmax</h4>

In [1]:
import torch
import torch.nn as nn

In [2]:
batch_size, n_classes = 5, 3
x = torch.randn(batch_size, n_classes)
print("feature:")
print(x.shape)
print(x)

feature:
torch.Size([5, 3])
tensor([[-0.9997,  0.9024,  1.5058],
        [ 1.2295, -1.2791,  0.6434],
        [-1.9006,  1.4064,  1.3669],
        [-0.8453,  0.9917,  0.6591],
        [ 0.3560,  0.9731,  0.3480]])


In [3]:
target = torch.randint(n_classes, size=(batch_size,), dtype=torch.long)
print("target:")
print(target)

target:
tensor([2, 0, 0, 0, 2])


Explicit definitions

In [4]:
def softmax2(x): return torch.exp(x)/torch.sum(torch.exp(x), dim=1).view(-1, 1)
def softmax(x):  return x.exp() / (x.exp().sum(-1)).unsqueeze(-1)
def log_softmax(x): return x - x.exp().sum(-1).log().unsqueeze(-1)
def nl(input, target): return -input[range(target.shape[0]), target].log().mean()
def nll(input, target): return -input[range(target.shape[0]), target].mean()

Above expressions are the same:

In [5]:
print( log_softmax(x) )
print( nn.LogSoftmax(dim=1)(x) )
print( torch.log(softmax(x)))
print( torch.log(nn.Softmax(dim=1)(x)) )

tensor([[-2.9932, -1.0911, -0.4877],
        [-0.4934, -3.0020, -1.0795],
        [-3.9991, -0.6921, -0.7316],
        [-2.4664, -0.6293, -0.9619],
        [-1.3469, -0.7298, -1.3550]])
tensor([[-2.9932, -1.0911, -0.4877],
        [-0.4934, -3.0020, -1.0795],
        [-3.9991, -0.6921, -0.7316],
        [-2.4664, -0.6293, -0.9619],
        [-1.3469, -0.7298, -1.3550]])
tensor([[-2.9932, -1.0911, -0.4877],
        [-0.4934, -3.0020, -1.0795],
        [-3.9991, -0.6921, -0.7316],
        [-2.4664, -0.6293, -0.9619],
        [-1.3469, -0.7298, -1.3550]])
tensor([[-2.9932, -1.0911, -0.4877],
        [-0.4934, -3.0020, -1.0795],
        [-3.9991, -0.6921, -0.7316],
        [-2.4664, -0.6293, -0.9619],
        [-1.3469, -0.7298, -1.3550]])


<p>CrossEntropyLoss = NLLLoss(LogSoftmax(x),target)</p>
<p>This criterion combines nn.LogSoftmax() and nn.NLLLoss() in one single class.</p>

In [6]:
nn.NLLLoss()(nn.LogSoftmax(dim=1)(x), target)

tensor(1.7603)

In [7]:
nll(log_softmax(x),target)

tensor(1.7603)

In [8]:
nn.CrossEntropyLoss()(x,target)

tensor(1.7603)

In [9]:
nn.CrossEntropyLoss()(nn.LogSoftmax(dim=1)(x),target)

tensor(1.7603)

In [10]:
print("1",nn.CrossEntropyLoss()(x,target))
print("2",nn.NLLLoss()(nn.LogSoftmax(dim=1)(x),target))
print("3",nn.CrossEntropyLoss()(nn.LogSoftmax(dim=1)(x),target))
print("4",nn.NLLLoss()( nn.LogSoftmax(dim=1)(nn.LogSoftmax(dim=1)(x)) ,target))

1 tensor(1.7603)
2 tensor(1.7603)
3 tensor(1.7603)
4 tensor(1.7603)


In [11]:
nll(log_softmax(x),target)

tensor(1.7603)

In [12]:
nll(log_softmax(log_softmax(x)),target)

tensor(1.7603)

NLLLoss "mechanism"

In [13]:
target = torch.LongTensor([1, 2, 2, 2, 0])
print(  x[range(target.shape[0]), target]   )

tensor([0.9024, 0.6434, 1.3669, 0.6591, 0.3560])


In [14]:
#matrix diagonal selection
y = torch.Tensor([[0,1,1,1,1],
                  [1,0,1,1,1],
                  [1,1,0,1,1],
                  [1,1,1,0,1],
                  [1,1,1,1,0]])

ax0 = [0,1,2,3,4]
ax1 = [0,1,2,3,4]

print(y[ax0,ax1])

tensor([0., 0., 0., 0., 0.])


<p>Reference:</p>
<a href="https://medium.com/@zhang_yang/understanding-cross-entropy-implementation-in-pytorch-softmax-log-softmax-nll-cross-entropy-416a2b200e34">How is Pytorch’s Cross Entropy function related to softmax, log softmax, and NLL</a>