In [1]:
# https://github.com/patrickloeber/pytorchTutorial/blob/master/11_softmax_and_crossentropy.py

import numpy as np
import torch
import torch.nn as nn

In [2]:
#        -> 2.0              -> 0.65  
# Linear -> 1.0  -> Softmax  -> 0.25   -> CrossEntropy(y, y_hat)
#        -> 0.1              -> 0.1                   
#
#     scores(logits)      probabilities
#                           sum = 1.0

# softmax applies the exponential function to each element, and normalizes
# by dividing by the sum of all these exponentials
# -> squashes the output to be between 0 and 1 = probability
# sum of all probabilities is 1
def softmax(x):
    # np.exp(x) is e^x, for all elements in x
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [3]:
x = np.array([2.0, 1.0, 0.1])
outputs = softmax(x)
f"softmax numpy: {outputs}"

'softmax numpy: [0.65900114 0.24243297 0.09856589]'

In [4]:
x = torch.tensor([2.0, 1.0, 0.1])
outputs = torch.softmax(x, dim=0) # along with the values along the first axis
f"softmax torch: {outputs}"

'softmax torch: tensor([0.6590, 0.2424, 0.0986])'

In [5]:
# cross-entropy loss, or log loss, measures the performance of a classification
# model whose output is a probability value between 0 and 1
# -> loss increases as the predicted probability diverges from the actual label
def cross_entropy(actual, predicted):
    EPS = 1e-15
    # limit  predicted between 0.000000000000001 and 1 - 0.000000000000001
    predicted = np.clip(predicted, EPS, 1 - EPS)
    loss = -np.sum(actual * np.log(predicted))
    # we can normalize it if we want (this is 1/N part of the formula, with
    # N being commented out)
    return loss # / float(predicted.shape[0])

In [6]:
# y must be one hot encoded
# one-hot encoded: representation of categorical variables as binary vectors
# (as shown below)
# if class 0: [1 0 0]
# if class 1: [0 1 0]
# if class 2: [0 0 1]
Y = np.array([1, 0, 0])
Y_pred_good = np.array([0.7, 0.2, 0.1])
Y_pred_bad = np.array([0.1, 0.3, 0.6])
l1 = cross_entropy(Y, Y_pred_good)
l2 = cross_entropy(Y, Y_pred_bad)
print(f"loss1 numpy: {l1:.4f}\nloss2 numpy: {l2:.4f}")

loss1 numpy: 0.3567
loss2 numpy: 2.3026


In [7]:
# CrossEntropyLoss in PyTorch (applies Softmax)
# nn.LogSoftmax + nn.NLLLoss
# NLLLoss = negative log likelihood loss
loss = nn.CrossEntropyLoss()
# loss(input, target)

# target is of size n_samples = 1
# each element has class label: 0, 1, or 2
# Y (=target) contains class labels, not one-hot
Y = torch.tensor([0])

# input is of size n_samples * n_classes = 1 * 3
# y_pred (=input) must be raw, unnormalizes scores (logits) for each class, not softmax
Y_pred_good = torch.tensor([[2.0, 1.0, 0.1]])
Y_pred_bad = torch.tensor([[0.5, 2.0, 0.3]])
l1 = loss(Y_pred_good, Y)
l2 = loss(Y_pred_bad, Y)
print(f"loss1 torch: {l1:.4f}\nloss2 torch: {l2:.4f}")

loss1 torch: 0.4170
loss2 torch: 1.8406


In [8]:
# get predictions
_, predictions1 = torch.max(Y_pred_good, 1)
_, predictions2 = torch.max(Y_pred_bad, 1)
print(f"actual class: {Y.item()}, Y_pred1: {predictions1.item()}, Y_pred2: {predictions2.item()}")

actual class: 0, Y_pred1: 0, Y_pred2: 1


In [9]:
# allows batch loss for multiple samples

# target is of size nBatch = 3
# each element has class label: 0, 1, or 2
Y = torch.tensor([2, 0, 1])

# input is of size nBatch x nClasses = 3 x 3
# Y_pred are logits (not softmax)
Y_pred_good = torch.tensor(
    [[0.1, 0.2, 3.9], # predict class 2
    [1.2, 0.1, 0.3],  # predict class 0
    [0.3, 2.2, 0.2]]) # predict class 1

Y_pred_bad = torch.tensor(
    [[0.9, 0.2, 0.1],
    [0.1, 0.3, 1.5],
    [1.2, 0.2, 0.5]])

l1 = loss(Y_pred_good, Y)
l2 = loss(Y_pred_bad, Y)
print(f"batch loss1: {l1.item():.4f}\nbatch loss2: {l2.item():.4f}")

batch loss1: 0.2834
batch loss2: 1.6418


In [10]:
# get predictions
_, predictions1 = torch.max(Y_pred_good, 1)
_, predictions2 = torch.max(Y_pred_bad, 1)
print(f"actual class: {Y}\nY_pred1: {predictions1}\nY_pred2: {predictions2}")

actual class: tensor([2, 0, 1])
Y_pred1: tensor([2, 0, 1])
Y_pred2: tensor([0, 2, 0])


In [11]:
# binary classification
class NeuralNet1(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(NeuralNet1, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        # since we're dealing with a yes/no question here (binary),
        # the output size is set to 1
        self.linear2 = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        # sigmoid at the end
        y_pred = torch.sigmoid(out)
        return y_pred

model = NeuralNet1(input_size=28*28, hidden_size=5)
criterion = nn.BCELoss()

In [12]:
# multiclass problem
class NeuralNet2(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet2, self).__init__()
        self.linear1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        # since there may be multiple answers for a question, the 
        # number of outputs is the same as the number of classes
        self.linear2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        out = self.linear1(x)
        out = self.relu(out)
        out = self.linear2(out)
        # no softmax at the end
        return out

model = NeuralNet2(input_size=28*28, hidden_size=5, num_classes=1)
criterion = nn.CrossEntropyLoss() # (applies softmax)

# some guide
## __init__()
1. calls super(Class, self).__init__()
2. you declare the layers here, but don't do any kind of computation
## forward()
1. this is the part where you do the computations
2. you know, when you stack the outputs of another layer into another