In [1]:
import sys
sys.path.append('../../pyutils')

import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

import metrics

# Mean Squared Error (MSE)

The predictions of the model are denoted $\hat{y}$, the true values are denoted $y$.  
This is a 1D version, without average. We can reshape the input and multiply by a constant term to handle average and multi-dimensional data.

# Forward pass

$$l = \sum_{i=1}^n(\hat{y}_i - y_i)^2, \space \hat{y}, y \in \mathbb{R}^n, l \in \mathbb{R}$$

# Backward pass

$$\frac{\partial E}{\partial \hat{y}} = 2 * \frac{\partial E}{\partial l} * (\hat{y} - y)$$
$$\frac{\partial E}{\partial y} = 2 * \frac{\partial E}{\partial l} * (y - \hat{y})$$

In [16]:
np.random.seed(12)

def mse(pred, target):
    return np.sum((pred - target)**2)

def mse_dpred(pred, target, dout):
    return 2 * dout * (pred - target)

def mse_dtarget(pred, target, dout):
    return 2 * dout * (target - pred)

pred = np.random.randn(26, 4, 8).astype(np.float32)
target = np.random.randn(26, 4, 8).astype(np.float32)
l = mse(pred, target)
loss = np.sum(l**2)
dl = 2*l
dpred = mse_dpred(pred, target, dl)
dtarget = mse_dtarget(pred, target, dl)


tpred = torch.from_numpy(pred).requires_grad_(True)
ttarget = torch.from_numpy(target).requires_grad_(True)
tl = F.mse_loss(tpred, ttarget, reduction='sum')
tloss = torch.sum(tl**2)
tloss.backward()
tdpred = tpred.grad
tdtarget = ttarget.grad

print(metrics.tdist(l, tl.data.numpy()))
print(metrics.tdist(dpred, tdpred.data.numpy()))
print(metrics.tdist(dtarget, tdtarget.data.numpy()))

0.0
0.0
0.0


# Mean Absolute Error (MAE)

The predictions of the model are denoted $\hat{y}$, the true values are denoted $y$.  
This is a 1D version, without average. We can reshape the input and multiply by a constant term to handle average and multi-dimensional data.

# Forward pass

$$l = \sum_{i=1}^n |\hat{y}_i - y_i|, \space \hat{y}, y \in \mathbb{R}^n, l \in \mathbb{R}$$

# Backward pass

$$\frac{\partial E}{\partial \hat{y}} = \frac{\partial E}{\partial l} * \text{sign}(\hat{y} - y)$$
$$\frac{\partial E}{\partial y} = \frac{\partial E}{\partial l} * \text{sign}(y - \hat{y})$$

In [21]:
np.random.seed(12)

def mae(pred, target):
    return np.sum(np.abs(pred - target))

def mae_dpred(pred, target, dout):
    return dout * np.sign(pred - target)

def mae_dtarget(pred, target, dout):
    return dout * np.sign(target - pred)

pred = np.random.randn(26, 4, 8).astype(np.float32)
target = np.random.randn(26, 4, 8).astype(np.float32)
l = mae(pred, target)
loss = np.sum(l**2)
dl = 2*l
dpred = mae_dpred(pred, target, dl)
dtarget = mae_dtarget(pred, target, dl)


tpred = torch.from_numpy(pred).requires_grad_(True)
ttarget = torch.from_numpy(target).requires_grad_(True)
tl = F.l1_loss(tpred, ttarget, reduction='sum')
tloss = torch.sum(tl**2)
tloss.backward()
tdpred = tpred.grad
tdtarget = ttarget.grad

print(metrics.tdist(l, tl.data.numpy()))
print(metrics.tdist(dpred, tdpred.data.numpy()))
print(metrics.tdist(dtarget, tdtarget.data.numpy()))

0.0
0.0
0.0


# Binary Cross Entropy (BCE)

The predictions of the model are denoted $\hat{y}$, they are the probabilities of belonging to class $1$.  
The true values are denoted $y$, they are $1$ if belong to class $1$, or $0$.


# Forward pass

$$l = - \sum_{i=1}^N (y_i log(\hat{y_i}) + (1 - y_i) log(1 - \hat{y_i})), \space \hat{y}, y \in \mathbb{R}^N, l \in \mathbb{R}$$

# Backward pass

$$\frac{\partial E}{\partial \hat{y}} = \frac{\partial E}{\partial l} * \frac{\hat{y} - y}{\hat{y}(1 - \hat{y})}$$

In [55]:
np.random.seed(12)

def bce(pred, target):
    return - np.sum(target*np.log(pred)+(1-target) * np.log(1-pred))

def bce_dpred(pred, target, dout):
    return dout * (pred - target) / (pred * (1 - pred))

pred = np.random.rand(26).astype(np.float32)
target = np.random.randint(0, 2, size=26).astype(np.float32)
l = bce(pred, target)
loss = np.sum(l**2)
dl = 2*l
dpred = bce_dpred(pred, target, dl)


tpred = torch.from_numpy(pred).requires_grad_(True)
ttarget = torch.from_numpy(target)
tl = F.binary_cross_entropy(tpred, ttarget, reduction='sum')
tloss = torch.sum(tl**2)
tloss.backward()
tdpred = tpred.grad

print(metrics.tdist(l, tl.data.numpy()))
print(metrics.tdist(dpred, tdpred.data.numpy()))

3.8146973e-06
0.0039273673


# Cross Entropy (CE)

The predictions of the model are denoted $\hat{y}$, they are the probabilities of belonging to each of the K classes.  
The true values are denoted $y$, a one-hot representation with $1$ for the true class.


# Forward pass

$$l = - \sum_{i=1}^N \sum_{j=1}^K [y_{ij} log(\hat{y_{ij}})], \space \hat{y}, y \in \mathbb{R}^{N*K}, l \in \mathbb{R}$$

# Backward pass

$$\frac{\partial E}{\partial \hat{y}} = - \frac{\partial E}{\partial l} * \frac{y}{\hat{y}}$$

In [74]:
np.random.seed(12)

def target2onehot(x, nclasses):
    oh = np.zeros((len(x), nclasses)).astype(np.float32)
    oh[np.arange(len(x)), x] = 1
    return oh

def cross_entropy(pred, target):
    return - np.sum(target * np.log(pred))

def cross_entropy_dpred(pred, target, dout):
    return - dout * target / pred

pred = np.random.rand(26, 4).astype(np.float32)
pred = pred / np.sum(pred, axis=1, keepdims=True)
target = np.random.randint(0, 4, size=26).astype(np.int)
target = target2onehot(target, 4)
l = cross_entropy(pred, target)
loss = np.sum(l**2)
dl = 2*l
dpred = cross_entropy_dpred(pred, target, dl)


tpred = torch.from_numpy(pred).requires_grad_(True)
ttarget = torch.from_numpy(target)
tl =  - torch.sum(ttarget * torch.log(tpred))
tloss = torch.sum(tl**2)
tloss.backward()
tdpred = tpred.grad

print(metrics.tdist(l, tl.data.numpy()))
print(metrics.tdist(dpred, tdpred.data.numpy()))

0.0
0.0
