In [3]:
import sys
sys.path.append('../../pyutils')

import torch
from torch import nn
import torch.nn.functional as F
import numpy as np

import metrics

# Sigmoid

Outputs a number between $0$ and $1$. Can be used as output activation for a binary classification problem.

## Foward pass

Take a tensor as input, and applies the sigmoid elementwise

$$\sigma(x): \mathbb{R} \to \mathbb{R}$$

$$\sigma(x) = \frac{1}{1 + e^{-x}}$$

## Backward pass

$$(\sigma(x))' = \sigma(x)(1 - \sigma(x))$$

In [38]:
np.random.seed(12)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    sx = sigmoid(x)
    return sx * (1 - sx)

def sigmoid_dx(x, dout):
    return sigmoid_prime(x) * dout

x = np.random.randn(4, 6, 2).astype(np.float32)
y = sigmoid(x)
loss = np.sum(y**2)
dy = 2*y
dx = sigmoid_dx(x, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.sigmoid(tx)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

6.143906e-08
8.9251614e-08


# Softmax

Ouputs a vector of probabilities (between 0 and 1, sum to 1). Can be used as output activation for a classification problem,

## Forward pass

$$Y = \text{softmax(X)}, \space X, Y \in \mathbb{R}^{N*K}$$

$$x, y \in \mathbb{R}^{p}$$
$$ Y_{ij} = \frac{\exp(x_{ij})}{\sum_{k=1}^K \exp(x_{ik})}$$

## Backward pass

$$\text{Let } S_{ci} = \text{softmax}(x)_{ci}$$

$$
\frac{\partial S_{ci}}{\partial x_{cj}} =
\begin{cases}
    S_{ci} (1 - S_{cj}) & \text{if } i = j\\
    -S_{ci} * S_{cj} & \text{otherwise}
\end{cases}
$$

$$\frac{\partial E}{X_{cj}} = \sum_{i=1}^K \frac{\partial E}{\partial S_{ci}} * \frac{\partial S_{ci}}{\partial X_{cj}} $$

In [86]:
np.random.seed(12)

def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)
    x_e = np.exp(x)
    return x_e / np.sum(x_e, axis=1, keepdims=True)

def softmax_dx(x, dout):
    s = softmax(x)
    N, p = x.shape
    
    prim = s.reshape(N, 1, p) * (np.eye(p).reshape(1, p, p)
                                 - s.reshape(N, p, 1))
    dx = np.sum(dout.reshape(N, p, 1) * prim, axis=1)
    return dx
    
    
    

x = np.random.randn(27, 13).astype(np.float32)
y = softmax(x)
loss = np.sum(y**2)
dy = 2*y
dx = softmax_dx(x, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.softmax(tx, 1)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

1.5499829e-07
7.847853192507804e-08


# ReLU

Almost linear, very good hidden unit.  
Not diferentiable at $0$, in pratice not an issue.

## Foward pass

Take a tensor as input, and applies the ReLU elementwise

$$\text{ReLU}: \mathbb{R} \to \mathbb{R}$$

$$\text{ReLU}(x) = \max(0, x)$$

## Backward pass

$$(\text{ReLU}(x))' =  1(x > 0)$$

In [85]:
np.random.seed(12)

def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return (x > 0).astype(x.dtype)

def relu_dx(x, dout):
    return relu_prime(x) * dout

x = np.random.randn(4, 6, 2).astype(np.float32)
y = relu(x)
loss = np.sum(y**2)
dy = 2*y
dx = relu_dx(x, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.relu(tx)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

0.0
0.0


# Leaky ReLU

A modified version of ReLU to fix the non-differentiable part of ReLU

## Forward pass

$$ y = \text{leaky_relu}(x) = \max(0, x) + \alpha \min(0, x), \space x, y, \alpha \in \mathbb{R}$$

## Backward pass

$$
\text{leaky_relu}(x))' =
\begin{cases}
     1 & \text{if } x > 0\\
    \alpha & \text{otherwise}
\end{cases}
$$

In [94]:
np.random.seed(12)

def leaky_relu(x, alpha):
    return np.maximum(0, x) + alpha * np.minimum(0, x)

def leaky_relu_px(x, alpha):
    return (x > 0).astype(x.dtype) + alpha * (x <= 0).astype(x.dtype)

def leaky_relu_dx(x, alpha, dout):
    return leaky_relu_px(x, alpha) * dout

x = np.random.randn(4, 6, 2).astype(np.float32)
alpha=0.2
y = leaky_relu(x, alpha)
loss = np.sum(y**2)
dy = 2*y
dx = leaky_relu_dx(x, alpha, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = F.leaky_relu(tx, alpha)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

0.0
0.0


# ELU

A modified version of ReLU to fix the non-differentiable part of ReLU

## Forward pass

$$ y = \text{ELU}(x) = \max(0, x) + \min(0, \alpha(e^x-1)), \space x, y, \alpha \in \mathbb{R}$$

## Backward pass

$$
\text{ELU}(x))' =
\begin{cases}
     1 & \text{if } x > 0\\
    \alpha e^x & \text{otherwise}
\end{cases}
$$

In [98]:
np.random.seed(12)

def elu(x, alpha):
    return np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x)-1))

def elu_px(x, alpha):
    return ((x > 0).astype(x.dtype) 
            + alpha * np.exp(x) * (x <= 0).astype(x.dtype))

def elu_dx(x, alpha, dout):
    return elu_px(x, alpha) * dout

x = np.random.randn(4, 6, 2).astype(np.float32)
alpha=0.2
y = elu(x, alpha)
loss = np.sum(y**2)
dy = 2*y
dx = elu_dx(x, alpha, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = F.elu(tx, alpha)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

0.0
4.679838e-09


# Hyperbolic Tangent

Very similar to sigmoid, but closer to identity function (linear) near $0$.  
Output range is between $-1$ and $1$

## Forward pass

Take a tensor as input, and applies $\tanh$ elementwise.

$$\tanh(x) : \mathbb{R} \to \mathbb{R}$$

## Backward pass

$$(\tanh(x))' = 1 - \tanh(x)^2$$

In [102]:
np.random.seed(12)

def tanh(x):
    return np.tanh(x)

def tanh_prime(x):
    return 1 - np.tanh(x)**2

def tanh_dx(x, dout):
    return tanh_prime(x) * dout

x = np.random.randn(4, 6, 2).astype(np.float32)
y = tanh(x)
loss = np.sum(y**2)
dy = 2*y
dx = tanh_dx(x, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = torch.tanh(tx)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

1.5879527e-07
2.485637e-07


# Softplus

## Forward pass

Take a tensor as input, and applies softplus elementwise.

$$\text{softplus}(x) = \frac{1}{\beta} log(1 + e^{\beta x}), \space x, \beta \in \mathbb{R}$$

## Backward pass

$$(\text{softplus}(x))' = \sigma(\beta x)$$

In [107]:
np.random.seed(12)

def softplus(x, beta):
    return (1 / beta) * np.log(1 + np.exp(beta * x))

def softplus_prime(x, beta):
    return sigmoid(beta * x)

def softplus_dx(x, beta, dout):
    return softplus_prime(x, beta) * dout

x = np.random.randn(4, 6, 2).astype(np.float32)
beta = 0.24
y = softplus(x, beta)
loss = np.sum(y**2)
dy = 2*y
dx = softplus_dx(x, beta, dy)

tx = torch.from_numpy(x).requires_grad_(True)
ty = F.softplus(tx, beta)
tloss = torch.sum(ty**2)
tloss.backward()
tdx = tx.grad

print(metrics.tdist(y, ty.data.numpy()))
print(metrics.tdist(dx, tdx.data.numpy()))

1.5359043e-06
2.2238205e-06
