In [1]:
import sys
sys.path.append('../../pyutils')

import numpy as np
import scipy.linalg
from sklearn.datasets.mldata import fetch_mldata
import torch

import metrics
import revdiff as rd
import utils

np.random.seed(12)

# Multi-Layer Perceptrons

Univer approximation theorem: A feedforward network with a linear output layer and at least one hidden layer with a non-linear activation function ca approximate any Borel mesurable function, witht any non-zero amount of error (witth enough hidden units)

## Add bias

This is an operation that takes a tensor $X$ and vector $y$ and sum both of them on the last axis of $X$.  
Example: 2D:
$$Z = X + y, \space X, R \in \mathbb{R}^{n*m}, b \in \mathbb{R}^m$$
$$Z_{ij} = X_{ij} + b_{j}$$
$$\frac{\partial E}{\partial W} = \frac{\partial E}{\partial Z}$$
$$\frac{\partial E}{\partial y_j} = \sum_{i=1}^n \frac{\partial E}{\partial Z_{ij}}$$

In [2]:
X = np.random.randn(43, 7)
y = np.random.randn(7)
z = (X + y).reshape(-1)
e = z @ z

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=True)
tz = (tX + ty).view(-1)
te = torch.dot(tz, tz)
te.backward()

print(e)
print(te.data.numpy())
print(metrics.tdist(e, te.data.numpy()))

512.8099394943562
512.8099394943561
1.1368683772161603e-13


In [3]:
dz = 2 * z.reshape(X.shape)
dX = dz
dy = np.sum(dz, axis=0)

dX_sol = tX.grad.data.numpy()
dy_sol = ty.grad.data.numpy()
print(metrics.tdist(dX, dX_sol))
print(dy)
print(dy_sol)
print(metrics.tdist(dy, dy_sol))

0.0
[153.25935123  -3.83113279  38.98813845  46.41386373  23.31963682
  29.90573326  79.79186959]
[153.25935123  -3.83113279  38.98813845  46.41386373  23.31963682
  29.90573326  79.79186959]
0.0


## Example: MNIST Dataset

In [4]:
IN_SIZE = 28 * 28
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 256
OUT_SIZE = 10

NEPOCHS = 5
LR = 0.001
BATCH_SIZE = 64

train_loader, test_loader = utils.load_mnist(BATCH_SIZE)
train_loader_01, test_loader_01 = utils.load_mnist_01(BATCH_SIZE)

def compute_accuracy(y_preds, y):
    total = len(y_preds)
    correct = np.equal(y_preds, y).sum()
    return correct, total

## MNIST K classes with softmax + cross entropy

In [5]:
class Net(torch.nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, OUT_SIZE)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x)
        return y_logits


net = Net()
criterion = torch.nn.CrossEntropyLoss(reduction='sum')

for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            w.data.sub_(w.grad.data * LR)
    
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    test_y_preds, test_y_true = utils.get_class_output(net, test_loader)
    correct, total = compute_accuracy(test_y_preds, test_y_true)
    acc = float(correct) / total
    print('Epoch {}: test accuracy = {} ({}/{})'.format(epoch + 1, acc,
                                                   correct, total))

1[1] Loss = 147.83250427246094
1[301] Loss = 24.46794891357422
1[601] Loss = 17.228824615478516
1[901] Loss = 15.436829566955566
Epoch 1: test accuracy = 0.9192 (9192/10000)
2[1] Loss = 18.49155616760254
2[301] Loss = 24.79840850830078
2[601] Loss = 8.40994930267334
2[901] Loss = 14.039809226989746
Epoch 2: test accuracy = 0.9441 (9441/10000)
3[1] Loss = 13.433870315551758
3[301] Loss = 27.380294799804688
3[601] Loss = 6.725010871887207
3[901] Loss = 4.588817596435547
Epoch 3: test accuracy = 0.9519 (9519/10000)
4[1] Loss = 8.174762725830078
4[301] Loss = 12.456804275512695
4[601] Loss = 9.772133827209473
4[901] Loss = 2.8409600257873535
Epoch 4: test accuracy = 0.9592 (9592/10000)
5[1] Loss = 12.741117477416992
5[301] Loss = 7.070180416107178
5[601] Loss = 6.1213297843933105
5[901] Loss = 6.556562423706055
Epoch 5: test accuracy = 0.965 (9650/10000)


## MNIST bin classification digits 0/1 (sigmoid+BCE)

In [6]:
IN_SIZE = 28 * 28
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 256

NEPOCHS = 5
LR = 0.001
BATCH_SIZE = 64


class Net(torch.nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, 1)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x).view(-1)
        return y_logits


net = Net()
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')
    
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader_01):
        y_logits = net(X)
        loss = criterion(y_logits, y.type(torch.float32))
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            w.data.sub_(w.grad.data * LR)
    
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
            
    test_y_preds, test_y_true = utils.get_class_output(net, test_loader_01,
                                            act=torch.sigmoid)
    correct, total = compute_accuracy(test_y_preds, test_y_true)
    acc = float(correct) / total
    print('Epoch {}: test accuracy = {} ({}/{})'.format(epoch + 1, acc,
                                                    correct, total))

1[1] Loss = 45.23631286621094
Epoch 1: test accuracy = 0.9990543735224586 (2113/2115)
2[1] Loss = 0.13379809260368347
Epoch 2: test accuracy = 0.9990543735224586 (2113/2115)
3[1] Loss = 0.0598868690431118
Epoch 3: test accuracy = 0.9990543735224586 (2113/2115)
4[1] Loss = 0.3059484362602234
Epoch 4: test accuracy = 0.9990543735224586 (2113/2115)
5[1] Loss = 0.026982322335243225
Epoch 5: test accuracy = 0.9990543735224586 (2113/2115)


## Activation Functions

### Sigmoid

Can be used as output units for binary clasification problems with BCE

$$x \in \mathbb{R},  \space \sigma(x) \in \mathbb{R}$$
$$\sigma(x) = \frac{1}{1 + e^{-x}}$$
$$(\sigma(x))' = \sigma(x)(1 - \sigma(x)), \space x \in \mathbb{R}$$

In [7]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(x):
    return sigmoid(x) * (1 - sigmoid(x))

### Softmax

Can be used as output units for multinomia; clasification problems with Cross Entropy

$$x, y \in \mathbb{R}^{p}$$
$$ y_j = \text{softmax}(x)_j = \frac{e^{x_j}}{\sum_{k=1}^p e^{x_k}}$$

The function can be extended to matrices, by applying softmax on each row.

$$x, y \in \mathbb{R}^{n*p}$$
$$ y_{ij} = \text{softmax}(x)_{ij} = \frac{e^{x_{ij}}}{\sum_{k=1}^p e^{x_{ik}}}$$

$$\text{Let } S_i = \text{softmax}(x)_i$$
$$\frac{\partial S_i}{\partial x_j} = S_i (1 - S_j) \space (i = j)$$
$$\frac{\partial S_i}{\partial x_j} = -S_i * S_j \space (i \neq j)$$

In [8]:
def softmax(x):
    x_e = np.exp(x)
    return x_e / np.sum(x_e, axis=1, keepdims=True)

### RELU (Rectified Linear Unit)

Almost linear, very good hidden unit.  
Not diferentiable at $0$, in pratice not an issue.

$$x, y \in \mathbb{R}$$
$$ y = \text{relu}(x) = max(0, x)$$
$$(\text{relu}(x))' = 1 \space (x > 0)$$
$$(\text{relu}(x))' = 0 \space (x \leq 0)$$

In [9]:
def relu(x):
    return np.maximum(0, x)

def relu_prime(x):
    return (x > 0).astype(x.dtype)

### Leaky RELU

$$x, y, \alpha \in \mathbb{R}$$
$$ y = \text{leaky_relu}(x) = max(0, x) + \alpha min(0, x)$$
$$(\text{leaky_relu}(x))' = 1 \space (x > 0)$$1
$$(\text{leaky_relu}(x))' = \alpha \space (x \leq 0)$$

In [10]:
def leaky_relu(x, alpha):
    return np.maximum(0, x) + alpha * np.minimum(0, x)

def leaky_relu_prime(x, alpha):
    return (x > 0).astype(x.dtype) + alpha * (x <= 0).astype(x.dtype)

### ELU (Exponential Linear Unit)

$$x, y, \alpha \in \mathbb{R}$$
$$ y = \text{elu}(x) = max(0, x) + min(0, \alpha(e^x-1))$$
$$(\text{elu}(x))' = 1 \space (x > 0)$$
$$(\text{elu}(x))' = \alpha e^x \space (x \leq 0)$$

In [11]:
def elu(x, alpha):
    return np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x)-1))

def elu_prime(x, alpha):
    return (x > 0).astype(x.dtype) + alpha * np.exp(x) * (x <= 0).astype(x.dtype)

### Parametric RELU

Parametric RELU is the same than Leaky RELU, except that $\alpha$ is a lernable parameter.

### Hyperbolic Tangent 

Very similar to sigmoid, but closer to identity function (linear) near $0$.  
Output range is between $-1$ and $1$

$$x, y \in \mathbb{R}$$
$$ y = tanh(x)$$
$$tanh(x) = 2\sigma(2z)-1$$
$$(tanh(x))' = 1 - tanh(x)^2$$

In [12]:
def tanh_prime(x):
    return 1 - np.tanh(x)**2

### Softplus

$$x, y, \beta \in \mathbb{R}$$
$$ y = \text{softplus}(x) = \frac{1}{\beta} log(1 + e^{\beta x})$$
$$(\text{softplus}(x))' = \sigma(\beta x)$$

In [13]:
def softplus(x, beta = 1):
    return (1 / beta) * np.log(1 + np.exp(beta * x))

def softplus_prime(x, beta=1):
    return sigmoid(beta * x)

### Linear (No Activation)

Can be used as output unit for regression problem with MSE.  
No gradients problems

## Cost Functions 

Each cost function take the predictions and true values for a whole batch, and return a unique number, the error, that must be minimized.

### MSE (Mean Squarer Error)

The predictions of the model are denoted $\hat{y} \in \mathbb{R}^n$.  
The true values are denoted $y \in \mathbb{R}^n$.

$$MSE(W) = \frac{1}{n} \sum_{i=1}^n(y_i - \hat{y}_i)^2$$

Multiple target versions: $y, \hat{y} \in \mathbb{R}^{n*m}$
$$MSE(W) = \frac{1}{n*m} \sum_{i=1}^n||y_i - \hat{y}_i||^2$$

$$\frac{\partial MSE(W)}{\partial \hat{y}} = \frac{2}{n}(\hat{y} - y)$$
$$\frac{\partial MSE(W)}{\partial y} = \frac{2}{n}(y - \hat{y})$$

For any dimensions, MSE can be applied by just reshaping $y$ and $\hat{y}$ into 1D tensors.

In [14]:
def mse(y_pred, y_true):
    n = len(y_true)
    return (1 / n) * np.sum((y_true - y_pred)**2)

def mse_dy_pred(dout, y_pred, y_true):
    n = len(y_true)
    return dout * (2 / n) * (y_pred - y_true)

### BCE (Binary Cross Entropy)

BCE is used to solve binary classification problems.
Usualy it's used with an activation function for the output layer that scales values between $0$ and $1$, like sigmoid.  
The predictions $\hat{y} \in \mathbb{R}^n$ are probabilities to belong to the class.  
The true values are $ y\in \mathbb{R}^n$ are either 0 or 1 (1 if belongs to the class). 
$$J(W) = - \sum_{i=1}^n (y_i log(\hat{y_i}) + (1 - y_i) log(1 - \hat{y_i}))$$

$$\frac{\partial J(W)}{\partial \hat{y}} = \frac{\hat{y} - y}{\hat{y}(1 - \hat{y})}$$
Let $\hat y = \sigma(o)$, $o \in \mathbb{R}^n$ (final activation is sigmoid).
$$\frac{\partial J(W)}{\partial o} = \hat{y} - y$$

In [15]:
def bce(y_pred, y_true):
    return - np.sum(y_true*np.log(y_pred)+(1-y_true) * np.log(1-y_pred))
    
def bce_dy_out(dout, y_out, y_true):
    y_pred = sigmoid(y_out)
    return dout * (y_pred - y_true)

### Cross Entropy

Cross Entropy is used to solve binary classification problems.
Usualy it's used with an activation function for the output layer that gives probabilities suming to $1$ in each row, like softmax.  
The predictions $\hat{y} \in \mathbb{R}^{n*k}$ are probabilities to belong to each of the k classes.  
The true values $y\in \mathbb{R}^{n*k}$ are one-hot vectors, with a $1$ entry in the true class.

$$J(W) = - \sum_{i=1}^n \sum_{j=1}^k [y_{ij} log(\hat{y_{ij}})]$$

$$\frac{\partial J(W)}{\partial \hat{y}} =  - \frac{y}{\hat{y}}$$
Let $\hat y = $softmax$(o)$, $o \in \mathbb{R}^{n*k}$ (final activation is softmax).
$$\frac{\partial J(W)}{\partial o} = \hat{y} - y$$

In [16]:
def cross_entropy(y_pred, y_true):
    return - np.sum(y_true * np.log(y_pred))

def cross_entropy_dy_out(dout, y_out, y_true):
    y_pred = softmax(y_out)
    return dout * (y_pred - y_true)

### MAE (Mean Absolute Error) or L1 Loss

The predictions of the model are denoted $\hat{y} \in \mathbb{R}^n$.  
The true values are denoted $y \in \mathbb{R}^n$.

$$J(W) = \frac{1}{n} \sum_{i=1}^n|y_i - \hat{y}_i|$$
$$\frac{\partial J(W)}{\partial \hat{y}} = \frac{1}{n} sign(\hat{y} - y)$$
$$\frac{\partial J(W)}{\partial y} = \frac{1}{n} sign(y - \hat{y})$$

In [17]:
def mae(y_pred, y_true):
    n = len(y_true)
    return (1 / n) * np.sum(np.abs(y_true - y_pred))

def mae_dy_pred(dout, y_pred, y_true):
    n = len(y_true)
    return dout * (1 / n) * np.sign(y_pred - y_true)