In [1]:
import sys
sys.path.append('..')

import numpy as np
import torch

import metrics
import revdiff as rd
import utils

# Reverse-mode Autodifferientation

## Chain Rule

$$\frac{\partial E}{\partial y} = \sum_{x_i \in preds(E)} \frac{\partial E}{\partial x_i} * \frac{\partial x_i}{\partial y}$$
$$\frac{\partial E}{\partial x} = \sum_{y_i \in succs(x)} \frac{\partial y_i}{\partial x} * \frac{\partial E}{\partial y_i}$$

How to compute $\frac{\partial E}{\partial x}$ ?

1) Test begin: if $x == E$, just return $1$

2) let $dx$ = ones($x$.shape)

3) For each succesor $y_i$:  
---- 1) Compute recursively $\frac{\partial E}{\partial y_i}$.  
---- 2) Call a node specific function that computes grad_yi = $\frac{\partial y_i}{\partial x} * \frac{\partial E}{\partial y_i}$ optimally from $x$ and $\frac{\partial E}{\partial y_i}$.  
---- 3) $dx +=$ grad_yi  

4) returns $dx$

Shortcut: if $x$ is not a predecessor (recursive) of $E$, the gradient is $0$.  
The same idea can be applied for each succesor $y_i$ of $x$

In [2]:
a = rd.build_val(2)
b = rd.build_val(7)
c = rd.build_val(3.5)

x = a * b
y = x / c - 2 * -a
print(metrics.tdist(14, x.eval()))
print(metrics.tdist(14 / 3.5 + 4, y.eval()))

0.0
0.0


# Dense Network

In [3]:
IN_SIZE = 28 * 28
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 256
OUT_SIZE = 10

NEPOCHS = 5
LR = 0.001
BATCH_SIZE = 64

train_loader, test_loader = utils.load_mnist(BATCH_SIZE)
train_loader_01, test_loader_01 = utils.load_mnist_01(BATCH_SIZE)

def compute_accuracy(y_preds, y):
    total = len(y_preds)
    correct = np.equal(y_preds, y).sum()
    return correct, total

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


In [4]:
class TNet(torch.nn.Module):

    def __init__(self):
        super(TNet, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, 1)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x).view(-1)
        return y_logits


tnet = TNet()
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum')

In [5]:
class DNet(rd.Network):
    
    def __init__(self):
        super().__init__()
        self.l1 = self.dense_layer(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = self.dense_layer(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = self.dense_layer(HIDDEN2_SIZE, 1)
        
    def forward(self, x):
        x = rd.build_reshape(x, (-1, IN_SIZE))
        x = rd.build_vrelu(self.l1(x))
        x = rd.build_vrelu(self.l2(x))
        y_logits = self.l3(x)
        y_logits = rd.build_reshape(y_logits, (-1,))
        return y_logits
        
dnet = DNet()

In [6]:
X_sample = None
y_sample = None

for x, y in train_loader_01:
    X_sample = x.data.numpy()
    y_sample = y.data.numpy()
    break
    
tparams = list(tnet.parameters())
for i in range(len(tparams)):
    dnet.params_[i].update(tparams[i].data.numpy().T)
    
tX = torch.tensor(X_sample)
ty = torch.tensor(y_sample).type(torch.float32)
ty_logits = tnet(tX)
tloss = criterion(ty_logits, ty)
tnet.zero_grad()
tloss.backward()

dX = rd.build_val(X_sample)
dy = rd.build_val(y_sample)
dy_logits = dnet(dX)
dloss = rd.build_bce_loss(dy_logits, dy)


print(ty_logits.data.numpy()[:10].T)
print(dy_logits.eval()[:10].T)
print(metrics.tdist(ty_logits.data.numpy(), dy_logits.eval()))
print(tloss.data.numpy())
print(dloss.eval())
print(metrics.tdist(tloss.data.numpy(), dloss.eval()))

print('Checking gradients')
tparams = list(tnet.parameters())
for i in range(len(tparams)):
    grad = rd.build_node_grad(dloss, dnet.params_[i]).eval()
    grad_sol = tparams[i].grad.data.numpy().T
    print(metrics.tdist(grad, grad_sol))

[-0.06461529 -0.0060272  -0.04025982 -0.01614732 -0.04020036 -0.06330499
 -0.06153796 -0.05931276 -0.057026    0.00039813]
[-0.06461532 -0.00602722 -0.04025982 -0.01614732 -0.04020034 -0.06330499
 -0.06153798 -0.05931275 -0.05702603  0.00039811]
1.7751653e-07
45.021507
45.021507
0.0
Checking gradients
3.1160503e-06
2.2359293e-07
5.871506e-06
3.5282864e-07
4.311466e-06
4.7683716e-07


In [7]:
class TNet(torch.nn.Module):

    def __init__(self):
        super(TNet, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, OUT_SIZE)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x)
        return y_logits


tnet = TNet()
criterion = torch.nn.CrossEntropyLoss(reduction='sum')

In [8]:
class DNet(rd.Network):
    
    def __init__(self):
        super().__init__()
        self.l1 = self.dense_layer(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = self.dense_layer(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = self.dense_layer(HIDDEN2_SIZE, OUT_SIZE)
        
    def forward(self, x):
        x = rd.build_reshape(x, (-1, IN_SIZE))
        x = rd.build_vrelu(self.l1(x))
        x = rd.build_vrelu(self.l2(x))
        y_logits = self.l3(x)
        return y_logits
        
dnet = DNet()

In [9]:
X_sample = None
y_sample = None

for x, y in train_loader:
    X_sample = x.data.numpy()
    y_sample = y.data.numpy()
    break
    
tparams = list(tnet.parameters())
for i in range(len(tparams)):
    dnet.params_[i].update(tparams[i].data.numpy().T)
    
tX = torch.tensor(X_sample)
ty = torch.tensor(y_sample)
ty_logits = tnet(tX)
tloss = criterion(ty_logits, ty)
tnet.zero_grad()
tloss.backward()

dX = rd.build_val(X_sample)
dy = rd.build_val(utils.vec2one_hot(y_sample, 10))
dy_logits = dnet(dX)
dloss = rd.build_cross_entropy_loss(dy_logits, dy)


print(metrics.tdist(ty_logits.data.numpy(), dy_logits.eval()))
print(tloss.data.numpy())
print(dloss.eval())
print(metrics.tdist(tloss.data.numpy(), dloss.eval()))

print('Checking gradients')
tparams = list(tnet.parameters())
for i in range(len(tparams)):
    grad = rd.build_node_grad(dloss, dnet.params_[i]).eval()
    grad_sol = tparams[i].grad.data.numpy().T
    print(metrics.tdist(grad, grad_sol))

4.9710405e-07
147.31425
147.31424
1.5258789e-05
Checking gradients
3.6772453e-06
2.6905576e-07
5.310021e-06
4.73366e-07
4.2969423e-06
1.2828853e-06


## Convolutional Network

In [10]:
import torch.nn.functional as F

class TNet(torch.nn.Module):

    def __init__(self):
        super(TNet, self).__init__()
        # 1 input image channel, 6 output channels, 5x5 square convolution
        # kernel
        self.conv1 = torch.nn.Conv2d(1, 6, 5)
        self.conv2 = torch.nn.Conv2d(6, 16, 5)
        self.fc1 = torch.nn.Linear(16 * 5 * 5, 4)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
        x = x.view(x.shape[0], -1)
        x = self.fc1(x)
        return x


tnet = TNet()
criterion = torch.nn.MSELoss(reduction='sum')

In [11]:
class DNet(rd.Network):

    def __init__(self):
        super().__init__()
        self.conv1 = self.conv2d_layer(1, 6, 5, 5)
        self.conv2 = self.conv2d_layer(6, 16, 5, 5)
        self.fc = self.dense_layer(16 * 5 * 5, 4)

    def forward(self, x):
        x = rd.build_vrelu(self.conv1(x))
        x = rd.build_max_pooling(x, 2, 2, 2, 2)
        
        x = rd.build_vrelu(self.conv2(x))
        x = rd.build_max_pooling(x, 2, 2, 2, 2)
        
        #print(x.shape)
        
        x = rd.build_reshape(x, (x.shape[0], -1))
        x = self.fc(x)
        return x


dnet = DNet()

In [12]:
X = np.random.randn(3, 1, 32, 32).astype(np.float32)
y = np.random.randn(3, 4).astype(np.float32)
    
tparams = list(tnet.parameters())
for i in range(len(tparams)):
    
    if len(tparams[i].shape) == 2:
        dnet.params_[i].update(tparams[i].data.numpy().T)
    else:
        dnet.params_[i].update(tparams[i].data.numpy())
    
tX = torch.tensor(X)
ty = torch.tensor(y)
ty_logits = tnet(tX)
tloss = criterion(ty_logits, ty)
tnet.zero_grad()
tloss.backward()

dX = rd.build_val(X)
dy = rd.build_val(y)
dy_logits = dnet(dX)
dloss = rd.op_mse_loss(dy_logits, dy)


print(metrics.tdist(ty_logits.data.numpy(), dy_logits.eval()))
print(tloss.data.numpy())
print(dloss.eval())
print(metrics.tdist(tloss.data.numpy(), dloss.eval()))


print('Checking gradients')
tparams = list(tnet.parameters())
for i in range(len(tparams)):
    grad = rd.build_node_grad(dloss, dnet.params_[i]).eval()
    
    if len(tparams[i].shape) == 2:
        grad_sol = tparams[i].grad.data.numpy().T
    else:
        grad_sol = tparams[i].grad.data.numpy()
    print(metrics.tdist(grad, grad_sol))

1.0470651e-07
15.654362
15.654361724853516
0.0
Checking gradients
2.9476935e-06
6.2956303e-07
4.8972583e-06
4.3136222e-07
2.9718476e-06
2.6656008e-07
