# What are synthetic gradients?

### Demo - We're going to use a newer optimization strategy called "Synthetic Gradients" instead of "Backpropagation" to train our simple feedforward Neural Network.

<img src="https://storage.googleapis.com/deepmind-live-cms/documents/3-6.gif">

## How do Neural Networks Learn?

![alt text](http://datathings.com/blog/images/neuralnet/nnblackbox.png "Logo Title Text 1")

![alt text](https://www.intechopen.com/source/html/38738/media/f2.jpg "Logo Title Text 1")

Learning process
- Use inputs + desired outputs to update internal state accordingly

Prediction process 
- Use input and internal state to generate most likely output according to its past “training experience”

![alt text](https://qph.ec.quoracdn.net/main-qimg-b2afcc88428418db01552987182e7b6a.webp "Logo Title Text 1")

![alt text](https://qph.ec.quoracdn.net/main-qimg-7bdfcff266211a74a31bfcdcc99c0087.webp "Logo Title Text 1")


## Gradient Descent
![alt text](http://datathings.com/blog/images/neuralnet/derivative2.png "Logo Title Text 1")

In [7]:

import numpy as np
import sys

def generate_dataset(output_dim = 8,num_examples=1000):
    def int2vec(x,dim=output_dim):
        out = np.zeros(dim)
        binrep = np.array(list(np.binary_repr(x))).astype('int')
        out[-len(binrep):] = binrep
        return out

    x_left_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
    x_right_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
    y_int = x_left_int + x_right_int

    x = list()
    for i in range(len(x_left_int)):
        x.append(np.concatenate((int2vec(x_left_int[i]),int2vec(x_right_int[i]))))

    y = list()
    for i in range(len(y_int)):
        y.append(int2vec(y_int[i]))

    x = np.array(x)
    y = np.array(y)
    
    return (x,y)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_out2deriv(out):
    return out * (1 - out)

class DNI(object):
    
    def __init__(self,input_dim, output_dim,nonlin,nonlin_deriv,alpha = 0.1):
        
        self.weights = (np.random.randn(input_dim, output_dim) * 2) - 1
        self.bias = (np.random.randn(output_dim) * 2) - 1
        
        self.weights_0_1_synthetic_grads = (np.random.randn(output_dim,output_dim) * .0) - .0
        self.bias_0_1_synthetic_grads = (np.random.randn(output_dim) * .0) - .0
    
        self.nonlin = nonlin
        self.nonlin_deriv = nonlin_deriv
        self.alpha = alpha
    
    def forward_and_synthetic_update(self,input,update=True):
        
        self.input = input
        self.output = self.nonlin(self.input.dot(self.weights)  + self.bias)
        
        if(not update):
            return self.output
        else:
            self.synthetic_gradient = (self.output.dot(self.weights_0_1_synthetic_grads) + self.bias_0_1_synthetic_grads)
            self.weight_synthetic_gradient = self.synthetic_gradient * self.nonlin_deriv(self.output)
        
            self.weights -= self.input.T.dot(self.weight_synthetic_gradient) * self.alpha
            self.bias -= np.average(self.weight_synthetic_gradient,axis=0) * self.alpha
        
        return self.weight_synthetic_gradient.dot(self.weights.T), self.output
    
    def normal_update(self,true_gradient):
        grad = true_gradient * self.nonlin_deriv(self.output)
        
        self.weights -= self.input.T.dot(grad) * self.alpha
        self.bias -= np.average(grad,axis=0) * self.alpha
        
        return grad.dot(self.weights.T)
    
    def update_synthetic_weights(self,true_gradient):
        self.synthetic_gradient_delta = (self.synthetic_gradient - true_gradient)
        self.weights_0_1_synthetic_grads -= self.output.T.dot(self.synthetic_gradient_delta) * self.alpha
        self.bias_0_1_synthetic_grads -= np.average(self.synthetic_gradient_delta,axis=0) * self.alpha
        
np.random.seed(1)

num_examples = 100
output_dim = 8
iterations = 10000

x,y = generate_dataset(num_examples=num_examples, output_dim = output_dim)

batch_size = 10
alpha = 0.01

input_dim = len(x[0])
layer_1_dim = 64
layer_2_dim = 32
output_dim = len(y[0])

layer_1 = DNI(input_dim,layer_1_dim,sigmoid,sigmoid_out2deriv,alpha)
layer_2 = DNI(layer_1_dim,layer_2_dim,sigmoid,sigmoid_out2deriv,alpha)
layer_3 = DNI(layer_2_dim, output_dim,sigmoid, sigmoid_out2deriv,alpha)

for iter in range(iterations):
    error = 0
    synthetic_error = 0
    
    for batch_i in range(int(len(x) / batch_size)):
        batch_x = x[(batch_i * batch_size):(batch_i+1)*batch_size]
        batch_y = y[(batch_i * batch_size):(batch_i+1)*batch_size]  
        
        _, layer_1_out = layer_1.forward_and_synthetic_update(batch_x)
        layer_1_delta, layer_2_out = layer_2.forward_and_synthetic_update(layer_1_out)
        layer_3_out = layer_3.forward_and_synthetic_update(layer_2_out,False)

        layer_3_delta = layer_3_out - batch_y
        layer_2_delta = layer_3.normal_update(layer_3_delta)
        layer_2.update_synthetic_weights(layer_2_delta)
        layer_1.update_synthetic_weights(layer_1_delta)
        
        error += (np.sum(np.abs(layer_3_delta)))
        synthetic_error += (np.sum(np.abs(layer_2_delta - layer_2.synthetic_gradient)))
    
    sys.stdout.write("\rIter:" + str(iter) + " Loss:" + str(error) + " Synthetic Loss:" + str(synthetic_error))
    if(iter % 1000 == 999):
        print("")

Iter:999 Loss:380.142015668 Synthetic Loss:1694.16764408
Iter:1999 Loss:367.203984217 Synthetic Loss:1717.46239983
Iter:2999 Loss:361.265375816 Synthetic Loss:1735.53587437
Iter:3999 Loss:357.190353264 Synthetic Loss:1715.18658268
Iter:4999 Loss:351.924334112 Synthetic Loss:1749.48418294
Iter:5999 Loss:344.629266092 Synthetic Loss:1723.40376341
Iter:6999 Loss:342.940872462 Synthetic Loss:1734.03534908
Iter:7999 Loss:338.074206236 Synthetic Loss:1720.04109979
Iter:8999 Loss:326.151407022 Synthetic Loss:1636.67750546
Iter:9999 Loss:324.912688062 Synthetic Loss:1633.27356925


## The Problem with Backpropagation

### Locking

![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-1.width-1500_zU6x0wC.png "Logo Title Text 1")

- A layer can only be updated after a full forward+backward pass been 
- After Layer 1 has processed input, it updates after output activations (black lines) have been propagated through the rest of the network, generated a loss, and the error gradients (green lines) backpropagated through every layer until Layer 1 is reached. 
- So L1 must wait for forward+backward pass of L2 & L3 before updating
- Therefore L1 is locked/coupled to the rest of the network

![alt text](https://www.semiwiki.com/forum/attachments/content/attachments/17619d1467046829-googlenet-inceptions-jpg "Logo Title Text 1")

- For simple networks it's a non-issue
- But consider a complex system of multiple networks, acting in multiple environments at asynchronous and irregular timescales.
-  Or a big distributed network spread over multiple machines. Time expensive


### If we decouple the interfaces - the connections -  between layers, every layer can be updated independently, and is not locked to the rest of the network. But how?


## Synthetic Gradients

![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-3.width-1500_Ij679hz.png "Logo Title Text 1")

- Normally, a neural network compares its predictions to a dataset to decide how to update its weights. 
- It then uses backpropagation to figure out how each weight should move in order to make the prediction more accurate. 
- However, with Synthetic Gradients, individual layers instead make a "best guess" for what they think the data will say, and then update their weights according to this guess. 
- This "best guess" is called a Synthetic Gradient. 
- The data is only used to help update each layer's "guesser" or Synthetic Gradient generator. 
- This allows for (most of the time), individual layers to learn in isolation, which increases the speed of training.

If we use a synthetic gradient model we can do the following:

![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-4.width-1500_jjNNlb7.png "Logo Title Text 1")

... and use the synthetic gradients (blue) to update Layer 1 before the rest of the network has even been executed.

The synthetic gradient model itself is trained to regress target gradients - these target gradients could be the true gradients backpropagated from the loss or other synthetic gradients which have been backpropagated from a further downstream synthetic gradient model.

![alt text](https://storage.googleapis.com/deepmind-live-cms/images/3-5.width-1500_pmWHi94.png "Logo Title Text 1")

Animated:

![alt text](https://storage.googleapis.com/deepmind-live-cms/documents/3-6.gif "Logo Title Text 1")

![alt text](https://iamtrask.github.io/img/synthetic_grads_paper.png "Logo Title Text 1")

- Synthetic Gradient generators are just neural nets trained to take the output of a layer and predict the gradient that will likely happen at that layer.
- When we perform full forward + back pass, we get the "correct" gradient
- We can compare this to our "synthetic" gradient 
- So we can train our Synthetic Gradient networks by pretending that our "true gradients" are coming from from mythical dataset

See how the gradient (M i+2) backpropagates through (f i+1) and into M(i+1)? As you can see, each synthetic gradient generator is actually only trained using the Synthetic Gradients generated from the next layer. Thus, only the last layer actually trains on the data. All the other layers, including the Synthetic Gradient generator networks, train based on Synthetic Gradients. Thus, the network can train with each layer only having to wait on the synthetic gradient from the following layer (which has no other dependencies). 

- DNI doesn’t magically allow networks to train without true gradient information. The true gradient information does percolate backwards through the network, but just slower and over many training iterations, through the losses of the synthetic gradient models. 
- But overall the network is faster  because the synthetic gradient models approximate and smooth over the absence of true gradients.
- DNI can be applied to any generic neural network architecture, not just feed-forward networks
- This is awesome! I want to see this integrated into all major DL libraries. allow distributed training of networks + faster + cleaner


In [5]:
# Normal back-prop gradient descent neural network

def generate_dataset(output_dim = 8,num_examples=1000):
    def int2vec(x,dim=output_dim):
        out = np.zeros(dim)
        binrep = np.array(list(np.binary_repr(x))).astype('int')
        out[-len(binrep):] = binrep
        return out

    x_left_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
    x_right_int = (np.random.rand(num_examples) * 2**(output_dim - 1)).astype('int')
    y_int = x_left_int + x_right_int

    x = list()
    for i in range(len(x_left_int)):
        x.append(np.concatenate((int2vec(x_left_int[i]),int2vec(x_right_int[i]))))

    y = list()
    for i in range(len(y_int)):
        y.append(int2vec(y_int[i]))

    x = np.array(x)
    y = np.array(y)
    
    return (x,y)
    
np.random.seed(1)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

num_examples = 100
output_dim = 8
iterations = 10000

x,y = generate_dataset(num_examples=num_examples, output_dim = output_dim)

batch_size = 10
alpha = 0.1

input_dim = len(x[0])
layer_1_dim = 128
layer_2_dim = 64
output_dim = len(y[0])

weights_0_1 = (np.random.randn(input_dim,layer_1_dim) * 0.2) - 0.1
weights_1_2 = (np.random.randn(layer_1_dim,layer_2_dim) * 0.2) - 0.1
weights_2_3 = (np.random.randn(layer_2_dim,output_dim) * 0.2) - 0.1


for iter in range(iterations):
    error = 0

    for batch_i in range(int(len(x) / batch_size)):
        batch_x = x[(batch_i * batch_size):(batch_i+1)*batch_size]
        batch_y = y[(batch_i * batch_size):(batch_i+1)*batch_size]    

        layer_0 = batch_x
        layer_1 = sigmoid(layer_0.dot(weights_0_1))
        layer_2 = sigmoid(layer_1.dot(weights_1_2))
        layer_3 = sigmoid(layer_2.dot(weights_2_3))

        layer_3_delta = (layer_3 - batch_y) * layer_3  * (1 - layer_3)
        layer_2_delta = layer_3_delta.dot(weights_2_3.T) * layer_2 * (1 - layer_2)
        layer_1_delta = layer_2_delta.dot(weights_1_2.T) * layer_1 * (1 - layer_1)

        weights_0_1 -= layer_0.T.dot(layer_1_delta) * alpha
        weights_1_2 -= layer_1.T.dot(layer_2_delta) * alpha
        weights_2_3 -= layer_2.T.dot(layer_3_delta) * alpha

        error += (np.sum(np.abs(layer_3_delta)))

    sys.stdout.write("\rIter:" + str(iter) + " Loss:" + str(error))
    if(iter % 1000 == 999):
        print("")


Iter:999 Loss:1.77001735428
Iter:1999 Loss:0.327780634499
Iter:2999 Loss:0.181843992452
Iter:3999 Loss:0.184111347924
Iter:4999 Loss:0.0999035747467
Iter:5999 Loss:0.0736026337433
Iter:6999 Loss:0.0588353479911
Iter:7999 Loss:0.0491536076171
Iter:8999 Loss:0.0422581205704
Iter:9999 Loss:0.0370760733124
