# [Hacker's Guite to Neural Networks](http://karpathy.github.io/neuralnets/)

## Simple Circuit

In [None]:
def forwardMultiplyGate(a, b):
    return a * b;

def forwardAddGate(a, b):
    return a + b

def forwardCircuit(x,y,z):
    q = forwardAddGate(x, y)
    f = forwardMultiplyGate(q, z)
    return f



# initial conditions
x = -2
y = 5
z = -4
q = forwardAddGate(x, y); # q is 3
f = forwardMultiplyGate(q, z) # output is -12

# gradient of the MULTIPLY gate with respect to its inputs
# wrt is short for "with respect to"
derivative_f_wrt_z = q
derivative_f_wrt_q = z

# derivative of the ADD gate with respect to its inputs
derivative_q_wrt_x = 1.0
derivative_q_wrt_y = 1.0

# chain rule
derivative_f_wrt_x = derivative_q_wrt_x * derivative_f_wrt_q
derivative_f_wrt_y = derivative_q_wrt_y * derivative_f_wrt_q


# final gradient, from above: [-4, -4, 3]
gradient_f_wrt_xyz = [derivative_f_wrt_x, derivative_f_wrt_y, derivative_f_wrt_z]

# let the inputs respond to the force/tug:
step_size = 0.01;
x = x + step_size * derivative_f_wrt_x
y = y + step_size * derivative_f_wrt_y
z = z + step_size * derivative_f_wrt_z


q = forwardAddGate(x, y) 
f = forwardMultiplyGate(q, z)


## Backprop

$f(x, y, a, b, c)=\sigma(ax+by+c)$

In [6]:
from abc import ABC, abstractmethod
import math

# every Unit corresponds to a wire in the diagrams
class Unit(object):
    def __init__(self, value, grad):
        # value computed in the forward pass
        self.value = value
        # the derivative of circuit output w.r.t this unit, computed in backward pass
        self.grad = grad


class ABCGate(ABC):
    
    def __init__(self):
        pass
    
    @abstractmethod
    def forward(self):
        pass
    
    @abstractmethod
    def backward(self):
        pass
        

class MultiplyGate(ABCGate):
    
    def forward(self, u0, u1):
        # store pointers to input Units u0 and u1 and output unit utop
        self.u0 = u0
        self.u1 = u1
        self.utop = Unit(u0.value * u1.value, 0.0)
        return self.utop
    
    def backward(self):
        # take the gradient in output unit and chain it with the
        # local gradients, which we derived for multiply gate before
        # then write those gradients to those Units.
        self.u0.grad += self.u1.value * self.utop.grad
        self.u1.grad += self.u0.value * self.utop.grad;


class AddGate(ABCGate):
    
    def forward(self, u0, u1):
        self.u0 = u0
        self.u1 = u1
        self.utop = Unit(u0.value + u1.value, 0.0)
        return self.utop
    
    def backward(self):
        # add gate. derivative wrt both inputs is 1
        self.u0.grad += 1 * self.utop.grad
        self.u1.grad += 1 * self.utop.grad

class SigmoidGate(ABCGate):
    
    # helper function
    def _sig(self, x):
        return 1 / (1 + math.exp(-x))
    
    
    def forward(self, u0):
        self.u0 = u0
        self.utop = Unit(self._sig(self.u0.value), 0.0)
        return self.utop
    
    def backward(self):
        s = self._sig(self.u0.value)
        self.u0.grad += (s * (1 - s)) * self.utop.grad


In [10]:
# create input units
a = Unit(1.0, 0.0)
b = Unit(2.0, 0.0)
c = Unit(-3.0, 0.0)
x = Unit(-1.0, 0.0)
y = Unit(3.0, 0.0)

# create the gates
mulg0 = MultiplyGate()
mulg1 = MultiplyGate()
addg0 = AddGate()
addg1 = AddGate()
sg0 = SigmoidGate()

# do the forward pass
def forward_neuron():
    ax = mulg0.forward(a, x) # a*x = -1
    by = mulg1.forward(b, y) # b*y = 6
    axpby = addg0.forward(ax, by) # a*x + b*y = 5
    axpbypc = addg1.forward(axpby, c) # a*x + b*y + c = 2
    s = sg0.forward(axpbypc) # sig(a*x + b*y + c) = 0.8808
    return s

s = forward_neuron()

print('circuit output: {0.value}'.format(s))

circuit output: 0.8807970779778823


In [11]:
# Compute Gradient
s.grad = 1.0;
sg0.backward() # writes gradient into axpbypc
addg1.backward() # writes gradients into axpby and c
addg0.backward() # writes gradients into ax and by
mulg1.backward() # writes gradients into b and y
mulg0.backward() # writes gradients into a and x


In [15]:
x.grad

0.10499358540350662

In [17]:
step_size = 0.01

a.value += step_size * a.grad # a.grad is -0.105
b.value += step_size * b.grad # b.grad is 0.315
c.value += step_size * c.grad # c.grad is 0.105
x.value += step_size * x.grad # x.grad is 0.105
y.value += step_size * y.grad # y.grad is 0.210

s = forward_neuron();
print('circuit output after one backprop: {s.value}'.format(s=s)) # prints 0.8825

circuit output after one backprop: 0.8825501816218984


## Machine Learning

$f(x,y,a,b,c)=ax+by+c$

1. We select a random datapoint and feed it through the circuit

2. We will interpret the output of the circuit as a confidence that the datapoint has class +1. (i.e. very high values = circuit is very certain datapoint has class +1 and very low values = circuit is certain this datapoint has class -1.)

3. We will measure how well the prediction aligns with the provided labels. Intuitively, for example, if a positive example scores very low, we will want to tug in the positive direction on the circuit, demanding that it should output higher value for this datapoint. Note that this is the case for the the first datapoint: it is labeled as +1 but our predictor unction only assigns it value -1.2. We will therefore tug on the circuit in positive direction; We want the value to be higher.

4. The circuit will take the tug and backpropagate it to compute tugs on the inputs a,b,c,x,y

5. Since we think of x,y as (fixed) datapoints, we will ignore the pull on x,y. If you’re a fan of my physical analogies, think of these inputs as pegs, fixed in the ground.

6. On the other hand, we will take the parameters a,b,c and make them respond to their tug (i.e. we’ll perform what we call a parameter update). This, of course, will make it so that the circuit will output a slightly higher score on this particular datapoint in the future.

7. Iterate! Go back to step 1.

In [None]:

data = [([1.2, 0.7], +1),
        ([-0.3, 0.5], -1),
        ([-3, -1], +1),
        ([0.1, 1.0], -1),
        ([3.0, 1.1], -1),
        ([2.1, -3], +1)]



### Define Circuit

In [18]:
# A circuit: it takes 5 Units (x,y,a,b,c) and outputs a single Unit
# It can also compute the gradient w.r.t. its inputs
class Circuit(ABCGate):
    
    def __init__(self):
        # create some gates
        self.mulg0 = MultiplyGate()
        self.mulg1 = MultiplyGate()
        self.addg0 = AddGate()
        self.addg1 = AddGate()
        
    def forward(self, x,y,a,b,c):
        self.ax = self.mulg0.forward(a, x) # a*x
        self.by = self.mulg1.forward(b, y) # b*y
        self.axpby = self.addg0.forward(self.ax, self.by) # a*x + b*y
        self.axpbypc = self.addg1.forward(self.axpby, c) # a*x + b*y + c
        return self.axpbypc
    
    def backward(self, gradient_top): # takes pull from above
        self.axpbypc.grad = gradient_top
        self.addg1.backward() # sets gradient in axpby and c
        self.addg0.backward() # sets gradient in ax and by
        self.mulg1.backward() # sets gradient in b and y
        self.mulg0.backward() # sets gradient in a and x
        

### Define Support Vector Machine

In [43]:
# SVM class

class SVM(ABCGate):

    def __init__(self):
        # random initial parameter values
        self.a = Unit(1.0, 0.0)
        self.b = Unit(-2.0, 0.0)
        self.c = Unit(-1.0, 0.0)
        
        self.circuit = Circuit()
    
    def forward(self, x, y): # assume x and y are Units
        self.unit_out = self.circuit.forward(x, y, self.a, self.b, self.c)
        return self.unit_out
    
    def backward(self, label): # label is +1 or -1
        # reset pulls on a,b,c
        self.a.grad = 0.0
        self.b.grad = 0.0
        self.c.grad = 0.0
        
        # compute the pull based on what the circuit output was
        pull = 0.0 # this could be more flexible...
        if (label == 1) & (self.unit_out.value < 1):
            pull = 1 # the score was too low: pull up
        elif (label == -1) & (self.unit_out.value > -1):
            pull = -1 # the score was too high for a positive example, pull down
        
        self.circuit.backward(pull) # writes gradient into x,y,a,b,c
        
        # add regularization pull for parameters: towards zero and proportional to value
        self.a.grad -= self.a.value
        self.b.grad -= self.b.value
    
    def learn_from(self, x, y, label):
        self.forward(x, y) # forward pass (set .value in all Units)
        self.backward(label) # backward pass (set .grad in all Units)
        self.parameter_update() # parameters respond to tug
    
    def parameter_update(self):
        step_size = 0.01
        self.a.value += step_size * self.a.grad
        self.b.value += step_size * self.b.grad
        self.c.value += step_size * self.c.grad


### Train with Stochastic Gradient Descent

In [46]:
import numpy as np

data = [[1.2, 0.7], [-0.3, -0.5], [3.0, 0.1],
        [-0.1, -1.0], [-1.0, 1.1], [2.1, -3]]
labels = [1, -1, 1, -1, -1, 1]

svm = SVM()

# a function that computes the classification accuracy
def eval_training_accuracy():
    num_correct = 0
    for i in range(len(data)):
        x = Unit(data[i][0], 0.0)
        y = Unit(data[i][1], 0.0)
        true_label = labels[i]
        
        # see if the prediction matches the provided label
        if svm.forward(x, y).value > 0:
            predicted_label = 1 
        else:
            predicted_label = -1
        
        if predicted_label == true_label:
            num_correct+=1

    return num_correct / float(len(data));


# the learning loop
for iter in range(400):
    # pick a random data point
    i = np.random.randint(len(data), size=1)[0]
    x = Unit(data[i][0], 0.0)
    y = Unit(data[i][1], 0.0)
    label = labels[i]
    svm.learn_from(x, y, label)
    
    if iter % 25 == 0: #/ every 10 iterations... 
        print('training accuracy at iter {iter}: {ac}'.format(iter=iter,
                                                             ac=eval_training_accuracy()))


training accuracy at iter 0: 0.6666666666666666
training accuracy at iter 25: 0.6666666666666666
training accuracy at iter 50: 0.8333333333333334
training accuracy at iter 75: 0.8333333333333334
training accuracy at iter 100: 0.8333333333333334
training accuracy at iter 125: 0.8333333333333334
training accuracy at iter 150: 0.8333333333333334
training accuracy at iter 175: 0.8333333333333334
training accuracy at iter 200: 0.8333333333333334
training accuracy at iter 225: 0.8333333333333334
training accuracy at iter 250: 0.8333333333333334
training accuracy at iter 275: 0.8333333333333334
training accuracy at iter 300: 0.8333333333333334
training accuracy at iter 325: 0.8333333333333334
training accuracy at iter 350: 1.0
training accuracy at iter 375: 1.0


In [32]:
len(data)

6