# Hacker's Guide to Neural Networks
* Python code w/ personal notes and experiments from [Andrej Karpathy's tutorial](http://karpathy.github.io/neuralnets/)

---
# Real-valued Circuits

### Circuit with Single Gate

In [1]:
def forwardMultiplyGate(x,y):
    return x * y

forwardMultiplyGate(-2, 3)

-6

#### Strategy 1: Random Local Search

In [2]:
x, y = -2, 3
best_x, best_y = x, y
best_out = -float("inf")
tweak_amount = 0.01

In [3]:
import random
for i in range(100):
    random_ = (random.random() * 2 - 1)
    x_try = x + tweak_amount * random_
    y_try = y + tweak_amount * random_
    out = forwardMultiplyGate(x_try, y_try)
    
    if (out > best_out):
        best_out, best_x, best_y = out, x_try, y_try

best_x, best_y, out

(-1.9900257329902675, 3.0099742670097323, -5.9932315527023885)

#### Strategy 2: Numerical Gradient
* tweaking the knobs for each pass

In [4]:
x, y = -2, 3
out = forwardMultiplyGate(x, y)
h = 0.0001

out_x = forwardMultiplyGate(x + h, y)
x_derivative = float((out_x - out) / h)
out_y = forwardMultiplyGate(x, y + h)
y_derivative = float((out_y - out) / h)

x_derivative, y_derivative

(3.00000000000189, -2.0000000000042206)

In [5]:
step_size = 0.01
x += step_size * x_derivative
y += step_size * y_derivative
out_new = forwardMultiplyGate(x, y)

out_new

-5.87059999999986

Try to see with more iterations if the gradient is really towards increasing the function.. 

It is! Except if step to (-)step, opposite direction of gradient.

In [8]:
step_size = 0.01
x, y = -1, 1
h = 0.0001

for i in range(10):
    
    out = forwardMultiplyGate(x, y)
    
    out_x = forwardMultiplyGate(x + h, y)
    x_derivative = (out_x - out) / h
    
    out_y = forwardMultiplyGate(x, y + h)
    y_derivative = (out_y - out) / h
    
    x += step_size * x_derivative
    y += step_size * y_derivative
    out_new = forwardMultiplyGate(x, y)
    
    print "old x: %s, x derivative: %s, new x: %s\nold y: %s, y derivative: %s, new y: %s\nout: %s\n" % \
    (out_x, x_derivative, x, out_y, y_derivative, y, out_new)

old x: -0.9999, x derivative: 1.0, new x: -0.99
old y: -1.0001, y derivative: -1.0, new y: 0.99
out: -0.9801

old x: -0.980001, x derivative: 0.99, new x: -0.9801
old y: -0.980199, y derivative: -0.99, new y: 0.9801
out: -0.96059601

old x: -0.960498, x derivative: 0.9801, new x: -0.970299
old y: -0.96069402, y derivative: -0.9801, new y: 0.970299
out: -0.941480149401

old x: -0.941383119501, x derivative: 0.970299, new x: -0.96059601
old y: -0.941577179301, y derivative: -0.970298999999, new y: 0.96059601
out: -0.922744694428

old x: -0.922648634827, x derivative: 0.96059601, new x: -0.9509900499
old y: -0.922840754029, y derivative: -0.96059601, new y: 0.9509900499
out: -0.904382075009

old x: -0.904286976004, x derivative: 0.950990049901, new x: -0.941480149401
old y: -0.904477174014, y derivative: -0.950990049899, new y: 0.941480149401
out: -0.886384871716

old x: -0.886290723701, x derivative: 0.941480149401, new x: -0.932065347907
old y: -0.886479019731, y derivative: -0.94148014

#### Strategy 3: Analytical Gradient
* for our function, it turns out that the derivatives of x, y are y, x respectively.

In [9]:
x, y = -2, 3 # re-initialize
x_gradient, y_gradient = y, x # derived from separate evaluation

x += step_size * x_gradient
y += step_size * y_gradient
out_new = forwardMultiplyGate(x, y)

out_new

-5.8706

### Circuits with Multiple Gates

In [10]:
def forwardMultiplyGate(a, b): return a * b
def forwardAddGate(a, b): return a + b

def forwardCircuit(x, y, z):
    q = forwardAddGate(x, y)
    f = forwardMultiplyGate(q, z) 
    return f

x, y, z = -2, 5, -4
forwardCircuit(x, y, z)

-12

#### Backpropagation
* the chain rule, is really really useful

In [11]:
x, y, z = -2, 5, -4
q = forwardAddGate(x, y)
f = forwardMultiplyGate(q, z)
print f

derivative_f_wrt_z = q
derivative_f_wrt_q = z

derivative_q_wrt_x = 1
derivative_q_wrt_y = 1

derivative_f_wrt_x = derivative_f_wrt_q * derivative_q_wrt_x
derivative_f_wrt_y = derivative_f_wrt_q * derivative_q_wrt_y

step_size = 0.01
x += step_size * derivative_f_wrt_x
y += step_size * derivative_f_wrt_y
z += step_size * derivative_f_wrt_z
print forwardMultiplyGate(forwardAddGate(x, y), z)

-12
-11.5924


In [12]:
x, y, z = -2, 5, -4
step_size = 0.01

for i in range(20):
    
    derivative_f_wrt_z = q
    derivative_f_wrt_q = z

    derivative_q_wrt_x = 1
    derivative_q_wrt_y = 1

    derivative_f_qrt_x = derivative_f_wrt_q * derivative_q_wrt_x
    derivative_f_wrt_y = derivative_f_wrt_q * derivative_q_wrt_y

    x += step_size * derivative_f_qrt_x
    y += step_size * derivative_f_wrt_y
    z += step_size * derivative_f_wrt_z
    
    print forwardMultiplyGate(forwardAddGate(x, y), z)

-11.5924
-11.191964
-10.798638
-10.412368
-10.0331
-9.66078
-9.295354
-8.936768
-8.584968
-8.2399
-7.90151
-7.569744
-7.244548
-6.925868
-6.61365
-6.30784
-6.008384
-5.715228
-5.428318
-5.1476


Experiment with chain rule with a basic cost function.

It works! (Finds the proper inputs to minimize the function, instead of just ascending the function). Beautiful.

In [14]:
x, y, z, k = -1, 3, 4, 6
step_size = 0.001

for i in range(50):    
   
    f = forwardMultiplyGate(forwardAddGate(x, y), z)
    c = ((f - k) ** 2) / 2
    
    derivative_f_wrt_z = q
    derivative_f_wrt_q = z

    derivative_q_wrt_x = 1
    derivative_q_wrt_y = 1

    derivative_f_wrt_x = derivative_f_wrt_q * derivative_q_wrt_x
    derivative_f_wrt_y = derivative_f_wrt_q * derivative_q_wrt_y
    
    derivative_c_wrt_f = k - f # we want to follow the opposite of the gradient, to minimize, not maximize the cost
    derivative_c_wrt_x = derivative_c_wrt_f * derivative_f_wrt_x
    derivative_c_wrt_y = derivative_c_wrt_f * derivative_f_wrt_y
    derivative_c_wrt_z = derivative_c_wrt_f * derivative_f_wrt_z

    x += step_size * derivative_c_wrt_x
    y += step_size * derivative_c_wrt_y
    z += step_size * derivative_c_wrt_z

    print x, y, z, f, c

 -1.008 2.992 3.994 8 2
-1.01568483942 2.98431516058 3.988227712 7.924096 1.85107270861
-1.02306842885 2.97693157115 3.982673674 7.8513460015 1.71374100864
-1.03016395748 2.96983604252 3.9773288761 7.78159929941 1.58704803184
-1.03698393817 2.96301606183 3.97218473472 7.714713794 1.47012169766
-1.04354024913 2.95645975087 3.96723306851 7.6505554002 1.36216656457
-1.04984417258 2.95015582742 3.962466076 7.5889975047 1.26245653498
-1.05590643052 2.94409356948 3.9578763146 7.52992046615 1.17032831638
-1.06173721806 2.93826278194 3.95345668114 7.4732111548 1.08517555331
-1.06734623425 2.93265376575 3.94920039355 7.41876252785 1.00644355522
-1.0727427109 2.9272572891 3.94510097384 7.36647323747 0.933624554359
-1.07793543928 2.92206456072 3.94115223204 7.31624726845 0.866253435853
-1.0829327951 2.9170672049 3.93734825123 7.26799360311 0.803903888768
-1.08774276174 2.91225723826 3.93368337349 7.22162591093 0.746184933134
-1.09237295199 2.90762704801 3.93015218671 7.17706226095 0.692737783074


### Single Neuron
* using sigmoid activation

In [16]:
class Unit(object):
    def __init__(self, value, grad):
        self.value = value
        self.grad = grad
        
class multiplyGate(object):
    def forward(self, u0, u1):
        self.u0 = u0
        self.u1 = u1
        self.utop = Unit(u0.value * u1.value, 0.0)
        return self.utop
    
    def backward(self):
        self.u0.grad += self.u1.value * self.utop.grad
        self.u1.grad += self.u0.value * self.utop.grad
    
x = Unit(2,0)
y = Unit(-3,0)
print x.value, x.grad

m = multiplyGate()
print m.forward(x,y).value
print x.grad, y.grad

2 0
-6
0 0
