In [1]:
import numpy as np

In [2]:
class Module(object):
    def sgd_step(self, lrate):
        pass

In [3]:
g = np.random.normal(0, 1, [5, 1])
g

array([[ 0.72857353],
       [ 0.49828572],
       [ 0.11834723],
       [-0.52550008],
       [-0.29110585]])

In [4]:
np.sum(g)

0.5286005462620245

In [5]:
w = np.random.normal(0, 1.0 * 5 ** (-.5), [5, 3])
w

array([[-0.13712645,  0.03143536,  0.51662227],
       [ 0.26148976,  0.37021696,  0.00935355],
       [-0.17235972,  0.20493661,  0.0023513 ],
       [-0.01794616, -0.1867508 , -0.10064325],
       [ 0.21741338,  0.4292686 , -0.25823008]])

In [6]:
v = np.array([[2, 3, 4], [4, 5, 6], [7, 8, 9]])
np.diag(v)

array([2, 5, 9])

In [7]:
c = np.random.randint(10, size=(5, 10, 3))

In [8]:
r = g * (g > 0)
r

array([[ 0.72857353],
       [ 0.49828572],
       [ 0.11834723],
       [-0.        ],
       [-0.        ]])

In [9]:
v = 1 * (r > 0)[:, 0]
v

array([1, 1, 1, 0, 0])

In [10]:
np.diag(v)

array([[1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0]])

In [11]:
tanh = np.tanh(g)
tanh

array([[ 0.62219187],
       [ 0.4607679 ],
       [ 0.11779778],
       [-0.48193383],
       [-0.28315232]])

In [12]:
d_tanh = 1 - np.square(tanh)
d_tanh

array([[0.61287727],
       [0.78769294],
       [0.98612368],
       [0.76773979],
       [0.91982477]])

In [13]:
np.argmax(d_tanh, axis=0)

array([2])

In [14]:
np.argmax(w, axis=1)

array([2, 1, 1, 0, 1])

In [15]:
np.argmax(g, axis=0)[0]

0

In [21]:
class Linear(Module):
    def __init__(self, m, n):
        self.m, self.n = (m, n)   # (in size, out size) 
        self.W0 = np.zeros((self.n, 1))   # (n x 1)
        self.W = np.random.normal(0, 1.0 * m ** (-.5), (m, n))  # (m x n)
        
    def forward(self, A):
        self.A = A  # (m x b)
        return np.dot(self.W.T, self.A) + self.W0   # (n x b)

    def backward(self, dLdZ):    # dLdZ is (n x b)
        self.dLdW = np.dot(self.A, dLdZ.T)
        self.dLdW0 = np.sum(dLdZ, axis=1, keepdims=True)
        return np.dot(self.W, dLdZ)   # (m x b) dLdA

    def sgd_step(self, lrate):
        self.W = self.W - lrate * self.dLdW
        self.W0 = self.W0 - lrate* self.dLdW0

class ReLU(Module):    # Layer activation
    def forward(self, Z):
        self.A = Z * (Z > 0)
        return self.A
    
    def backward(self, dLdA):
        v = 1 * (self.A > 0)[:, 0]  # Rank 1 view
        dAdZ = np.diag(v)
        dLdZ = np.dot(dAdZ, dLdA)
        return dLdZ

class Tanh(Module):
    def forward(self, Z):
        self.A = np.tanh(Z)
        return self.A
    
    def backward(self, dLdA):
        v = 1 - np.square(self.A)
        dAdZ = np.zeros((v.shape[1], v.shape[0], v.shape[0]))
        dAdZ = np.diag(v)
        dLdZ = np.dot(dAdZ, dLdA)
        return dLdZ


class SoftMax(Module):
    def forward(self, Z):
        exp = np.exp(Z)
        self.A = exp / np.sum(exp, axis=0)
        return self.A
    
    def backward(self, dLdZ):
        return dLdZ
    
    def class_fun(self, Ypred):
        return np.argmax(Ypred, axis=0)


class NLL(Module):        # Loss
    def forward(self, Ypred, Y):
        self.Ypred = Ypred
        self.Y = Y
        nll = -((self.Y * np.log(self.Ypred)) + ((1 - self.Y) * np.log(1 - self.Ypred)))
        return np.sum(nll) / self.Ypred.shape[0]

    def backward(self):
        return self.Ypred - self.Y
        

In [22]:
class Sequential:
    def __init__(self, modules, loss):            # List of modules, loss module
        self.modules = modules
        self.loss = loss

    def sgd(self, X, Y, iters=1000, lrate=0.005):  # Train
        D, N = X.shape
        for it in range(iters):
            i = np.random.randint(N)
            Xt = X[:, i:i+1]
            Yt = Y[:, i:i+1]

            # feed forward computing Ypred and loss
            Ypred = self.forward(Xt)
            loss = self.loss.forward(Ypred, Yt)

            # backward backpropagation
            delta = self.loss.backward()
            self.backward(delta)
            self.sgd_step(lrate)

            self.print_accuracy(it, X, y, loss, every=250)
            

    def forward(self, Xt):                        # Compute Ypred
        for m in self.modules: Xt = m.forward(Xt)
        return Xt

    def backward(self, delta):                    # Update dLdW and dLdW0
        # Note reversed list of modules
        for m in self.modules[::-1]: delta = m.backward(delta)

    def sgd_step(self, lrate):                    # Gradient descent step
        for m in self.modules: m.sgd_step(lrate)

    def print_accuracy(self, it, X, Y, cur_loss, every=250):
        # Utility method to print accuracy on full dataset, should
        # improve over time when doing SGD. Also prints current loss,
        # which should decrease over time. Call this on each iteration
        # of SGD!
        if it % every == 1:
            cf = self.modules[-1].class_fun
            acc = np.mean(cf(self.forward(X)) == cf(Y))
            print('Iteration =', it, '	Acc =', acc, '	Loss =', cur_loss)


In [23]:
def super_simple_separable():
    X = np.array([[2, 3, 9, 12],
                  [5, 2, 6, 5]])
    y = np.array([[1, 0, 1, 0]])
    return X, for_softmax(y)

In [24]:
def for_softmax(y):
    return np.vstack([1 - y, y])

### testing

In [25]:
nn_test = Sequential([Linear(2,3), Tanh(), Linear(3,2), SoftMax()], NLL())

In [26]:
(linear_1, f_1, linear_2, f_2) = nn_test.modules
Loss = nn_test.loss

In [27]:
X, Y = super_simple_separable()
X, Y

(array([[ 2,  3,  9, 12],
        [ 5,  2,  6,  5]]),
 array([[0, 1, 0, 1],
        [1, 0, 1, 0]]))

#### forward pass

In [28]:
z_1 = linear_1.forward(X)
z_1

array([[ 4.59071745,  1.60818575,  4.82455724,  3.55389366],
       [ 0.08357941, -1.17624846, -3.52874537, -5.41496705],
       [ 2.66545998,  2.56776679,  7.70330036,  9.49083632]])

In [29]:
linear_1.W

array([[-0.10368238, -0.54985465,  0.68253763],
       [ 0.95961644,  0.23665774,  0.26007694]])

In [30]:
a_1 = f_1.forward(z_1)
a_1

array([[ 0.99979416,  0.92289148,  0.99987104,  0.99836393],
       [ 0.08338533, -0.82626498, -0.99827961, -0.9999604 ],
       [ 0.99036739,  0.98830101,  0.99999959,  0.99999999]])

In [31]:
z_2 = linear_2.forward(a_1)
z_2

array([[ 0.51121946,  1.37102829,  1.58214494,  1.58302355],
       [-1.36266425, -1.4881951 , -1.56585003, -1.56544737]])

In [32]:
linear_2.W

array([[ 0.52043481, -0.46774467],
       [-0.98937446,  0.17962984],
       [ 0.07410492, -0.91884524]])

In [33]:
a_2 = f_2.forward(z_2)
a_2

array([[0.86690702, 0.9457935 , 0.95882965, 0.95884843],
       [0.13309298, 0.0542065 , 0.04117035, 0.04115157]])

In [34]:
Ypred = a_2

In [35]:
loss = Loss.forward(Ypred, Y)
loss

5.304497384106767

#### backward pass

In [36]:
dloss = Loss.backward()
dloss

array([[ 0.86690702, -0.0542065 ,  0.95882965, -0.04115157],
       [-0.86690702,  0.0542065 , -0.95882965,  0.04115157]])

In [37]:
dL_dz2 = f_2.backward(dloss)
dL_dz2

array([[ 0.86690702, -0.0542065 ,  0.95882965, -0.04115157],
       [-0.86690702,  0.0542065 , -0.95882965,  0.04115157]])

In [38]:
dL_da1 = linear_2.backward(dL_dz2)
dL_da1

array([[ 0.85665973, -0.05356575,  0.94749578, -0.04066514],
       [-1.01341803,  0.06336763, -1.12087598,  0.04810636],
       [ 0.86079546, -0.05382435,  0.95207005, -0.04086146]])

In [39]:
f_1.A

array([[ 0.99979416,  0.92289148,  0.99987104,  0.99836393],
       [ 0.08338533, -0.82626498, -0.99827961, -0.9999604 ],
       [ 0.99036739,  0.98830101,  0.99999959,  0.99999999]])

In [40]:
v = 1 - np.square(f_1.A)
v

array([[4.11646167e-04, 1.48271321e-01, 2.57897309e-04, 3.26946243e-03],
       [9.93046886e-01, 3.17286179e-01, 3.43782073e-03, 7.91885111e-05],
       [1.91724394e-02, 2.32611068e-02, 8.14813354e-07, 2.28257103e-08]])

In [41]:
d, n = v.shape
dAdZ = np.zeros((n, d, d))
dAdZ

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]],

       [[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [42]:
for i in range(n):
    dAdZ[i, :, :] = np.diag(v[:, i])

In [43]:
dAdZ

array([[[4.11646167e-04, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 9.93046886e-01, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 1.91724394e-02]],

       [[1.48271321e-01, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 3.17286179e-01, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 2.32611068e-02]],

       [[2.57897309e-04, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 3.43782073e-03, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 8.14813354e-07]],

       [[3.26946243e-03, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 7.91885111e-05, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 2.28257103e-08]]])

In [44]:
V = np.dot(dAdZ, dL_da1)
V

array([[[ 3.52640693e-04, -2.20501370e-05,  3.90033008e-04,
         -1.67396481e-05],
        [-1.00637162e+00,  6.29270316e-02, -1.11308240e+00,
          4.77718740e-02],
        [ 1.65035488e-02, -1.03194418e-03,  1.82535054e-02,
         -7.83413842e-04]],

       [[ 1.27018070e-01, -7.94226500e-03,  1.40486452e-01,
         -6.02947372e-03],
        [-3.21543534e-01,  2.01056744e-02, -3.55638456e-01,
          1.52634841e-02],
        [ 2.00230552e-02, -1.25201407e-03,  2.21462032e-02,
         -9.50482757e-04]],

       [[ 2.20930238e-04, -1.38144636e-05,  2.44356613e-04,
         -1.04874296e-05],
        [-3.48394951e-03,  2.17846566e-04, -3.85337068e-03,
          1.65381052e-04],
        [ 7.01387638e-07, -4.38568032e-08,  7.75759392e-07,
         -3.32944623e-08]],

       [[ 2.80081680e-03, -1.75131218e-04,  3.09780187e-03,
         -1.32953141e-04],
        [-8.02510649e-05,  5.01798858e-06, -8.87604998e-05,
          3.80947126e-06],
        [ 1.96482679e-08, -1.22857913

In [45]:
dldz1 = np.zeros((d, n))

In [46]:
for i in range(n):
    dldz1[:, i] = np.dot(dAdZ[i,:,:], dL_da1[:, i])

In [47]:
dldz1

array([[ 3.52640693e-04, -7.94226500e-03,  2.44356613e-04,
        -1.32953141e-04],
       [-1.00637162e+00,  2.01056744e-02, -3.85337068e-03,
         3.80947126e-06],
       [ 1.65035488e-02, -1.25201407e-03,  7.75759392e-07,
        -9.32691820e-10]])