In [16]:
import numpy as np
np.random.seed(1234)

* = elementwise multiplication

outer = elementwise

dot = normal dot product

Simple Example

In [17]:
W = np.array(np.reshape([i for i in range(1,13)], (4,3)))
x = np.array([1,2,3])
u = np.array([-1,0,2,4])

def dot(W, x):
    value = np.dot(W, x)

    def vjp(u):
        vjp_wrt_W = np.outer(u, x)  #applied to W
        vjp_wrt_x = W.T.dot(u)  #applied to x
        # return vjp_wrt_W, vjp_wrt_x
        return vjp_wrt_x, vjp_wrt_W
        
    return value, vjp

value, vjp = dot(W, x)
print(value)
print(vjp(u))

[14 32 50 68]
(array([53, 58, 63]), array([[-1, -2, -3],
       [ 0,  0,  0],
       [ 2,  4,  6],
       [ 4,  8, 12]]))


### **Question 1** Implement the relu function and its VJP in the format above. Using the finite difference equation (slide 13), make sure that the VJP is correct numerically.

In [18]:
def relu(x):
    value = np.maximum(0, x)

    def vjp(u):
        gdash = lambda y: 1 if y>=0 else 0
        vjp_wrt_x = u*np.vectorize(gdash)(x)
        return vjp_wrt_x,  
        # The comma is important!
    
    return value, vjp


x = np.array([1,2,3])
u = np.array([-1,0,2])
value, vjp = relu(x)
# print(value)
# print(vjp(u))

In [19]:
def act_tanh(x):
    value = np.tanh(x)

    def vjp(u): 
        gdash = lambda z: (1/np.cosh(z))**2         #Is this even right?
        vjp_wrt_x = u*np.vectorize(gdash)(x)
        return vjp_wrt_x,  
    
    return value, vjp

x = np.array([1,2,3])
u = np.array([-1,0,2])
value, vjp = act_tanh(x)

### **Question 2**
Reusing dot and relu, implement a 2-layer MLP with a relu activation

$x\mapsto\text{relu}(W_{1}x)$

In [20]:
def mlp1(x, W1):
    a, vjp1 = dot(W1, x)
    b, vjp2 = relu(a)
    
    value = b
    def vjp(u):
        #Coming from the left here, so multiplying u by relu first and then that thing with the dot
        vjp_wrt_a, = vjp2(u)
        # vjp_wrt_W1, vjp_wrt_x = vjp1(vjp_wrt_a)
        vjp_wrt_x, vjp_wrt_W1 = vjp1(vjp_wrt_a)
        return vjp_wrt_x, vjp_wrt_W1
    return value, vjp

#W1: First layer weights; has shape (D, H)

D = 4
H = 3
W1 = np.random.rand(D,H)
x = np.random.rand(H)
u = np.random.rand(D)

val, vjp = mlp1(x, W1)
print(val)
print(vjp(u))


[0.73634365 1.19358144 1.11520082 1.03917841]
(array([1.18333121, 1.02908902, 0.78316235]), array([[0.38355679, 0.39996566, 0.20778331],
       [0.3438387 , 0.35854839, 0.18626692],
       [0.00941022, 0.0098128 , 0.00509778],
       [0.52819835, 0.5507951 , 0.28613964]]))


In [21]:
def mlp2(x, W2, W1):
    """
    input: 
        x = input data
        W1 = weight matrix
        W2 = weight matrix
    formula:
        y = W2.q(W1.x)
    returns:
        value = evaluated value according to formula
        vjp = tuple of vjp's in order d/dx, d/dW1, d/dW2

    """
    a, vjp_dot1 = dot(W1, x)
    b, vjp_relu = relu(a)
    c, vjp_dot2 = dot(W2, b)
    value = c

    def vjp(u):
        # vjp_wrt_W2, vjp_wrt_b = vjp_dot2(u)
        vjp_wrt_b, vjp_wrt_W2 = vjp_dot2(u)
        vjp_wrt_a, = vjp_relu(vjp_wrt_b)
        # vjp_wrt_W1, vjp_wrt_x = vjp_dot1(vjp_wrt_a) 
        vjp_wrt_x, vjp_wrt_W1 = vjp_dot1(vjp_wrt_a) 

        return vjp_wrt_x, vjp_wrt_W1, vjp_wrt_W2 
    return value, vjp



def mlp3(x, W):
    value, vjp_1 = mlp2(x, W[-2], W[-1])
    value, vjp_2 = relu(value)
    value, vjp_3 = dot(W[-3], value)

    def vjp(u):
        vjp_wrt_Wk, vjp_wrt_x = vjp_3(u)    #order must actually be changed here
        vjp_wrt_x, = vjp_2(vjp_wrt_x)
        vjp_wrt_x_wrtW = vjp_1(vjp_wrt_x)
        return vjp_wrt_x_wrtW, vjp_wrt_Wk

    return value, vjp
    


In [22]:
D, H, C = [3,2,4]

x = np.random.rand(H)
W1 = np.random.rand(D,H)
W2 = np.random.rand(C,D)
# W3 = np.random.rand(C,C)
# u = np.random.rand(C)
W3 = np.random.rand(H,C)
W4 = np.random.rand(D,H)
W5 = np.random.rand(D,D)
u = np.random.rand(D)

# val, vjp = mlp2(x, W2, W1)
# val, vjp = mlp3(x, [W3, W2, W1])
# val, vjp = mlpk(x, [W5, W4, W3, W2, W1])

# print(val)
# print(vjp(u))


### **Question 3** 
implement the squared loss VJP

In [23]:
def squared_loss(y_pred, y):
    residual = y_pred - y
    
    def vjp(u):
        vjp_y_pred = u*(1*residual)
        vjp_y = u*(-1*residual)
        return vjp_y_pred, vjp_y

    value = 0.5 * np.sum(residual ** 2)
    # The code requires every output to be an array.
    return np.array([value]), vjp

y = np.random.rand(5)
epsilon = np.random.uniform(-1, 1, 5)/5
y_pred = y + epsilon
u = np.array([1,2,3,4,5])

val, vjp = squared_loss(y_pred, y)
# print(val)
# print(vjp(u))

### **Question 4**
Implement the loss by composing mlp2 and squared_loss

In [25]:
def loss(x, y, W2, W1):
    # pred, predicted_vjp = mlp2(x, W1, W2) #Elysheva suggested a change as follows:
    pred_value, predicted_vjp = mlp2(x, W2, W1)
    loss_value, loss_vjp = squared_loss(pred_value, y)
    value = loss_value

    def vjp(u):
        vjp_y, vjp_y_pred = loss_vjp(u)
        vjp_x, vjp_W1, vjp_W2 = predicted_vjp(vjp_y_pred)
        return vjp_x, vjp_y, vjp_W1, vjp_W2
    
    return value, vjp

# y = np.random.rand(C)
# u = np.random.rand(C)

# val, vjp = loss(x, y, W1, W2)
# print(val)
# print(vjp(u))


### **Question 5** 
Implement an MLP with an arbitrary number of layers.

In [26]:
def initialiseMLP_random(inputfeatures, layers, verbose=False):
    import random
    dims = random.choices([i for i in range(2,11)], k=layers)
    W = [np.random.rand(dims[0], inputfeatures)]
    for i in range(1, len(dims)):
        Wi = np.random.rand(dims[i], dims[i-1])
        W.append(Wi)

    W.reverse()
    x = np.random.random(inputfeatures)
    u = np.random.rand(dims[-1])

    if verbose:
        print(np.shape(x))
        for i in W:
            print(np.shape(i))
        print(np.shape(u))

    return x, W, u

In [28]:
def mlpk(x, W):
    """
    input:
        x = input data
        W = list of weight matrices ordered Wk, ..., W2, W1
    formula:
        Wk.q(Wk-1q(...W2q(W1x)))
        mlp2(mlp2(mlp2(x, W2, W1), W3, W2), W4, W3)
    returns
        value = evaluated value of network
        vjp = tuple of vjp's in order d/dx, d/dW1, ... , d/dWk
    """
    if (len(W)>3):
        # print("if", len(W))
        value, vjp_1 = mlpk(x, W[1:len(W)])
    else:
        # print("else", len(W))
        value, vjp_1 = mlp2(x, W[-2], W[-1])
    
    value, vjp_2 = relu(value)
    value, vjp_3 = dot(W[0], value)

    def vjp(u):
        # vjp_wrt_Wk, vjp_wrt_x = vjp_3(u)
        vjp_wrt_x, vjp_wrt_Wk = vjp_3(u)
        vjp_wrt_x, = vjp_2(vjp_wrt_x)
        vjp_wrt_x_wrtW = vjp_1(vjp_wrt_x)
        return vjp_wrt_x_wrtW, vjp_wrt_Wk

    return value, vjp


In [34]:
x, W, u = initialiseMLP_random(4, 7, verbose=False)
val, vjp = mlpk(x, W)

print(val)
vjp_output = list(vjp(u))
# print("vjp_x", vjp_output[0], sep='\n')
print(len(vjp_output[0]))
i=0
# for w in vjp_output[1]:
    # print("W{i}".format(i=i), w, sep='\n')
    # i=i+1


[68.51259253 31.78881603 71.08672357]
2


Check implementation by checking gradient

In [4]:
import scipy
def check_grad_calculations(pblinreg, pblogreg, n, d):
    from scipy.optimize import check_grad 
    print(check_grad(pblinreg.fun, pblinreg.grad, np.random.randn(d)))
    grad_error = []
    for i in range(n):
        ind = np.random.choice(n,1)
        w =  np.random.randn(d)
        vec =  np.random.randn(d)
        eps = pow(10.0, -7.0)
        grad_error.append((pblinreg.f_i( ind[0], w+eps*vec) - pblinreg.f_i( ind[0], w))/eps - np.dot(pblinreg.grad_i(ind[0],w),vec)) 
    print(np.mean(grad_error))

    # Check for the logistic regression problem
    print(check_grad(pblogreg.fun, pblogreg.grad, np.random.randn(d)))
    grad_error = []
    for i in range(n):
        ind = np.random.choice(n,1)
        w =  np.random.randn(d)
        vec =  np.random.randn(d)
        eps = pow(10.0, -7.0)
        grad_error.append((pblogreg.f_i( ind[0], w+eps*vec) - pblogreg.f_i( ind[0], w))/eps - np.dot(pblogreg.grad_i(ind[0],w),vec)) 
    print(np.mean(grad_error))





In [44]:
def mlp2(x, W1, W2):
    a, vjp_a = mlp1(x, W1)
    b, vjp_b = mlp1(a, W2)

    def vjp(u):
        vjp_wrt_x, vjp_wrt_W2 = vjp_b(u)
        vjp_wrt_x, vjp_wrt_W1 = vjp_a(vjp_wrt_x)

        return vjp_wrt_x, vjp_wrt_W1, vjp_wrt_W2
    
    return b, vjp


    

D, H, C = [3,2,4]
x = np.random.rand(H)
W1 = np.random.rand(D,H)
W2 = np.random.rand(C,D)
u = np.random.rand(C)

val, vjp = mlp2(x, W1, W2)
print(val)
print(vjp(u))

[1.24494797 0.58107635 1.54771172 1.06925227]
(array([1.98184065, 2.69573953]), array([[0.57068706, 0.45386658],
       [0.57159938, 0.45459214],
       [0.60645832, 0.48231541]]), array([[0.54556394, 0.56780473, 0.26933486],
       [0.3427808 , 0.35675481, 0.16922456],
       [0.69600074, 0.72437433, 0.34360274],
       [0.0898297 , 0.09349175, 0.04434727]]))


### **Question 6**  
Implement SGD to train your MLP on a dataset of your choice. Study the impact of depth (number of layers) and width (number of hidden units).

In [1]:
""" 
SGD: x_k+1 = x_k + a_k * del_f_i(x_k)
"""


[2, 3, 4]
