In [1]:
def importData():
    from sklearn import datasets
    dataset = datasets.fetch_california_housing(as_frame = True)

    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import train_test_split
    import numpy as np
    np.random.seed(1)

    dataset.frame_normalized = StandardScaler().fit_transform(dataset.frame)
    # We drop Longitude as well since Latitude has enough information
    X = dataset.frame_normalized[:,0:len(dataset.frame.columns) - 2]
    y = dataset.frame_normalized[:,len(dataset.frame.columns) - 1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 9)
    X_train = np.insert(X_train, 0, np.ones(X_train.shape[0]), axis=1)
    X_test = np.insert(X_test, 0, np.ones(X_test.shape[0]), axis=1)

    return X_train, y_train, X_test, y_test

In [34]:
import numpy as np

def dot(W, x):
    value = np.dot(W, x)

    def vjp(u):
        vjp_wrt_W = np.outer(u, x)  #applied to W
        vjp_wrt_x = W.T.dot(u)  #applied to x
        return vjp_wrt_x, vjp_wrt_W
        
    return value, vjp

def relu(x):
    value = np.maximum(0, x)

    def vjp(u):
        gdash = lambda y: 1 if y>=0 else 0
        vjp_wrt_x = u*np.vectorize(gdash)(x)
        return vjp_wrt_x,  
        # The comma is important!
    
    return value, vjp

def initialiseMLP_random(inputfeatures, layers, unif=False, verbose=False):
    dims = np.random.choice([i for i in range(2,8)], layers)
    if unif:
        W = [np.random.uniform(-1, 1, size=(dims[0], inputfeatures))]
    else:
        W = [np.array(np.random.rand(dims[0], inputfeatures))]
    for i in range(1, len(dims)):
        if unif:
            Wi = np.random.uniform(-1, 1, size=(dims[i], dims[i-1]))
        else:
            Wi = np.array(np.random.rand(dims[i], dims[i-1]))
        W.append(Wi)

    W.reverse()
    if unif:
        x = np.random.uniform(-1, 1, inputfeatures)
        u = np.random.uniform(-1, 1, dims[-1])
    else:
        x = np.random.uniform(0, 1, inputfeatures)
        u = np.random.uniform(0, 1, dims[-1])

    if verbose:
        print("u=", np.shape(u))
        for i in range(len(W)):
            print("W{i}=".format(i=i), np.shape(W[i]))
        print("x=", np.shape(x))

    return x, W, u

def mlp2(x, W):
    """
    input: 
        x = input data
        W = list of weight matrices, W = [Wk, ..., W3, W2, W1]
    formula:
        y = W2.q(W1.x)
    returns:
        value = evaluated value according to formula
        vjp = tuple of vjp's in order x, W
    """
    W2, W1 = W
    a, vjp_dot1 = dot(W1, x)
    b, vjp_relu = relu(a)
    c, vjp_dot2 = dot(W2, b)
    value = c

    def vjp(u):
        vjp_wrt_b, vjp_wrt_W2 = vjp_dot2(u)
        vjp_wrt_a, = vjp_relu(vjp_wrt_b)
        vjp_wrt_x, vjp_wrt_W1 = vjp_dot1(vjp_wrt_a) 

        return vjp_wrt_x, [vjp_wrt_W2, vjp_wrt_W1]
    return value, vjp

def mlpk(x, W): #W = [Wk, ..., W3, W2, W1]
    if (len(W)>=3):
        value, vjp_1 = mlpk(x, W[1:len(W)])
    else:
        # value, vjp_1 = mlp2(x, [W[-2], W[-1]]) 04/01 change to:
        return mlp2(x, [W[-2], W[-1]])
    
    value, vjp_2 = relu(value)
    value, vjp_3 = dot(W[0], value)

    def vjp(u):
        vjp_wrt_x, vjp_wrt_Wk = vjp_3(u)
        vjp_wrt_x, = vjp_2(vjp_wrt_x)
        # vjp_wrt_x_wrtW = vjp_1(vjp_wrt_x) 04/01 change to:
        vjp_wrt_x, *vjp_wrt_W = vjp_1(vjp_wrt_x)
        #04/01 add:
        vjp_wrt_W = vjp_wrt_W[0]
        vjp_wrt_W.append(vjp_wrt_Wk)
        return vjp_wrt_x, vjp_wrt_W
        # return vjp_wrt_x_wrtW, vjp_wrt_Wk 04/01 comment out

    return value, vjp

def squared_loss(y_pred, y):
    residual = y_pred - y
    
    def vjp(u):
        vjp_y_pred = u*(1*residual)
        vjp_y = u*(-1*residual)
        return vjp_y_pred, vjp_y

    value = 0.5 * np.sum(residual ** 2)
    # The code requires every output to be an array.
    return value, vjp
    # return np.array([value]), vjp

def loss_i(i, X, y, W):
    x = X[i] # np.reshape(X[i], (X.shape[1],))
    pred_value, predicted_vjp = mlpk(x, W)
    loss_value, loss_vjp = squared_loss(pred_value, y[i])
    value = loss_value

    def vjp(u):
        vjp_y, vjp_y_pred = loss_vjp(u)
        vjp_x, vjp_W = predicted_vjp(vjp_y_pred)
        return vjp_x, vjp_y, vjp_W
    
    return value, vjp

In [35]:
X_train, y_train, X_test, y_test = importData()
n, d = X_train.shape

In [38]:
def SGD(niter, step, W):
    n, d = X_train.shape
    loss_evol = []

    for it in range(niter):
        i = np.random.choice(n, 1)[-1]

        vali, vjpi = loss_i(i, X_train, y_train, W)
        vjp_wrtx, vjp_wrty, vjp_wrtW = vjpi(1)

        for k in range(len(W)):
            print(W[k].shape, vjp_wrtW[k].shape, i, step)
            W[k] = W[k] - vjp_wrtW[k]*step
    
        loss_evol.append(vali)

    return loss_evol


In [39]:
W0 = np.ones((5, d))
W1 = np.ones((5, 5))
W2 = np.ones((1, 5))
W = [W2, W0]

# for i in W: print(i.shape)
SGD(niter=1, step=0.05, W=W)

(1, 5) (1, 5) 12172 0.05
(5, 8) (5, 8) 12172 0.05


[49.92143895046307]