In [1]:
import numpy as np

Minimal Gated Unit in Numpy

    Activation Functions

In [2]:
def sigmoid(x, derivative = False):
    
    z = 1/ (1 + np.exp(-x)) 
    
    if derivative:
        return z * (1 - z)
    else: 
        return z

def tanh(x, derivative = False):
    
    z = np.tanh(x)
    
    if derivative:
        return 1 - np.square(z)
    else:
        return z

    Initialization

In [3]:
def init_mru(hidden_size, output_size, batch_size):
    
    
    
    # Forget gate weights and biase
    U_f = np.random.randn(hidden_size,hidden_size) * 0.1 - 0.05
    W_f = np.random.randn(hidden_size,batch_size) * 0.1 - 0.05
    b_f = np.zeros((hidden_size,1))
    
    # Hidden_hat weights
    U_h = np.random.randn(hidden_size,hidden_size) * 0.1 - 0.05
    W_h = np.random.randn(hidden_size,batch_size) * 0.1 - 0.05
    b_h = np.zeros((hidden_size,1))
    
    # Output weights
    W_y = np.random.rand(output_size, hidden_size) * 0.1 - 0.05
    b_y = np.random.rand(output_size, 1)
    
    # Previous Hidden
    prev_hidden = np.zeros((hidden_size,batch_size))
    
    params = [W_f, U_f, W_h, U_h, W_y, b_f, b_h, b_y]
    
    return prev_hidden, params
    

    Forward Pass Cell

In [39]:
def mru_cell_forward(x, prev_hidden, params):
    
    # unpack parameters
    W_f, U_f, W_h, U_h, W_y, b_f, b_h, b_y = params
    
    # Calculate forget gate, expand dim to preserve dim size
    f = sigmoid(np.expand_dims(np.dot(W_f, x),1) + np.dot(U_f, prev_hidden) + b_f)

    # Calculate hidden hat, expand dim to preserve dim size
    h_hat = tanh(np.expand_dims(np.dot(W_h,x),1) + np.dot(U_h, np.multiply(f, prev_hidden)) + b_h)
    
    # Calculate hidden
    hidden = np.multiply((1 - f),prev_hidden) + np.multiply(f,h_hat)
    
    # Calculate prediction
    y_pred = np.dot(W_y, hidden) + b_y
    
    
    cache = (x, prev_hidden, f, h_hat, hidden, y_pred, params)
    
    return hidden, y_pred, cache
        
        
        

    Forward Pass

In [42]:
def mru_forward(x, h0, params):
    
    caches = []
    
    # Shape of input: num_batches x num_timesteps
    n, T_x = x.shape
    h_size, n = h0.shape
    # Initialize hidden
    h_t = h0 
    
    # Initialize h_t and yt_pred storage
    h = np.zeros((h_size,n,T_x))
    
    # -- batch size x timesteps
    y_pred = np.zeros((n,T_x))
    # Iterates through all timesteps
    for t in range(T_x):
        print(f'Time Step: {t}')
        # get x's at timestep 
        x_t = x[:, t]
        print()
        # Compute forward propagation: new hidden state, y_pred, and cache
        h_t, yt_pred, cache = mru_cell_forward(x_t, h_t, params)
        
        # Save predicted y
        y_pred[:,t] = yt_pred.reshape(-1)
        
        # Save next hidden state
        h[:,:,t] = h_t
        
        # Save cache for backpropagation
        caches.append(cache)
    
    caches = (caches, x)
    
    return h, y_pred, cache
        
    

    Backward Pass Cell

In [56]:
def mru_cell_backward(dh_next, dy, cache):

    # Retrieving values from cache
    (x, h_prev, f, h_hat, hidden, y_pred, params) = cache
    W_f, U_f, W_h, U_h, W_y, b_f, b_h, b_y = params
    
    # Compute derivatives for y parameters
    dW_y = np.dot(dy, np.transpose(h_prev))
    db_y = dy
    # Intermediate derivatives
    dh = dh_next
    dh_hat = np.multiply(dh,f)
    dh_hat_1 =  dh_hat * tanh(h_hat, derivative=True)
    
    # Compute derivatives for hidden parameters
    dW_h = np.dot(dh_hat_1, np.transpose(x))
    dU_h = np.dot(dh_hat_1, np.transpose(np.multiply(f, h_prev)))
    db_h = dh_hat_1
    
    # Intermediate derivatives
    dfhp = np.dot(np.transpose(U_f),dh_hat_1)
    df = np.multiply(dfhp, h_prev)
    df_1 = df * sigmoid(f, derivative=True)
    
    # Compute derivatives for forget gate parameters
    dW_f = np.dot(df_1, np.transpose(x))
    dU_f = np.dot(df_1, np.transpose(h_prev))
    db_f = df_1
    
    # all influence of previous later to loss
    # compute input and prev hidden derivative
    dh_prev = 0
    dh_prev += np.multiply(dh,(1-f))
    dh_prev += np.dot(np.transpose(U_f), df_1)
    dh_prev += np.multiply(dfhp,f)
    
    grads = (dW_h, dU_h, db_h, dW_f, dU_f, db_f, dW_y, db_y)
    
    return dh_prev, grads
    
    
    

    Backward Pass

In [57]:
def backward_pass(dy, lr, params, caches):
    
    W_f, U_f, W_h, U_h, W_y, b_f, b_h, b_y  = params
    
    (caches, x) = caches
    
    # Get shape of input
    n, T_x = x.shape
    
    # Initialize gradients with correct sizes
    dW_h = np.zeros_like(W_f)
    dU_h = np.zeros_like(U_f)
    db_h = np.zeros_like(b_h)
    dW_f = np.zeros_like(W_h)
    dU_f = np.zeros_like(U_h)
    db_f = np.zeros_like(b_f)
    dW_y = np.zeros_like(W_y)
    db_y = np.zeros_like(b_y)
    
    # Compute original dh_next derivative
    dh_next = np.dot(dy,W_y)
    
    # Compute derivates of derivable parameters  for whole sequence
    for i in reversed(range(T_x)):
        
        dh_next, grads = mru_cell_backward(dh_next, dy, caches[i])
        
        partial_dW_h, partial_dU_h, partial_db_h, partial_dW_f, partial_dU_f, partial_db_f, partial_dW_y, partial_db_y = grads
        
        # add each steps gradient to self
        dW_h += partial_dW_h
        dU_h += partial_dU_h
        db_h += partial_db_h
        dW_f += partial_dW_f
        dU_f += partial_dU_f
        db_f += partial_db_f
        dW_y += partial_dW_y
        db_y += partial_db_y
    
    #Adjust parameters
    W_f += lr * dW_f
    U_f += lr * dU_f
    W_h += lr * dW_h
    U_h += lr * dU_h
    W_y += lr * dW_y
    b_f += lr * db_f
    b_h += lr * db_f
    b_y += lr * db_y
    
    params = [W_f, U_f, W_h, U_h, W_y, b_f, b_h, b_y]
    
    return params

    Loss Function

In [58]:
def lossFun(predicted, targets):
    
    assert(predicted.shape == targets.shape)
    n, T_x = predicted.shape
    
    sequence_loss = np.multiply(np.divide(1,n), np.sum(np.square(np.subtract(targets,predicted)),0))
    
    dy = np.multiply(np.divide(2,n), np.sum(np.subtract(targets,predicted),0))
    
    print(f'Loss : {sequence_loss}')
    
    return dy
    
    
    

    Model

In [59]:
def MRU_train(x, y, params, prev_hidden, iters, lr):
    for i in range(iters):
        hiddens, preds, caches = mru_forward(x, prev_hidden, params)
        dy = lossFun(preds, y)
        params = mru_cell_backward(lr, params, caches)
    return params   
        
def MRU_test(x, params, prev_hidden):
    hiddens, preds, cache = mru_forward(x, prev_hidden, params)
    return preds[-1]

    Testing

In [60]:
hidden_size, output_size, batch_size = 5, 1, 3
lr = 0.1
prev_hidden, params = init_mru(hidden_size, output_size, batch_size)

In [61]:
x = np.ones((batch_size,10))
y = np.ones((batch_size,10))
MRU_train(x, y, params, prev_hidden, 10, lr)

Time Step: 0

(3,)
Time Step: 1

(3,)
Time Step: 2

(3,)
Time Step: 3

(3,)
Time Step: 4

(3,)
Time Step: 5

(3,)
Time Step: 6

(3,)
Time Step: 7

(3,)
Time Step: 8

(3,)
Time Step: 9

(3,)
Loss : [0.6380024  0.65215649 0.6598964  0.66410357 0.66638808 0.66763017
 0.66830703 0.66867683 0.66887938 0.6689906 ]


ValueError: not enough values to unpack (expected 7, got 5)

In [87]:
W_f, U_f, U_h, W_h, W_y, b_f, b_h, b_y = params

In [44]:
preds

array([[0.78727981, 0.78635477, 0.78583911, 0.78556823, 0.78543314,
        0.78536888, 0.78533975, 0.78532727, 0.78532233, 0.78532062],
       [0.78727981, 0.78635477, 0.78583911, 0.78556823, 0.78543314,
        0.78536888, 0.78533975, 0.78532727, 0.78532233, 0.78532062],
       [0.78727981, 0.78635477, 0.78583911, 0.78556823, 0.78543314,
        0.78536888, 0.78533975, 0.78532727, 0.78532233, 0.78532062]])

In [46]:
c

(array([[-0.41335591, -0.41335591, -0.41335591],
        [-0.24835662, -0.24835662, -0.24835662],
        [ 0.05868696,  0.05868696,  0.05868696],
        [ 0.18363383,  0.18363383,  0.18363383],
        [-0.25645071, -0.25645071, -0.25645071]]),
 array([[0.46692244, 0.46692244, 0.46692244],
        [0.57336718, 0.57336718, 0.57336718],
        [0.45574459, 0.45574459, 0.45574459],
        [0.43750763, 0.43750763, 0.43750763],
        [0.51509687, 0.51509687, 0.51509687]]),
 array([[-0.41448969, -0.41448969, -0.41448969],
        [-0.24834578, -0.24834578, -0.24834578],
        [ 0.05871416,  0.05871416,  0.05871416],
        [ 0.18506013,  0.18506013,  0.18506013],
        [-0.25647662, -0.25647662, -0.25647662]]),
 array([[-0.41388529, -0.41388529, -0.41388529],
        [-0.2483504 , -0.2483504 , -0.2483504 ],
        [ 0.05869936,  0.05869936,  0.05869936],
        [ 0.18425784,  0.18425784,  0.18425784],
        [-0.25646405, -0.25646405, -0.25646405]]),
 array([[0.78532062, 0.7853