# Hidden State Activation

In [6]:
import numpy as np

# Horizontal concatenation
w_hh = np.full((3,2), 1)
w_hx = np.full((3,3), 9)

print(w_hh)
print(w_hh.shape)
print(w_hx)
print(w_hx.shape)

w_h1 = np.concatenate((w_hh, w_hx), axis=1)
print(w_h1)
print(w_h1.shape)

w_h2 = np.hstack((w_hh, w_hx))
print(w_h2)
print(w_h2.shape)

# can do similar with axis=0 or vstack

[[1 1]
 [1 1]
 [1 1]]
(3, 2)
[[9 9 9]
 [9 9 9]
 [9 9 9]]
(3, 3)
[[1 1 9 9 9]
 [1 1 9 9 9]
 [1 1 9 9 9]]
(3, 5)
[[1 1 9 9 9]
 [1 1 9 9 9]
 [1 1 9 9 9]]
(3, 5)


In [12]:
# Remember, we're concatenating
# W horizontally
# parameters h and x vertically
# ensure the separated + elementwise added result is the same as the concatted version

w_hh = np.full((3, 2), 1)
w_hx = np.full((3, 3), 9)
h_t_prev = np.full((2,  1), 1)
x_t = np.full((3, 1), 9)

stack_1 = np.hstack((w_hh, w_hx))
stack_2 = np.vstack((h_t_prev, x_t))

print('formula 1')
print('term1', stack_1)
print('term2', stack_2)
formula_1 = np.matmul(np.hstack((w_hh, w_hx)), np.vstack((h_t_prev, x_t)))
print('output')
print(formula_1)

# formula 2
mul_1 = np.matmul(w_hh, h_t_prev)
mul_2 = np.matmul(w_hx, x_t)
print('formula 2')
print('term1', mul_1)
print('term2', mul_2)

formula_2 = np.matmul(w_hh, h_t_prev) + np.matmul(w_hx, x_t)
print('output')
print(formula_2)


print("-- Verify --")
print("Results are the same :", np.allclose(formula_1, formula_2))

# Try adding a sigmoid activation function and bias term as a final check
# Activation
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Bias and check
b = np.random.standard_normal((formula_1.shape[0],1))
print("Formula 1 Output:\n",sigmoid(formula_1+b))
print("Formula 2 Output:\n",sigmoid(formula_2+b))

all_close = np.allclose(sigmoid(formula_1+b), sigmoid(formula_2+b))
print("Results after activation are the same :",all_close)

formula 1
term1 [[1 1 9 9 9]
 [1 1 9 9 9]
 [1 1 9 9 9]]
term2 [[1]
 [1]
 [9]
 [9]
 [9]]
output
[[245]
 [245]
 [245]]
formula 2
term1 [[2]
 [2]
 [2]]
term2 [[243]
 [243]
 [243]]
output
[[245]
 [245]
 [245]]
-- Verify --
Results are the same : True
Formula 1 Output:
 [[1.]
 [1.]
 [1.]]
Formula 2 Output:
 [[1.]
 [1.]
 [1.]]
Results after activation are the same : True


# JAX numpy and perplexity

In [13]:
import numpy
import trax
import trax.fastmath.numpy as np

trax.supervised.trainer_lib.init_random_number_generators(32)
numpy.random.seed(32)



In [16]:
numpy_array = numpy.random.random((5, 10))
print(numpy_array)
print(type(numpy_array))

[[0.29043624 0.38836719 0.12730549 0.21281151 0.75505369 0.57887068
  0.63183171 0.9575624  0.09425337 0.12772689]
 [0.40667553 0.22271941 0.53159447 0.83057898 0.60421553 0.18538948
  0.38130853 0.28346655 0.70331245 0.25379794]
 [0.76139101 0.27705392 0.04512082 0.94255551 0.51623923 0.74118788
  0.59841033 0.99493805 0.15958447 0.41761247]
 [0.4993524  0.68251486 0.74356294 0.04570872 0.5659992  0.95207681
  0.86689532 0.20598575 0.61122462 0.4219096 ]
 [0.45027546 0.04400528 0.00684583 0.50835038 0.38884269 0.57835584
  0.1565324  0.8933882  0.01109693 0.76347618]]
<class 'numpy.ndarray'>


In [17]:
trax_numpy_array = np.array(numpy_array)
print(trax_numpy_array)
print(type(trax_numpy_array))

[[0.29043624 0.3883672  0.1273055  0.21281151 0.7550537  0.5788707
  0.6318317  0.9575624  0.09425337 0.12772688]
 [0.40667552 0.22271942 0.53159446 0.830579   0.6042155  0.18538949
  0.38130853 0.28346655 0.70331246 0.25379795]
 [0.761391   0.27705392 0.04512082 0.9425555  0.5162392  0.7411879
  0.5984103  0.9949381  0.15958448 0.41761246]
 [0.4993524  0.68251485 0.74356294 0.04570872 0.5659992  0.9520768
  0.8668953  0.20598575 0.6112246  0.4219096 ]
 [0.45027545 0.04400528 0.00684583 0.5083504  0.3888427  0.57835585
  0.1565324  0.8933882  0.01109693 0.7634762 ]]
<class 'jax.interpreters.xla._DeviceArray'>


In [18]:
from trax import layers as tl

predictions = numpy.load('predictions.npy')
targets = numpy.load('targets.npy')

predictions = np.array(predictions)
targets = np.array(targets)

print(predictions.shape)
print(targets.shape)

(32, 64, 256)
(32, 64)


In [22]:
reshaped_targets = tl.one_hot(targets, predictions.shape[-1])
print(reshaped_targets.shape)

(32, 64, 256)


In [28]:
total_log_ppx = np.sum(predictions * reshaped_targets, axis=-1)
print(total_log_ppx.shape)
total_log_ppx

(32, 64)


DeviceArray([[ -5.396545  ,  -1.0311184 ,  -0.66916656, ...,
              -22.37673   , -23.18771   , -21.843483  ],
             [ -4.5857706 ,  -1.1341286 ,  -8.538033  , ...,
              -20.15686   , -26.837097  , -23.57502   ],
             [ -5.2223887 ,  -1.2824144 ,  -0.17312431, ...,
              -21.328228  , -19.854412  , -33.88444   ],
             ...,
             [ -5.396545  , -17.291681  ,  -4.360766  , ...,
              -20.825802  , -21.065838  , -22.443115  ],
             [ -5.9313164 , -14.247417  ,  -0.2637329 , ...,
              -26.743248  , -18.38433   , -22.355278  ],
             [ -5.670536  ,  -0.10595131,   0.        , ...,
              -23.332523  , -28.087376  , -23.878807  ]], dtype=float32)

In [24]:
# account for perplexity
non_pad = 1.0 - np.equal(targets, 0)
print('non_pad has shape:', non_pad.shape)
print('non_pad looks like:', non_pad)

non_pad has shape: (32, 64)
non_pad looks like: [[1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]]


In [29]:
# by computing produt of total log perpexity and the non_pad tensor, we remove the effect of padding
real_log_ppx = total_log_ppx * non_pad
print('real perplexity still has shape:', real_log_ppx.shape)
print(total_log_ppx)
print(real_log_ppx)

real perplexity still has shape: (32, 64)
[[ -5.396545    -1.0311184   -0.66916656 ... -22.37673    -23.18771
  -21.843483  ]
 [ -4.5857706   -1.1341286   -8.538033   ... -20.15686    -26.837097
  -23.57502   ]
 [ -5.2223887   -1.2824144   -0.17312431 ... -21.328228   -19.854412
  -33.88444   ]
 ...
 [ -5.396545   -17.291681    -4.360766   ... -20.825802   -21.065838
  -22.443115  ]
 [ -5.9313164  -14.247417    -0.2637329  ... -26.743248   -18.38433
  -22.355278  ]
 [ -5.670536    -0.10595131   0.         ... -23.332523   -28.087376
  -23.878807  ]]
[[ -5.396545    -1.0311184   -0.66916656 ...  -0.          -0.
   -0.        ]
 [ -4.5857706   -1.1341286   -8.538033   ...  -0.          -0.
   -0.        ]
 [ -5.2223887   -1.2824144   -0.17312431 ...  -0.          -0.
   -0.        ]
 ...
 [ -5.396545   -17.291681    -4.360766   ...  -0.          -0.
   -0.        ]
 [ -5.9313164  -14.247417    -0.2637329  ...  -0.          -0.
   -0.        ]
 [ -5.670536    -0.10595131   0.         ...

In [30]:
log_ppx = np.sum(real_log_ppx) / np.sum(non_pad)
log_ppx = -log_ppx
print('log perplexity', log_ppx)
print('perplexity', np.exp(log_ppx))

log perplexity 2.328121
perplexity 10.258647


# Vanilla RNNs, GRUs and the scan function

In [31]:
import numpy as np
from numpy import random
from time import perf_counter

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

In [32]:
# Forward method
# Embedding size: 128
# Hidden state size: (16,1)
# W is (h_dim, emb + h_dim)
# b_ is (h_dim, 1)
# h_t is (h_dim,1)
# h_0 is a vector of zeros
random.seed(10)                 # Random seed, so your results match ours
emb = 128                       # Embedding size
T = 256                         # Number of variables in the sequences
h_dim = 16                      # Hidden state dimension
h_0 = np.zeros((h_dim, 1))      # Initial hidden state
# Random initialization of weights and biases
w1 = random.standard_normal((h_dim, emb+h_dim))
w2 = random.standard_normal((h_dim, emb+h_dim))
w3 = random.standard_normal((h_dim, emb+h_dim))
b1 = random.standard_normal((h_dim, 1))
b2 = random.standard_normal((h_dim, 1))
b3 = random.standard_normal((h_dim, 1))
X = random.standard_normal((T, emb, 1))
weights = [w1, w2, w3, b1, b2, b3]

In [49]:
def forward_V_RNN(inputs, weights):
    x, h_t = inputs
    wh, _, _, bh, _, _ = weights

    h_t = np.dot(wh, np.concatenate([h_t, x])) + bh
    h_t = sigmoid(h_t)

    return h_t, h_t

def forward_GRU(inputs, weighst):
    x, h_t = inputs

    wu, wr, wc, bu, br, bc = weights

    # Update gate
    u = np.dot(wu, np.concatenate([h_t, x])) + bu
    u = sigmoid(u)

    # Relevance gate
    r = np.dot(wr, np.concatenate([h_t, x])) + br
    r = sigmoid(r)

    # Candidate hidden state
    c = np.dot(wc, np.concatenate([r * h_t, x])) + bc
    c = np.tanh(c)

    h_t = u * c + (1 - u)*h_t
    return h_t, h_t

In [64]:
forward_GRU([X[1],h_0], weights)[0]

array([[ 9.77779014e-01],
       [-9.97986240e-01],
       [-5.19958083e-01],
       [-9.99999886e-01],
       [-9.99707004e-01],
       [-3.02197037e-04],
       [-9.58733503e-01],
       [ 2.10804828e-02],
       [ 9.77365398e-05],
       [ 9.99833090e-01],
       [ 1.63200940e-08],
       [ 8.51874303e-01],
       [ 5.21399924e-02],
       [ 2.15495959e-02],
       [ 9.99878828e-01],
       [ 9.77165472e-01]])

In [65]:
# Part 3, scan function
# Takes fn, elems (for each time step) x, weights (for fn), and h_0 (initial hidden state)

def scan(fn, elems, weights, h_0=None):
    h_t = h_0
    ys = []
    for x in elems:
        y, h_t = fn([x, h_t], weights)
        ys.append(y)
    return ys, h_t

In [66]:
# Comparison between vanilla RNNS and GRUs. Need a forward method and some way of
# scanning through all elements
# Compute forward prop for sequence with 256 time steps

# Vanilla RNNs
tic = perf_counter()
ys, h_T = scan(forward_V_RNN, X, weights, h_0)
toc = perf_counter()
RNN_time = (toc-tic)*1000
print(f'It took {RNN_time:.2f}ms for vanilla RNN forward')

It took 6.64ms for vanilla RNN forward


In [68]:
# GRUs
tic = perf_counter()
ys, h_T = scan(forward_GRU, X, weights, h_0)
toc = perf_counter()
GRU_time=(toc-tic)*1000
print (f"It took {GRU_time:.2f}ms to run the forward method for the GRU.")

It took 12.58ms to run the forward method for the GRU.


# Creating a GRU model using Trax

In [69]:
import trax
from trax import layers as tl
mlp = tl.Serial(
    tl.Dense(128),
    tl.Relu(),
    tl.Dense(10),
    tl.LogSoftmax()
)

print(mlp)

Serial[
  Dense_128
  Relu
  Dense_10
  LogSoftmax
]


In [74]:
# New layers needed for GRU
# ShiftRight: Shifts tensor to t he right by padding on axis 1. Mode refers to the context in which the model is being used ('train', 'eval', or 'predict', defaults to train)
# Embedding: Maps discrete tokens to vectors. It will have shape (vocab_length X dimension of output vectors). The dimension of output vectors (also called d_feature) is the number of elements in the word embedding
# GRU: Leverages GRUCell. Specify number of GRU units, which should match number of elements in the word embedding. To stack two consecutive GRU layers, ist can be done with python  list comprehension
# Dense: Vanilla dense layer
# LogSoftMax: Log Softmax Function

mode = 'train'
vocab_size = 256
model_dimension =  512
n_layers = 2
GRU = tl.Serial(
    tl.ShiftRight(mode=mode),
    tl.Embedding(vocab_size=vocab_size, d_feature=model_dimension),
    [tl.GRU(n_units=model_dimension) for _ in range(n_layers)],
    tl.Dense(n_units=vocab_size),
    tl.LogSoftmax()
)

In [76]:
def show_layers(model, layer_prefix='Serial.sublayers'):
    print(f'Total layers: {len(model.sublayers)}\n')
    for i in range(len(model.sublayers)):
        print('===========')
        print(f'{layer_prefix}_{i}: {model.sublayers[i]}\n')

show_layers(GRU)

Total layers: 6

Serial.sublayers_0: ShiftRight(1)

Serial.sublayers_1: Embedding_256_512

Serial.sublayers_2: GRU_512

Serial.sublayers_3: GRU_512

Serial.sublayers_4: Dense_256

Serial.sublayers_5: LogSoftmax

