In [1]:
from keras.callbacks import ModelCheckpoint
from tensorflow.keras import regularizers
from keras.models import load_model
import tensorflow.keras.backend as K
import sklearn.gaussian_process as gp
import matplotlib.pyplot as plt
from keras import models
from keras import layers
import tensorflow as tf
import numpy as np
import matplotlib 
import time
import keras
import os
from data_generation import *
from batch_creator import *
from gp_kernels import *
from gp_priors import *
from gp_plots import *

Using TensorFlow backend.


In [None]:
plot_gp_prior(4, n=20)

In [2]:
def create_batch_gp_mim_2(pos, tar, pos_mask, batch_s=128):
    '''
    Get a batch of positions, targets and position mask from data generated 
    by data_generator_for_gp_mimick_gpt function and from position_mask function 
    -------------------------
    Parameters:
    pos (2D np array): 1st/2nd output from data_generator_for_gp_mimick_gpt function 
    tar (2D np array): 3rd/4th output from data_generator_for_gp_mimick_gpt function  
    pos_mask (4D np.array): output from position_mask function 
    batch_s (int): deafult 128
    -------------------------
    Returns:
    batch_tar_tr (2D np array)
    batch_pos_tr (2D np array)
    batch_pos_mask (4D np array)
    batch_idx_tr (1D np array): indices (=row numbers) chosen for current batch
    
    '''
    shape = tar.shape[0]
    batch_idx_tr = np.random.choice(list(range(shape)), batch_s)
    batch_tar_tr = tar[batch_idx_tr, :]
    batch_pos_tr = pos[batch_idx_tr, :]
    batch_pos_mask = pos_mask[batch_idx_tr, :, :, :]
    return batch_tar_tr, batch_pos_tr, batch_pos_mask, batch_idx_tr

In [16]:
def data_generator_for_gp_mimick_gpt(num_obs, kernel, tr_percent=0.8):
    '''
    Generator for training a GPT inspired netowrk. Make sure x is drawn in a range that 
    Doesn't include 0 --> 0 is used for padding.
    -----------------------
    Parameters:
    num_obs (int): how many observation to generate
    kernel (function of am SKlearn kernel object): e.g. rbf_kernel which comes from gp_kernels file
    tr_percent (float): daefult 0.8
    -----------------------
    Returns:
    pad_pos_tr (np array): the first rows * tr_percent from the x generated values padded by zeros according to obs_per_sample  
    pad_pos_te (np array): all rows of x not chosen for training 
    pad_y_fren_tr (np array): the first rows * tr_percent from the f_prior generated values padded by zeros according to obs_per_sample  
    pad_y_fren_te (np array): all rows of f_prior not chosen for training 
    '''
    df = np.zeros((num_obs * 2, 59))
    for i in range(0, num_obs * 2, 2):
        x = np.random.uniform(5, 15, size=(1, 59))
        k = kernel(x)
        f_prior = generate_priors(k, 59, 1)

        df[i, :x.shape[1]] = x
        df[i + 1, :x.shape[1]] = f_prior

    rows = df.shape[0]
    cols = df.shape[1]
    tr_rows = int(tr_percent * rows)
    tr_rows = tr_rows if tr_rows % 2 == 0 else tr_rows + 1
    df_tr = df[:tr_rows, :]
    df_te = df[tr_rows:, :]
    
    # get all even rows
    pad_pos_tr = df_tr[::2, :]
    pad_pos_te = df_te[::2, :]
    # get all odd rows
    pad_y_fren_tr = df_tr[1::2, :]
    pad_y_fren_te = df_te[1::2, :]

    return pad_pos_tr, pad_pos_te, pad_y_fren_tr, pad_y_fren_te, df_tr, df_te

In [17]:
def position_mask(arr):
    '''
    This tries to emulate the kernel matrix. 
    In the first stage we have a 2X2 matrix of zeros, next
    3X3 matrix of zeros, etc.
    -------------------------
    Parameters:
    arr (np array): the 1st/2nd output from data_generator_for_gp_mimick_gpt function
    -------------------------
    Returns:
    mask (4D np array): if there are 100 rows and 50 cols in arr then this will 
    return [100, 49, 50, 50] array -- where the first dim is observation number 
    second dim is timestamp and third+fourth dim are the mask matrix.
    '''
    rows = arr.shape[0]
    cols = arr.shape[1]
    mask = np.ones((rows, cols - 1, cols, cols))
    specific = np.sum(np.equal(arr, 0), 1)
    for i in range(2, cols + 1):
        mask[:, i - 2, :i, :i] = np.zeros((i, i))
    for j in range(rows):
        k  = specific[j]
        mask[j, k:, :, :] = 1
            
    return mask

In [18]:
def create_padding_mask(seq):
    '''
    Used to pad sequences that have zeros where there was no event.
    Typically this will be combined with create_look_ahead_mask function.
    This function is used inside an open session of tensorflow. 
    To try it out create a tf.constant tensor.
    -------------------
    Parameters:
    seq (tensor): shape is (batch_size, seq_len)
    
    -------------------
    Returns:
    A binary tensor  (batch_size, 1, seq_len): 1 where there was no event and 0 otherwise.
    
    '''
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
  
  # add extra dimensions to add the padding
  # to the attention. Extra dimension is used in create_masks function
    return seq[:, tf.newaxis, :]  

In [19]:
def create_tar_mask(size):
    '''
    '''
    mask = tf.linalg.diag(tf.ones(size, size))
    return mask

In [20]:
create_tar_mask(3)

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]], dtype=int32)>

In [21]:
def create_look_ahead_mask(size):
    '''
    Hide future outputs from a decoder style network.
    Used typically together with create_padding_mask function
    -----------------------
    Parameters:
    size (int): max sequnce length 
    
    -----------------------
    Returns:
    mask (tensor): shape is (seq_len X seq_len). Example: if size is 4, returns
    0 1 1 1
    0 0 1 1
    0 0 0 1
    0 0 0 0 
    where 1 signifies what to hide.
    '''
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [22]:
def create_masks(tar):
    '''
    Create unified masking hiding future from current timestamps and hiding paddings. 
    -------------------
    Parameters: 
    tar (tensor): batch of padded target sequences 
    -------------------
    Returns: 
    combined_mask_tar  (tensor): shape is batch_size X max_seq_len X max_seq_len
    '''
    
    tar_padding_mask = create_padding_mask(tar)
    ## this will be batch_size X 1 X 40

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    # if max seq length is 40 -- > this will be 40X40 
    
    
    ## This will also be (64, 40, 40)
    combined_mask_tar = tf.maximum(tar_padding_mask, look_ahead_mask)
    
    
    return combined_mask_tar

In [23]:
pad_pos_tr, pad_pos_te, pad_y_fren_tr, pad_y_fren_te, _, df_te = data_generator_for_gp_mimick_gpt(10000, rbf_kernel)

In [24]:
pp = position_mask(pad_pos_tr)

In [25]:
loss_object = tf.keras.losses.MeanSquaredError()

In [26]:
optimizer = tf.keras.optimizers.Adam(learning_rate= 0.1)
train_loss = tf.keras.metrics.Mean(name='train_loss')

In [27]:
def loss_function(real, pred):
    '''
    Masked MSE. Since the target sequences are padded, 
    it is important to apply a padding mask when calculating the loss.
    ----------------
    Parameters:
    real (tf.tensor float64): shape batch_size X max_seq_len. True values of sequences.
    pred (tf.tensor float64): shape batch_size X max_seq_len. Predictions from GPT network. 
    
    ----------------
    Returns: 
    loss value (tf.float64)
    '''
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)
    
#     print('loss_ :', loss_)
#     shape= (128X58)
    
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_sum(loss_) / tf.reduce_sum(mask)

In [60]:
def dot_prod_position(q, k, v, mask):
    '''
    Used to create a pseudo XX^T covariance matrix for each 
    positional sequence in the batch.
    ------------------
    Parameters: 
    q : shape (batch_size X max_seq_len X 1). Position outptut from create_batch_gp_mim_2 function (or after another Dense layer) 
    k : shape (batch_size X max_seq_len X 1). Position outptut from create_batch_gp_mim_2 function (or after another Dense layer) 
    mask: shape (batch_size X max_seq_len X max_seq_len X max_seq_len). The positional mask created by position_mask function and selected in batch indices 
    
    ------------------
    Returns:
    nl_qk (tf.tensor float64): shape (batch_size X max_seq_len X max_seq_len X max_seq_len).
    Each observation (1st dim) has seq_len timestamps (2nd dim) and each timestamp has an associated
    seq_len X seq_len pseudo covariance matrix (3rd & 4th dims) masked according to the timestamp.
    
    '''
    qk = tf.matmul(q, k, transpose_b = True)
    qk = tf.cast(qk[:, tf.newaxis, :, :], tf.float64)
    print('qk1: ', qk)
#     shape=(128, 1, 59, 59)

    print('pos_mask: ', mask)
#     shape=(128, 58, 59, 59)
    if mask is not None:
        qk +=  ((tf.cast(mask, tf.float64)) * -1e9)
        
    print('qk2: ', qk)


    qk = tf.reshape(qk, shape = [tf.shape(mask)[0], tf.shape(mask)[1], -1])
    
    print('qk3: ', qk)
    
    qk = tf.reshape(tf.nn.softmax(qk, axis = -1), shape = [tf.shape(mask)[0], tf.shape(mask)[1], tf.shape(mask)[2], tf.shape(mask)[3]])
    
    print('qk4: ', qk)
    #shape=(128, 58, 59, 59)
    
    v = v[:, tf.newaxis, :, :]
    
    u = tf.transpose(tf.matmul(qk, v), perm = [0, 1, 3 ,2])
    
    print('u: ', u)
    
    u2 = tf.matmul(u, v)
    
    
    return u2

In [77]:
def dot_product_attention(q, k, v, mask):
    '''
    Attention inspired by Transformer (but not the same). The Transformer embeds the 
    target words to q (query), k (key), v (value). So if we have a batch of 128 sequences 
    with max length 40 and embedding layer is 20, we will get shape q = shape k = shape v
    = (128 X  max sequence length X 20). The Transformer then transposes k 
    to get after matmul (128 X max seq X max seq) matrix. We then apply relu layer (unlike in Transformer)
    ---------------------
    Parameters:
    q (tf.tensor float64): shape (batch_size, max_seq_len, 1)
    k (tf.tensor float64): shape (batch_size, max_seq_len, 1)
    v (tf.tensor float64): shape (batch_size, max_seq_len, 1)
    mask (tf.tensor float64): shape (batch_size, max_seq_len, max_seq_len)
    ---------------------
    Returns:
    out_tar: shape (batch_size, max_seq_len, max_seq_len). The sequences after embedding (or Dense layer) weighted by attention_weights. 
    attention_weights : shape (batch_size, max_seq_len, max_seq_len). Weights to assign for each sequence member at each timestamp (2nd dim).
    matmul_qk: shape (batch_size, max_seq_len, max_seq_len)
    
    
    '''
    # similarity
    # q = k = v  shape := (batch_size, max_seq_len - 1, max_seq_len -1)
    matmul_qk = tf.matmul(q, k, transpose_b = True, name = 'qk')
#     print('matmul_qk: ', matmul_qk)
#     shape=(128, 58, 58)
    
    nl_qk = tf.cast(tf.nn.relu(matmul_qk, name = 'nl_qk'), tf.float64) 
#     print('nl_qk: ', nl_qk)
#     shape=(128, 58, 58)
#     nl_qk shape := (batch_size, max_seq_len - 1, max_seq_len - 1)

    # -1e9 will turn the softmax output in this locations to zero
    # this is a good mask as an input for softmax -- we need also masking when 
    # want to use matmul as is 
    
    if mask is not None:
        nl_qk +=  ((tf.cast(mask, tf.float64)) * -1e9)
    
        
#     print('nl_qk after mask: ', nl_qk)
#     shape=(128, 58, 58)
        
     # turn simialrity to scores
    attention_weights = tf.nn.softmax(nl_qk, axis = -1, name = 'attention_weights')
    # Notice that for all the rows where 
    # everything is 0, the masking will turn everything to -inf
    # and the output from the softmax would be 1/num_cols 
    # (try a = tf.constant([-1e9, -1e9, -1e9]), tf.nn.softmax(a))
    # So we can expect an output from these rows which we want to ignore
    # this will be enforced in the masking of the loss function 
    
#     print('attention_weights: ', attention_weights)
#     shape=(128, 58, 58)
   
    # weight values 
    # attention_weights shape := (batch_size, max_seq_len - 1, max_seq_len - 1), 
    # v shape := batch_size X max_seq_len X l
    out_tar = tf.matmul(attention_weights, v)
    
#   print('out_tar: ', out_tar)
#   shape=(128, 58, tar_d_model)
    
    return out_tar, attention_weights, matmul_qk

In [83]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, l):
        super(Decoder, self).__init__()
        
        self.l = l
        
        self.wq = tf.keras.layers.Dense(l, name = 'wq')
        self.wk = tf.keras.layers.Dense(l, name = 'wk')
        self.wv = tf.keras.layers.Dense(l, name = 'wk')                    
        
        self.hq = tf.keras.layers.Dense(l, name = 'hq')
        self.hk = tf.keras.layers.Dense(l, name = 'hk')
        self.hv = tf.keras.layers.Dense(l, name = 'hv')
        
        self.B = tf.keras.layers.Dense(l, name = 'B')
        self.A = tf.keras.layers.Dense(1, name = 'A')

    #a call method, the layer's forward pass
    def call(self, tar_position, tar_inp, training, pos_mask, tar_mask):
        
        # Adding extra dimension to allow multiplication of 
        # a sequnce with itself. 
        tar_position = tar_position[:, :, tf.newaxis]
        
        q_p = self.wq(tar_position) 
        k_p = self.wk(tar_position)
        v_p = self.wk(tar_position)


        print('v_p: ', v_p)
        #shape=(128, 59, 16)
        
        pos_attn1 = dot_prod_position(q_p, k_p, v_p, mask = pos_mask)
        print('pos_attn1 :', pos_attn1)
#       shape=(128, 58, 59, 59)
    
        tar_inp = tar_inp[:, :, tf.newaxis]

        
        q = self.hq(tar_inp) 
        k = self.hk(tar_inp)
        v = self.hv(tar_inp)
        
        print('q :', q)
#       shape=(128, 58, 58)

        tar_attn1, _, _ = dot_product_attention(q, k, v, tar_mask)
        # tar_attn1 is (batch_size, max_seq_len - 1, tar_d_model)

        print('tar_attn1 :', tar_attn1)
#       shape=(128, 58, l)
#       shape=(128, 58, 16)
        tar_attn1 = tar_attn1[:, :, :, tf.newaxis]
        
        tar1 = self.B(tar_attn1)
        
        print('tar1 :', tar1)
        # shape=(128, 58, 16, 16)

        L = tf.matmul(tar1, pos_attn1)
        
        print('L :', L)
        # shape=(128, 58, 16, 16)
        
        L2 = self.A(tf.reshape(L, shape = [tf.shape(L)[0], tf.shape(L)[1] ,self.l ** 2])) 
        
        print('L2 :', L2)
        
        return L2

In [84]:
decoder = Decoder(16)

In [85]:
@tf.function
def train_step(pos, tar, pos_mask):
    '''
    A typical train step function for TF2. Elements which we wish to track their gradient
    has to be inside the GradientTape() clause. see (1) https://www.tensorflow.org/guide/migrate 
    (2) https://www.tensorflow.org/tutorials/quickstart/advanced
    ------------------
    Parameters:
    pos (np array): array of positions (x values) - the 1st/2nd output from data_generator_for_gp_mimick_gpt
    tar (np array): array of targets. Notice that if dealing with sequnces, we typically want to have the targets go from 0 to n-1. The 3rd/4th output from data_generator_for_gp_mimick_gpt  
    pos_mask (np array): see description in position_mask function
    ------------------    
    '''
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    combined_mask_tar = create_masks(tar_inp)
    with tf.GradientTape(persistent=True) as tape:
        pred = decoder(pos, tar_inp, True, pos_mask, combined_mask_tar)
#         print('pred: ')
#         tf.print(pred)

        loss = loss_function(tar_real, pred)

    gradients = tape.gradient(loss, decoder.trainable_variables)
#     tf.print(gradients)
    optimizer.apply_gradients(zip(gradients, decoder.trainable_variables))
    train_loss(loss)
    b = decoder.trainable_weights[0]
    tf.print(tf.reduce_mean(b))

In [81]:
tf.keras.backend.set_floatx('float64')

In [82]:
if __name__ == '__main__':
    EPOCHS = 20
    batch_s  = 128
    num_batches = int(pad_y_fren_tr.shape[0] / batch_s)
    
    for epoch in range(EPOCHS):
        start = time.time()
#         train_loss.reset_states()

        for batch in range(num_batches):
            batch_tar_tr, batch_pos_tr, batch_pos_mask, _ = create_batch_gp_mim_2(pad_pos_tr, pad_y_fren_tr, pp)
            # batch_tar_tr shape := 128 X 59 = (batch_size, max_seq_len)
            # batch_pos_tr shape := 128 X 59 = (batch_size, max_seq_len)
            train_step(batch_pos_tr, batch_tar_tr, batch_pos_mask)

            if batch % 50 == 0:
                print ('Epoch {} Batch {} Loss {:.4f}'.format(
                  epoch + 1, batch, train_loss.result()))

        print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

v_p:  Tensor("decoder_8/wk_1/BiasAdd:0", shape=(128, 59, 16), dtype=float64)
qk1:  Tensor("decoder_8/strided_slice_1:0", shape=(128, 1, 59, 59), dtype=float64)
pos_mask:  Tensor("pos_mask:0", shape=(128, 58, 59, 59), dtype=float64)
qk2:  Tensor("decoder_8/add:0", shape=(128, 58, 59, 59), dtype=float64)
qk3:  Tensor("decoder_8/Reshape:0", shape=(128, 58, 3481), dtype=float64)
qk4:  Tensor("decoder_8/Reshape_1:0", shape=(128, 58, 59, 59), dtype=float64)
u:  Tensor("decoder_8/transpose:0", shape=(128, 58, 16, 59), dtype=float64)
pos_attn1 : Tensor("decoder_8/MatMul_2:0", shape=(128, 58, 16, 16), dtype=float64)
q : Tensor("decoder_8/hq/BiasAdd:0", shape=(128, 58, 16), dtype=float64)
tar_attn1 : Tensor("decoder_8/MatMul_3:0", shape=(128, 58, 16), dtype=float64)
tar1 : Tensor("decoder_8/B/BiasAdd:0", shape=(128, 58, 16, 16), dtype=float64)
L : Tensor("decoder_8/MatMul_4:0", shape=(128, 58, 16, 16), dtype=float64)
L2 : Tensor("decoder_8/A/BiasAdd:0", shape=(7424, 1), dtype=float64)


ValueError: in user code:

    <ipython-input-52-994304711f81>:22 train_step  *
        loss = loss_function(tar_real, pred)
    <ipython-input-27-6df2cbecf3d5>:15 loss_function  *
        loss_ = loss_object(real, pred)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/losses.py:143 __call__  **
        losses = self.call(y_true, y_pred)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/losses.py:246 call
        return self.fn(y_true, y_pred, **self._fn_kwargs)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/keras/losses.py:1198 mean_squared_error
        return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/gen_math_ops.py:10038 squared_difference
        "SquaredDifference", x=x, y=y, name=name)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py:744 _apply_op_helper
        attrs=attr_protos, op_def=op_def)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/func_graph.py:595 _create_op_internal
        compute_device)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:3327 _create_op_internal
        op_def=op_def)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1817 __init__
        control_input_ops, op_def)
    /Users/omernivron/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/framework/ops.py:1657 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 7424 and 128 for '{{node mean_squared_error/SquaredDifference}} = SquaredDifference[T=DT_DOUBLE](decoder_8/A/BiasAdd, strided_slice_1)' with input shapes: [7424,1], [128,58].


In [None]:
decoder.trainable_variables

In [None]:
df_te[0, 54]

In [None]:
pos = df_te[0, :].reshape(1, -1)

In [None]:
tar = df_te[1, :].reshape(1, -1)

In [None]:
def infer(pos, tar, pos_mask):
    '''
    
    '''
    combined_mask_tar = create_masks(tar)
    out = decoder(pos, tar, False, pos_mask, combined_mask_tar)
    return out

In [None]:
def inference(pos, tar, max_seq_len, num_steps = 1):
    '''
    
    ------------------
    Parameters:
    pos (2D np array): (n + num_steps) positions 
    tar (2D np array): n targets 
    max_seq_len (int): this has to be the same max seq length as the trained model
    num_steps (int): how many inference steps are required
    ------------------
    Returns:
    tar 
    
    '''
    pos_mask = position_mask(pos)
    current_idx = np.where(pos != 0)[1][-1]  
    
    out = infer(pos, tar[:, :-1], pos_mask)
    print(out)
#     tar = tf.concat((tar, tf.reshape(out[:, current_idx], [1, 1])), axis = 1)
#     if num_steps > 1:
#         inference(pos, tar, max_seq_len, num_steps - 1)
    
    return out
    

In [None]:
a = inference(pos, tar, 59)

In [None]:
# tf.data.Dataset(tf.Tensor(pad_pos_tr, value_index = 0 , dtype = tf.float32))