# Transformer Time series

In [1]:
import tensorflow as tf
import numpy as np


In [2]:
class DenseEinsum(tf.keras.layers.Layer):
    def __init__(self,
                 output_shape,
                 num_summed_dimensions=1,
                 activation=None,
                 use_bias=True,
                 kernel_initializer="glorot_uniform",
                 bias_initializer="zeros",
                 kernel_regularizer=None,
                 bias_regularizer=None,
                 kernel_constraint=None,
                 bias_constraint=None,
                 ):
        self._output_shape=output_shape
        self._num_summed_dimentions=num_summed_dimensions
        self._activation = tf.keras.activations.get(activation)
        self._use_bias = use_bias
        self._kernel_initializer = tf.keras.initializers.get(kernel_initializer)
        self._bias_initializer = tf.keras.initializers.get(bias_initializer)
        self._kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
        self._bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
        self._kernel_constraint = tf.keras.constraints.get(kernel_constraint)
        self._bias_constraint = tf.keras.constraints.get(bias_constraint)
        self._einsum_string = None
        self._CHR_IDX = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"]
        super(DenseEinsum,self).__init__()

    def _build_einsum_string(self, free_input_dims, bound_dims, output_dims):
        input_str = ""
        kernel_str = ""
        output_str = ""
        letter_offset = 0
        for i in range(free_input_dims):
            char = self._CHR_IDX[i + letter_offset]
            input_str += char
            output_str += char

        letter_offset += free_input_dims
        for i in range(bound_dims):
            char = self._CHR_IDX[i + letter_offset]
            input_str += char
            kernel_str += char

        letter_offset += bound_dims
        for i in range(output_dims):
            char = self._CHR_IDX[i + letter_offset]
            kernel_str += char
            output_str += char

        return input_str + "," + kernel_str + "->" + output_str

    def build(self,input_shape):
        input_shape = tf.TensorShape(input_shape)
        input_rank = input_shape.rank
        self._einsum_string=self._build_einsum_string(input_rank-self._num_summed_dimentions,
                                                      self._num_summed_dimentions,
                                                      len(self._output_shape))  # "BTF,FCD->BTCD"

        self._kernel_shape = (input_shape[input_rank-self._num_summed_dimentions:].concatenate(self._output_shape))
        self._kernel = self.add_weight("kernel",
                                       shape=self._kernel_shape,
                                       initializer=self._kernel_initializer,
                                       regularizer=self._kernel_regularizer,
                                       constraint=self._kernel_constraint,
                                       dtype=self.dtype,
                                       trainable=True)
        if self._use_bias:
            self._bias = self.add_weight("bias",
                                         shape=self._output_shape,
                                         initializer=self._bias_initializer,
                                         regularizer=self._bias_regularizer,
                                         constraint=self._bias_constraint,
                                         dtype=self.dtype,
                                         trainable=True)
        else:
            self._bias = None

        super(DenseEinsum, self).build(input_shape)

    def call(self,inputs):
        ret=tf.einsum(self._einsum_string,inputs,self._kernel)
        if self._use_bias:
            ret += self._bias
        if self._activation is not None:
            ret = self._activation(ret)
        return ret


class Attention(tf.keras.layers.Layer):
    def __init__(self,hidden_size,num_heads,attention_dropout):
        if hidden_size%num_heads:
            raise ValueError("Hidden size ({}) must be divisible by the number of heads ({})."
                             .format(hidden_size, num_heads))

        super(Attention,self).__init__()
        self.hidden_size=hidden_size
        self.num_heads=num_heads
        self.attention_dropout=attention_dropout

    def build(self,input_shape):
        self.query_dense_layer = DenseEinsum([self.num_heads,self.hidden_size//self.num_heads])
        self.key_dense_layer = DenseEinsum([self.num_heads,self.hidden_size//self.num_heads])
        self.value_dense_layer = DenseEinsum([self.num_heads,self.hidden_size//self.num_heads])
        self.output_dense_layer = DenseEinsum([self.hidden_size,],num_summed_dimensions=2)
        super(Attention, self).build(input_shape)

    def forward(self,query_input,source_input,attention_mask,training,cache):
        query=self.query_dense_layer(query_input)
        key=self.key_dense_layer(source_input)
        value=self.value_dense_layer(source_input)

        if cache is not None:
            key = tf.concat([tf.cast(cache["k"], key.dtype), key], axis=1)
            value = tf.concat([tf.cast(cache["v"], value.dtype), value], axis=1)

            # Update cache
            cache["k"] = key
            cache["v"] = value

        depth = (self.hidden_size // self.num_heads)
        query *= depth ** -0.5
        logits = tf.einsum("BTNH,BFNH->BNFT", key, query)

        if attention_mask is not None:
            logits+=attention_mask

        weights = tf.nn.softmax(logits, name="attention_weights")
        if training:
            weights = tf.nn.dropout(weights, rate=self.attention_dropout)
        attention_output = tf.einsum("BNFT,BTNH->BFNH", weights, value)

        attention_output = self.output_dense_layer(attention_output)
        #print('*'*10,attention_output.shape)
        return attention_output

    def call(self, query_input, source_input,bias,training,cache=None):
        return self.forward(query_input,source_input,bias,training,cache)


class FeedForwardNetwork(tf.keras.layers.Layer):
    def __init__(self,hidden_size,filter_size,relu_dropout):
        super(FeedForwardNetwork,self).__init__()
        self.hidden_size=hidden_size
        self.filter_size=filter_size
        self.relu_dropout=relu_dropout
        self.filter_dense_layer = tf.keras.layers.Dense(
                                                        self.filter_size,
                                                        use_bias=True,
                                                        activation=tf.nn.relu,
                                                        name="filter_layer")
        self.output_dense_layer = tf.keras.layers.Dense(
            self.hidden_size, use_bias=True, name="output_layer")

    def forward(self,x,training):
        output = self.filter_dense_layer(x)
        if training:
            output = tf.nn.dropout(output, rate=self.relu_dropout)
        output = self.output_dense_layer(output)
        return output

    def get_config(self):
        return {
            "hidden_size": self.hidden_size,
            "filter_size": self.filter_size,
            "relu_dropout": self.relu_dropout,
        }

    def call(self,x,training):
        return self.forward(x,training)


class EmbeddingLayer(tf.keras.layers.Layer):
    def __init__(self,embedding_size):
        super(EmbeddingLayer,self).__init__()
        self.embedding_size=embedding_size

    def build(self,input_shape):
        with tf.name_scope('embedding'):
            self.shared_weights=self.add_weight(name='weights',
                                                shape=[input_shape[-1],self.embedding_size],
                                                initializer=tf.random_normal_initializer(mean=0.,
                                                                                         stddev=self.embedding_size ** -0.5))
        super(EmbeddingLayer,self).build(input_shape)

    def get_config(self):
        return {
            #'vocab_size':self.vocab_size,
            'embedding_size':self.embedding_size
        }

    def call(self,x):
        y=tf.einsum('bsf,fk->bsk',x,self.shared_weights)
        return y


class PositionEncoding(tf.keras.layers.Layer):
    def __init__(self,max_len):
        super(PositionEncoding, self).__init__()
        self.max_len=max_len

    def build(self,input_shape):
        super(PositionEncoding,self).build(input_shape)

    def get_config(self):
        return {
            'max_len': self.max_len
        }

    def call(self,x,masking=True):
        E = x.get_shape().as_list()[-1]  # static
        batch_size, seq_length = tf.shape(x)[0], tf.shape(x)[1]  # dynamic
        with tf.name_scope('position_encode'):
            position_ind = tf.tile(tf.expand_dims(tf.range(seq_length), 0), [batch_size, 1])  # => batch_size*seq_length
            position_enc = np.array(
                [[pos / np.power(10000, (i - i % 2) / E) for i in range(E)] for pos in range(self.max_len)])

            position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])
            position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])
            position_enc = tf.convert_to_tensor(position_enc, tf.float32)  # (maxlen,E)

            outputs = tf.nn.embedding_lookup(position_enc, position_ind)
            if masking:
                outputs = tf.where(tf.equal(x, 0), x, outputs)
        return tf.cast(outputs,tf.float32)


class SublayerConnection(tf.keras.layers.Layer):
    def __init__(self,sublayer,params):
        super(SublayerConnection,self).__init__()
        self.sublayer=sublayer
        self.params=params
        self.layer_postprocess_dropout=params['layer_postprocess_dropout']

    def build(self,input_shape):
        self.layer_norm=tf.keras.layers.LayerNormalization(epsilon=1e-6, dtype="float32")
        super(SublayerConnection,self).build(input_shape)

    def get_config(self):
        return {
            'params':self.params
        }

    def call(self,x,*args,**kwargs):
        y=self.sublayer(self.layer_norm(x),*args,**kwargs)
        if kwargs['training']:
            y=tf.nn.dropout(y,rate=self.layer_postprocess_dropout)
        return x+y
    


class Transformer(object):
    def __init__(self, custom_model_params):
        params.update(custom_model_params)
        self.params=params
        self.embedding_layer=EmbeddingLayer(embedding_size=self.params['attention_hidden_size'])
        self.encoder_stack=EncoderStack(self.params)
        self.decoder_stack=DecoderStack(self.params)
        self.projection = tf.keras.layers.Dense(units=1)

    def get_config(self):
        return {}

    def __call__(self, x, predict_seq_length, training):
        assert isinstance(x, tuple), "please input both of inputs and targets"
        inputs,targets=x
        self.position_encoding_layer = PositionEncoding(max_len=inputs.get_shape().as_list()[1])
        self.position_encoding_layer_2 = PositionEncoding(max_len=predict_seq_length)
        if training:
            src_mask = self.get_src_mask(inputs)  # => batch_size * sequence_length
            src_mask = self.get_src_mask_bias(src_mask)  # => batch_size * 1 * 1 * input_sequence_length
            memory=self.encoder(encoder_inputs=inputs,mask=src_mask,training=training)

            decoder_output = self.decoder(targets,memory,src_mask,training=training,predict_seq_length=predict_seq_length)
            outputs=self.projection(decoder_output)

            return outputs
        else:
            src_mask = self.get_src_mask(x)  # => batch_size * sequence_length
            src_mask = self.get_src_mask_bias(src_mask)  # => batch_size * 1 * 1 * input_sequence_length

            memory = self.encoder(encoder_inputs=x, mask=src_mask, training=training)

            decoder_inputs = tf.ones((x.shape[0], 1, 1), tf.int32)

            for _ in range(predict_seq_length):
                decoder_inputs_update=self.decoder(decoder_inputs,memory,src_mask,training)
                decoder_inputs=tf.concat([decoder_inputs,decoder_inputs_update],axis=1)

    def encoder(self,encoder_inputs, mask,training):
        '''
        :param inputs: sequence_inputs, batch_size * sequence_length * feature_dim
        :param training:
        :return:
        '''
        with tf.name_scope("encoder"):
            src=self.embedding_layer(encoder_inputs)  # batch_size * sequence_length * embedding_size
            src+=self.position_encoding_layer(src)

            if training:
                src=tf.nn.dropout(src,rate=0.01)  # batch_size * sequence_length * attention_hidden_size

            return self.encoder_stack(src,mask,training)

    def decoder(self,targets,memory,src_mask,training,predict_seq_length):
        with tf.name_scope("shift_targets"):
            # Shift targets to the right, and remove the last element
            decoder_inputs = tf.pad(targets, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            tgt_mask=self.get_tgt_mask_bias(predict_seq_length)
            tgt=self.embedding_layer(decoder_inputs)

        with tf.name_scope("add_pos_encoding"):
            pos_encoding = self.position_encoding_layer_2(tgt)
            tgt += pos_encoding

        if training:
            tgt = tf.nn.dropout(tgt, rate=self.params["layer_postprocess_dropout"])
        with tf.name_scope('decoder'):
            logits=self.decoder_stack(tgt,memory,src_mask,tgt_mask,training)  # Todo：mask
        return logits

    def get_src_mask(self,x,pad=0):
        src_mask = tf.reduce_all(tf.math.equal(x, pad),axis=-1)
        return src_mask

    def get_src_mask_bias(self,mask):
        attention_bias = tf.cast(mask, tf.float32)
        attention_bias = attention_bias * tf.constant(-1e9, dtype=tf.float32)
        attention_bias = tf.expand_dims(tf.expand_dims(attention_bias, 1),1)  # => batch_size * 1 * 1 * input_length
        return attention_bias

    def get_tgt_mask_bias(self,length):
        valid_locs = tf.linalg.band_part(tf.ones([length, length], dtype=tf.float32),-1, 0)
        valid_locs = tf.reshape(valid_locs, [1, 1, length, length])
        decoder_bias = -1e9 * (1.0 - valid_locs)
        return decoder_bias


class EncoderStack(tf.keras.layers.Layer):
    def __init__(self,params):
        super(EncoderStack, self).__init__()
        self.params=params
        self.layers=[]

    def build(self,input_shape):
        for _ in range(self.params['n_layers']):
            attention_layer=Attention(self.params['attention_hidden_size'],
                                      self.params['num_heads'],
                                      self.params['attention_dropout'])
            feed_forward_layer=FeedForwardNetwork(self.params['ffn_hidden_size'],
                                                  self.params['ffn_filter_size'],
                                                  self.params['relu_dropout'])
            post_attention_layer=SublayerConnection(attention_layer,self.params)
            post_feed_forward_layer=SublayerConnection(feed_forward_layer,self.params)
            self.layers.append([post_attention_layer,post_feed_forward_layer])
        self.output_norm=tf.keras.layers.LayerNormalization(epsilon=1e-6, dtype="float32")
        super(EncoderStack,self).build(input_shape)

    def get_config(self):
        return {
        }

    def call(self,encoder_inputs, src_mask, training):
        for n, layer in enumerate(self.layers):
            attention_layer = layer[0]
            ffn_layer = layer[1]

            with tf.name_scope('layer_{}'.format(n)):
                with tf.name_scope('self_attention'):
                    encoder_inputs = attention_layer(encoder_inputs,encoder_inputs, src_mask, training=training)
                with tf.name_scope('ffn'):
                    encoder_inputs = ffn_layer(encoder_inputs, training=training)

        return self.output_norm(encoder_inputs)


class DecoderStack(tf.keras.layers.Layer):
    def __init__(self,params):
        super(DecoderStack,self).__init__()
        self.params=params
        self.layers=[]

    def build(self,input_shape):
        for _ in range(self.params['n_layers']):
            self_attention_layer=Attention(self.params['attention_hidden_size'],
                                           self.params['num_heads'],
                                           self.params['attention_dropout'])
            enc_dec_attention_layer=Attention(self.params['attention_hidden_size'],
                                              self.params['num_heads'],
                                              self.params['attention_dropout'])
            feed_forward_layer=FeedForwardNetwork(self.params['ffn_hidden_size'],
                                                  self.params['ffn_filter_size'],
                                                  self.params['relu_dropout'])
            post_self_attention_layer=SublayerConnection(self_attention_layer,self.params)
            post_enc_dec_attention_layer=SublayerConnection(enc_dec_attention_layer,self.params)
            post_feed_forward_layer=SublayerConnection(feed_forward_layer,self.params)
            self.layers.append([post_self_attention_layer,post_enc_dec_attention_layer,post_feed_forward_layer])
        self.output_norm=tf.keras.layers.LayerNormalization(epsilon=1e-6, dtype="float32")
        super(DecoderStack,self).build(input_shape)

    def get_config(self):
        return {
            'params':self.params
        }

    def call(self, decoder_inputs,encoder_outputs,src_mask,tgt_mask,training,cache=None):
        for n, layer in enumerate(self.layers):
            self_attention_layer = layer[0]
            enc_dec_attention_layer = layer[1]
            ffn_layer = layer[2]

            #layer_cache = cache[layer_name] if cache is not None else None
            with tf.name_scope("dec_layer_{}".format(n)):
                with tf.name_scope('self_attention'):
                    decoder_inputs = self_attention_layer(decoder_inputs,decoder_inputs,tgt_mask,training=training)
                with tf.name_scope('enc_dec_attention'):
                    decoder_inputs = enc_dec_attention_layer(decoder_inputs,encoder_outputs,src_mask,training=training)  # Todo: mask??
                with tf.name_scope('ffn'):
                    decoder_inputs = ffn_layer(decoder_inputs,training=training)
        return self.output_norm(decoder_inputs)

In [3]:
params={
    'n_layers':6,
    'attention_hidden_size':64*8,
    'num_heads':8,
    'ffn_hidden_size':64*8,
    'ffn_filter_size':64*8,
    'attention_dropout':0.1,
    'relu_dropout':0.1,
    'layer_postprocess_dropout':0.1,
}

In [4]:
model = Transformer(params)

In [5]:
e = EncoderStack(params)

In [6]:
e.build(input_shape=(1440,3))