In [1]:
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LeakyReLU, BatchNormalization
from keras.layers import Conv2D, Conv2DTranspose, Reshape, Flatten
from keras.layers import Input, Flatten, Embedding, multiply, Dropout
from keras.layers import Concatenate, GaussianNoise,Activation
from keras.optimizers import Adam
from keras.utils import np_utils, to_categorical
from keras.callbacks import TensorBoard
from keras import initializers
from keras import backend as K
from keras.applications.inception_v3 import InceptionV3, preprocess_input
from skimage.transform import resize
from scipy.linalg import sqrtm
K.tensorflow_backend._get_available_gpus()
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])








In [30]:
import numpy as np
import os

%matplotlib inline
import matplotlib.pyplot as plt


In [10]:
!pip install keras-layer-normalization    #keras layer for normalization

Collecting keras-layer-normalization
  Using cached https://files.pythonhosted.org/packages/a4/0e/d1078df0494bac9ce1a67954e5380b6e7569668f0f3b50a9531c62c1fc4a/keras-layer-normalization-0.14.0.tar.gz
Collecting numpy (from keras-layer-normalization)
  Using cached https://files.pythonhosted.org/packages/d7/b1/3367ea1f372957f97a6752ec725b87886e12af1415216feec9067e31df70/numpy-1.16.5-cp27-cp27mu-manylinux1_x86_64.whl
Collecting Keras (from keras-layer-normalization)
  Using cached https://files.pythonhosted.org/packages/ad/fd/6bfe87920d7f4fd475acd28500a42482b6b84479832bdc0fe9e589a60ceb/Keras-2.3.1-py2.py3-none-any.whl
Collecting keras-applications>=1.0.6 (from Keras->keras-layer-normalization)
Collecting h5py (from Keras->keras-layer-normalization)
  Using cached https://files.pythonhosted.org/packages/12/90/3216b8f6d69905a320352a9ca6802a8e39fdb1cd93133c3d4163db8d5f19/h5py-2.10.0-cp27-cp27mu-manylinux1_x86_64.whl
Collecting pyyaml (from Keras->keras-layer-normalization)
Collecting scipy>=

## GELU
GELU activation function which is defined as:
     GELU(x) = xP(x)
     wher P(x) = 0.5𝑥(1+𝑡𝑎𝑛ℎ(sqrt(2/pi))(𝑥+0.044715𝑥3)))

In [11]:
def gelu(x):
    Px = 0.5 * (1.0 + K.tanh((K.sqrt(2 / (22/7)) * (x + 0.044715 * K.pow(x, 3)))))
    return x * Px

## sentencepiece Installation

In [12]:
!pip install sentencepiece

Collecting sentencepiece
  Using cached https://files.pythonhosted.org/packages/fa/50/09193c69a66cc87e95bd53b935f42453ea118cd90f5b118d74536c633d0c/sentencepiece-0.1.83-cp27-cp27mu-manylinux1_x86_64.whl
Installing collected packages: sentencepiece
Successfully installed sentencepiece-0.1.83


## Attention

In [13]:
def Attention(X):
    channels = X.shape[-1].value
    f = Conv2D(channels//8, kernel_size=1, strides=1, padding='same')(X) 
    g = Conv2D(channels//8, kernel_size=1, strides=1, padding='same')(X)  
    h = Conv2D(channels, kernel_size=1, strides=1, padding='same')(X)    
    flatten_g = K.reshape(g, shape= ([K.shape(g)[0],-1, K.shape(g)[-1]]))
    flatten_f = K.reshape(f, shape= ([K.shape(f)[0], -1, K.shape(f)[-1]]))
    flatten_f = K.permute_dimensions(flatten_f, (0,2,1))
    s = K.batch_dot(flatten_g, flatten_f)
    beta = K.softmax(s)  # attention map
    flatten_h = K.reshape(h, shape=([K.shape(h)[0], -1, K.shape(h)[-1]]))
    o = K.batch_dot(beta, flatten_h)    
    gamma = 0
    o = K.reshape(o, shape= K.shape(X))
    y = gamma * o + X
    return y

In [14]:
def dot_product_attention(q, k, v, bias):
    logits = K.batch_dot(q, k, axes=1)
    logits = K.batch_dot(logits, 1.0 / K.sqrt(float(get_shape_list(q)[-1])))
    if bias is not None:
        from_shape = get_shape_list(q)
        if len(from_shape) == 4:
            broadcast_ones = K.ones([from_shape[0], 1, from_shape[2], 1], dtype='float32')
        elif len(from_shape) == 5:
            broadcast_ones = K.ones([from_shape[0], 1, from_shape[2], from_shape[3],1], dtype='float32')

        bias = K.batch_dot(broadcast_ones,K.cast(bias, dtype='float32'), axes=1)
        adder = (1.0 - bias) * -10000.0
        logits += adder
    else:
        adder = 0.0
    attention_probs = K.softmax(logits, name="attention_probs")
    return K.batch_dot(attention_probs, v)

## Embeddings Functions

In [20]:
def get_timing_signal_1d_given_position(channels, position, min_timescale=1.0, max_timescale=1.0e4):
    num_timescales = channels // 2
    log_timescale_increment = (K.log(float(max_timescale) / float(min_timescale)) / (tf.to_float(num_timescales) - 1))
    inv_timescales = min_timescale * K.exp(tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
    scaled_time = (K.expand_dims(tf.to_float(position), 2) * K.expand_dims(K.expand_dims(inv_timescales, 0), 0))
    sig = K.concat([K.sin(scaled_time), K.cos(scaled_time)], axis=2)
    sig = K.pad(sig, [[0, 0], [0, 0], [0, K.mod(channels, 2)]])
    return sig

In [22]:
def embedding_factorization(config, embedding):
    mat1 = K.Embedding(vocab_size, embedding)
    mat2 = K.Dense(embedding, hidden)
    pos_embed = K.Embedding(max_len, hidden) # positional embedding
    seg_embed = K.Embedding(n_segments, hidden) # segment embedding
    return mat1, mat2, pos_embed, seg_embed

## Sentence order loss

In [21]:
def get_sentence_order_output(config, pred, labels):
    y = K.batch_dot(pred, output_weights, axes = 1)
    y = K.bias_add(y, output_bias)
    probs = K.log_softmax(y, axis=-1)
    labels = K.reshape(labels, [-1])
    one_hot_labels = K.one_hot(labels, depth=2, dtype="float32")
    per_example_loss = -K.sum(one_hot_labels * log_probs, axis=-1)
    loss = K.mean(per_example_loss)
    return (loss, per_example_loss, log_probs)