In [1]:
%matplotlib inline

import re
import random
import string
import time
import warnings

import importlib

import customersupport.common
import customersupport.common.utils
import customersupport.evaluation.eval

print('Library versions:')

import tensorflow as tf
print('tensorflow:{}'.format(tf.__version__))
import pandas as pd
print('pandas:{}'.format(pd.__version__))
# import sklearn
# print('sklearn:{}'.format(sklearn.__version__))
# import nltk
# print('nltk:{}'.format(nltk.__version__))
import numpy as np
print('numpy:{}'.format(np.__version__))
import matplotlib.pyplot as plt

from IPython.display import SVG
from tqdm import tqdm_notebook as tqdm  # Special jupyter notebook progress bar

from tensorflow.python.layers import core as layers_core
from datetime import datetime

from customersupport.common.vocab import VocabHolder
from customersupport.common.dataset import CustomerSupportDataset

from customersupport.evaluation.eval import evaluate_words_index, format_metrics, get_evaluation_conf, strip_punkt

importlib.reload(customersupport.common.vocab)
importlib.reload(customersupport.common.dataset)
importlib.reload(customersupport.common.utils)
importlib.reload(customersupport.evaluation.eval)

warnings.simplefilter('ignore')

tqdm().pandas()  # Enable tracking of progress in dataframe `apply` calls

tqdm.monitor_interval = 0

Library versions:
tensorflow:1.11.0
pandas:0.23.4
numpy:1.15.4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:01:00.0, compute capability: 6.1']

In [68]:
# 8192 - large enough for demonstration, larger values make network training slower
MAX_VOCAB_SIZE = 2**13

# seq2seq generally relies on fixed length message vectors - longer messages provide more info
# but result in slower training and larger networks
#MAX_MESSAGE_LEN = 50

hparams = tf.contrib.training.HParams(
    # Larger batch sizes generally reach the average response faster, but small batch sizes are
    # required for the model to learn nuanced responses.  Also, GPU memory limits max batch size.
    batch_size=64,
    encoder_length=60,
    decoder_length=70,
    src_vocab_size=MAX_VOCAB_SIZE,
    # Embedding size for words - gives a trade off between expressivity of words and network size
    embedding_size=200,
    tgt_vocab_size=MAX_VOCAB_SIZE,
    # Helps regularize network and prevent overfitting.
    # High learning rate helps model reach average response faster, but can make it hard to
    # converge on nuanced responses
    learning_rate=5e-04,  #0.0005,
    max_gradient_norm=5.0,
    l2_norm = None,
    beam_width = 10,
    d = 128,
    nh = 4,
    max_epochs=15,
    dropout=0.1,
    use_glove=False,
    l2_reg=0.,
    glove_path=None,
    #'/home/momchil/Storage/Projects/Python/Data/glove.twitter.27B/glove.twitter.27B.200d.txt',
    tweets_path=
    '/home/momchil/Storage/Projects/Python/Data/customer-support-on-twitter/twcs-conv_ids_clean.csv',
    # Ngram count for ROUGE and BLEU
    max_order = 2,
    train_size=0.8,
    decay_rate=0.99,
    train_time_diff=5.0,
    first_day=0,
    last_day=60,
    evaluation_metrics=[
        "bleu", "rouge_l", "embedding_average", "vector_extrema",
        "greedy_matching"
    ],
    training_metrics=[
        "bleu", "rouge_l", "embedding_average", "vector_extrema",
        "greedy_matching"
    ],
    companies=['AppleSupport'])

In [4]:
%%time
cs_data = CustomerSupportDataset(hparams)

#& (y_text.str.contains('help') ^ True)
#['direct message', 'is fixed in a future software update']
cs_data.process_utterances(masks=['direct message'], append_context=True)

Done support_author (984679, 9)
Replacing anonymized screen names in X...


HBox(children=(IntProgress(value=0, max=105179), HTML(value='')))


Replacing anonymized screen names in Y...


HBox(children=(IntProgress(value=0, max=105179), HTML(value='')))


CPU times: user 58.5 s, sys: 1.54 s, total: 1min
Wall time: 54.5 s


In [5]:
voc_holder = VocabHolder(hparams)

Loaded w2v


In [6]:
analyzer = voc_holder.fit(cs_data.x_text, cs_data.y_text, hparams.src_vocab_size)

cs_data.text_to_vec(hparams, voc_holder)
cs_data.train_test_split(hparams, do_random=False)

train_x = cs_data.train_x
train_y = cs_data.train_y

test_x = cs_data.test_x
test_y = cs_data.test_y

train_weights = cs_data.train_weights
test_weights = cs_data.test_weights

Fitting CountVectorizer on X and Y text data...


HBox(children=(IntProgress(value=0, max=49626), HTML(value='')))


Learned vocab of 8192 items.
Calculating word indexes for X...


HBox(children=(IntProgress(value=0, max=49626), HTML(value='')))


Calculating word indexes for Y...


HBox(children=(IntProgress(value=0, max=49626), HTML(value='')))


Training data of shape (45582, 60) and test data of shape (4044, 70).
count    45582.000000
mean         1.000000
std          0.141677
min          0.740038
25%          0.883758
50%          1.021893
75%          1.097074
max          1.286219
dtype: float64
count    4044.000000
mean        1.000000
std         0.014701
min         0.972407
25%         0.988713
50%         1.001299
75%         1.011627
max         1.022508
dtype: float64


In [7]:
# cs_data.x = cs_data.x_text.values
# cs_data.y = cs_data.y_text.values
# cs_data.train_test_split(hparams, do_random=False)
# train_x = cs_data.train_x
# train_y = cs_data.train_y

# test_x = cs_data.test_x
# test_y = cs_data.test_y

# def export(x_sent, y_sent, path):
#     with open(path, "w") as f:
#         for x, y in tqdm(zip(x_sent, y_sent)):
#             f.write(x + "\n" + y + '\n\n')
            
# export(train_x, train_y, "/home/momchil/Storage/Projects/Python/bert/next_sent_train.txt")
# export(test_x, test_y, "/home/momchil/Storage/Projects/Python/bert/next_sent_test.txt")

In [8]:
def export_vocab(voc_holder):
    with open("/home/momchil/Storage/Projects/Python/Data/multi_cased_L-12_H-768_A-12/vocab.txt", "w") as f:
        for l in map(lambda x: voc_holder.reverse_vocab[x], range(MAX_VOCAB_SIZE)):
            f.write(l + "\n")

def validate_elmo(elmo_weights, voc_holder, word):
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
    embeddings = elmo(
                inputs={
                    "tokens": [[word]],
                    "sequence_len": [1]
                },
                signature="tokens",
                as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        elmo_pred = sess.run(embeddings).reshape(1024)
    embedding = elmo_weights[voc_holder.vocab[word]]
    
    np.testing.assert_array_almost_equal(elmo_pred, embedding, decimal=6)

def export_bert(vocab_holder):
    
    def sum_per_layer(data, i):
        return np.sum([t['layers'][i]['values'] for t in data['features'][1:]], 0)
    
    words = [""] * len(voc_holder.reverse_vocab)
    for i, w in voc_holder.reverse_vocab.items():
        words[i] = w
        
    # BERT uses those 3
    words[customersupport.common.utils.START] = '<S>'
    words[customersupport.common.utils.PAD] = '[PAD]'
    words[customersupport.common.utils.UNK] = '[UNK]'
    
    bert_weights = []
    with open('/home/momchil/Storage/Projects/Python/Data/uncased_L-12_H-768_A-12/output.jsonl') as f:
#         for i in range(2000): 
#             f.readline()
        for i, w in enumerate(tqdm(words)):
            data = json.loads(str(f.readline()))
#             weights = np.sum([data['features'][i]['layers'][0]['values'] for i in range(len(data['features']))], 0)
#             weights = data['features'][0]['layers'][0]['values']
#             print( data['features'][0]['token'])
            weights = sum_per_layer(data, 0)
            for i in range(1, len(data['features'][0]['layers'])):
#                 weights = np.concatenate((weights, sum_per_layer(data, i)))
                weights += sum_per_layer(data, i)
            bert_weights.append(weights)
    
    return np.array(bert_weights, dtype=np.float32)

emb_weights = export_bert(voc_holder)
emb_weights[customersupport.common.utils.PAD] = np.zeros(emb_weights.shape[-1], dtype=np.float32)
bert = emb_weights

HBox(children=(IntProgress(value=0, max=8192), HTML(value='')))




In [9]:
# for w1, w2 in [('ipad', 'iphone'), ('hello', 'hi'), ('yes', 'no'), ('talk', 'bad'), ('love', 'hate')]:
#     print(np.dot(bert[voc_holder.vocab[w1]], bert[voc_holder.vocab[w2]]) / \
#         (np.linalg.norm(bert[voc_holder.vocab[w1]]) * np.linalg.norm(bert[voc_holder.vocab[w2]])))
#     print(np.dot(elmo[voc_holder.vocab[w1]], elmo[voc_holder.vocab[w2]]) / \
#         (np.linalg.norm(elmo[voc_holder.vocab[w1]]) * np.linalg.norm(elmo[voc_holder.vocab[w2]])))
#     print()
# # np.mean(emb_weights, axis=1).tolist()

In [10]:
import tensorflow_hub as hub

def validate_elmo(elmo_weights, voc_holder, word):
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
    embeddings = elmo(
                inputs={
                    "tokens": [[word]],
                    "sequence_len": [1]
                },
                signature="tokens",
                as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        elmo_pred = sess.run(embeddings).reshape(1024)
    embedding = elmo_weights[voc_holder.vocab[word]]
    
    np.testing.assert_array_almost_equal(elmo_pred, embedding, decimal=6)

def export_elmo(vocab_holder):
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
    
    words = [""] * len(voc_holder.reverse_vocab)
    for i, w in voc_holder.reverse_vocab.items():
        words[i] = w
        
    # ELMo uses those 3
    words[customersupport.common.utils.START] = '<S>'
    words[customersupport.common.utils.PAD] = '</S>'
    words[customersupport.common.utils.UNK] = '<UNK>'
    
    words = np.array(words, dtype=np.str).reshape(-1, 512, 1)

    elmo_weights = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for tokens_input in tqdm(words):
            embeddings = elmo(
                inputs={
                    "tokens": tokens_input,
                    "sequence_len": np.ones(tokens_input.shape[0])
                },
                signature="tokens",
                as_dict=True)["elmo"]
            e = sess.run(embeddings)
            elmo_weights.append(e.reshape(-1, 1024))

        elmo_weights = np.array(elmo_weights).reshape(-1, 1024)
        return elmo_weights

emb_weights = export_elmo(voc_holder)
emb_weights[customersupport.common.utils.PAD] = np.zeros(1024, dtype=np.float32)

validate_elmo(emb_weights, voc_holder, "hello")
elmo = emb_weights

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver

In [11]:
from tensorflow.python.util import nest
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import clip_ops

def batch_dot(x, y, axes=None):
    """Copy from keras==2.0.6
    Batchwise dot product.
    `batch_dot` is used to compute dot product of `x` and `y` when
    `x` and `y` are data in batch, i.e. in a shape of
    `(batch_size, :)`.
    `batch_dot` results in a tensor or variable with less dimensions
    than the input. If the number of dimensions is reduced to 1,
    we use `expand_dims` to make sure that ndim is at least 2.
    # Arguments
        x: Keras tensor or variable with `ndim >= 2`.
        y: Keras tensor or variable with `ndim >= 2`.
        axes: list of (or single) int with target dimensions.
            The lengths of `axes[0]` and `axes[1]` should be the same.
    # Returns
        A tensor with shape equal to the concatenation of `x`'s shape
        (less the dimension that was summed over) and `y`'s shape
        (less the batch dimension and the dimension that was summed over).
        If the final rank is 1, we reshape it to `(batch_size, 1)`.
    """
    if isinstance(axes, int):
        axes = (axes, axes)
    x_ndim = ndim(x)
    y_ndim = ndim(y)
    if x_ndim > y_ndim:
        diff = x_ndim - y_ndim
        y = tf.reshape(y, tf.concat([tf.shape(y), [1] * (diff)], axis=0))
    elif y_ndim > x_ndim:
        diff = y_ndim - x_ndim
        x = tf.reshape(x, tf.concat([tf.shape(x), [1] * (diff)], axis=0))
    else:
        diff = 0
    if ndim(x) == 2 and ndim(y) == 2:
        if axes[0] == axes[1]:
            out = tf.reduce_sum(tf.multiply(x, y), axes[0])
        else:
            out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1])
    else:
        if axes is not None:
            adj_x = None if axes[0] == ndim(x) - 1 else True
            adj_y = True if axes[1] == ndim(y) - 1 else None
        else:
            adj_x = None
            adj_y = None
        out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
    if diff:
        if x_ndim > y_ndim:
            idx = x_ndim + y_ndim - 3
        else:
            idx = x_ndim - 1
        out = tf.squeeze(out, list(range(idx, idx + diff)))
    if ndim(out) == 1:
        out = tf.expand_dims(out, 1)
    return out

def ndim(x):
    """Copied from keras==2.0.6
    Returns the number of axes in a tensor, as an integer.
    # Arguments
        x: Tensor or variable.
    # Returns
        Integer (scalar), number of axes.
    # Examples
    ```python
        >>> from keras import backend as K
        >>> inputs = K.placeholder(shape=(2, 4, 5))
        >>> val = np.array([[1, 2], [3, 4]])
        >>> kvar = K.variable(value=val)
        >>> K.ndim(inputs)
        3
        >>> K.ndim(kvar)
        2
    ```
    """
    dims = x.get_shape()._dims
    if dims is not None:
        return len(dims)
    return None

def dot(x, y):
    """Modified from keras==2.0.6
    Multiplies 2 tensors (and/or variables) and returns a *tensor*.
    When attempting to multiply a nD tensor
    with a nD tensor, it reproduces the Theano behavior.
    (e.g. `(2, 3) * (4, 3, 5) -> (2, 4, 5)`)
    # Arguments
        x: Tensor or variable.
        y: Tensor or variable.
    # Returns
        A tensor, dot product of `x` and `y`.
    """
    if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
        x_shape = []
        for i, s in zip(x.get_shape().as_list(), tf.unstack(tf.shape(x))):
            if i is not None:
                x_shape.append(i)
            else:
                x_shape.append(s)
        x_shape = tuple(x_shape)
        y_shape = []
        for i, s in zip(y.get_shape().as_list(), tf.unstack(tf.shape(y))):
            if i is not None:
                y_shape.append(i)
            else:
                y_shape.append(s)
        y_shape = tuple(y_shape)
        y_permute_dim = list(range(ndim(y)))
        y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
        xt = tf.reshape(x, [-1, x_shape[-1]])
        yt = tf.reshape(tf.transpose(y, perm=y_permute_dim), [y_shape[-2], -1])
        return tf.reshape(tf.matmul(xt, yt),
                          x_shape[:-1] + y_shape[:-2] + y_shape[-1:])
    if isinstance(x, tf.SparseTensor):
        out = tf.sparse_tensor_dense_matmul(x, y)
    else:
        out = tf.matmul(x, y)
    return out
    
initializer = lambda: tf.contrib.layers.variance_scaling_initializer(factor=1.0,
                                                             mode='FAN_AVG',
                                                             uniform=True,
                                                             dtype=tf.float32)
initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
                                                             mode='FAN_IN',
                                                             uniform=False,
                                                             dtype=tf.float32)
regularizer = tf.contrib.layers.l2_regularizer(scale = 3e-7)

def optimized_trilinear_for_attention(args, c_maxlen, q_maxlen, input_keep_prob=1.0,
    scope='efficient_trilinear',
    bias_initializer=tf.zeros_initializer(),
    kernel_initializer=initializer()):
    assert len(args) == 2, "just use for computing attention with two input"
    arg0_shape = args[0].get_shape().as_list()
    arg1_shape = args[1].get_shape().as_list()
    if len(arg0_shape) != 3 or len(arg1_shape) != 3:
        raise ValueError("`args` must be 3 dims (batch_size, len, dimension)")
    if arg0_shape[2] != arg1_shape[2]:
        raise ValueError("the last dimension of `args` must equal")
    arg_size = arg0_shape[2]
    dtype = args[0].dtype
    droped_args = [tf.nn.dropout(arg, input_keep_prob) for arg in args]
    with tf.variable_scope(scope):
        weights4arg0 = tf.get_variable(
            "linear_kernel4arg0", [arg_size, 1],
            dtype=dtype,
            regularizer=regularizer,
            initializer=kernel_initializer)
        weights4arg1 = tf.get_variable(
            "linear_kernel4arg1", [arg_size, 1],
            dtype=dtype,
            regularizer=regularizer,
            initializer=kernel_initializer)
        weights4mlu = tf.get_variable(
            "linear_kernel4mul", [1, 1, arg_size],
            dtype=dtype,
            regularizer=regularizer,
            initializer=kernel_initializer)
        biases = tf.get_variable(
            "linear_bias", [1],
            dtype=dtype,
            regularizer=regularizer,
            initializer=bias_initializer)
        subres0 = tf.tile(dot(droped_args[0], weights4arg0), [1, 1, q_maxlen])
        subres1 = tf.tile(tf.transpose(dot(droped_args[1], weights4arg1), perm=(0, 2, 1)), [1, c_maxlen, 1])
        subres2 = batch_dot(droped_args[0] * weights4mlu, tf.transpose(droped_args[1], perm=(0, 2, 1)))
        res = subres0 + subres1 + subres2
        tf.nn.bias_add(res, tf.tile(biases, [q_maxlen]))
        
    return res

def context2query(c: tf.Tensor, q: tf.Tensor, mask_c: tf.Tensor, mask_q: tf.Tensor, input_keep_prob: float):
    with tf.variable_scope("Context_to_Query_Attention_Layer"):
        S = optimized_trilinear_for_attention([c, q], tf.shape(c)[1], tf.shape(q)[1], input_keep_prob)
        S = mask_matrix(mask_q, S, tf.shape(c)[1])

        S_ = tf.nn.softmax(S)
        S_ *= tf.tile(tf.expand_dims(mask_c, -1), [1, 1, tf.shape(q)[1]]) 
        S_T = tf.transpose(tf.nn.softmax(S, axis = 1),(0,2,1))
        
        c2q = tf.matmul(S_, q)
        q2c = tf.matmul(tf.matmul(S_, S_T), c)
        attention_outputs = [c, c2q, c * c2q, c * q2c, S_]
        
        return attention_outputs
    
def linear(context: tf.Tensor, c_maxlen):
    with tf.variable_scope("Linear"):
        wl = tf.get_variable("Wl", shape=[1, c_maxlen], 
                             initializer=tf.contrib.layers.xavier_initializer())
        out = tf.matmul(tf.tile(tf.expand_dims(wl, 0), [tf.shape(context)[0], 1, 1]), context)
        
        return tf.squeeze(out)

In [12]:
def embed_tensor(inputs, pretrained_embs, name, trainable=False):

    train_embeddings = tf.get_variable(
        name=name,
        shape=[hparams.tgt_vocab_size - len(voc_holder.glove_weights), hparams.embedding_size],
        initializer=tf.random_uniform_initializer(-0.04, 0.04),
        trainable=True)
    
    embeddings = tf.concat(
        [train_embeddings, pretrained_embs], axis=0)

    return tf.nn.embedding_lookup(embeddings, inputs), embeddings

In [75]:
import math
def cosine_similarity(v1,v2, use_elmo=True):
    if (use_elmo):
        v1 = np.array(list(map(lambda idx: elmo_weights[idx], filter(lambda idx: idx != 0, v1))))
        v1 = np.mean(v1, 0)

        v2 = np.array(list(map(lambda idx: elmo_weights[idx], filter(lambda idx: idx != 0, v2))))
        v2 = np.mean(v2, 0)
    else:
        mx = hparams.src_vocab_size # max(np.max(v1), np.max(v2)) + 1
        v1_tmp = np.zeros(mx)
        v1_tmp[v1] = 1.
        v1 = v1_tmp
        
        v2_tmp = np.zeros(mx)
        v2_tmp[v2] = 1.
        v2 = v2_tmp
        
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))      
#     return -np.linalg.norm(v1-v2)

def next_batch(train_x, train_y, batch_size, is_train = False, negatives = True):
    feed_dict = {}
    feed_dict[p] = []
    feed_dict[ctx] = []
    feed_dict[qry] = []
    feed_dict[dropout] = 0.0 if not is_train else hparams.dropout
    
    ids = np.random.choice(range(len(train_x)), size=batch_size, replace=False)
    np.random.shuffle(ids)
    ids_targets = np.random.choice(range(len(train_x)), size=batch_size, replace=False)
    np.random.shuffle(ids_targets)
    
    for i, it in zip(ids, ids_targets):
        feed_dict[p].append([1.])
        feed_dict[ctx].append(train_y[i])
        feed_dict[qry].append(train_x[i])
        
        if (len(feed_dict[p]) == batch_size):
            break
        
        if (not negatives):
            continue
        
        sim = cosine_similarity(train_y[i], train_y[it], False)
#         print(sim)
        feed_dict[p].append([float(sim > .7)])
#         feed_dict[p].append([0.])
        feed_dict[ctx].append(train_y[it])
        feed_dict[qry].append(train_x[i])
        
        if (len(feed_dict[p]) == batch_size):
            break
    
    return feed_dict
# def next_batch(train_x, train_y, batch_size):
#     feed_dict = {}
    
#     ids = np.random.choice(range(len(train_x)), size=3 * batch_size, replace=False) 
#     ids = ids.reshape(3, batch_size)
#     target_ids = []
       
    

#     for i in range(3):
#         feed_dict[inputs[i]] = train_y[ids[i]]

#     for i in range(batch_size):
#         idx = np.random.randint(0, 3, 1)[0]
#         target_ids.append(ids[idx][i])
        
#     ys = train_y[target_ids]
#     feed_dict[inputs[3]] = train_x[target_ids, :-10]
        
#     ysc = [[cosine_similarity(ys[i], feed_dict[inputs[0]][i]), 
#         cosine_similarity(ys[i], feed_dict[inputs[1]][i]), 
#         cosine_similarity(ys[i], feed_dict[inputs[2]][i])] for i in range(batch_size)]
    
# #     ysc = [np.exp(x - np.max(x)) /  np.exp(x - np.max(x)).sum(axis=0) for x in ysc]

#     feed_dict[o_probs] = ysc
    
#     return feed_dict

cosine_similarity([5, 10, 1, 4, 3], [0, 5, 10, 3, 4], False)
# next_batch(train_x, train_y, 3)

0.7999999999999998

In [14]:
def parametric_relu(_x):
    alphas = tf.get_variable('alpha', _x.get_shape()[-1],
                       initializer=initializer_relu(),
                        dtype=tf.float32)
    pos = tf.nn.relu(_x)
    neg = alphas * (_x - abs(_x)) * 0.5

    return pos + neg

def gelu_fast(_x):
    return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))

In [88]:
from QANet.layers import residual_block, highway, conv, mask_logits, trilinear, \
total_params#, optimized_trilinear_for_attention

c_maxlen = hparams.decoder_length
q_maxlen = hparams.encoder_length

tf.reset_default_graph()
dropout = tf.placeholder_with_default(0.0, (), name="dropout")

p = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="prob")

qry = tf.placeholder(dtype=tf.int32, shape=(None, q_maxlen), name="query")
ctx = tf.placeholder(dtype=tf.int32, shape=(None, c_maxlen), name="elastic")

c_mask = tf.cast(ctx, tf.bool)
q_mask = tf.cast(qry, tf.bool)

c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1)
q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1)

with tf.variable_scope("Input_Embedding_Layer"):
#     pretrained_embs = tf.get_variable(
#         name="embs_pretrained",
#         initializer=tf.constant_initializer(
#            voc_holder.glove_weights, dtype=tf.float32),
#         shape=voc_holder.glove_weights.shape,
#         trainable=False)

    emb_tensor = tf.convert_to_tensor(emb_weights)
#     c_emb, _  = embed_tensor(ctx, pretrained_embs, "embedding_ctx_glove")
#     q_emb, _  = embed_tensor(qry, pretrained_embs, "embedding_qry_glove")

#     c_emb = tf.nn.dropout(c_emb, 1.0 - dropout) 
#     q_emb = tf.nn.dropout(q_emb, 1.0 - dropout) 
    c_emb_ = tf.nn.dropout(tf.nn.embedding_lookup(emb_tensor, ctx), 1.0 - dropout)
    q_emb_ = tf.nn.dropout(tf.nn.embedding_lookup(emb_tensor, qry), 1.0 - dropout)

    c_emb = highway(c_emb_, size = hparams.d, scope = "highway", dropout = dropout, reuse = None)
    q_emb = highway(q_emb_, size = hparams.d, scope = "highway", dropout = dropout, reuse = True)
    
with tf.variable_scope("Embedding_Encoder_Layer"):
    c = residual_block(inputs=c_emb,
        num_blocks = 1,
        num_conv_layers = 4,
        kernel_size = 7,
        mask = c_mask,
        num_filters = hparams.d,
        num_heads = hparams.nh,
        seq_len = c_len,
        scope = "Encoder_Residual_Block",
        bias = False,
        dropout = dropout)
    
    q = residual_block(inputs=q_emb,
        num_blocks = 1,
        num_conv_layers = 4,
        kernel_size = 7,
        mask = q_mask,
        num_filters = hparams.d,
        num_heads = hparams.nh,
        seq_len = q_len,
        scope = "Encoder_Residual_Block",
        reuse = True, # Share the weights between passage and question
        bias = False,
        dropout = dropout)
    
with tf.variable_scope("Context_to_Query_Attention_Layer"):
#     C = tf.tile(tf.expand_dims(c,2),[1,1,q_maxlen,1])
#     Q = tf.tile(tf.expand_dims(q,1),[1,c_maxlen,1,1])
#     S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - dropout)
    S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob = 1.0 - dropout)
    mask_q = tf.expand_dims(q_mask, 1)
    S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
    mask_c = tf.expand_dims(c_mask, 2)
    S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1))
    c2q = tf.matmul(S_, q)
    q2c = tf.matmul(tf.matmul(S_, S_T), c)
    attention_outputs = [c, c2q, c * c2q, c * q2c]

with tf.variable_scope("Model_Encoder_Layer"):
    inputs = tf.concat(attention_outputs, axis = -1)
    enc = [conv(inputs, hparams.d, name = "input_projection")]
    for i in range(2):
        if i % 2 == 0: # dropout every 2 blocks
            enc[i] = tf.nn.dropout(enc[i], 1.0 - dropout)
        enc.append(
            residual_block(enc[i],
                num_blocks = 7,
                num_conv_layers = 2,
                kernel_size = 5,
                mask = c_mask,
                num_filters = hparams.d,
                num_heads = hparams.nh,
                seq_len = c_len,
                scope = "Model_Encoder",
                bias = False,
                reuse = True if i > 0 else None,
                dropout = dropout)
            )
with tf.variable_scope("Output_Layer"):
#     proj_logits = tf.squeeze(conv(tf.concat([enc[1], enc[2]],axis = -1),1, bias = False, name = "first_logits"),-1)
    proj_logits = tf.squeeze(conv(enc[1], 1, bias = False, name = "first_logits"), -1)
    proj_logits = tf.concat([proj_logits, 
                             batch_dot(tf.reduce_max(tf.abs(q_emb), 1), tf.reduce_max(tf.abs(c_emb), 1), 1),
                             tf.reduce_sum(q_emb_, 1),
                             tf.reduce_sum(c_emb_, 1)
#                              tf.reduce_max(c_emb, 1),
#                              tf.reduce_max(q_emb, 1),
#                              tf.reduce_min(c_emb, 1),
#                              tf.reduce_min(q_emb, 1),
#                              tf.reshape(tf.reduce_sum(tf.to_float(tf.equal(ctx, 1)), 1), (-1, 1)),
#                              tf.reshape(tf.reduce_sum(tf.to_float(tf.equal(qry, 1)), 1), (-1, 1)),
#                              tf.reduce_max(tf.one_hot(ctx, MAX_VOCAB_SIZE, on_value=1.0, off_value=0.0, axis =-1), 1),
#                              tf.reduce_max(tf.one_hot(qry, MAX_VOCAB_SIZE, on_value=1.0, off_value=0.0, axis =-1), 1)
                            ], 1)
    
    proj_logits = tf.nn.dropout(proj_logits, 1.0 - dropout)
#     second_logits = tf.squeeze(conv(tf.concat([enc[1], enc[3]],axis = -1),1,
# bias = False, name = "second_logits"),-1)
#     third_logits = tf.squeeze(conv(tf.concat([enc[2], enc[3]],axis = -1),1, 
# bias = False, name = "third_logits"),-1)
#     proj_logits = tf.squeeze(conv(enc[1] ,1, bias = False, name = "first_logits"), -1)
    
#     proj_logits = tf.layers.dense(proj_logits, 512, kernel_initializer=initializer(), 
#                                   activation=gelu_fast, kernel_regularizer=regularizer)
#     proj_logits = tf.nn.dropout(proj_logits, 1.0 - dropout)
    
#     proj_logits = tf.layers.dense(proj_logits, 256, kernel_initializer=initializer(), 
#                                   activation=parametric_relu, kernel_regularizer=regularizer)
#     proj_logits = tf.nn.dropout(proj_logits, 1.0 - dropout)
    
#     proj_logits = tf.layers.dense(proj_logits, 128, kernel_initializer=initializer(), 
#                                   activation=parametric_relu, kernel_regularizer=regularizer)
#     proj_logits = tf.layers.dense(proj_logits, hparams.d // 2, kernel_initializer=initializer(), 
#                                   activation=parametric_relu)

#     proj_logits = tf.squeeze(conv(proj_logits, 1, bias = False, name = "proj_logits"), -1)
    
    logits = tf.layers.dense(proj_logits, 1, kernel_initializer=initializer())#, 
#                              kernel_regularizer=regularizer)
    yp = tf.sigmoid(logits)
    correct_pred = tf.equal(tf.round(yp), p)
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    
    losses = tf.nn.sigmoid_cross_entropy_with_logits(labels = p, logits = logits)
    loss = tf.reduce_mean(losses)
    
if hparams.l2_norm is not None:
    variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
    l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
    loss += l2_loss
    print("Adding l2")
    
global_step = tf.Variable(0, name='global_step', trainable=False)

# lr = tf.minimum(hparams.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(global_step, tf.float32) + 1))
iters_per_epoch = int(np.ceil(train_x.shape[0] / hparams.batch_size))

# Optimization
lr = tf.train.exponential_decay(
    hparams.learning_rate, global_step, iters_per_epoch, .99, staircase=True)

optimizer = tf.train.AdamOptimizer(lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7)
# lrate = hparams.d ** -0.5 * tf.minimum(tf.pow(tf.cast(global_step, tf.float32), tf.constant(-0.5)),
#                                              tf.constant(3000 ** -1.5))
# optimizer = tf.train.AdamOptimizer(lrate, beta1=0.9, beta2=0.97, epsilon=1e-9)

params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients,
                                              hparams.max_gradient_norm)

train_op = optimizer.apply_gradients(
    zip(clipped_gradients, params), global_step=global_step)

total_params()

Total number of trainable parameters: 1241161


In [89]:
# Add ops to save and restore all the variables.
saver = tf.train.Saver()

sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [90]:
tf.summary.scalar('cross_entropy', loss)
tf.summary.scalar('accuracy', accuracy)

# Merge all the summaries and write them out to /tmp/mnist_logs (by default)
merged = tf.summary.merge_all()
now = str(int(datetime.now().timestamp()))
summaries_dir = './logs'
train_writer = tf.summary.FileWriter(summaries_dir + '/train' + now,
                                      sess.graph)
test_writer = tf.summary.FileWriter(summaries_dir + '/test' + now)
print('Model id #{}'.format(now))

Model id #1545087828


In [None]:
# Train
try:
    for epoch in range(hparams.max_epochs):
        print('\nStarting epoch = {}/{}'.format(epoch + 1, hparams.max_epochs))

        losss = 0.0
        last_test_loss = float('Inf')
        perc = 0
        t_start = time.time()
        
        iters_per_epoch = len(train_x) // hparams.batch_size
        for i in tqdm(range(iters_per_epoch)):
            try:
                step = sess.run(global_step)
                feed_dict = next_batch(train_x, train_y, hparams.batch_size, True)
                
                _, loss_value, summary = sess.run([train_op, loss, merged], feed_dict=feed_dict)
                losss += loss_value
                train_writer.add_summary(summary, step)

                if (i % 10 == 0):
                    feed_dict = next_batch(test_x, test_y, hparams.batch_size, False)
                    summary = sess.run(merged, feed_dict=feed_dict)
                    test_writer.add_summary(summary, step)
                    
            except Exception as e:
                print(e)
                print(feed_dict)

        t_epoch_end = time.time() - t_start

        t_start = time.time()
        
        print(losss / iters_per_epoch)
        
        t_test_end = time.time() - t_start
        
#         print(format_metrics(evaluation))
        print()
        print('Train elapsed {}, Test elapsed {}'.format(t_epoch_end, t_test_end))
        print('Updating lr = {}'.format(sess.run(lr)))

except KeyboardInterrupt:
    print("\nHalting training from keyboard interrupt.")


Starting epoch = 1/15


HBox(children=(IntProgress(value=0, max=712), HTML(value='')))

1.307510025045845

Train elapsed 147.24563431739807, Test elapsed 2.0265579223632812e-05
Updating lr = 0.0005000000237487257

Starting epoch = 2/15


HBox(children=(IntProgress(value=0, max=712), HTML(value='')))

0.8366588379643606

Train elapsed 131.7603759765625, Test elapsed 2.0742416381835938e-05
Updating lr = 0.0004949999856762588

Starting epoch = 3/15


HBox(children=(IntProgress(value=0, max=712), HTML(value='')))

0.701325458254707

Train elapsed 132.4077877998352, Test elapsed 8.749961853027344e-05
Updating lr = 0.0004900500061921775

Starting epoch = 4/15


HBox(children=(IntProgress(value=0, max=712), HTML(value='')))

In [None]:
n = next_batch(train_x, train_y, 1, negatives=False)

In [20]:
stop  

NameError: name 'stop' is not defined

In [82]:
all_dict = pd.read_csv("/home/momchil/Desktop/elastic_top10_all_dict.tsv", sep='\t')

all_dict.head()
references = []
hypothesis = []

# correct solution:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

def predict(question, candidates, sess, model):
    feed_dict = {}
    feed_dict[dropout] = 0.0
    feed_dict[ctx] = [c for c in candidates]
    feed_dict[qry] = [question] * len(candidates)
    
    probs = sess.run(model, feed_dict)
    
    return probs

for idx in tqdm(range(all_dict.shape[0])):
    try:
#         h, r, i = predict_pandas(all_dict, idx, do_predict=True)
#         h, r,i  = predict_pandas(all_dict, idx, do_predict=False)

        r = voc_holder.to_word_idx(all_dict.ix[idx, 'Reference'], -1)

        c = [voc_holder.to_word_idx(c, c_maxlen) for c in all_dict.ix[idx, 'Hypothesis'].split("@ @ @")]
        c = np.unique(c, axis=0)
        q = voc_holder.to_word_idx(all_dict.ix[idx, 'Question'], q_maxlen)
        
        probs =predict(q[0:q_maxlen], c, sess, logits)
#         h = c[np.argmax(probs)]
        h = c[np.random.choice(range(len(probs)), 1, p=softmax(probs).T[0])[0]]

#         references.append(strip_punkt(r, eval_conf.voc_holder.reverse_vocab))
#         hypothesis.append(strip_punkt(h, eval_conf.voc_holder.reverse_vocab))
        references.append(r[r.nonzero()])
        hypothesis.append(h[h.nonzero()])
    except:
        print("Skipping", idx)
        
references = np.array(references)
hypothesis = np.array(hypothesis)
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

HBox(children=(IntProgress(value=0, max=4044), HTML(value='')))

BLEU@2: 14.489494496198416
Embedding Average: 76.77852005419523
Greedy Matching: 29.799201824978688
ROUGE_L: 21.28191459864267
Vector Extrema: 38.87155947091704


In [None]:
# saver.restore(sess, "./models/1544353641/model.ckpt")
# saver.restore(sess, "./models/1544624781/model.ckpt")
# print("Model restored.")

In [None]:
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

In [None]:
def export_errors(x, y):
    i = 0
    pbar = tqdm(total=x.shape[0] // hparams.batch_size)

    q = []
    c = []
    p = []
    l = []
    
    while i < x.shape[0]:
        fd = (next_batch(x[i:(i+hparams.batch_size)], 
                                              y[i:(i+hparams.batch_size)], 
                                              len(y[i:(i+hparams.batch_size)]), False))
        
        pp, pl = sess.run([yp, losses], feed_dict=fd)
        
        for j in range(len(pp)):
            q.append(voc_holder.from_word_idx(fd[qry][j]))
            c.append(voc_holder.from_word_idx(fd[ctx][j]))
            p.append(pp[j])
            l.append(pl[j])
        i += hparams.batch_size
        pbar.update(1)
        
    df = pd.DataFrame(data = np.stack((q, c, np.array(p).reshape(-1), np.array(l).reshape(-1)), axis=1), 
                 columns = ["Question", "Context", "Probability", "Loss" ])
        
    df.to_csv('/home/momchil/Desktop/errors_test_all_dict.tsv', sep='\t', encoding='utf-8')
        
export_errors(test_x, test_y)

In [None]:

save_path = saver.save(sess, "./models/1544624781/model.ckpt")
print("Model saved in path: %s" % save_path)


In [None]:
feed_dict = next_batch(train_x, train_y, 2)
print(feed_dict[p])
print()
print(voc_holder.from_word_idx(feed_dict[qry][0]))
print(voc_holder.from_word_idx(feed_dict[ctx][0]))

print()
print(voc_holder.from_word_idx(feed_dict[qry][1]))
print(voc_holder.from_word_idx(feed_dict[ctx][1]))

sess.run(yp, feed_dict)
# sess.run(tf.nn.embedding_lookup(elmo_tensor, ctx), feed_dict)

In [None]:
def predict(question, candidates, sess, model):
    feed_dict = {}
    feed_dict[dropout] = 0.0
    feed_dict[ctx] = [c for c in candidates]
    feed_dict[qry] = [question] * len(candidates)
    
    probs = sess.run(model, feed_dict)
    
    return probs

def predict_pandas(df, idx, do_predict = True, i = None):
    row = df.iloc[idx]
    
    cols = list(filter(lambda c: "Hypothesis" in c, all_dict.columns.tolist()))
    c_raw = row[cols]
    candidates = [voc_holder.to_word_idx(x, c_maxlen) for x in c_raw]
    
    question = voc_holder.to_word_idx(row['Question'], q_maxlen)
    if do_predict:
        probs = predict(question, candidates, sess, yp)
        i = np.random.choice(range(len(candidates)), size=1, p=probs.T[0]/probs.sum())[0]
    else:
        i = i if i is not None else np.random.randint(0, 3)
    
    ref = voc_holder.to_word_idx(row['Reference'], -1)
    hyp =  voc_holder.to_word_idx(c_raw[i], -1)
    return hyp, ref, i


def trimAllColumns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trimStrings = lambda x: x.strip() if type(x) is str else x
    return df.applymap(trimStrings)

trans_all_dict = pd.read_csv("data/transformer_all_dict.tsv", sep='\t')[['Question', 'Reference', 'Hypothesis']]
es_all_dict = pd.read_csv("data/elastic_all_dict.tsv", sep='\t')[['Question', 'Reference', 'Hypothesis']]
s2s_all_dict = pd.read_csv("data/seq2seq_all_dict.tsv", sep='\t')[['Question', 'Reference', 'Hypothesis']]

es_all_dict = trimAllColumns(es_all_dict)
trans_all_dict = trimAllColumns(trans_all_dict)

all_dict = s2s_all_dict

all_dict['Reference'] = es_all_dict['Reference']
all_dict['Hypothesis_es'] = es_all_dict['Hypothesis']
all_dict['Hypothesis_trans'] = trans_all_dict['Hypothesis']
# all_dict['Hypothesis_s2s'] = all_dict['Hypothesis']
all_dict.rename(columns={'Hypothesis' : 'Hypothesis_s2s'}, inplace=True)
# all_dict.drop(columns=['Hypothesis'], inplace=True)

print(all_dict.shape)
all_dict.head()

In [None]:
references = []
hypothesis = []
counts = [0, 0, 0]
for idx in tqdm(range(all_dict.shape[0])):
    try:
        h, r, i = predict_pandas(all_dict, idx, do_predict=True)
#         h, r,i  = predict_pandas(all_dict, idx, do_predict=False)
        counts[i] += 1
        references.append(r[r.nonzero()])
        hypothesis.append(h[h.nonzero()])
    except:
        print("Skipping", idx)
        
references = np.array(references)
hypothesis = np.array(hypothesis)
print(counts)
eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

In [None]:
df = s2s_all_dict
df['Hypothesis'] = list(map(voc_holder.from_word_idx, hypothesis))
df.to_csv('data/ensamble_all_dict.tsv', sep='\t', encoding='utf-8')

In [None]:
idx = 4034
print(es_all_dict.ix[idx, 'Question'])
print(trans_all_dict.ix[idx, 'Question'])
print(s2s_all_dict.ix[idx, 'Question'])

In [None]:
import numpy as np
import matplotlib.pyplot as plt

plt.imshow(sess.run(c, feed_dict)[0]);
plt.colorbar()
plt.show()

In [None]:
a = sess.run(S_T, feed_dict)[0].T

fig, ax = plt.subplots(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
im = ax.imshow(a, cmap='hot', interpolation='nearest')

# We want to show all ticks...
ax.set_xticks(np.arange((a.shape[1])))
ax.set_yticks(np.arange((a.shape[0])))

# ... and label them with the respective list entries
ax.set_xticklabels([voc_holder.reverse_vocab[x] for x in feed_dict[qry][0]])
ax.set_yticklabels([voc_holder.reverse_vocab[x] for x in feed_dict[ctx][0]])

plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")

ax.set_title("Harvest of local farmers (in tons/year)")
fig.tight_layout()
plt.colorbar(im)  
plt.show()

In [None]:
sess.run(c_emb, feed_dict)[0][-1]

In [None]:
voc_holder.reverse_vocab

In [None]:
sum(map(int, sess.run(c_mask, feed_dict)[0]))
np.count_nonzero(feed_dict[ctx][0])

In [None]:
feed_dict[ctx][0].shape

In [None]:
np.set_printoptions(suppress=True)
feed_dict = next_batch(train_x, train_y, 10)

print(voc_holder.from_word_idx(feed_dict[qry][0]))
print()
print(voc_holder.from_word_idx(feed_dict[ctx][0]))

print(sess.run(losses, feed_dict))

# 1. / (1. + np.exp(-sess.run(logits, feed_dict=feed_dict)))

# Old code

In [None]:
stop
sess.close()

In [None]:
tf.reset_default_graph()
dropout = tf.placeholder_with_default(0.0, (), name="dropout")

p = tf.placeholder(dtype=tf.float32, shape=(None, 1), name="prob")

qry = tf.placeholder(dtype=tf.int32, shape=(None, q_maxlen), name="query")
ctx = tf.placeholder(dtype=tf.int32, shape=(None, c_maxlen), name="elastic")

with tf.variable_scope("Input_Embedding_Layer"):
#     pretrained_embs = tf.get_variable(
#         name="embs_pretrained",
#         initializer=tf.constant_initializer(
#            voc_holder.glove_weights, dtype=tf.float32),
#         shape=voc_holder.glove_weights.shape,
#         trainable=False)

    elmo_tensor = tf.convert_to_tensor(elmo_weights)
#     c_emb, _  = embed_tensor(ctx, pretrained_embs, "embedding_ctx_glove")
#     q_emb, _  = embed_tensor(qry, pretrained_embs, "embedding_qry_glove")

#     c_emb = tf.nn.dropout(c_emb, 1.0 - dropout) 
#     q_emb = tf.nn.dropout(q_emb, 1.0 - dropout) 
    c_emb = tf.nn.dropout(tf.nn.embedding_lookup(elmo_tensor, ctx), 1.0 - dropout)
    q_emb = tf.nn.dropout(tf.nn.embedding_lookup(elmo_tensor, qry), 1.0 - dropout)

#     c_emb = tf.reduce_mean(c_emb, 1)
#     q_emb = tf.reduce_mean(q_emb, 1)

inputs = tf.concat([c_emb, q_emb], 1)
inputs = tf.reshape(inputs, (-1, 1024*hparams.encoder_length*2))
logits = tf.layers.dense(inputs, 1, activation=tf.sigmoid, kernel_regularizer=regularizer)
yp = tf.sigmoid(logits)
correct_pred = tf.equal(tf.round(yp), p)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
global_step = tf.Variable(0, name='global_step', trainable=False)

# lr = tf.minimum(hparams.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(global_step, tf.float32) + 1))
optimizer = tf.train.GradientDescentOptimizer(.1)
losses = tf.nn.sigmoid_cross_entropy_with_logits(labels = p, logits = logits)
loss = tf.reduce_mean(losses)

variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
loss += l2_loss

params = tf.trainable_variables()
gradients = tf.gradients(loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients,
                                              hparams.max_gradient_norm)

train_op = optimizer.apply_gradients(
    zip(clipped_gradients, params), global_step=global_step)

total_params()

In [None]:
total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
#     print(shape)
#     print(len(shape))
    variable_parameters = 1
    for dim in shape:
#         print(dim)
        variable_parameters *= dim.value
#     print(variable_parameters)
    total_parameters += variable_parameters
print("{:,}".format(total_parameters))

In [None]:
import tensorflow as tf

class Model(object):
    def __init__(self, config, batch, word_mat=None, char_mat=None, trainable=True, opt=True, demo = False, graph = None):
        self.config = config
        self.demo = demo
        self.graph = graph if graph is not None else tf.Graph()
        with self.graph.as_default():

            self.global_step = tf.get_variable('global_step', shape=[], dtype=tf.int32,
                                               initializer=tf.constant_initializer(0), trainable=False)
            self.dropout = tf.placeholder_with_default(0.0, (), name="dropout")
#             if self.demo:
#                 self.c = tf.placeholder(tf.int32, [None, config.test_para_limit],"context")
#                 self.q = tf.placeholder(tf.int32, [None, config.test_ques_limit],"question")
#                 self.ch = tf.placeholder(tf.int32, [None, config.test_para_limit, config.char_limit],"context_char")
#                 self.qh = tf.placeholder(tf.int32, [None, config.test_ques_limit, config.char_limit],"question_char")
#                 self.y1 = tf.placeholder(tf.int32, [None, config.test_para_limit],"answer_index1")
#                 self.y2 = tf.placeholder(tf.int32, [None, config.test_para_limit],"answer_index2")
#             else:
#                 self.c, self.q, self.ch, self.qh, self.y1, self.y2, self.qa_id = batch.get_next()

            inputs = [tf.placeholder(dtype=tf.int32, shape=(None, None), name="elastic"),
                 tf.placeholder(dtype=tf.int32, shape=(None, None), name="seq2seq"),
                 tf.placeholder(dtype=tf.int32, shape=(None, None), name="transformer"),
                 tf.placeholder(dtype=tf.int32, shape=(None, None), name="query")]

            # self.word_unk = tf.get_variable("word_unk", shape = [config.glove_dim], initializer=initializer())
            self.word_mat = tf.get_variable("word_mat", initializer=tf.constant(
                word_mat, dtype=tf.float32), trainable=False)
            self.char_mat = tf.get_variable(
                "char_mat", initializer=tf.constant(char_mat, dtype=tf.float32))

            self.c_mask = tf.cast(self.c, tf.bool)
            self.q_mask = tf.cast(self.q, tf.bool)
            self.c_len = tf.reduce_sum(tf.cast(self.c_mask, tf.int32), axis=1)
            self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)

            if opt:
                N, CL = config.batch_size if not self.demo else 1, config.char_limit
                self.c_maxlen = tf.reduce_max(self.c_len)
                self.q_maxlen = tf.reduce_max(self.q_len)
                self.c = tf.slice(self.c, [0, 0], [N, self.c_maxlen])
                self.q = tf.slice(self.q, [0, 0], [N, self.q_maxlen])
                self.c_mask = tf.slice(self.c_mask, [0, 0], [N, self.c_maxlen])
                self.q_mask = tf.slice(self.q_mask, [0, 0], [N, self.q_maxlen])
                self.ch = tf.slice(self.ch, [0, 0, 0], [N, self.c_maxlen, CL])
                self.qh = tf.slice(self.qh, [0, 0, 0], [N, self.q_maxlen, CL])
                self.y1 = tf.slice(self.y1, [0, 0], [N, self.c_maxlen])
                self.y2 = tf.slice(self.y2, [0, 0], [N, self.c_maxlen])
            else:
                self.c_maxlen, self.q_maxlen = config.para_limit, config.ques_limit

            self.ch_len = tf.reshape(tf.reduce_sum(
                tf.cast(tf.cast(self.ch, tf.bool), tf.int32), axis=2), [-1])
            self.qh_len = tf.reshape(tf.reduce_sum(
                tf.cast(tf.cast(self.qh, tf.bool), tf.int32), axis=2), [-1])

            self.forward()
            total_params()

            if trainable:
                self.lr = tf.minimum(config.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(self.global_step, tf.float32) + 1))
                self.opt = tf.train.AdamOptimizer(learning_rate = self.lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7)
                grads = self.opt.compute_gradients(self.loss)
                gradients, variables = zip(*grads)
                capped_grads, _ = tf.clip_by_global_norm(
                    gradients, config.grad_clip)
                self.train_op = self.opt.apply_gradients(
                    zip(capped_grads, variables), global_step=self.global_step)

    def forward(self):
        config = self.config
        N, PL, QL, CL, d, dc, nh = config.batch_size if not self.demo else 1, self.c_maxlen, self.q_maxlen, config.char_limit, config.hidden, config.char_dim, config.num_heads

        with tf.variable_scope("Input_Embedding_Layer"):
            ch_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.ch), [N * PL, CL, dc])
            qh_emb = tf.reshape(tf.nn.embedding_lookup(
                self.char_mat, self.qh), [N * QL, CL, dc])
            ch_emb = tf.nn.dropout(ch_emb, 1.0 - 0.5 * self.dropout)
            qh_emb = tf.nn.dropout(qh_emb, 1.0 - 0.5 * self.dropout)

            # Bidaf style conv-highway encoder
            ch_emb = conv(ch_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = None)
            qh_emb = conv(qh_emb, d,
                bias = True, activation = tf.nn.relu, kernel_size = 5, name = "char_conv", reuse = True)

            ch_emb = tf.reduce_max(ch_emb, axis = 1)
            qh_emb = tf.reduce_max(qh_emb, axis = 1)

            ch_emb = tf.reshape(ch_emb, [N, PL, ch_emb.shape[-1]])
            qh_emb = tf.reshape(qh_emb, [N, QL, ch_emb.shape[-1]])

            c_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.c), 1.0 - self.dropout)
            q_emb = tf.nn.dropout(tf.nn.embedding_lookup(self.word_mat, self.q), 1.0 - self.dropout)

            c_emb = tf.concat([c_emb, ch_emb], axis=2)
            q_emb = tf.concat([q_emb, qh_emb], axis=2)

            c_emb = highway(c_emb, size = d, scope = "highway", dropout = self.dropout, reuse = None)
            q_emb = highway(q_emb, size = d, scope = "highway", dropout = self.dropout, reuse = True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.c_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = self.dropout)
            q = residual_block(q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = self.q_mask,
                num_filters = d,
                num_heads = nh,
                seq_len = self.q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = self.dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            # C = tf.tile(tf.expand_dims(c,2),[1,1,self.q_maxlen,1])
            # Q = tf.tile(tf.expand_dims(q,1),[1,self.c_maxlen,1,1])
            # S = trilinear([C, Q, C*Q], input_keep_prob = 1.0 - self.dropout)
            S = optimized_trilinear_for_attention([c, q], self.c_maxlen, self.q_maxlen, input_keep_prob = 1.0 - self.dropout)
            mask_q = tf.expand_dims(q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), dim = 1),(0,2,1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * self.q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis = -1)
            self.enc = [conv(inputs, d, name = "input_projection")]
            for i in range(3):
                if i % 2 == 0: # dropout every 2 blocks
                    self.enc[i] = tf.nn.dropout(self.enc[i], 1.0 - self.dropout)
                self.enc.append(
                    residual_block(self.enc[i],
                        num_blocks = 7,
                        num_conv_layers = 2,
                        kernel_size = 5,
                        mask = self.c_mask,
                        num_filters = d,
                        num_heads = nh,
                        seq_len = self.c_len,
                        scope = "Model_Encoder",
                        bias = False,
                        reuse = True if i > 0 else None,
                        dropout = self.dropout)
                    )

        with tf.variable_scope("Output_Layer"):
            start_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[2]],axis = -1),1, bias = False, name = "start_pointer"),-1)
            end_logits = tf.squeeze(conv(tf.concat([self.enc[1], self.enc[3]],axis = -1),1, bias = False, name = "end_pointer"), -1)
            self.logits = [mask_logits(start_logits, mask = self.c_mask),
                           mask_logits(end_logits, mask = self.c_mask)]

            logits1, logits2 = [l for l in self.logits]

            outer = tf.matmul(tf.expand_dims(tf.nn.softmax(logits1), axis=2),
                              tf.expand_dims(tf.nn.softmax(logits2), axis=1))
            outer = tf.matrix_band_part(outer, 0, config.ans_limit)
            self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
            self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
            losses = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits1, labels=self.y1)
            losses2 = tf.nn.softmax_cross_entropy_with_logits(
                logits=logits2, labels=self.y2)
            self.loss = tf.reduce_mean(losses + losses2)

        if config.l2_norm is not None:
            variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
            self.loss += l2_loss

        if config.decay is not None:
            self.var_ema = tf.train.ExponentialMovingAverage(config.decay)
            ema_op = self.var_ema.apply(tf.trainable_variables())
            with tf.control_dependencies([ema_op]):
                self.loss = tf.identity(self.loss)

                self.assign_vars = []
                for var in tf.global_variables():
                    v = self.var_ema.average(var)
                    if v:
                        self.assign_vars.append(tf.assign(var,v))

    def get_loss(self):
        return self.loss

    def get_global_step(self):
        return self.global_step

In [None]:
tf.reset_default_graph()

elmo_weights[customersupport.common.utils.PAD] = np.zeros(1024, dtype=np.float32)

elmo_tensor = tf.convert_to_tensor(elmo_weights)

inputs = [tf.placeholder(dtype=tf.int32, shape=(None, None), name="elastic"),
         tf.placeholder(dtype=tf.int32, shape=(None, None), name="seq2seq"),
         tf.placeholder(dtype=tf.int32, shape=(None, None), name="transformer"),
         tf.placeholder(dtype=tf.int32, shape=(None, None), name="query")]
batch_size = tf.shape(inputs[0])[0]

input_keep_prob = tf.constant(.8)

outputs = tf.placeholder(dtype=tf.float32, shape=(None, 1024), name="outputs")
o_probs = tf.placeholder(dtype=tf.float32, shape=(None, 3), name="o_probs")

encoders = []
masks = []
for i, inp in enumerate(inputs):
    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        embedded_word_ids = tf.nn.embedding_lookup(elmo_tensor, inp)
        mask = tf.to_float(tf.sign(inputs[i]))
        e = build_encoder(embedded_word_ids, mask, 1, input_keep_prob, 1024, 128, 1, 32, 32, True)
        encoders.append(e)
        masks.append(mask)

ctx = []
query = encoders.pop()
mask_q = masks[-1]
for i, enc in enumerate(encoders):
    with tf.variable_scope("model", reuse=(i != 0)):
        mask_c = masks[i]
        c2q = context2query(enc, query, mask_c, mask_q, input_keep_prob)
#         cl = linear(c2q, 60)
       
        lstm = tf.contrib.rnn.LSTMCell(512)
        initial_state = lstm.zero_state(batch_size, tf.float32)
        lstm_outputs, final_state = tf.nn.dynamic_rnn(lstm, c2q[2], initial_state=initial_state)
        
        ctx.append(lstm_outputs[:, -1])


output = tf.reshape(tf.stack(ctx, 1), (-1, len(ctx), 512))
logits = tf.squeeze(tf.layers.dense(output, 1, use_bias=False))
# print(output.get_shape())
# logits = tf.nn.softmax(logits)
# logits = tf.expand_dims(logits, 2)

# y_pred = tf.squeeze(tf.matmul(logits, output, transpose_a=True))

# y_pred_norm = tf.sqrt(tf.reduce_sum(tf.square(y_pred), axis=1, keepdims=True) + 1e-8)
# outputs_norm = tf.sqrt(tf.reduce_sum(tf.square(outputs), axis=1, keepdims=True) + 1e-8)

# train_loss = tf.reduce_mean(tf.squared_difference(y_pred, outputs)) 
# train_loss += tf.norm(y_pred - outputs, ord='euclidean')
# train_loss += 0.01 * tf.losses.cosine_distance(y_pred_norm, outputs_norm, axis=1)
y_smooted = o_probs#label_smoothing(o_probs)

crossent = tf.nn.softmax_cross_entropy_with_logits_v2(
    labels=y_smooted, logits=logits)
train_loss = tf.reduce_mean(crossent)

# Train
global_step = tf.Variable(0, name='global_step', trainable=False)

# Calculate and clip gradients
params = tf.trainable_variables()
gradients = tf.gradients(train_loss, params)
clipped_gradients, _ = tf.clip_by_global_norm(gradients,
                                              hparams.max_gradient_norm)

# Optimization
learning_rate = tf.train.exponential_decay(
    hparams.learning_rate, global_step, iters_per_epoch, .99, staircase=True)
# learning_rate = tf.Variable(0.001, dtype=np.float32)

optimizer = tf.train.AdamOptimizer()
#optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.apply_gradients(
    zip(clipped_gradients, params), global_step=global_step)

In [None]:
evaluation = evaluate(test_x, test_y, eval_conf_greedy, hparams.evaluation_metrics, True)
print(format_metrics(evaluation))

In [None]:
from collections import Counter
# cnt = Counter()

for i in tqdm(range(2000)):
    feed_dict = next_batch(train_x, train_y, 64)
    o1 = sess.run([train_loss, train_op], feed_dict=feed_dict)
    if i % 10 == 0:
        print(o1[0], end='..')

print()

In [None]:
feed_dict = next_batch(train_x, train_y, 2)
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15,15]

ctxt = feed_dict[inputs[2]][0]
qtxt = feed_dict[inputs[3]][0]

fig, ax = plt.subplots()
im = ax.imshow(sess.run(c2q, feed_dict=feed_dict)[4][0], cmap='binary', interpolation='nearest')

# We want to show all ticks...
ax.set_xticks(np.arange(len(qtxt)))
ax.set_yticks(np.arange(len(ctxt)))
# ... and label them with the respective list entries
ax.set_xticklabels([voc_holder.reverse_vocab[i] for i in qtxt])
ax.set_yticklabels([voc_holder.reverse_vocab[i] for i in ctxt])
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
plt.show()
# print(feed_dict)

In [None]:
def export(x, y, eval_conf):
    verbose = True
    references = []
    hypothesis = []
    losses = []
    questions = []

    batch_size = eval_conf.batch_size
    beam_width = eval_conf.beam_width

    batch_ids = get_batch_id_chunks(len(x), batch_size)

    gen = range(len(batch_ids))
    exclude = set(string.punctuation)

    if (verbose):
        gen = tqdm(gen)

    for i in gen:
        feed_dict = next_batch(x, y, False,
            rand_idx=batch_ids[i], 
            weights=None,
            beam_width=beam_width)
        
        responses = sess.run(eval_conf.seq_func, feed_dict=feed_dict)[0]
        feed_dict[beam_width_tensor] = 1
        loss = sess.run(tf.reduce_mean(crossent * target_weights, 1), feed_dict=feed_dict)

        for (h, r, q, l) in zip(responses, feed_dict.get(target_labels), feed_dict.get(encoder_inputs), loss):
            references.append(voc_holder.from_word_idx(strip_punkt(r, eval_conf.voc_holder.reverse_vocab)))
            hypothesis.append(voc_holder.from_word_idx(strip_punkt(h, eval_conf.voc_holder.reverse_vocab)))
            questions.append(voc_holder.from_word_idx(q))
            losses.append(l)
        print('{}'.format(len(losses)), end='...')
            
    return (np.array(questions), np.array(references), np.array(hypothesis), np.array(losses))

(q, r, h, l) = export(test_x, test_y, eval_conf)
df = pd.DataFrame(data = np.vstack((q, r, h, l)).T, 
                 columns = ["Question", "Reference", "Hypothesis", "Loss" ])
df.head()
df.to_csv('/home/momchil/Desktop/seq2seq_all_dict.tsv', sep='\t', encoding='utf-8')

In [None]:
import numpy as np
import tensorflow as tf

MIN_INT = (-2 ** 32 + 1)


def pos_embeddings(d_model: int, timesteps: int) -> tf.Tensor:
    lookup_table = np.zeros(shape=(timesteps, d_model), dtype=np.float32)
    for pos in range(timesteps):
        pos_vec = lookup_table[pos]
        for i in range(d_model):
            val = pos / (1000 ** (2 * i / d_model))
            pos_vec[i] = np.sin(val) if i % 2 == 0 else np.cos(val)

    return tf.convert_to_tensor(lookup_table)


def embedding_layer(inputs: tf.Tensor, vocabulary_size: int, embedding_size: int, dropout_rate: float,
                    is_training: bool, scope: str) -> tf.Tensor:
    with tf.variable_scope(scope, "Embeddings"):
        word_embeddings = tf.get_variable("word_embeddings",
                                          [vocabulary_size, embedding_size])
        embedded_word_ids = tf.nn.embedding_lookup(word_embeddings, inputs)
        emb = tf.layers.dropout(embedded_word_ids, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

        return emb


def mask_matrix(masks: tf.Tensor, outputs: tf.Tensor, dims: int) -> tf.Tensor:
    masks = tf.tile(tf.expand_dims(masks, 1), [1, dims, 1])

    paddings = tf.ones_like(outputs) * MIN_INT
    logits = tf.where(tf.equal(masks, 0), paddings, outputs)

    return logits


def multihead_attention(queries: tf.Tensor, keys: tf.Tensor, masks: tf.Tensor, dropout_rate: float, h: int, d_v: int,
                        d_k: int, causality: bool, is_training: bool, scope: str = None) -> tf.Tensor:
    with tf.variable_scope(scope, "multi-head-attn"):
        q = tf.layers.dense(queries, d_k, use_bias=False)
        k = tf.layers.dense(keys, d_k, use_bias=False)
        v = tf.layers.dense(keys, d_k, use_bias=False)

        outputs = tf.matmul(q, k, transpose_b=True) / np.sqrt(d_k)

        # Key Masking
#         masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))
        logits = mask_matrix(masks, outputs, tf.shape(queries)[1])
        if causality:
            ltrig = tf.matrix_band_part(logits, -1, 0)
            paddings = tf.ones_like(outputs) * MIN_INT
            logits = tf.where(tf.equal(ltrig, 0), paddings, outputs)

        att_w = tf.nn.softmax(logits)
        
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])
        att_w *= query_masks

        att_w = tf.layers.dropout(att_w, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

        values = tf.matmul(att_w, v)

        return values

def multihead_layer(query: tf.Tensor, keys: tf.Tensor, masks: tf.Tensor, dropout_rate: float, h: int, d_model: int, d_k: int, d_v: int,
                    causality: bool = False, is_training: bool = False) -> tf.Tensor:
    heads = [multihead_attention(query, keys, masks, dropout_rate,
                                 h, d_k, d_v, causality, is_training,
                                 "multi-head-attn" + str(i))
             for i in range(h)]
    heads = tf.concat(heads, axis=-1)

    mha = tf.layers.dense(heads, d_model, use_bias=False)
    norm_mha = tf.contrib.layers.layer_norm(mha + query)

    norm_mha = tf.layers.dropout(norm_mha, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

    return norm_mha


def feed_forward(inputs: tf.Tensor, d_ff: int, d_model: int, scope: str = None) -> tf.Tensor:
    with tf.variable_scope(scope, "feed-forward"):
        proj = tf.layers.dense(inputs, d_ff, activation=tf.nn.relu, use_bias=True)
        ff = tf.layers.dense(proj, d_model, use_bias=True)

        out = tf.contrib.layers.layer_norm(ff + inputs)

        return out


def build_encoder(enc: tf.Tensor, masks: tf.Tensor, n_blocks: int, dropout_rate: float, d_model: int, d_ff: 
                  int, h: int, d_k: int, d_v: int, is_training: bool) -> tf.Tensor:
    if n_blocks <= 0:
        return enc

    with tf.variable_scope("encoder" + str(n_blocks)):
        norm_mha = multihead_layer(enc, enc, masks, dropout_rate, h, d_model, d_k, d_v, False, is_training)
        enc = feed_forward(norm_mha, d_ff, d_model, "feed-forward")

    return build_encoder(enc, masks, n_blocks - 1, dropout_rate, d_model, d_ff, h, d_k, d_v, is_training)


def build_decoder(dec: tf.Tensor, enc: tf.Tensor, n_blocks: int, dropout_rate: float, d_model: int, d_ff: int,
                  h: int, d_k: int, d_v: int, is_training: bool) -> tf.Tensor:
    if n_blocks <= 0:
        return dec

    with tf.variable_scope("decoder" + str(n_blocks)):
        with tf.variable_scope("self-attention"):
            self_att = multihead_layer(dec, dec, dropout_rate, h, d_model, d_k, d_v, True,
                                       is_training)

        with tf.variable_scope("mha-attention"):
            norm_mha = multihead_layer(self_att, enc, dropout_rate, h, d_model, d_k, d_v, False,
                                       is_training)

            dec = feed_forward(norm_mha, d_ff, d_model, "feed-forward")

    return build_decoder(dec, enc, n_blocks - 1, dropout_rate, d_model, d_ff, h, d_k, d_v, is_training)


def label_smoothing(y: tf.Tensor, epsilon: float = 0.1) -> tf.Tensor:
    with tf.variable_scope("smoothing"):
        k = y.get_shape().as_list()[-1]

        return y * (1 - epsilon) + (epsilon / k)