In [1]:
## %matplotlib inline

import re
import random
import string
import time
import warnings

import importlib

import customersupport.common
import customersupport.common.utils
import customersupport.evaluation.eval

print('Library versions:')

import tensorflow as tf
print('tensorflow:{}'.format(tf.__version__))
import pandas as pd
print('pandas:{}'.format(pd.__version__))
# import sklearn
# print('sklearn:{}'.format(sklearn.__version__))
# import nltk
# print('nltk:{}'.format(nltk.__version__))
import numpy as np
print('numpy:{}'.format(np.__version__))
import matplotlib.pyplot as plt

from IPython.display import SVG
from tqdm import tqdm_notebook as tqdm  # Special jupyter notebook progress bar

from tensorflow.python.layers import core as layers_core
from datetime import datetime

from customersupport.common.vocab import VocabHolder
from customersupport.common.dataset import CustomerSupportDataset

from customersupport.evaluation.eval import evaluate_words_index, format_metrics, get_evaluation_conf, strip_punkt

importlib.reload(customersupport.common.vocab)
importlib.reload(customersupport.common.dataset)
importlib.reload(customersupport.common.utils)
importlib.reload(customersupport.evaluation.eval)

warnings.simplefilter('ignore')

tqdm().pandas()  # Enable tracking of progress in dataframe `apply` calls

tqdm.monitor_interval = 0

Library versions:
tensorflow:1.11.0
pandas:0.23.4
numpy:1.15.4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [2]:
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

['device: 0, name: GeForce GTX 1060 6GB, pci bus id: 0000:01:00.0, compute capability: 6.1']

In [14]:
##### 8192 - large enough for demonstration, larger values make network training slower
MAX_VOCAB_SIZE = 2**13

# seq2seq generally relies on fixed length message vectors - longer messages provide more info
# but result in slower training and larger networks
#MAX_MESSAGE_LEN = 50

hparams = tf.contrib.training.HParams(
    # Larger batch sizes generally reach the average response faster, but small batch sizes are
    # required for the model to learn nuanced responses.  Also, GPU memory limits max batch size.
    batch_size=64,
    encoder_length=60,
    decoder_length=70,
    src_vocab_size=MAX_VOCAB_SIZE,
    # Embedding size for words - gives a trade off between expressivity of words and network size
    embedding_size=200,
    tgt_vocab_size=MAX_VOCAB_SIZE,
    # Helps regularize network and prevent overfitting.
    # High learning rate helps model reach average response faster, but can make it hard to
    # converge on nuanced responses
    learning_rate=5e-04,  #0.0005,
    max_gradient_norm=5.0,
    l2_norm = True,
    beam_width = 10,
    d = 64,
    nh = 4,
    max_epochs=15,
    dropout=0.1,
    use_glove=True,
    l2_reg=0.,
    glove_path=#None,
    '/home/momchil/Storage/Projects/Python/Data/glove.twitter.27B/glove.twitter.27B.200d.txt',
    tweets_path=
    '/home/momchil/Storage/Projects/Python/Data/customer-support-on-twitter/twcs-conv_ids_clean.csv',
    # Ngram count for ROUGE and BLEU
    max_order = 2,
    train_size=0.8,
    decay_rate=0.99,
    train_time_diff=5.0,
    first_day=0,
    last_day=60,
    evaluation_metrics=[
        "bleu", "rouge_l", "embedding_average", "vector_extrema",
        "greedy_matching"
    ],
    training_metrics=[
        "bleu", "rouge_l", "embedding_average", "vector_extrema",
        "greedy_matching"
    ],
    companies=['AppleSupport'])

In [4]:
%%time
cs_data = CustomerSupportDataset(hparams)

#& (y_text.str.contains('help') ^ True)
#['direct message', 'is fixed in a future software update']
cs_data.process_utterances(masks=['direct message'], append_context=True)

Done support_author (984679, 9)
Replacing anonymized screen names in X...


HBox(children=(IntProgress(value=0, max=105179), HTML(value='')))


Replacing anonymized screen names in Y...


HBox(children=(IntProgress(value=0, max=105179), HTML(value='')))


CPU times: user 1min, sys: 1.17 s, total: 1min 1s
Wall time: 56.2 s


In [5]:
voc_holder = VocabHolder(hparams)

Loaded glove
Loaded w2v


In [6]:
analyzer = voc_holder.fit(cs_data.x_text, cs_data.y_text, hparams.src_vocab_size)

cs_data.text_to_vec(hparams, voc_holder)
cs_data.train_test_split(hparams, do_random=False)

train_x = cs_data.train_x
train_y = cs_data.train_y

test_x = cs_data.test_x
test_y = cs_data.test_y

train_weights = cs_data.train_weights
test_weights = cs_data.test_weights

Fitting CountVectorizer on X and Y text data...


HBox(children=(IntProgress(value=0, max=49626), HTML(value='')))


Number of known words 7378
Learned vocab of 8192 items.
Calculating word indexes for X...


HBox(children=(IntProgress(value=0, max=49626), HTML(value='')))


Calculating word indexes for Y...


HBox(children=(IntProgress(value=0, max=49626), HTML(value='')))


Training data of shape (45582, 60) and test data of shape (4044, 70).
count    45582.000000
mean         1.000000
std          0.141677
min          0.740038
25%          0.883758
50%          1.021893
75%          1.097074
max          1.286219
dtype: float64
count    4044.000000
mean        1.000000
std         0.014701
min         0.972407
25%         0.988713
50%         1.001299
75%         1.011627
max         1.022508
dtype: float64


In [7]:
import tensorflow_hub as hub

def validate_elmo(elmo_weights, voc_holder, word):
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
    embeddings = elmo(
                inputs={
                    "tokens": [[word]],
                    "sequence_len": [1]
                },
                signature="tokens",
                as_dict=True)["elmo"]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        elmo_pred = sess.run(embeddings).reshape(1024)
    embedding = elmo_weights[voc_holder.vocab[word]]
    
    np.testing.assert_array_almost_equal(elmo_pred, embedding, decimal=6)

def export_elmo(vocab_holder):
    elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=False)
    
    words = [""] * len(voc_holder.reverse_vocab)
    for i, w in voc_holder.reverse_vocab.items():
        words[i] = w
        
    # ELMo uses those 3
    words[customersupport.common.utils.START] = '<S>'
    words[customersupport.common.utils.PAD] = '</S>'
    words[customersupport.common.utils.UNK] = '<UNK>'
    
    words = np.array(words, dtype=np.str).reshape(-1, 512, 1)

    elmo_weights = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for tokens_input in tqdm(words):
            embeddings = elmo(
                inputs={
                    "tokens": tokens_input,
                    "sequence_len": np.ones(tokens_input.shape[0])
                },
                signature="tokens",
                as_dict=True)["elmo"]
            e = sess.run(embeddings)
            elmo_weights.append(e.reshape(-1, 1024))

        elmo_weights = np.array(elmo_weights).reshape(-1, 1024)
        return elmo_weights

elmo = export_elmo(voc_holder)
elmo[customersupport.common.utils.PAD] = np.zeros(elmo.shape[-1], dtype=np.float32)

validate_elmo(elmo, voc_holder, "hello")

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.
INFO:tensorflow:Downloading TF-Hub Module 'https://tfhub.dev/google/elmo/2'.
INFO:tensorflow:Downloaded TF-Hub Module 'https://tfhub.dev/google/elmo/2'.


HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver not created because there are no variables in the graph to restore
INFO:tensorflow:Saver

In [8]:
def export_vocab(voc_holder):
    with open("/home/momchil/Storage/Projects/Python/Data/multi_cased_L-12_H-768_A-12/vocab.txt", "w") as f:
        for l in map(lambda x: voc_holder.reverse_vocab[x], range(MAX_VOCAB_SIZE)):
            f.write(l + "\n")

def export_bert(vocab_holder):
    
    def sum_per_layer(data, i):
        return np.sum([t['layers'][i]['values'] for t in data['features'][1:]], 0)
    
    words = [""] * len(voc_holder.reverse_vocab)
    for i, w in voc_holder.reverse_vocab.items():
        words[i] = w
        
    # BERT uses those 3
    words[customersupport.common.utils.START] = '<S>'
    words[customersupport.common.utils.PAD] = '[PAD]'
    words[customersupport.common.utils.UNK] = '[UNK]'
    
    bert_weights = []
    with open('/home/momchil/Storage/Projects/Python/Data/uncased_L-12_H-768_A-12/output.jsonl') as f:
#         for i in range(2000): 
#             f.readline()
        for i, w in enumerate(tqdm(words)):
            data = json.loads(str(f.readline()))
#             weights = np.sum([data['features'][i]['layers'][0]['values'] for i in range(len(data['features']))], 0)
#             weights = data['features'][0]['layers'][0]['values']
#             print( data['features'][0]['token'])
            weights = sum_per_layer(data, 0)
            for i in range(1, len(data['features'][0]['layers'])):
#                 weights = np.concatenate((weights, sum_per_layer(data, i)))
                weights += sum_per_layer(data, i)
            bert_weights.append(weights)
    
    return np.array(bert_weights, dtype=np.float32)

bert = export_bert(voc_holder)
bert[customersupport.common.utils.PAD] = np.zeros(bert.shape[-1], dtype=np.float32)

HBox(children=(IntProgress(value=0, max=8192), HTML(value='')))




In [9]:
from tensorflow.python.util import nest
from tensorflow.python.ops import math_ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import init_ops
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import clip_ops

def batch_dot(x, y, axes=None):
    """Copy from keras==2.0.6
    Batchwise dot product.
    `batch_dot` is used to compute dot product of `x` and `y` when
    `x` and `y` are data in batch, i.e. in a shape of
    `(batch_size, :)`.
    `batch_dot` results in a tensor or variable with less dimensions
    than the input. If the number of dimensions is reduced to 1,
    we use `expand_dims` to make sure that ndim is at least 2.
    # Arguments
        x: Keras tensor or variable with `ndim >= 2`.
        y: Keras tensor or variable with `ndim >= 2`.
        axes: list of (or single) int with target dimensions.
            The lengths of `axes[0]` and `axes[1]` should be the same.
    # Returns
        A tensor with shape equal to the concatenation of `x`'s shape
        (less the dimension that was summed over) and `y`'s shape
        (less the batch dimension and the dimension that was summed over).
        If the final rank is 1, we reshape it to `(batch_size, 1)`.
    """
    if isinstance(axes, int):
        axes = (axes, axes)
    x_ndim = ndim(x)
    y_ndim = ndim(y)
    if x_ndim > y_ndim:
        diff = x_ndim - y_ndim
        y = tf.reshape(y, tf.concat([tf.shape(y), [1] * (diff)], axis=0))
    elif y_ndim > x_ndim:
        diff = y_ndim - x_ndim
        x = tf.reshape(x, tf.concat([tf.shape(x), [1] * (diff)], axis=0))
    else:
        diff = 0
    if ndim(x) == 2 and ndim(y) == 2:
        if axes[0] == axes[1]:
            out = tf.reduce_sum(tf.multiply(x, y), axes[0])
        else:
            out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1])
    else:
        if axes is not None:
            adj_x = None if axes[0] == ndim(x) - 1 else True
            adj_y = True if axes[1] == ndim(y) - 1 else None
        else:
            adj_x = None
            adj_y = None
        out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
    if diff:
        if x_ndim > y_ndim:
            idx = x_ndim + y_ndim - 3
        else:
            idx = x_ndim - 1
        out = tf.squeeze(out, list(range(idx, idx + diff)))
    if ndim(out) == 1:
        out = tf.expand_dims(out, 1)
    return out

def ndim(x):
    """Copied from keras==2.0.6
    Returns the number of axes in a tensor, as an integer.
    # Arguments
        x: Tensor or variable.
    # Returns
        Integer (scalar), number of axes.
    # Examples
    ```python
        >>> from keras import backend as K
        >>> inputs = K.placeholder(shape=(2, 4, 5))
        >>> val = np.array([[1, 2], [3, 4]])
        >>> kvar = K.variable(value=val)
        >>> K.ndim(inputs)
        3
        >>> K.ndim(kvar)
        2
    ```
    """
    dims = x.get_shape()._dims
    if dims is not None:
        return len(dims)
    return None

def dot(x, y):
    """Modified from keras==2.0.6
    Multiplies 2 tensors (and/or variables) and returns a *tensor*.
    When attempting to multiply a nD tensor
    with a nD tensor, it reproduces the Theano behavior.
    (e.g. `(2, 3) * (4, 3, 5) -> (2, 4, 5)`)
    # Arguments
        x: Tensor or variable.
        y: Tensor or variable.
    # Returns
        A tensor, dot product of `x` and `y`.
    """
    if ndim(x) is not None and (ndim(x) > 2 or ndim(y) > 2):
        x_shape = []
        for i, s in zip(x.get_shape().as_list(), tf.unstack(tf.shape(x))):
            if i is not None:
                x_shape.append(i)
            else:
                x_shape.append(s)
        x_shape = tuple(x_shape)
        y_shape = []
        for i, s in zip(y.get_shape().as_list(), tf.unstack(tf.shape(y))):
            if i is not None:
                y_shape.append(i)
            else:
                y_shape.append(s)
        y_shape = tuple(y_shape)
        y_permute_dim = list(range(ndim(y)))
        y_permute_dim = [y_permute_dim.pop(-2)] + y_permute_dim
        xt = tf.reshape(x, [-1, x_shape[-1]])
        yt = tf.reshape(tf.transpose(y, perm=y_permute_dim), [y_shape[-2], -1])
        return tf.reshape(tf.matmul(xt, yt),
                          x_shape[:-1] + y_shape[:-2] + y_shape[-1:])
    if isinstance(x, tf.SparseTensor):
        out = tf.sparse_tensor_dense_matmul(x, y)
    else:
        out = tf.matmul(x, y)
    return out
    
initializer = lambda: tf.contrib.layers.variance_scaling_initializer(factor=1.0,
                                                             mode='FAN_AVG',
                                                             uniform=True,
                                                             dtype=tf.float32)
initializer_relu = lambda: tf.contrib.layers.variance_scaling_initializer(factor=2.0,
                                                             mode='FAN_IN',
                                                             uniform=False,
                                                             dtype=tf.float32)
regularizer = tf.contrib.layers.l2_regularizer(scale = 3e-7)

def optimized_trilinear_for_attention(args, c_maxlen, q_maxlen, input_keep_prob=1.0,
    scope='efficient_trilinear',
    bias_initializer=tf.zeros_initializer(),
    kernel_initializer=initializer()):
    assert len(args) == 2, "just use for computing attention with two input"
    arg0_shape = args[0].get_shape().as_list()
    arg1_shape = args[1].get_shape().as_list()
    if len(arg0_shape) != 3 or len(arg1_shape) != 3:
        raise ValueError("`args` must be 3 dims (batch_size, len, dimension)")
    if arg0_shape[2] != arg1_shape[2]:
        raise ValueError("the last dimension of `args` must equal")
    arg_size = arg0_shape[2]
    dtype = args[0].dtype
    droped_args = [tf.nn.dropout(arg, input_keep_prob) for arg in args]
    with tf.variable_scope(scope):
        weights4arg0 = tf.get_variable(
            "linear_kernel4arg0", [arg_size, 1],
            dtype=dtype,
            regularizer=regularizer,
            initializer=kernel_initializer)
        weights4arg1 = tf.get_variable(
            "linear_kernel4arg1", [arg_size, 1],
            dtype=dtype,
            regularizer=regularizer,
            initializer=kernel_initializer)
        weights4mlu = tf.get_variable(
            "linear_kernel4mul", [1, 1, arg_size],
            dtype=dtype,
            regularizer=regularizer,
            initializer=kernel_initializer)
        biases = tf.get_variable(
            "linear_bias", [1],
            dtype=dtype,
            regularizer=regularizer,
            initializer=bias_initializer)
        subres0 = tf.tile(dot(droped_args[0], weights4arg0), [1, 1, q_maxlen])
        subres1 = tf.tile(tf.transpose(dot(droped_args[1], weights4arg1), perm=(0, 2, 1)), [1, c_maxlen, 1])
        subres2 = batch_dot(droped_args[0] * weights4mlu, tf.transpose(droped_args[1], perm=(0, 2, 1)))
        res = subres0 + subres1 + subres2
        tf.nn.bias_add(res, tf.tile(biases, [q_maxlen]))
        
    return res

def context2query(c: tf.Tensor, q: tf.Tensor, mask_c: tf.Tensor, mask_q: tf.Tensor, input_keep_prob: float):
    with tf.variable_scope("Context_to_Query_Attention_Layer"):
        S = optimized_trilinear_for_attention([c, q], tf.shape(c)[1], tf.shape(q)[1], input_keep_prob)
        S = mask_matrix(mask_q, S, tf.shape(c)[1])

        S_ = tf.nn.softmax(S)
        S_ *= tf.tile(tf.expand_dims(mask_c, -1), [1, 1, tf.shape(q)[1]]) 
        S_T = tf.transpose(tf.nn.softmax(S, axis = 1),(0,2,1))
        
        c2q = tf.matmul(S_, q)
        q2c = tf.matmul(tf.matmul(S_, S_T), c)
        attention_outputs = [c, c2q, c * c2q, c * q2c, S_]
        
        return attention_outputs
    
def linear(context: tf.Tensor, c_maxlen):
    with tf.variable_scope("Linear"):
        wl = tf.get_variable("Wl", shape=[1, c_maxlen], 
                             initializer=tf.contrib.layers.xavier_initializer())
        out = tf.matmul(tf.tile(tf.expand_dims(wl, 0), [tf.shape(context)[0], 1, 1]), context)
        
        return tf.squeeze(out)
    
def embed_tensor(inputs, pretrained_embs, name, trainable=False):

    train_embeddings = tf.get_variable(
        name=name,
        shape=[hparams.tgt_vocab_size - len(voc_holder.glove_weights), hparams.embedding_size],
        initializer=tf.random_uniform_initializer(-0.04, 0.04),
        trainable=True)
    
    embeddings = tf.concat(
        [train_embeddings, pretrained_embs], axis=0)

    return tf.nn.embedding_lookup(embeddings, inputs), embeddings

def parametric_relu(_x):
    alphas = tf.get_variable('alpha', _x.get_shape()[-1],
                       initializer=initializer_relu(),
                        dtype=tf.float32)
    pos = tf.nn.relu(_x)
    neg = alphas * (_x - abs(_x)) * 0.5

    return pos + neg

def gelu_fast(_x):
    return 0.5 * _x * (1 + tf.tanh(tf.sqrt(2 / np.pi) * (_x + 0.044715 * tf.pow(_x, 3))))

In [10]:
def input_fn_builder(x, y,  is_training, negatives):
    import math
    def cosine_similarity(v1, v2):

        mx = hparams.src_vocab_size # max(np.max(v1), np.max(v2)) + 1
        v1_tmp = np.zeros(mx)
        v1_tmp[v1] = 1.
        v1 = v1_tmp

        v2_tmp = np.zeros(mx)
        v2_tmp[v2] = 1.
        v2 = v2_tmp

        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))      
    
    def data_generator():    
        while True:
            i, it = np.random.randint(0, len(x), 2)
            if not negatives or np.random.binomial(1, 0.5, 1)[0] == 0:
                it = i
                sim = 1.
            else:
                sim = cosine_similarity(y[i], y[it])
                
            feed_dict = {}
            feed_dict['qry'] = x[i]
            feed_dict['ctx'] = y[it]
            p = [float(sim > .7)]
            
            yield feed_dict, p
            
    def _decode_record(ctx, qry, p):
        feed_dict = {}
        feed_dict['qry'] = qry
        feed_dict['ctx'] = ctx
        labels = p
        
        return feed_dict, labels
    
    def input_fn(batch_size):

        d = tf.data.Dataset.from_generator(
            data_generator, ({'qry': tf.int32, 'ctx': tf.int32}, tf.float32),
            output_shapes=({'qry': (hparams.encoder_length), 'ctx': (hparams.decoder_length)}, (1)))
        
        if is_training:
            d = d.shuffle(300)
            d = d.repeat()
        
        d = d.batch(batch_size)
        
        return d.make_one_shot_iterator().get_next()
        
    return input_fn

def eval_input_fn_builder(questions, candidates):
    
    def data_generator():
        feed_dict = {}
        for q, cl in zip(questions, candidates):
            feed_dict['ctx'] = np.array([voc_holder.to_word_idx(c, c_maxlen) for c in cl.split("@ @ @")])
            feed_dict['qry'] = np.tile(q, [feed_dict['ctx'].shape[0], 1])
            
            yield feed_dict
            
    def map_fn(feed_dict):
        feed_dict['qry'] = tf.reshape(feed_dict['qry'], ((-1, hparams.encoder_length)))
        feed_dict['ctx'] = tf.reshape(feed_dict['ctx'], ((-1, hparams.decoder_length)))
        
        return feed_dict
       
    def input_fn(batch_size):
        d = tf.data.Dataset.from_generator(data_generator, ({'qry': tf.int32, 'ctx': tf.int32}),
            output_shapes=({'qry': (None, hparams.encoder_length), 'ctx': (None, hparams.decoder_length)}))
        
        d = d.batch(batch_size)
        d = d.map(map_fn)
        
        return d.make_one_shot_iterator().get_next()
    return input_fn

input_fn = input_fn_builder(train_x, train_y, True, True)
test_input_fn = input_fn_builder(test_x, test_y, False, True)

In [15]:
def model_fn_builder(voc_holder, hparams, emb_weights):    
    def model_fn(features, labels, mode, params):

        from QANet.layers import residual_block, highway, conv, mask_logits, trilinear, \
            total_params#, optimized_trilinear_for_attention
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        
        qry = features["qry"]
        ctx = features["ctx"]
        
        c_maxlen = tf.shape(ctx)[-1]
        q_maxlen = tf.shape(qry)[-1]
        batch_size = tf.shape(qry)[0]

        dropout = hparams.dropout if is_training else 0.0 
        #tf.placeholder_with_default(0.0, (), name="dropout")

        c_mask = tf.cast(ctx, tf.bool)
        q_mask = tf.cast(qry, tf.bool)

        c_len = tf.reduce_sum(tf.cast(c_mask, tf.int32), axis=1)
        q_len = tf.reduce_sum(tf.cast(q_mask, tf.int32), axis=1)

        with tf.variable_scope("Input_Embedding_Layer"):
        
            if (hparams.use_glove):
                pretrained_embs = tf.get_variable(
                    name="embs_pretrained",
                    initializer=tf.constant_initializer(
                       voc_holder.glove_weights, dtype=tf.float32),
                    shape=voc_holder.glove_weights.shape,
                    trainable=False)
                _, emb_tensor = \
                    embed_tensor(qry, pretrained_embs, "embedding_encoder_glove")
            else:
                emb_tensor = tf.convert_to_tensor(emb_weights)

            c_emb_ = tf.nn.dropout(tf.nn.embedding_lookup(emb_tensor, ctx), 1.0 - dropout)
            q_emb_ = tf.nn.dropout(tf.nn.embedding_lookup(emb_tensor, qry), 1.0 - dropout)

            c_emb = highway(c_emb_, size = hparams.d, scope = "highway", dropout = dropout, reuse = None)
            q_emb = highway(q_emb_, size = hparams.d, scope = "highway", dropout = dropout, reuse = True)

        with tf.variable_scope("Embedding_Encoder_Layer"):
            c = residual_block(inputs=c_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = c_mask,
                num_filters = hparams.d,
                num_heads = hparams.nh,
                seq_len = c_len,
                scope = "Encoder_Residual_Block",
                bias = False,
                dropout = dropout)

            q = residual_block(inputs=q_emb,
                num_blocks = 1,
                num_conv_layers = 4,
                kernel_size = 7,
                mask = q_mask,
                num_filters = hparams.d,
                num_heads = hparams.nh,
                seq_len = q_len,
                scope = "Encoder_Residual_Block",
                reuse = True, # Share the weights between passage and question
                bias = False,
                dropout = dropout)

        with tf.variable_scope("Context_to_Query_Attention_Layer"):
            S = optimized_trilinear_for_attention([c, q], c_maxlen, q_maxlen, input_keep_prob = 1.0 - dropout)
            mask_q = tf.expand_dims(q_mask, 1)
            S_ = tf.nn.softmax(mask_logits(S, mask = mask_q))
            mask_c = tf.expand_dims(c_mask, 2)
            S_T = tf.transpose(tf.nn.softmax(mask_logits(S, mask = mask_c), axis = 1),(0,2,1))
            c2q = tf.matmul(S_, q)
            q2c = tf.matmul(tf.matmul(S_, S_T), c)
            attention_outputs = [c, c2q, c * c2q, c * q2c]

        with tf.variable_scope("Model_Encoder_Layer"):
            inputs = tf.concat(attention_outputs, axis = -1)
            enc = [conv(inputs, hparams.d, name = "input_projection")]
            for i in range(2):
                if i % 2 == 0: # dropout every 2 blocks
                    enc[i] = tf.nn.dropout(enc[i], 1.0 - dropout)
                enc.append(
                    residual_block(enc[i],
                        num_blocks = 7,
                        num_conv_layers = 2,
                        kernel_size = 5,
                        mask = c_mask,
                        num_filters = hparams.d,
                        num_heads = hparams.nh,
                        seq_len = c_len,
                        scope = "Model_Encoder",
                        bias = False,
                        reuse = True if i > 0 else None,
                        dropout = dropout)
                    )
        with tf.variable_scope("Output_Layer"):
            proj_logits = tf.squeeze(conv(tf.concat([enc[1], enc[2]],axis = -1),1, bias = False, name = "first_logits"),-1)
#             proj_logits = tf.squeeze(conv(enc[1], 1, bias = False, name = "first_logits"), -1)
#             proj_logits = tf.concat([proj_logits, 
#                                      batch_dot(tf.reduce_max(tf.abs(q_emb), 1), tf.reduce_max(tf.abs(c_emb), 1), 1),
#                                      tf.reduce_mean(q_emb_, 1),
#                                      tf.reduce_mean(c_emb_, 1)
# #         #                              tf.reduce_max(c_emb, 1),
# #         #                              tf.reduce_max(q_emb, 1),
# #         #                              tf.reduce_min(c_emb, 1),
# #         #                              tf.reduce_min(q_emb, 1),
# #         #                              tf.reshape(tf.reduce_sum(tf.to_float(tf.equal(ctx, 1)), 1), (-1, 1)),
# #         #                              tf.reshape(tf.reduce_sum(tf.to_float(tf.equal(qry, 1)), 1), (-1, 1)),
# #         #                              tf.reduce_max(tf.one_hot(ctx, MAX_VOCAB_SIZE, on_value=1.0, off_value=0.0, axis =-1), 1),
# #         #                              tf.reduce_max(tf.one_hot(qry, MAX_VOCAB_SIZE, on_value=1.0, off_value=0.0, axis =-1), 1)
#                                     ], 1)
            
#             proj_logits = tf.nn.dropout(proj_logits, 1.0 - dropout)
#             proj_logits = tf.layers.dense(proj_logits, hparams.d * 2, kernel_initializer=initializer(), 
#                                           activation=gelu_fast, kernel_regularizer=regularizer)

            proj_logits = tf.nn.dropout(proj_logits, 1.0 - dropout)
            logits = tf.layers.dense(proj_logits, 1, kernel_initializer=initializer())#, 
        #                              kernel_regularizer=regularizer)

            yp = tf.sigmoid(logits)
             # Compute predictions.
            if mode == tf.estimator.ModeKeys.PREDICT:
                predictions = {
                    'prob': yp,
                    'logits': logits,
                }

                return tf.estimator.EstimatorSpec(mode, predictions=predictions)

            correct_pred = tf.equal(tf.round(yp), labels)
            accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

            losses = tf.nn.sigmoid_cross_entropy_with_logits(labels = labels, logits = logits)
            loss = tf.reduce_mean(losses)
            if hparams.l2_norm is not None:
                    variables = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
                    l2_loss = tf.contrib.layers.apply_regularization(regularizer, variables)
                    loss += l2_loss
                    print("Adding l2")
        
        metrics = {'accuracy': accuracy, "loss": loss}
        if mode == tf.estimator.ModeKeys.EVAL:
            # Compute evaluation metrics.
            accuracy = tf.metrics.accuracy(labels=labels,
                                           predictions=tf.round(yp),
                                           name='eval_acc_op')
            eval_loss = tf.metrics.mean(values=losses, name='eval_loss_op')
            
            metrics = {'eval_accuracy': accuracy, 
                       "eval_loss": eval_loss}
            return tf.estimator.EstimatorSpec(
                mode, loss=loss, eval_metric_ops=metrics)

        # Create training op.
        assert is_training

        with tf.variable_scope("training"):
            # lr = tf.minimum(hparams.learning_rate, 0.001 / tf.log(999.) * tf.log(tf.cast(global_step, tf.float32) + 1))
            iters_per_epoch = int(np.ceil(train_x.shape[0] / hparams.batch_size))
            warmup_steps = 40000

            global_step = tf.train.get_global_step()  # tf.Variable(0, name='global_step', trainable=False)
#             lrate = d_model ** -0.5 * tf.minimum(tf.pow(tf.cast(global_step, tf.float32), tf.constant(-0.5)),
#                                                  tf.constant(warmup_steps ** -1.5))

            lr = tf.train.exponential_decay(
                hparams.learning_rate, global_step, iters_per_epoch, .99, staircase=True)

            optimizer = tf.train.AdamOptimizer(lr, beta1 = 0.8, beta2 = 0.999, epsilon = 1e-7)
            # lrate = hparams.d ** -0.5 * tf.minimum(tf.pow(tf.cast(global_step, tf.float32), tf.constant(-0.5)),
            #                                              tf.constant(3000 ** -1.5))
            # optimizer = tf.train.AdamOptimizer(lrate, beta1=0.9, beta2=0.97, epsilon=1e-9)

            params = tf.trainable_variables()
            gradients = tf.gradients(loss, params)
            clipped_gradients, _ = tf.clip_by_global_norm(gradients,
                                                          hparams.max_gradient_norm)

            train_op = optimizer.apply_gradients(
                zip(clipped_gradients, params), global_step=global_step)

            total_params()
            logging_hook = tf.train.LoggingTensorHook(metrics, every_n_iter=100)
            
        tf.summary.scalar('cross_entropy', loss)
        tf.summary.scalar('accuracy', accuracy)

        # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
        merged = tf.summary.merge_all()

#        
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
    
    return model_fn

model_fn = model_fn_builder(voc_holder, hparams, elmo)

In [16]:
now = str(int(datetime.now().timestamp()))
summaries_dir = './logs'
# summaries_dir + '/test' + now

print('Model id #{}'.format(now))


run_config = tf.estimator.RunConfig()
estimator = tf.estimator.Estimator(
    model_fn=model_fn, config=run_config, model_dir=(summaries_dir + '/model' + now))

Model id #1545508358
INFO:tensorflow:Using config: {'_model_dir': './logs/model1545508358', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f24c71ebf60>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
train_spec = tf.estimator.TrainSpec(lambda: input_fn(hparams.batch_size), 
                                    max_steps = 2*len(train_x)//hparams.batch_size * hparams.max_epochs)
eval_spec = tf.estimator.EvalSpec(lambda: test_input_fn(hparams.batch_size))

results = tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
results

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
Adding l2


In [18]:
all_dict = pd.read_csv("/home/momchil/Desktop/elastic_top10_all_dict.tsv", sep='\t')

all_dict.head()

c_maxlen = hparams.decoder_length
q_maxlen = hparams.encoder_length

questions = []
references = []
candidates = []

for idx in tqdm(range(all_dict.shape[0])):
    
    r = voc_holder.to_word_idx(all_dict.ix[idx, 'Reference'], -1)
    q = voc_holder.to_word_idx(all_dict.ix[idx, 'Question'], q_maxlen)
    candidates.append(all_dict.ix[idx, 'Hypothesis'])
    questions.append(q)
#     references.append(strip_punkt(r, eval_conf.voc_holder.reverse_vocab))
    references.append(r[r.nonzero()])

HBox(children=(IntProgress(value=0, max=4044), HTML(value='')))




In [23]:
# correct solution:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0) # only difference

def eval_predictions(predictions, references, is_greedy):
    hypothesis = []
    
    for i, probs in enumerate(predictions):
        try:
            c = np.array([voc_holder.to_word_idx(c, c_maxlen) for c in candidates[i].split("@ @ @")])
            idx = probs.argmax(0)[0] if is_greedy else\
                  np.random.choice(range(len(probs)), 1, p=softmax(probs).T[0])[0]
            h = c[idx]
    #         hypothesis.append(strip_punkt(h, eval_conf.voc_holder.reverse_vocab))
            hypothesis.append(h[h.nonzero()])
        except:
            print("Skipping", idx)
            raise

    references = np.array(references)
    hypothesis = np.array(hypothesis)
    eval_conf = get_evaluation_conf(None, hparams, None, None, voc_holder)
    evaluation = evaluate_words_index(references, hypothesis, eval_conf, hparams.evaluation_metrics, True)
    return evaluation

predictions = []

eval_input_fn = eval_input_fn_builder(questions, candidates)
pred_gen = estimator.predict(lambda: eval_input_fn(1), predict_keys='logits', yield_single_examples=False)
for p in tqdm(pred_gen, total=len(all_dict)):
    predictions.append(p['logits'])

for _ in range(5):
    evaluation = eval_predictions(predictions, references, False)
    print(format_metrics(evaluation), end='\n\n')
print("Greedy")

evaluation = eval_predictions(predictions, references, True)
print(format_metrics(evaluation))

HBox(children=(IntProgress(value=0, max=4044), HTML(value='')))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./logs/model1545508358/model.ckpt-18382
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

BLEU@2: 15.75774442253442
Embedding Average: 78.22661547558036
Greedy Matching: 31.072027783808416
ROUGE_L: 24.61709790859387
Vector Extrema: 40.422141354514295


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

BLEU@2: 15.916557220397848
Embedding Average: 78.3932229091149
Greedy Matching: 31.122681689190994
ROUGE_L: 24.684578976120715
Vector Extrema: 40.59055378862006


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

BLEU@2: 15.941834567857804
Embedding Average: 78.35599346513433
Greedy Matching: 31.155852705404524
ROUGE_L: 24.655845612572698
Vector Extrema: 40.6492684395473


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

BLEU@2: 15.78639223543167
Embedding Average: 78.38821640725462
Greedy Matching: 31.209832738147625
ROUGE_L: 24.60141756206627
Vector Extrema: 40.703585068317885


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

BLEU@2: 15.693864391989226
Embedding Average: 78.20456625552617
Greedy Matching: 31.1091629957454
ROUGE_L: 24.546035216329482
Vector Extrema: 40.487256244664096
Greedy


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

BLEU@2: 15.178345469681423
Embedding Average: 78.34437802020314
Greedy Matching: 31.12752050195541
ROUGE_L: 24.127489239551984
Vector Extrema: 40.809929125897085


In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

print(probs.T[0])
print(softmax(probs).T[0])
print([sigmoid(x) for x in probs.T[0]] / sum([sigmoid(x) for x in probs]))
print([x + np.min(probs) for x in probs.T[0]] / sum([x + np.min(probs) for x in probs]))

In [None]:
### for w1, w2 in [('ipad', 'iphone'), ('hello', 'hi'), ('yes', 'no'), ('talk', 'bad'), ('love', 'hate')]:
#     print(np.dot(bert[voc_holder.vocab[w1]], bert[voc_holder.vocab[w2]]) / \
#         (np.linalg.norm(bert[voc_holder.vocab[w1]]) * np.linalg.norm(bert[voc_holder.vocab[w2]])))
#     print(np.dot(elmo[voc_holder.vocab[w1]], elmo[voc_holder.vocab[w2]]) / \
#         (np.linalg.norm(elmo[voc_holder.vocab[w1]]) * np.linalg.norm(elmo[voc_holder.vocab[w2]])))
#     print()
# # np.mean(emb_weights, axis=1).tolist()