In [1]:
import numpy as np
import tensorflow as tf

tf.enable_eager_execution()

  from ._conv import register_converters as _register_converters


In [2]:
tf.executing_eagerly()

np.random.seed(1234)

batch_size = 2
N = 10 # number of maximum context length
M = 5 # number of maximum question length
d = 128

context = np.random.randn(batch_size, N, d)
query = np.random.randn(batch_size, M, d)

from BiDAF

> α(h, u) = w⊤ [h; u; h ◦ (S)
u], where w(S) ∈ R6d is a trainable weight vector, ◦ is elementwise multiplication, [; ] is vector concatenation across row, and implicit multiplication is matrix multiplication

tri_linear: https://github.com/allenai/bi-att-flow/blob/master/my/tensorflow/nn.py#L125

### Context-Query Attention

> We use C and Q to denote the encoded context and query. The context-to-query attention is constructed as follows: We first computer the similarities between each pair of context and query words, rendering a similarity matrix S ∈ Rn×m. We then normalize each row of S by applying the softmax function, getting a matrixS.Thenthecontext-to-queryattentioniscomputedasA=S·QT ∈Rn×d.Thesimilarity function used here is the trilinear function (Seo et al., 2016):
f(q, c) = W0[q, c, q ⊙ c],
where ⊙ is the element-wise multiplication and W0 is a trainable variable.

In [65]:
class ConfusionMaxtirx(tf.keras.layers.Layer):
    """contexとqueryのconfusion matrixを計算する
    
    Input:
      context: (batch_size, N, dim)
      query: (batch_size, M, dim)
      context_mask: (batch_size, N)
      query_mask: (batch_size, M)
      
    Output: (batch_size, N, M)
    """

    def __init__(self, initializer='glorot_uniform', **kwargs):
        self._initializer = initializer
        super(ConfusionMaxtirx, self).__init__(**kwargs)
    
    def build(self, input_shape):
        c_shape, _, _, _ = input_shape
        
        self._W = self.add_weight(
            'weight',
            [c_shape[-1] * 3, 1],
            initializer=self._initializer)
        
        super(ConfusionMaxtirx, self).build(input_shape)
    def call(self, x):
        c, q, c_mask, q_mask = x
        N, M, d = c.shape[1], q.shape[1], c.shape[-1]
        
        # (batch_size, N, M, d)
        c = tf.tile(tf.expand_dims(c, 2), [1, 1, M, 1])
        q = tf.tile(tf.expand_dims(q, 1), [1, N, 1, 1])
        # (batch_size, N * M, d)
        c = tf.reshape(c, [-1, N * M, d])
        q = tf.reshape(q, [-1, N * M, d])
        c_q = c * q
        # (batch_size, N * M, d * 3)
        S = tf.concat([c, q, c_q], 2)
        # (batch_size, N, M)
        logits = tf.reshape(tf.tensordot(S, self._W, [[2], [0]]), [-1, N, M])
        
        # (batch_size, N, M)
        c_mask = tf.cast(tf.tile(tf.expand_dims(c_mask, 2), [1, 1, M]), tf.bool)
        q_mask = tf.cast(tf.tile(tf.expand_dims(q_mask, 1), [1, N, 1]), tf.bool)
        
        return exp_mask(logits, c_mask & q_mask)
        
    def compute_output_shape(self, input_shape):
        c_shape, q_shape, _, _ = input_shape
        return tf.TensorShape([
            c_shape[0], c_shape[1], q_shape[1]])

VERY_NEGATIVE_NUMBER = - 1e30
    
def exp_mask(val, mask):
    return val + (1. - tf.cast(mask, tf.float32)) * VERY_NEGATIVE_NUMBER

In [66]:
_c = np.random.randn(2, 10, 128)
_q = np.random.randn(2, 5, 128)
_c_mask = np.array([[1.] * 8 + [0.] * 2] * 2)
_q_mask = np.array([[1.] * 4 + [0.] * 1] * 2)

__c_mask = tf.cast(tf.tile(tf.expand_dims(_c_mask, 2), [1, 1, 5]), tf.bool)
__q_mask = tf.cast(tf.tile(tf.expand_dims(_q_mask, 1), [1, 10, 1]), tf.bool)
print(__c_mask & __q_mask)
__cq_mask = __c_mask & __q_mask

(1. - tf.cast(__cq_mask, tf.float32)) * (- 1e30)

tf.Tensor(
[[[ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [False False False False False]
  [False False False False False]]

 [[ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [ True  True  True  True False]
  [False False False False False]
  [False False False False False]]], shape=(2, 10, 5), dtype=bool)


<tf.Tensor: id=935, shape=(2, 10, 5), dtype=float32, numpy=
array([[[-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-1.e+30, -1.e+30, -1.e+30, -1.e+30, -1.e+30],
        [-1.e+30, -1.e+30, -1.e+30, -1.e+30, -1.e+30]],

       [[-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
        [-0.e+00, -0.e+00, -0.e+00, -0.e+00, -1.e+30],
   

In [69]:
import unittest

from unittest import TestCase

np.random.seed(1234)

class SimilarityMaxrixTest(TestCase):
    def test_similarity_matrix(self):
        in_context = tf.keras.layers.Input(shape=(10, 128))
        in_query = tf.keras.layers.Input(shape=(5, 128))
        in_context_mask = tf.keras.layers.Input(shape=(10,))
        in_query_mask = tf.keras.layers.Input(shape=(5,))

        S = ConfusionMaxtirx(
            initializer=tf.keras.initializers.Ones()
        )([in_context, in_query, in_context_mask, in_query_mask])

        model = tf.keras.models.Model(
            inputs=[in_context, in_query,
                   in_context_mask, in_query_mask], outputs=S)
        model.compile(
            optimizer=tf.train.GradientDescentOptimizer(0.001),
            loss='mse')
        
        context = np.random.randn(2, 10, 128)
        query = np.random.randn(2, 5, 128)
        context_mask = np.array([[1.] * 8 + [0.] * 2] * 2)
        query_mask = np.array([[1.] * 4 + [0.] * 1] * 2)
        prediction = model.predict([context, query, context_mask, query_mask])
        
        c_ = context[0]
        q_ = query[0]
        
        print(prediction)

        self.assertTrue(np.isclose(
            sum(np.sum(x) for x in [c_[0], q_[0], c_[0] * q_[0]]),
            prediction[0][0][0]))
        self.assertTrue(np.isclose(
            sum(np.sum(x) for x in [c_[0], q_[1], c_[0] * q_[1]]),
            prediction[0][0][1]))
        self.assertTrue(np.isclose(
            sum(np.sum(x) for x in [c_[0], q_[2], c_[0] * q_[2]]),
            prediction[0][0][2]))

        self.assertTrue(np.isclose(
            sum(np.sum(x) for x in [c_[1], q_[0], c_[1] * q_[0]]),
            prediction[0][1][0]))
        self.assertTrue(np.isclose(
            sum(np.sum(x) for x in [c_[1], q_[1], c_[1] * q_[1]]),
            prediction[0][1][1]))
        self.assertTrue(np.isclose(
            sum(np.sum(x) for x in [c_[1], q_[2], c_[1] * q_[2]]),
            prediction[0][1][2]))
        
        self.assertTrue((prediction[0][8] < -1e29).all())
        self.assertTrue((prediction[0][9] < -1e29).all())

        self.assertTrue((prediction[0][:, 4] < -1e29).all())

In [70]:
unittest.main(argv=['first-arg-is-ignored'], exit=False)

  if d.decorator_argspec is not None), _inspect.getargspec(target))
.

tf.Tensor(
[[[-26.680727     8.706963     4.0591383    1.026109    20.791464  ]
  [  8.203361    26.703358    10.090115    28.830126     4.4024324 ]
  [ -0.2873788   22.457983    39.042847    35.48367     17.648518  ]
  [-19.38573    -25.331831   -18.605465    -4.207034    -8.938112  ]
  [ -7.7391853    8.7559395   29.606297    23.942646    -5.004421  ]
  [ -8.550956    13.350004     5.2746067    1.4550667   -5.768668  ]
  [-19.972979    -9.359817   -22.498337    11.988964    16.971111  ]
  [ -9.590694    -0.09886384  -7.1513443  -16.830381   -17.28229   ]
  [ -6.801503     2.1188016    9.362334    -0.9295566   -4.843854  ]
  [  4.9242787    8.273638    -4.1670313   24.31109     -6.914058  ]]

 [[-10.826447    21.88923      2.2997658   12.595575    13.492126  ]
  [ 11.372191     0.44924843   6.1759567   13.439011   -23.922638  ]
  [  2.4858618   11.282563     3.6231716   21.716213     3.372271  ]
  [-42.06301    -19.71345     20.213993     8.019485   -14.077389  ]
  [  8.271914   -21.8


----------------------------------------------------------------------
Ran 1 test in 0.009s

OK


<unittest.main.TestProgram at 0x7f946c2eddd8>

In [16]:
class Softmax(tf.keras)

class ContextQueryAttention(tf.keras.layers.Lambda):
    def __init__(self, **kwargs):
        def fn(x):
            """context-to-query attention
            
              引数:
                x:
                  S_: (batch_size, N, M)
                    similarity-matrixを行方向にsoftmaxしたもの
                  q: (batch_size, M, d)
            """
            S_, q = x
            return tf.matmul(S_, q)
            
        super(ContextQueryAttention, self).__init__(
            function=fn, **kwargs)
    def compute_output_shape(self, input_shape):
        d = input_shape[1][-1]
        N = input_shape[0][1]
        return tf.TensorShape([input_shape[0][0], N, d])

In [30]:
class QueryContextAttention(tf.keras.layers.Lambda):
    def __init__(self, **kwargs):
        def fn(x):
            """query-to-context attention
            
              引数:
                x:
                  S_: (batch_size, N, M)
                    similarity-matrixを行方向にsoftmaxしたもの
                  S__: (batch_size, N, M)
                    similarity-matrixを列方向にsoftmaxしたもの
                  c: (batch_size, N, d)
            """
            S_, S__, c = x
            return tf.matmul(tf.matmul(S_, S__, transpose_b=True), c)
        
        super(QueryContextAttention, self).__init__(
            function=fn, **kwargs)
    def compute_output_shape(self, input_shape):
        d = input_shape[2][-1]
        N = input_shape[0][1]
        return tf.TensorShape([input_shape[0][0], N, d])

In [27]:
in_context = tf.keras.layers.Input(shape=(10, 128))
in_query = tf.keras.layers.Input(shape=(5, 128))

S = SimilarityMaxtirx(initializer=tf.keras.initializers.Ones())([in_context, in_query])
S_ = tf.keras.layers.Lambda(lambda x: tf.nn.softmax(x, 2))(S)
S__ = tf.keras.layers.Lambda(lambda x: tf.nn.softmax(x, 1))(S)
A = ContextQueryAttention()([S_, in_query])
B = QueryContextAttention()([S_, S__, in_context])

model = tf.keras.models.Model(
    inputs=[in_context, in_query], outputs=[A, B])
model.compile(
    optimizer=tf.train.GradientDescentOptimizer(0.001),
    loss='mse')
prediction = model.predict([context, query])
prediction



[array([[[ 0.42624378, -0.42186388, -0.3423511 , ...,  0.4141646 ,
           0.2401321 , -0.20993373],
         [ 0.286063  , -0.3810072 , -0.675566  , ...,  0.5241299 ,
           0.3081832 , -0.2659616 ],
         [ 0.3023329 , -0.25960472, -0.5013888 , ...,  0.8884078 ,
           0.0510106 ,  0.09299337],
         ...,
         [ 0.39595932, -0.34718075, -0.7454579 , ...,  0.6564402 ,
          -0.49691024, -0.253864  ],
         [ 0.3028922 , -0.25585917, -0.49765366, ...,  0.90051794,
           0.04516283,  0.10472202],
         [ 0.27646336, -0.38716143, -0.6815043 , ...,  0.5068797 ,
           0.39329955, -0.27373126]],
 
        [[ 0.19743341, -0.40965334, -0.68006325, ..., -0.32033688,
           0.6024212 , -0.33785802],
         [ 0.47385624, -0.6860708 , -0.86413115, ..., -0.41086742,
           1.0548424 , -0.06912111],
         [ 0.5347344 , -0.760919  , -0.89348876, ..., -0.40699884,
           1.1184912 , -0.06335605],
         ...,
         [ 0.5347473 , -0.7609319

In [29]:
print(prediction[0].shape)
print(prediction[1].shape)

(2, 10, 128)
(2, 10, 128)


3まで終わった