In [1]:
import tensorflow as tf
import numpy as np

tf.enable_eager_execution()

  from ._conv import register_converters as _register_converters


In [2]:
tf.executing_eagerly()

True

In [3]:
# Word embedding

batch_size = 2
N = 10 # number of maximum context length
M = 5 # number of maximum question length
p1 = 300 # word embedding size

> All the out-of-vocabulary words are mapped to an <UNK> token, whose embedding is trainable with random initialization. 


In [4]:
from collections import namedtuple

class Inputs(namedtuple('Inputs', [
    'context_words',
    'context_word_unk_label'])):
    pass

np.random.seed(1234)

inputs = Inputs(
    context_words=np.random.randn(batch_size, N, p1),
    # 最初だけUNK
    context_word_unk_label=np.array([[True] + [False] * (N - 1)] * batch_size))

In [5]:
unk_word_embedding = tf.get_variable('unk_word_embedding', shape=(1, p1))

context_words = tf.where(
    tf.tile(tf.expand_dims(inputs.context_word_unk_label, -1), [1, 1, p1]),
    tf.nn.embedding_lookup(
        unk_word_embedding,
        tf.zeros_like(inputs.context_word_unk_label, dtype=tf.int32)),
    inputs.context_words)

assert not all(np.isclose(
    context_words[0][0],
    inputs.context_words[0][0]))
assert all(np.isclose(
    context_words[0][1],
    inputs.context_words[0][1]))

> The character embedding is obtained as follows: Each character is represented as a trainable vector of dimension p2 = 200, meaning each word can be viewed as the concatenation of the embedding vectors for each of its characters. The length of each word is either truncated or padded to 16. We take maximum value of each row of this matrix to get a fixed-size vector representation of each word. Finally, the output of a given word x from this layer is the concatenation [xw;xc] ∈ Rp1+p2, where xw and xc are the word embedding and the convolution output of character embedding of x respectively.

In [6]:
C = 16 # number of maximum word length
char_vocab_size = 1000
p2 = 200 # character embedding size

class Inputs(namedtuple('Inputs', [
    'context_words',
    'context_word_unk_label',
    'context_char_ids'])):
    pass

np.random.seed(1234)

inputs = Inputs(
    context_words=np.random.randn(batch_size, N, p1),
    # 最初だけUNK
    context_word_unk_label=np.array([[True] + [False] * (N - 1)] * batch_size),
    context_char_ids=np.random.randint(0, char_vocab_size, size=(batch_size, N, C)))

In [7]:
context_char_embedding = tf.get_variable(
    'context_char_embedding', shape=(char_vocab_size, p2))

# (batch_size, N, C, p2)
context_chars = tf.nn.embedding_lookup(
    context_char_embedding, inputs.context_char_ids)

In [8]:
# conv2d: (batch_size, height, width, in_channel)
#    -> (batch_size, height - f + 1, width - f + 1, out_channel)

# 今回の場合: (batch_size, 1, C, p2)
#    -> (batch_size, 1, C * f + 1, p2)
# 但しbatch_sizeは実はbatch_size * N

filter_size = 7

# (batch_size * N, 1, C, p2)
context_chars = tf.expand_dims(tf.reshape(context_chars, [-1, C, p2]), 1)
kernel = tf.get_variable('char_filter', [1, filter_size, p2, p2])
# (batch_size * N, 1, C - filter_size + 1, p2)
context_chars = tf.nn.conv2d(
    context_chars, kernel, [1, 1, 1, 1], 'VALID')
# (batch_size, N, C - filter_size + 1, p2)
#context_chars = tf.reshape(context_chars, [-1, N, C - filter_size + 1, p2])
# (batch_size, N, p2)
context_chars = tf.reduce_max(context_chars, 2)
context_chars

<tf.Tensor: id=82, shape=(20, 1, 200), dtype=float32, numpy=
array([[[0.08703647, 0.10834691, 0.05828177, ..., 0.04130189,
         0.06452552, 0.06336806]],

       [[0.04885535, 0.07087737, 0.09226833, ..., 0.03336651,
         0.06198725, 0.08716781]],

       [[0.05197889, 0.08440043, 0.05774486, ..., 0.08644083,
         0.06589647, 0.0719725 ]],

       ...,

       [[0.09894997, 0.07307689, 0.05758217, ..., 0.03159501,
         0.05708284, 0.09963546]],

       [[0.06574344, 0.03455252, 0.04887037, ..., 0.03554013,
         0.07093097, 0.098703  ]],

       [[0.06985399, 0.08604864, 0.04421144, ..., 0.08145757,
         0.09867382, 0.13105427]]], dtype=float32)>

In [13]:
class WordEmbedding(tf.keras.layers.Layer):
    def build(self, input_shape):
        word_input_shape, _ = input_shape
        word_embedding_dim = word_input_shape[-1]
        
        self._unk_embedding = self.add_weight(
            'unk_embedding',
            [1, word_embedding_dim],
            initializer='glorot_uniform')

        super(WordEmbedding, self).build(input_shape)
        
    def call(self, x):
        """UNKラベルで指定された単語をUNK向けのembeddingで
        置き換える
        
          x:
            words: (batch_size, N, word_embedding_dim)
            word_unk_label: (batch_size, N)
        """
        words, word_unk_label = x
        # なぜかfloatで渡ってくる…
        word_unk_label = tf.cast(word_unk_label, tf.bool)
        
        # All the out-of-vocabulary words are mapped to an <UNK> token,
        # whose embedding is trainable with random initialization. 
        
        # (batch_size, N, p1)
        return tf.where(
            tf.tile(tf.expand_dims(word_unk_label, -1), [1, 1, words.shape[-1]]),
            tf.nn.embedding_lookup(
                self._unk_embedding,
                tf.zeros_like(word_unk_label, dtype=tf.int32)),
            words)

    def compute_output_shape(self, input_shape):
        return input_shape[0]

# !!!!!! assertion
    
class CharacterEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, dim, filter_size, **kwargs):
        """コンストラクタ
        
          vocab_size: 文字の辞書のサイズ
          dim: embeddingの次元
          filter_size: 畳み込みのフィルターサイズ
        """
        self._vocab_size = vocab_size
        self._dim = dim
        self._filter_size = filter_size
        super(CharacterEmbedding, self).__init__(**kwargs)
    
    def build(self, input_shape):
        self._embedding = self.add_weight(
            'embedding',
            [self._vocab_size, self._dim],
            initializer='glorot_uniform')
        self._filter = self.add_weight(
            'filter',
            [1, self._filter_size, self._dim, self._dim], # みっつめはin_channelｄからembedding_sizeと同じ
            # 四つ目は出力、200、わけて受け取るようにする
            initializer='glorot_uniform')
        self._bias = self.add_weight(
            'bias',
            [1, 1, 1, self._dim],
            initializer='zeros')
        super(CharacterEmbedding, self).build(input_shape)
    
    def call(self, x):
        """文字のIDをdimのベクトルに変換する
        
          x: (batch_size, N, C)
              Cは最大文字数(16)
        """
        x = tf.cast(x, tf.int32)
        N, C = x.shape[1:]

        # from BiDAF
        # Characters are embed- ded into vectors, which can be
        # considered as 1D inputs to the CNN, and whose size is
        # the input channel size of the CNN.
        # The outputs of the CNN are max-pooled over the entire
        # width to obtain a fixed-size vector for each word.
        
        # (batch_size, N, C, p2)
        x_ = tf.nn.embedding_lookup(self._embedding, x)
        # (batch_size * N, C, p2)
        x_ = tf.reshape(x_, [-1, C, self._dim])
        # (batch_size * N, 1, C, p2)
        x_ = tf.expand_dims(x_, 1)
        # (batch_size * N, 1, C - filter_size + 1, p2)
        x_ = tf.nn.conv2d(x_, self._filter, [1, 1, 1, 1], 'VALID') + self._bias
        # (batch_size, N, C - filter_size + 1, p2)
        x_ = tf.reshape(x_, [-1, N, C - self._filter_size + 1, self._dim])
        # (batch_size, N, p2)
        return tf.reduce_max(tf.nn.relu(x_), 2)

    def compute_output_shape(self, input_shape):
        return tf.TensorShape(
            [input_shape[0], input_shape[1], self._dim])

In [14]:
batch_size = 2
N = 10 # number of maximum context length
M = 5 # number of maximum question length
p1 = 300 # word embedding size
C = 16 # number of maximum word length
char_vocab_size = 1000
p2 = 200 # character embedding size
filter_size = 7

context_words = tf.keras.layers.Input(shape=(N, p1))
context_word_unk_label = tf.keras.layers.Input(shape=(N,))
context_char_ids = tf.keras.layers.Input(shape=(N, C), dtype='int32')

word_embedding_layer = WordEmbedding()
char_embedding_layer = CharacterEmbedding(
    char_vocab_size, p2, filter_size)

context_word_emb = word_embedding_layer(
    [context_words, context_word_unk_label])
context_char_emb = char_embedding_layer(context_char_ids)

print(context_char_emb)
print(context_word_emb)

con = tf.keras.layers.Concatenate(axis=2)
context = con(
    [context_word_emb, context_char_emb])

model = tf.keras.models.Model(
    inputs=[
        context_words,
        context_word_unk_label,
        context_char_ids],
    outputs=context)
model.compile(
    optimizer=tf.train.GradientDescentOptimizer(0.001),
    loss='mse')

DeferredTensor('None', shape=(?, 10, 200), dtype=float32)
DeferredTensor('None', shape=(?, 10, 300), dtype=float32)


In [15]:
from collections import namedtuple

class Inputs_(namedtuple('Inputs_', [
    'context_words',
    'context_word_unk_label',
    'context_char_ids'])):
    pass

np.random.seed(1234)

inputs = Inputs_(
    context_words=np.random.randn(batch_size, N, p1),
    # 最初だけUNK
    context_word_unk_label=np.array([[True] + [False] * (N - 1)] * batch_size),
    context_char_ids=np.random.randint(0, char_vocab_size, size=(batch_size, N, C)))

In [16]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 10, 300)      0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, 10)           0                                            
__________________________________________________________________________________________________
input_9 (InputLayer)            (None, 10, 16)       0                                            
__________________________________________________________________________________________________
word_embedding_3 (WordEmbedding (None, 10, 300)      300         input_7[0][0]                    
                                                                 input_8[0][0]                    
__________

In [None]:
model.predict(list(inputs)).shape

> The concatenation of the character and word embedding vectors is passed to a two-layer Highway Network (Srivastava et al., 2015).

BiDAFより

In [17]:
class HighwayLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(HighwayLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        d = input_shape[-1]
        
        self._W_T = self.add_weight(
            'weight_transform',
            [d, d],
            initializer='glorot_uniform')
        self._b_T = self.add_weight(
            'bias_transform',
            [d],
            initializer='zeros')
        self._W = self.add_weight(
            'weight',
            [d, d],
            initializer='glorot_uniform')
        self._b = self.add_weight(
            'bias',
            [d],
            initializer='zeros')
    
    def call(self, input):
        T = tf.sigmoid(tf.matmul(input, self._W_T) + self._b_T)
        H = tf.nn.relu(tf.matmul(input, self._W) + self._b)
        return H * T + (1. - T) * input
    
    def compute_output_shape(self, input_shape):
        return input_shape

In [18]:
inputs = tf.keras.layers.Input(shape=(10,))

x = HighwayLayer()(inputs)

model = tf.keras.models.Model(
    inputs=inputs, outputs=x)
model.compile(
    optimizer=tf.train.GradientDescentOptimizer(0.001),
    loss='mse')

In [19]:
class HighwayNetwork(tf.keras.models.Model):
    def __init__(self, num_layers, **kwargs):
        self._layers = [HighwayLayer() for _ in range(num_layers)]
        super(HighwayNetwork, self).__init__(**kwargs)

    def call(self, input):
        y = input
        for layer in self._layers:
            y = layer(y)
        return y

    def compute_output_shape(self, input_shape):
        return input_shape

In [20]:
inputs = tf.keras.layers.Input(shape=(10, 500))

x = HighwayNetwork(num_layers=2)(inputs)

model = tf.keras.models.Model(
    inputs=inputs, outputs=x)
model.compile(
    optimizer=tf.train.GradientDescentOptimizer(0.001),
    loss='mse')

In [21]:
model.predict(np.random.randn(2, 10, 500)).shape

(2, 10, 500)

やっとInput Embedding Layer終わり