# Match Pyramid

参考实现：

* [MatchPyramid-for-semantic-matching](https://github.com/ddddwy/MatchPyramid-for-semantic-matching/blob/master/match_pyramid.py)
* [MatchZoo](https://github.com/NTMC-Community/MatchZoo)

In [1]:
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -q tensorflow==2.0.0a
    

In [2]:
import tensorflow as tf

tf.__version__

'2.0.0-alpha0'

In [25]:
config = {
    'query_max_len': 20,
    'doc_max_len': 100,
    'num_conv_layers': 2,
    'filters': [8, 16, 32],
    'kernel_size': [[5, 5], [3, 3], [3, 3]],
    'pool_size': [[2, 2], [2, 2], [2, 2]],
    'dropout': 0.5,
    'batch_size': 32,
    'vocab_size': 100,
    'embedding_size': 128,
}

In [45]:

def build_model(config):
    q_input = tf.keras.layers.Input(shape=(config['query_max_len'],), name='q_input')
    d_input = tf.keras.layers.Input(shape=(config['doc_max_len'],), name='d_input')
    
    embedding = tf.keras.layers.Embedding(config['vocab_size'], config['embedding_size'], name='embedding')
    
    q_embedding = embedding(q_input)
    d_embedding = embedding(d_input)
    
    #dot
    dot = tf.keras.layers.Dot(axes=-1, name='dot')([q_embedding, d_embedding])
    print('dot shape: ', dot.shape)
    matrix = tf.keras.layers.Reshape((config['query_max_len'], config['doc_max_len'], 1), name='matrix')(dot)
    print('matrix shape: ', matrix.shape)
    
    x = matrix
    for i in range(config['num_conv_layers']):
        x = tf.keras.layers.Conv2D(
            filters=config['filters'][i], 
            kernel_size=config['kernel_size'][i], 
            padding='same',
            activation='relu')(x)
        x = tf.keras.layers.MaxPooling2D(pool_size=tuple(config['pool_size'][i]))(x)
    
    flatten = tf.keras.layers.Flatten()(x)
    drop = tf.keras.layers.Dropout(config['dropout'])(flatten)
    dense = tf.keras.layers.Dense(32, activation='relu')(drop)
    out = tf.keras.layers.Dense(1, activation='sigmoid', name='out')(dense)
    
    model = tf.keras.Model(inputs=[q_input, d_input], outputs=[matrix, out])
    return model
                                                  

In [47]:

model = build_model(config)

model.summary()

model.compile(loss={'out': 'binary_crossentropy'}, optimizer='sgd')

W0508 17:10:30.348741 140249640052544 training_utils.py:1152] Output matrix missing from loss dictionary. We assume this was done on purpose. The fit and evaluate APIs will not be expecting any data to be passed to matrix.


dot shape:  (None, 20, 100)
matrix shape:  (None, 20, 100, 1)
Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
q_input (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
d_input (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           multiple             12800       q_input[0][0]                    
                                                                 d_input[0][0]                    
__________________________________________________________________________________________________
dot (Dot)                     

In [43]:
from tensorflow.python.ops import lookup_ops

query_vocab_file = ''
doc_vocab_file = ''

query_str2id = lookup_ops.index_table_from_file(query_vocab_file, default_value=0)
query_id2str = lookup_ops.index_to_string_table_from_file(query_vocab_file, default_value='unk')
doc_str2id = lookup_ops.index_table_from_file(doc_vocab_file, default_value=0)
doc_id2str = lookup_ops.index_to_string_table_from_file(doc_vocab_file, default_value='unk')

unk_id = tf.constant(0, dtype=tf.int32)

ValueError: vocabulary_file must be specified and must not be empty.

In [None]:
dataset_config = {
    'shuffle_size': 10000000,
    'num_parallel_calls': 4,
    'query_max_len': 20,
    'doc_max_len': 100,
    'batch_size': 32,
    'predict_batch_size': 32
}

def _common_process_dataset(dataset, config):
    dataset = dataset.shuffle(config['shuffle_size'])
    dataset = dataset.map(
        lambda x: (tf.string_split([x], delimiter='@').values[0],
                   tf.string_split([x], delimiter='@').values[1],
                   tf.string_split([x], delimiter='@').values[2]),
        num_parallel_calls=config['num_parallel_calls']
    ).prefetch(tf.data.experimental.AUTO_TUNE)
    dataset = dataset.map(
        lambda q, d, l: (tf.string_split([q], delimiter=' ').values,
                         tf.string_split([d], delimiter=' ').values,
                         tf.string_to_number(l, out_type=tf.int32)),
        num_parallel_calls=config['num_parallel_calls']
    ).prefetch(tf.data.experimental.AUTO_TUNE)
    
    dataset = dataset.map(
        lambda q, d, l: (q[:config['query_max_len']], d[:config['doc_max_len']], l),
        num_parallel_calls=config['num_parallel_calls']
    ).prefetch(tf.data.experimental.AUTO_TUNE)
    
    dataset = dataset.map(
        lambda q, d, l: (query_str2id.lookup(q), doc_str2id.lookup(d), l),
        num_parallel_calls=config['num_parallel_calls']
    ).prefetch(tf.data.experimental.AUTO_TUNE)
    
    return dataset

def _build_dataset(dataset, config):
    dataset = _common_process_dataset(dataset, config)
    dataset = dataset.padded_batch(
        batch_size=config['batch_size'],
        padding_shapes=(tf.Dimension(config['query_max_len']),
                        tf.Dimension(config['doc_max_len']),
                        []),
        padding_values=(unk_id, unk_id, 0)
    )
    dataset = dataset.map(
        lambda q, d, l: ((q, d), l),
        num_parallel_calls=config['num_parallel_calls']
    ).prefetch(tf.data.experimental.AUTO_TUNE)
    
    return dataset

def build_train_dataset(train_files, config):
    dataset = tf.data.Dataset.from_tensor_slices(train_files)
    dataset = dataset.flat_map(lambda x: tf.data.TextLineDataset(x).skip(config.get('skip_count', 0)))
    dataset = _build_dataset(dataset, config)
    return dataset

def build_eval_dataset(eval_files, config):
    dataset = tf.data.Dataset.from_tensor_slices(eval_files)
    dataset = dataset.flat_map(lambda x: tf.data.TextLineDataset(x))
    dataset = _build_dataset(dataset)
    return dataset

def build_predict_dataset(predict_files, config):
    """假设predict文件也带有label"""
    dataset = tf.data.Dataset.from_tensor_slices(predict_files)
    dataset = dataset.flat_map(lambda x: tf.data.TextLineDataset(x).skip(config.get('skip_count', 0)))
    dataset = _commone_process_dataset(dataset)
    dataset = dataset.map(
        lambda q, d, l: (q, d),
        num_parallel_calls=config['num_parallel_calls']
    ).prefetch(tf.data.experimental.AUTO_TUNE)
    
    dataset = dataset.padded_batch(
        batch_size=config['predict_batch_size'],
        padding_shapes=(tf.Dimension(config['query_max_len']),
                        tf.Dimension(config['doc_max_len'])),
        paddding_values=(unk_id, unk_id)
    )
    dataset = dataset.map(lambda q, d: ((q, d)))
    return dataset


In [None]:
# train and eval model
import os

train_dataset = build_train_dataset(dataset_config)
eval_dataset = build_eval_dataset(dataset_config)

model_dir = '/opt/algo_nfs/kdd_luozhouyang/models/match_pyramid/20190508'
if not os.path.exists(model_dir):
    os.mkdirs(model_dir)
    
ckpt_path = os.path.join(model_dir, 'mp-{epoch:04d}.ckpt')
log_dir = os.path.join(model_dir, 'log')

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(ckpt_path, save_weights_only=True,),
    tf.keras.callabcks.TensorBoard(log_dir),
    tf.keras.callbacks.EarlyStopping(patience=10)
]

model.fit(train_dataset, validation_data=eval_dataset, callbacks=callbacks)


In [None]:
# test model

predict_dataset = build_predict_dataset(dataset_config)

results = model.predict(predict_dataset)
matrix, out = results['matrix'], results['out']

print(matrix.numpy())
print(out.numpy())