# DSSM keras


## 关键问题

* 不同于word2vec，word2vec的词典数量不大，进行负采样很容易，dssm网络的query和doc千变万化，不能全部加载进内存进行负采样，因此应该在准备训练数据的时候就准备好负样本
* 计算query和doc的cosine相似度，是不是直接使用keras的Dot层即可？



In [2]:
import tensorflow as tf

In [1]:


class NegativeSamplingModelBuilder:
    """Negative sampling model builder."""

    def __init__(self, neg_num=10, params=None):
        self.neg_num = neg_num

        default_params = self.default_params()
        if params:
            default_params.update(params)
        self.params = default_params

        self.model = None
        self.observable_model = None

    def build_model(self):
        raise NotImplementedError()

    def build_observable_model(self):
        raise NotImplementedError()

    def cosine(self, x):
        query, doc = x
        query_norm = tf.tile(tf.sqrt(tf.reduce_sum(tf.square(query), 1, True)), [self.neg_num + 1, 1])
        doc_norm = tf.sqrt(tf.reduce_sum(tf.square(doc), 1, True))

        prod = tf.reduce_sum(tf.multiply(tf.tile(query, [self.neg_num + 1, 1]), doc), 1, True)
        prod_norm = tf.multiply(query_norm, doc_norm)

        cos = tf.truediv(prod, prod_norm)

        cos = tf.transpose(tf.reshape(tf.transpose(cos), [self.neg_num + 1, self.params['batch_size']]))
        return cos

    def default_params(self):
        params = {
            'vocab_size': 100,
            'vec_dim': 256,
            'batch_size': 32,
            'query_max_len': 10,
            'doc_max_len': 100
        }
        return params


class MLPNegativeSamplingModelBuilder(NegativeSamplingModelBuilder):
    """MLP negative sampling model builder. Multi-hot input instead of embedding."""

    def build_model(self):
        if self.model:
            return self.model

        query_input = tf.keras.layers.Input(shape=(self.params['query_max_len'],), name='query_input')
        query_dense_1 = tf.keras.layers.Dense(1024, name='query_dense_1')(query_input)
        query_vec = tf.keras.layers.Dense(self.params['vec_dim'], name='query_vec')(query_dense_1)

        doc_input = tf.keras.layers.Input(shape=(self.params['doc_max_len'],), name='doc_input')
        doc_dense_1 = tf.keras.layers.Dense(1024, name='doc_dense_1')(doc_input)
        doc_vec = tf.keras.layers.Dense(self.params['vec_dim'], name='doc_vec')(doc_dense_1)

        cosine = tf.keras.layers.Lambda(self.cosine, name='cosine')([query_vec, doc_vec])

        output = tf.keras.layers.Activation('softmax')(cosine)

        similarity = tf.keras.layers.Lambda(lambda x: tf.slice(x, [0, 0], [-1, 1]), name='similarity')(output)

        model = tf.keras.Model(inputs=[query_input, doc_input], outputs=[similarity])
        metrics = ['accuracy', tf.keras.metrics.Recall(), tf.keras.metrics.Precision()]
        model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=metrics)
        model.summary()

        self.model = model
        return self.model

    def build_observable_model(self):
        if self.observable_model:
            return self.observable_model
        model = self.build_model()
        observable_model = tf.keras.Model(
            inputs=model.input,
            outputs=[model.get_layer('query_vec').output,
                     model.get_layer('doc_vec').output,
                     model.get_layer('similarity').output
                     ])
        self.observable_model = observable_model
        return self.observable_model


In [None]:
def build_mlp_model(config):
    query_input = tf.keras.layers.Input(shape)

In [16]:
import tensorflow as tf

class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, features, hidden):
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights


def build_lstm_model():
    query_input = tf.keras.layers.Input(shape=(6,))
    doc_input = tf.keras.layers.Input(shape=(10,))
    
    embedding = tf.keras.layers.Embedding(100, 128)
    
    query_embedding = embedding(query_input)
    doc_embedding = embedding(doc_input)
    
    lstm = tf.keras.layers.LSTM(32)
    query_lstm = lstm(query_embedding)
    doc_lstm = lstm(doc_embedding)
    print(query_lstm)
    print(doc_lstm)
    
    query_dense = tf.keras.layers.Dense(1024)(query_lstm)
    doc_dense = tf.keras.layers.Dense(1024)(doc_lstm)
    
    query_vec = tf.keras.layers.Dense(256)(query_dense)
    doc_vec = tf.keras.layers.Dense(256)(doc_dense)
    
    def cosine(x):
        q, d = x
        q_norm = tf.sqrt(tf.reduce_sum(tf.square(q), 1, True))
        d_norm = tf.sqrt(tf.reduce_sum(tf.square(d), 1, True))
        prod = tf.reduce_sum(tf.multiply(q, d), 1, True)
        prod_norm = tf.multiply(q_norm, d_norm)
        cos = tf.truediv(prod, prod_norm)
        print('cos:', cos)
        return cos
        
    cos = tf.keras.layers.Lambda(lambda x: cosine(x))([query_vec, doc_vec])
    
    dot = tf.keras.layers.Dot(axes=1, normalize=True,)([query_vec, doc_vec])
    out = tf.keras.layers.Dense(1, activation='sigmoid', name='out')(dot)
    
    model = tf.keras.Model(inputs=[query_input, doc_input], outputs=[out, cos])
    return model

model = build_lstm_model()
model.summary()

model.compile(loss={'out': 'binary_crossentropy'}, optimizer='sgd')

W0423 16:32:22.415776 4506338752 training_utils.py:1152] Output lambda_6 missing from loss dictionary. We assume this was done on purpose. The fit and evaluate APIs will not be expecting any data to be passed to lambda_6.


Tensor("unified_lstm_15/strided_slice_3:0", shape=(None, 32), dtype=float32)
Tensor("unified_lstm_15_1/strided_slice_3:0", shape=(None, 32), dtype=float32)
cos: Tensor("lambda_6/truediv:0", shape=(None, 1), dtype=float32)
Model: "model_10"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_31 (InputLayer)           [(None, 6)]          0                                            
__________________________________________________________________________________________________
input_32 (InputLayer)           [(None, 10)]         0                                            
__________________________________________________________________________________________________
embedding_15 (Embedding)        multiple             12800       input_31[0][0]                   
                                                                 in

In [3]:
def cosine_similarity(x):
    q, d = x
    q_norm = tf.sqrt(tf.reduce_sum(tf.square(q), 1, True))
    d_norm = tf.sqrt(tf.reduce_sum(tf.square(d), 1, True))
    p = tf.reduce_sum(tf.multiply(q, d), 1, True)
    p_norm = tf.multiply(q_norm, d_norm)
    cos = tf.truediv(p, p_norm)
    return cos

In [4]:
print(tf.__version__)


class LSTMModel(tf.keras.Model):
    
    def __init__(self, params=None):
        super(LSTMModel, self).__init__(name='lstm_model')
        
        self.embedding = tf.keras.layers.Embedding(100, 128)
        
        self.query_dense = tf.keras.layers.Dense(1024)
        self.doc_dense = tf.keras.layers.Dense(1024)
        
        self.query_lstm = tf.keras.layers.LSTM(32)
        self.doc_lstm = tf.keras.layers.LSTM(32)
        
        self.query_dense_2 = tf.keras.layers.Dense(256, name='query_vec')
        self.doc_dense_2 = tf.keras.layers.Dense(256, name='doc_vec')
        
        self.cosine = tf.keras.layers.Lambda(lambda x: cosine_similarity(x), name='similarity')
        
        self.dot = tf.keras.layers.Dot(axes=1, normalize=True, name='dot')
        
        self.out = tf.keras.layers.Dense(1, activation='sigmoid', name='out')
        
    def call(self, inputs, training=True, mask=None):
        query, doc = inputs
        query_embedding = self.embedding(query)
        doc_embedding = self.embedding(doc)
        print('query embedding shape: ', query_embedding.shape)
        print('doc embedding shape: ', doc_embedding.shape)
        
        query_lstm = self.query_lstm(query_embedding)
        doc_lstm = self.doc_lstm(doc_embedding)
        print('query lstm shape: ', query_lstm.shape)
        print('doc lstm shape: ', doc_lstm.shape)
        
        query_vec = self.query_dense_2(query_lstm)
        doc_vec = self.doc_dense_2(doc_lstm)
        
        cos = self.cosine([query_vec, doc_vec])
        dot = self.dot([query_vec, doc_vec])
        out = self.out(dot)
        
        return {'cos': cos, 'out': out}
        
model = LSTMModel()
# loss = {'out': 'binary_crossentropy'} 报错找不到 'out'
model.compile(loss={'output_2': 'binary_crossentropy'}, optimizer='sgd')


q = [
    [1, 2, 3, 4],
    [2, 3, 4, 1]
]
d = [
    [2, 1, 3, 4],
    [3, 4, 1, 2]
]
l = [
    [1],
    [0],
]
qd = tf.data.Dataset.from_tensor_slices(q)
dd = tf.data.Dataset.from_tensor_slices(d)
ld = tf.data.Dataset.from_tensor_slices(l)

d = tf.data.Dataset.zip((qd, dd, ld))
for v in iter(d):
    print('v:', v, '\n')
print('===============================')
d = d.shuffle(100)
d = d.map(lambda q, d, l: ((q, d), l))
d = d.repeat(4)
d = d.batch(2)

# for v in iter(d):
#     print(v)
#     print('---------------------------')
    
h = model.fit(d)

pred_d = d
outputs = model.predict(pred_d)
cos, prob = outputs[0], outputs[1]
print('cos: \n', cos)
print('prob: \n', prob)

2.0.0-alpha0
v: (<tf.Tensor: id=12, shape=(4,), dtype=int32, numpy=array([1, 2, 3, 4], dtype=int32)>, <tf.Tensor: id=13, shape=(4,), dtype=int32, numpy=array([2, 1, 3, 4], dtype=int32)>, <tf.Tensor: id=14, shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>) 

v: (<tf.Tensor: id=18, shape=(4,), dtype=int32, numpy=array([2, 3, 4, 1], dtype=int32)>, <tf.Tensor: id=19, shape=(4,), dtype=int32, numpy=array([3, 4, 1, 2], dtype=int32)>, <tf.Tensor: id=20, shape=(1,), dtype=int32, numpy=array([0], dtype=int32)>) 

query embedding shape:  (None, 4, 128)
doc embedding shape:  (None, 4, 128)


W0423 20:00:34.438781 4481783232 training_utils.py:1152] Output output_1 missing from loss dictionary. We assume this was done on purpose. The fit and evaluate APIs will not be expecting any data to be passed to output_1.


query lstm shape:  (None, 32)
doc lstm shape:  (None, 32)
cos: 
 [[-0.01785314]
 [ 0.0249357 ]
 [ 0.0249357 ]
 [-0.01785314]
 [-0.01785314]
 [ 0.0249357 ]
 [ 0.0249357 ]
 [-0.01785314]]
prob: 
 [[0.4987174]
 [0.5017917]
 [0.5017917]
 [0.4987174]
 [0.4987174]
 [0.5017917]
 [0.5017917]
 [0.4987174]]
