# 基于Bert训练一个文本相似度模型

bert+maxpooling +cosine 计算相似度，只使用正例进行训练（约占训练集1/3 1w条数据）

In [1]:
from transformers import AutoTokenizer,TFAutoModel,AutoConfig,TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow import keras 
import json
from collections import defaultdict
import numpy as np
gpus = tf.config.list_physical_devices('GPU')
if len(gpus)>1:
    tf.config.set_visible_devices(gpus[1], 'GPU')
    print('use gpu1')

use gpu1


In [2]:
# config = AutoConfig.from_pretrained('bert-base-chinese')
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = TFAutoModel.from_pretrained('bert-base-chinese')

2022-03-14 17:55:12.277740: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-14 17:55:12.836827: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6390 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:b3:00.0, compute capability: 7.5
Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if y

## 构造数据集

### 这里使用 AFQMC 蚂蚁金融语义相似度 Ant Financial Question Matching Corpus， 但只使用其中的正例模拟常见应用场景

In [3]:
sentence_pairs=[]
with open('train.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        if data['label'] == '1':
            sentence_pairs.append((data['sentence1'],data['sentence2']))

In [4]:
print('训练数据对数：',len(sentence_pairs))
print('训练数据平均长度：', sum([len(s1)+len(s2) for s1,s2 in sentence_pairs])/(len(sentence_pairs)*2))
print('训练数据最大长度：', max([max(len(s1),len(s2)) for s1,s2 in sentence_pairs]))

训练数据对数： 10573
训练数据平均长度： 13.40915539581954
训练数据最大长度： 90


In [5]:
MAX_LEN = 50

In [6]:
s0_encoded = tokenizer([p[0] for p in sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
s1_encoded = tokenizer([p[1] for p in sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")

In [7]:
# 读取dev数据
dev_sentence_pairs=[]
dev_labels=[]
with open('dev.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        dev_sentence_pairs.append((data['sentence1'],data['sentence2']))
        dev_labels.append(True if data['label']=='1' else False)
dev_s0_encoded = tokenizer([p[0] for p in dev_sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
dev_s1_encoded = tokenizer([p[1] for p in dev_sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
dev_s1_input_ids = dev_s0_encoded['input_ids']
dev_s1_token_type_ids = dev_s0_encoded['token_type_ids']
dev_s1_attention_mask = dev_s0_encoded['attention_mask']
dev_s2_input_ids = dev_s1_encoded['input_ids']
dev_s2_token_type_ids = dev_s1_encoded['token_type_ids']
dev_s2_attention_mask = dev_s1_encoded['attention_mask']
dev_labels = tf.constant(dev_labels)

## 构建模型

In [8]:
# model.trainable = False

@tf.function
def max_pooling_with_mask(paras):
    sent_word_embeddings,sent_indices = paras
    not_padding = tf.math.not_equal(sent_indices,0)
    not_cls = tf.math.not_equal(sent_indices,101) # cls
    not_seg = tf.math.not_equal(sent_indices,102) # sep
    mask = tf.math.logical_and(not_padding,not_cls)
    mask = tf.math.logical_and(mask,not_seg)
    mask_f = tf.cast(mask,tf.float32)
    mask_f = tf.expand_dims(mask_f,axis=-1)
    return tf.reduce_mean(tf.multiply(sent_word_embeddings,mask_f),axis=1)

def convert_bert_to_sentence_embedding_model(bert_model,sentence_embedding_size=256):
    input_ids = keras.Input(shape=(None,),dtype=tf.int32)
    token_type_ids = keras.Input(shape=(None,),dtype=tf.int32)
    attention_mask = keras.Input(shape=(None,),dtype=tf.int32)
    output = bert_model(input_ids,token_type_ids,attention_mask)
    bert_embeddings = output.last_hidden_state
    sentence_embeddings = keras.layers.Lambda(max_pooling_with_mask,name='lambda_max_pooling')([bert_embeddings,input_ids]) # 这里使用max_pooling作为句子的embedding
    # sentence_embeddings = keras.layers.Dense(sentence_embedding_size,name='dense_layer',activation='relu')(sentence_embeddings)
    normalized_sentence_embeddings = keras.layers.Lambda(lambda xt: tf.nn.l2_normalize(xt,axis=1))(sentence_embeddings)

    return keras.Model([input_ids,token_type_ids,attention_mask],normalized_sentence_embeddings,name='sentence_embedding_model')

In [9]:
sentence_embedding_model = convert_bert_to_sentence_embedding_model(model)

In [10]:
def create_siamese_model(sentence_embedding_model):
    s1_input_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s1_token_type_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s1_attention_mask = keras.Input(shape=(None,),dtype=tf.int32)
    s2_input_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s2_token_type_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s2_attention_mask = keras.Input(shape=(None,),dtype=tf.int32)

    s1_embeddings = sentence_embedding_model([s1_input_ids,s1_token_type_ids,s1_attention_mask])
    s2_embeddings = sentence_embedding_model([s2_input_ids,s2_token_type_ids,s2_attention_mask]) 
    stack_embeddings = keras.layers.Lambda(lambda x: tf.stack(x,axis=1))([s1_embeddings,s2_embeddings])
    return keras.Model([s1_input_ids,s1_token_type_ids,s1_attention_mask,s2_input_ids,s2_token_type_ids,s2_attention_mask],stack_embeddings)

In [11]:
siamese_model = create_siamese_model(sentence_embedding_model)

In [12]:
siamese_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_5 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_7 (InputLayer)           [(None, None)]       0           []                               
                                                                                              

In [13]:
@tf.function
def negative_ranking_loss_func(_, embeddings):
    query_embeddings,question_embeddings = embeddings[:,0],embeddings[:,1]
    # tf.print('min in embedding',tf.reduce_min(query_embeddings))
    score_matrix = tf.linalg.matmul(query_embeddings,tf.transpose(question_embeddings))
    # tf.print('min in loss',tf.reduce_min(score_matrix))
    labels = tf.one_hot(tf.range(len(embeddings)), len(embeddings))
    loss = tf.nn.softmax_cross_entropy_with_logits(labels,score_matrix*10)
    return loss

adam_opt = keras.optimizers.Adam(learning_rate=2e-5)
siamese_model.compile(loss=negative_ranking_loss_func, optimizer=adam_opt)

## 模型训练

In [14]:
s1_input_ids = s0_encoded['input_ids']
s1_token_type_ids = s0_encoded['token_type_ids']
s1_attention_mask = s0_encoded['attention_mask']
s2_input_ids = s1_encoded['input_ids']
s2_token_type_ids = s1_encoded['token_type_ids']
s2_attention_mask = s1_encoded['attention_mask']
y_placeholder = tf.ones(len(s1_input_ids),dtype=tf.bool)

In [15]:
def check_accuracy_manually(model,data_group):
    dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask,dev_labels = data_group
        
    embeddings = np.asarray(model.predict([dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask]))
    queries_embeddings = embeddings[:,0]
    questions_embeddings = embeddings[:,1]
    scores = tf.linalg.matmul(queries_embeddings,tf.transpose(questions_embeddings))
    scores = tf.linalg.diag_part(scores)
    predict_labels = tf.math.greater(scores,0.5)
    # tf.print('min',tf.reduce_min(scores))
    return tf.math.reduce_mean(tf.cast(tf.math.equal(predict_labels,dev_labels),tf.float32))
    

class CalcAccuracyCallback(keras.callbacks.Callback): 
    def __init__(self,val_data):
        super(keras.callbacks.Callback, self).__init__()
        self.val_data = val_data 
    def on_epoch_end(self, epoch, logs=None):
        accuracy = check_accuracy_manually(self.model,self.val_data)
        tf.print("accuracy on dev set is ",accuracy)

In [16]:
siamese_model.fit([s1_input_ids,s1_token_type_ids,s1_attention_mask,s2_input_ids,s2_token_type_ids,s2_attention_mask],y_placeholder,
                  # validation_data = ([dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask],dev_labels),
                  callbacks=[CalcAccuracyCallback((s1_input_ids[:1000],s1_token_type_ids[:1000],s1_attention_mask[:1000],s2_input_ids[:1000],s2_token_type_ids[:1000],s2_attention_mask[:1000],y_placeholder[:1000])),
                            CalcAccuracyCallback((dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask,dev_labels))],
                  epochs=10,batch_size=32)

Epoch 1/10
accuracy on dev set is  0.993
min -0.211176127
accuracy on dev set is  0.483549595
Epoch 2/10
accuracy on dev set is  0.995
min -0.207627222
accuracy on dev set is  0.512279868
Epoch 3/10
  2/331 [..............................] - ETA: 1:58 - loss: 0.2177

KeyboardInterrupt: 

In [None]:
# siamese_model.fit([s1_input_ids,s1_token_type_ids,s1_attention_mask,s2_input_ids,s2_token_type_ids,s2_attention_mask],y_placeholder,
#                   validation_data = ([dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask],dev_labels),
#                   callbacks=[CalcAccuracyCallback((dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask,dev_labels))],
#                   epochs=100,batch_size=16)

In [None]:
sentence1 = "今天下午可能会下雨"
sentence2= '今天天气很晴朗'
sentence3= '天气预报说下午有雨'
sentence4= '北京是中国的首都'

sentence_list = [sentence1,sentence2,sentence3,sentence4]
encoded_inputs = tokenizer(sentence_list,return_tensors='tf',padding=True)
sentence_embeddins = sentence_embedding_model(encoded_inputs)
for i,s1 in enumerate(sentence_list):
    for j,s2 in enumerate(sentence_list):
        print(f'[{s1}]和[{s2}]的相似度为：',np.matmul(sentence_embeddins[i], tf.transpose(sentence_embeddins[j])))
 

In [27]:
correct_count = 0
for (s1,s2),label in zip(dev_sentence_pairs,dev_labels):
    s1_embedding = sentence_embedding_model(tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN),training=False)[0]
    s2_embedding = sentence_embedding_model(tokenizer(s2,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN),training=False)[0]
    print(f'[{label}][{s1}]和[{s2}]的相似度为：',np.matmul(s1_embedding, s2_embedding))
    if (np.matmul(s1_embedding, s2_embedding)>0.5) ==label :
        correct_count+=1
        print('!!!')
print(correct_count)        

[False][双十一花呗提额在哪]和[里可以提花呗额度]的相似度为： 0.99999654
[False][花呗支持高铁票支付吗]和[为什么友付宝不支持花呗付款]的相似度为： 0.9295593
[True][我的蚂蚁花呗支付金额怎么会有限制]和[我到支付宝实体店消费用花呗支付受金额限制]的相似度为： 0.94713557
!!!
[False][为什么有花呗额度不能分期付款]和[花呗分期额度不足]的相似度为： 0.9656383
[False][赠品不能设置用花呗付款]和[怎么不能花呗分期付款]的相似度为： 0.7830473
[True][为什么这个订单不可以花呗支付]和[为什么支付时没有出现用花呗支付]的相似度为： 0.8588219
!!!
[False][使用了花呗没有记录]和[银行扣了，还款，花呗]的相似度为： 0.9899275
[False][一个身份证能办几个花呗]和[花呗身份验证为何不亮]的相似度为： 0.9406398
[False][花呗咋扫不上]和[花呗扫一扫不能付钱]的相似度为： 0.78438926
[False][我的花呗额度为什么还没有增长]和[为什么花呗额度才有***块]的相似度为： 0.95101166
[True][花呗收款额度限制]和[收钱码，对花呗支付的金额有限制吗]的相似度为： 0.9164518
!!!
[False][为什么我的借呗跟花呗不见了]和[我现在登陆后以前的花呗，借呗，余额宝里的钱都不见了怎么办]的相似度为： 0.9557075
[True][我的账号是新号，为什么花呗是以前的号码]和[花呗号吗是以前的号码]的相似度为： 0.7657628
!!!
[False][花呗提前分期利息怎么算]和[花呗分期提前结清会有手续费吗]的相似度为： 0.69271183
[False][记错借呗的还款日期 这几天没去注意 怎么办]和[支付借呗怎么修改还款日期]的相似度为： 0.83016014
[False][***月份的花呗为什么出现在***月份的账单里]和[花呗的往期账单怎么查]的相似度为： 0.8865557
[False][为什么借呗开通了又自动关闭了]和[还了钱关闭借呗]的相似度为： 0.9068506
[False][为什么借呗会收回去]和[为什么我这次借呗利益这么高]的相似度为： 0.9116498
[F

In [17]:
s1_encoded = tokenizer([p[0] for p in dev_sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
s1_embeddings = sentence_embedding_model.predict([s1_encoded['input_ids'],s1_encoded['token_type_ids'],s1_encoded['attention_mask']])
s2_embeddings = sentence_embedding_model.predict(tokenizer([p[1] for p in dev_sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf").values())
dev_score = tf.linalg.diag_part(tf.linalg.matmul(s1_embeddings,tf.transpose(s2_embeddings)))

In [23]:
tf.reduce_sum(tf.cast(tf.equal(dev_score>0.5,dev_labels),dtype=tf.int16))

<tf.Tensor: shape=(), dtype=int16, numpy=2217>

<tf.Tensor: shape=(4316,), dtype=bool, numpy=array([False, False,  True, ..., False, False,  True])>

In [None]:
np.matmul(s1_embeddings[1], s2_embeddings[1])

In [None]:
s1,s2 = dev_sentence_pairs[1]

In [None]:
s1


In [None]:
s1_embedding = sentence_embedding_model.predict(tokenizer([s1,'万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃'],return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN).values())[0]
s2_embedding = sentence_embedding_model.predict(tokenizer([s2,'万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃'],return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN).values())[0]
np.matmul(s1_embedding,s2_embedding)

In [None]:
s1_embedding = sentence_embedding_model.predict(tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN).values())[0]
s2_embedding = sentence_embedding_model.predict(tokenizer(s2,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN).values())[0]
np.matmul(s1_embedding,s2_embedding)

In [None]:
s1_embedding = sentence_embedding_model(tokenizer([s1,'万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃'],return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN))[0]
s2_embedding = sentence_embedding_model(tokenizer([s2,'万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃'],return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN))[0]
np.matmul(s1_embedding,s2_embedding)

In [None]:
s1_embedding = sentence_embedding_model(tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN),training=False)[0]
s2_embedding = sentence_embedding_model(tokenizer(s2,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN),training=False)[0]
np.matmul(s1_embedding,s2_embedding)

In [None]:
s0_embedding = model(tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN),training=False)[0]
s1_embedding = model.predict(tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN).values())[0]
s1_embedding - s0_embedding.numpy()

In [None]:
tokenizer([p[0] for p in dev_sentence_pairs[:5]],padding=True,truncation=True,max_length=50)

In [None]:
s1_short = model(tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN))[0]

In [None]:
s1_long = model(tokenizer([s1,'大家好我们都是中国人中国人就是厉害每天工作20个小时哈哈哈'],return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN))[0]

In [None]:
tokenizer(s1,return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN)

In [None]:
e1 = sentence_embedding_model(tokenizer([s1,'万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃'],return_tensors='tf',padding=True,truncation=True,max_length=MAX_LEN))[0]
s1e = tokenizer([s1,'万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃万求将集唉就产就吃'], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
e2 = sentence_embedding_model.predict([s1e['input_ids'],s1e['token_type_ids'],s1e['attention_mask']])

In [None]:
e2[0]