# 基于Bert训练一个文本相似度模型
1. 使用分类方法训练句子嵌入,使用所有正负例数据
2. 句子嵌入采用meanpooling +normalize
3. 分类用 cosing+sigmoid,embedding结果更具有解释性，计算相似度时可以调整句子顺序

In [1]:
from transformers import AutoTokenizer,TFAutoModel,AutoConfig,TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow import keras 
import json
from collections import defaultdict
import numpy as np
gpus = tf.config.list_physical_devices('GPU')
if len(gpus)>1:
    tf.config.set_visible_devices(gpus[1], 'GPU')
    print('use gpu1')
    
RANDOM_SEED=68
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

use gpu1


In [2]:
# config = AutoConfig.from_pretrained('bert-base-chinese')
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = TFAutoModel.from_pretrained('bert-base-chinese')

2022-03-16 17:12:09.462326: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-16 17:12:13.566405: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 4098 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:b3:00.0, compute capability: 7.5
Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if y

## 构造数据集

### 这里使用 AFQMC 蚂蚁金融语义相似度 Ant Financial Question Matching Corpus

In [3]:
sentence_pairs=[]
labels=[]
with open('train.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        labels.append(True if data['label']=='1' else False)
        sentence_pairs.append((data['sentence1'],data['sentence2']))

In [4]:
print('训练数据对数：',len(sentence_pairs))
print('训练数据平均长度：', sum([len(s1)+len(s2) for s1,s2 in sentence_pairs])/(len(sentence_pairs)*2))
print('训练数据最大长度：', max([max(len(s1),len(s2)) for s1,s2 in sentence_pairs]))

训练数据对数： 34334
训练数据平均长度： 13.366298712646357
训练数据最大长度： 112


In [5]:
MAX_LEN = 50

In [6]:
s0_encoded = tokenizer([p[0] for p in sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
s1_encoded = tokenizer([p[1] for p in sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
labels = tf.reshape(tf.constant(labels),shape=(-1,1))

In [7]:
# 读取dev数据
dev_sentence_pairs=[]
dev_labels=[]
with open('dev.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        dev_sentence_pairs.append((data['sentence1'],data['sentence2']))
        dev_labels.append(True if data['label']=='1' else False)
dev_s0_encoded = tokenizer([p[0] for p in dev_sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
dev_s1_encoded = tokenizer([p[1] for p in dev_sentence_pairs], padding=True, truncation=True, max_length=MAX_LEN,return_tensors="tf")
dev_s1_input_ids = dev_s0_encoded['input_ids']
dev_s1_token_type_ids = dev_s0_encoded['token_type_ids']
dev_s1_attention_mask = dev_s0_encoded['attention_mask']
dev_s2_input_ids = dev_s1_encoded['input_ids']
dev_s2_token_type_ids = dev_s1_encoded['token_type_ids']
dev_s2_attention_mask = dev_s1_encoded['attention_mask']
dev_labels = tf.reshape(tf.constant(dev_labels),shape=(-1,1))

## 构建模型

In [8]:
@tf.function
def mean_pooling_with_mask(paras):
    sent_word_embeddings,sent_indices = paras
    not_padding = tf.math.not_equal(sent_indices,0)
    not_cls = tf.math.not_equal(sent_indices,101) # cls
    not_seg = tf.math.not_equal(sent_indices,102) # sep
    mask = tf.math.logical_and(not_padding,not_cls)
    mask = tf.math.logical_and(mask,not_seg)
    mask_f = tf.cast(mask,tf.float32)
    mask_f = tf.expand_dims(mask_f,axis=-1)
    return tf.reduce_mean(tf.multiply(sent_word_embeddings,mask_f),axis=1)


def convert_bert_to_sentence_embedding_model(bert_model):
    input_ids = keras.Input(shape=(None,),dtype=tf.int32)
    token_type_ids = keras.Input(shape=(None,),dtype=tf.int32)
    attention_mask = keras.Input(shape=(None,),dtype=tf.int32)
    output = bert_model(input_ids,attention_mask,token_type_ids) # 输入顺序需要和TFBERTModel方法的参数顺序对应
    bert_embeddings = output.last_hidden_state
    sentence_embeddings = keras.layers.Lambda(mean_pooling_with_mask,name='lambda_mean_pooling')([bert_embeddings,input_ids]) # 这里使用max_pooling作为句子的embedding
    normalized_sentence_embeddings = keras.layers.Lambda(lambda xt: tf.nn.l2_normalize(xt,axis=1))(sentence_embeddings)

    return keras.Model([input_ids,token_type_ids,attention_mask],sentence_embeddings,name='sentence_embedding_model')

In [9]:
sentence_embedding_model = convert_bert_to_sentence_embedding_model(model)

In [10]:
# 这里用dense层，也可以考虑计算cos距离
# @tf.function
def create_siamese_model(sentence_embedding_model):
    s1_input_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s1_token_type_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s1_attention_mask = keras.Input(shape=(None,),dtype=tf.int32)
    s2_input_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s2_token_type_ids = keras.Input(shape=(None,),dtype=tf.int32)
    s2_attention_mask = keras.Input(shape=(None,),dtype=tf.int32)

    s1_embeddings = sentence_embedding_model([s1_input_ids,s1_token_type_ids,s1_attention_mask])
    s2_embeddings = sentence_embedding_model([s2_input_ids,s2_token_type_ids,s2_attention_mask]) 

    scores = tf.linalg.diag_part(tf.linalg.matmul(s1_embeddings,tf.transpose(s2_embeddings))) * 10
    scores = tf.reshape(scores,(-1,1))
    probs = keras.layers.Dense(1,activation='sigmoid')(scores)   
    return keras.Model([s1_input_ids,s1_token_type_ids,s1_attention_mask,s2_input_ids,s2_token_type_ids,s2_attention_mask],probs)

In [11]:
siamese_model = create_siamese_model(sentence_embedding_model)

In [12]:
adam_opt = keras.optimizers.Adam(learning_rate=2e-5)
bce = keras.losses.BinaryCrossentropy()
siamese_model.compile(loss=bce, optimizer=adam_opt,metrics=['accuracy'])

## 模型训练

In [13]:
s1_input_ids = s0_encoded['input_ids']
s1_token_type_ids = s0_encoded['token_type_ids']
s1_attention_mask = s0_encoded['attention_mask']
s2_input_ids = s1_encoded['input_ids']
s2_token_type_ids = s1_encoded['token_type_ids']
s2_attention_mask = s1_encoded['attention_mask']
# y_placeholder = tf.ones(len(s1_input_ids),dtype=tf.bool)

In [14]:
siamese_model.fit([s1_input_ids,s1_token_type_ids,s1_attention_mask,s2_input_ids,s2_token_type_ids,s2_attention_mask],labels,
                  validation_data = ([dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask],dev_labels),
                  # callbacks=[CalcAccuracyCallback((s1_input_ids[:1000],s1_token_type_ids[:1000],s1_attention_mask[:1000],s2_input_ids[:1000],s2_token_type_ids[:1000],s2_attention_mask[:1000],labels[:1000])),
                            # CalcAccuracyCallback((dev_s1_input_ids,dev_s1_token_type_ids,dev_s1_attention_mask,dev_s2_input_ids,dev_s2_token_type_ids,dev_s2_attention_mask,dev_labels))],
                  epochs=5,batch_size=16)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f72e4a68fa0>

In [15]:
sentence1 = "今天下午可能会下雨"
sentence2= '今天天气很晴朗'
sentence3= '天气预报说下午有雨'
sentence4= '北京是中国的首都'

sentence_list = [sentence1,sentence2,sentence3,sentence4]
encoded_inputs = tokenizer(sentence_list,return_tensors='tf',padding=True)
sentence_embeddins = sentence_embedding_model(encoded_inputs)
for i,s1 in enumerate(sentence_list):
    for j,s2 in enumerate(sentence_list):
        print(f'[{s1}]和[{s2}]的相似度为：',siamese_model.predict((*tokenizer(s1,return_tensors='tf').values(),*tokenizer(s2,return_tensors='tf').values()))[0,0])
 

[今天下午可能会下雨]和[今天下午可能会下雨]的相似度为： 2.0658917e-17
[今天下午可能会下雨]和[今天天气很晴朗]的相似度为： 6.454774e-12
[今天下午可能会下雨]和[天气预报说下午有雨]的相似度为： 2.5447382e-06
[今天下午可能会下雨]和[北京是中国的首都]的相似度为： 5.2464917e-09
[今天天气很晴朗]和[今天下午可能会下雨]的相似度为： 6.454774e-12
[今天天气很晴朗]和[今天天气很晴朗]的相似度为： 2.0045364e-35
[今天天气很晴朗]和[天气预报说下午有雨]的相似度为： 2.4299438e-09
[今天天气很晴朗]和[北京是中国的首都]的相似度为： 1.0232005e-12
[天气预报说下午有雨]和[今天下午可能会下雨]的相似度为： 2.5447382e-06
[天气预报说下午有雨]和[今天天气很晴朗]的相似度为： 2.4299438e-09
[天气预报说下午有雨]和[天气预报说下午有雨]的相似度为： 1.9866324e-12
[天气预报说下午有雨]和[北京是中国的首都]的相似度为： 3.7954134e-08
[北京是中国的首都]和[今天下午可能会下雨]的相似度为： 5.2464917e-09
[北京是中国的首都]和[今天天气很晴朗]的相似度为： 1.0232005e-12
[北京是中国的首都]和[天气预报说下午有雨]的相似度为： 3.7954134e-08
[北京是中国的首都]和[北京是中国的首都]的相似度为： 8.345166e-35


In [16]:
label_pred = siamese_model.predict([dev_s1_input_ids, dev_s1_token_type_ids, dev_s1_attention_mask, dev_s2_input_ids, dev_s2_token_type_ids, dev_s2_attention_mask])


In [17]:
correct = 0
tp=0
tn=0
fp=0
fn=0
for score,label in zip(label_pred,dev_labels):
    pred = True if  score>0.5 else False
    if label == pred:
        correct+=1
        if label :
            tp+=1
        else:
            tn+=1
    else:
        if pred :
            fp+=1
        else:
            fn+=1
print('测试集正确率',correct/len(dev_labels))
print(f'TP:{tp}, TN:{tn}, FP:{fp}, FN:{fn}')

测试集正确率 0.6865152919369787
TP:32, TN:2931, FP:47, FN:1306
