In [1]:
from transformers import BertTokenizer,TFAutoModel
import tensorflow as tf
SEED = 42
tf.random.set_seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


## 本实验使用transformers库来进行
https://github.com/huggingface/transformers

https://huggingface.co/


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = TFAutoModel.from_pretrained('bert-base-chinese')

Some layers from the model checkpoint at bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [3]:
# 如何使用transformers从本地读取模型
# save model locally and load model from local file system.
# config.save_pretrained('/data/hchan/huggingface-bert-base-chinese')
# tokenizer.save_pretrained('/data/hchan/huggingface-bert-base-chinese')
# model.save_pretrained('/data/hchan/huggingface-bert-base-chinese')
# config = AutoConfig.from_pretrained('/data/hchan/huggingface-bert-base-chinese')
# tokenizer = BertTokenizer.from_pretrained('/data/hchan/huggingface-bert-base-chinese')
# model = TFAutoModel.from_pretrained('/data/hchan/huggingface-bert-base-chinese')

### Bert 模型输入的一些规则
1. 句首加[cls],每个句子句尾加[sep]
2. token_type_ids 用来区分多个句子的情况
3. attention_mask 用来区别句子内容和padding

In [4]:
text = "今天我们来学习BERT的用法"
print('tokenized text:',tokenizer.tokenize(text))
encoded_input = tokenizer(text, return_tensors='tf')
print('encoded_tensor:',tokenizer(text))
print("decoded model input:",tokenizer.decode(tokenizer(text)['input_ids']))

tokenized text: ['今', '天', '我', '们', '来', '学', '习', '[UNK]', '的', '用', '法']
encoded_tensor: {'input_ids': [101, 791, 1921, 2769, 812, 3341, 2110, 739, 100, 4638, 4500, 3791, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
decoded model input: [CLS] 今 天 我 们 来 学 习 [UNK] 的 用 法 [SEP]


In [5]:
output = model(encoded_input)
output.last_hidden_state

<tf.Tensor: shape=(1, 13, 768), dtype=float32, numpy=
array([[[ 0.55845565, -0.2177088 , -1.1836438 , ...,  0.44294152,
          0.4910655 ,  0.37872693],
        [-0.16212857, -0.22858876,  0.22160313, ..., -1.3752859 ,
         -0.19599125, -0.13314222],
        [ 0.59043   , -0.6807966 , -1.5145776 , ..., -0.38338068,
          1.0174727 ,  0.23099777],
        ...,
        [ 0.33431968, -0.06720637,  0.05515486, ...,  0.04679219,
          0.402681  ,  0.09251919],
        [ 0.5097357 , -0.8889353 , -0.5622668 , ..., -0.56882447,
          0.42690647,  0.39236704],
        [-0.43672717, -0.31467965, -1.1357595 , ..., -0.86004853,
          0.45863438, -0.09646266]]], dtype=float32)>

## BERT词嵌入是和上下文有关的，对比三种语境下的中国和美国的差异

In [6]:
text1='中国是个发展中国家'
output1 = model(tokenizer(text1, return_tensors='tf'))
embedding_china = tf.math.reduce_mean(output1.last_hidden_state[0,1:3,:],axis=0)
text2='美国是个发达的国家'
output2 = model(tokenizer(text2, return_tensors='tf'))
embedding_us = tf.math.reduce_mean(output2.last_hidden_state[0,1:3,:],axis=0)
tf.math.reduce_sum(tf.math.square(embedding_china-embedding_us))

<tf.Tensor: shape=(), dtype=float32, numpy=122.12986>

In [7]:
text1='中国是个强大的国家'
output1 = model(tokenizer(text1, return_tensors='tf'))
embedding_china = tf.math.reduce_mean(output1.last_hidden_state[0,1:3,:],axis=0)
text2='美国大学在世界上排名领先'
output2 = model(tokenizer(text2, return_tensors='tf'))
embedding_us = tf.math.reduce_mean(output2.last_hidden_state[0,1:3,:],axis=0)
tf.math.reduce_sum(tf.math.square(embedding_china-embedding_us))

<tf.Tensor: shape=(), dtype=float32, numpy=191.0534>

In [8]:
text1='中国是个强大的国家'
output1 = model(tokenizer(text1, return_tensors='tf'))
embedding_china = tf.math.reduce_mean(output1.last_hidden_state[0,1:3,:],axis=0)
text2='世界上没有完美国家'
output2 = model(tokenizer(text2, return_tensors='tf'))
embedding_us = tf.math.reduce_mean(output2.last_hidden_state[0,7:9,:],axis=0)
tf.math.reduce_sum(tf.math.square(embedding_china-embedding_us))

<tf.Tensor: shape=(), dtype=float32, numpy=272.4551>

# 额外内容，用词嵌入的平均值计算句子相似度

In [9]:
sentence1 = "今天下午可能会下雨"
sentence2= '今天天气很晴朗'
sentence3= '天气预报说下午有雨'
sentence4= '北京是中国的首都'

sentence_list = [sentence1,sentence2,sentence3,sentence4]
encoded_inputs = tokenizer(sentence_list,return_tensors='tf',padding=True)
encoded_inputs

{'input_ids': <tf.Tensor: shape=(4, 11), dtype=int32, numpy=
array([[ 101,  791, 1921,  678, 1286, 1377, 5543,  833,  678, 7433,  102],
       [ 101,  791, 1921, 1921, 3698, 2523, 3252, 3306,  102,    0,    0],
       [ 101, 1921, 3698, 7564, 2845, 6432,  678, 1286, 3300, 7433,  102],
       [ 101, 1266,  776, 3221,  704, 1744, 4638, 7674, 6963,  102,    0]],
      dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(4, 11), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(4, 11), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]], dtype=int32)>}

In [10]:
sentence_embeddins = tf.math.reduce_mean(model(encoded_inputs).last_hidden_state,axis=1)
for i,s1 in enumerate(sentence_list):
    for j,s2 in enumerate(sentence_list):
        print(f'[{s1}]和[{s2}]的L2距离为：',tf.math.reduce_sum(tf.math.square(sentence_embeddins[i]-sentence_embeddins[j])).numpy())
 

[今天下午可能会下雨]和[今天下午可能会下雨]的L2距离为： 0.0
[今天下午可能会下雨]和[今天天气很晴朗]的L2距离为： 97.23857
[今天下午可能会下雨]和[天气预报说下午有雨]的L2距离为： 54.660233
[今天下午可能会下雨]和[北京是中国的首都]的L2距离为： 185.97879
[今天天气很晴朗]和[今天下午可能会下雨]的L2距离为： 97.23857
[今天天气很晴朗]和[今天天气很晴朗]的L2距离为： 0.0
[今天天气很晴朗]和[天气预报说下午有雨]的L2距离为： 91.68937
[今天天气很晴朗]和[北京是中国的首都]的L2距离为： 199.58102
[天气预报说下午有雨]和[今天下午可能会下雨]的L2距离为： 54.660233
[天气预报说下午有雨]和[今天天气很晴朗]的L2距离为： 91.68937
[天气预报说下午有雨]和[天气预报说下午有雨]的L2距离为： 0.0
[天气预报说下午有雨]和[北京是中国的首都]的L2距离为： 184.15384
[北京是中国的首都]和[今天下午可能会下雨]的L2距离为： 185.97879
[北京是中国的首都]和[今天天气很晴朗]的L2距离为： 199.58102
[北京是中国的首都]和[天气预报说下午有雨]的L2距离为： 184.15384
[北京是中国的首都]和[北京是中国的首都]的L2距离为： 0.0
