In [1]:
import os
import shutil
import json
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
# from official.nlp import optimization  # to create AdamW optimizer

RANDOM_SEED=68
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [2]:
# all the caches are default to /tmp/tfhub_modules
#preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_preprocess/3")  #https://tfhub.dev/tensorflow/bert_zh_preprocess/3
preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_zh_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4",trainable=True)  #https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4

2022-03-16 10:41:03.471583: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-16 10:41:04.504758: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6642 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:17:00.0, compute capability: 7.5
2022-03-16 10:41:04.505485: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 6659 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:b3:00.0, compute capability: 7.5


# 数据预处理
### 数据:CLUE Fine-Grain NER   https://www.cluebenchmarks.com/introduce.html

地址（address），ADD
书名（book），BOOK
公司（company），COM
游戏（game），GA
政府（goverment），GOV
电影（movie），MOV
姓名（name），NAME
组织机构（organization），ORG
职位（position），POS
景点（scene）SCENE

使用B I 标注方式

In [3]:
#{"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，", "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}}
sentences=[]
labels=[]
label_set=set()
with open('train.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        text = data['text']
        label = ['O' for _ in text]
        for label_name,label_dict in data['label'].items():
            label_set.add(label_name)
            for _,pos_list in label_dict.items():
                for s,e in pos_list:
                    is_first = True
                    for i in range(s,e+1):
                        label[i] = label_name+'-'+ ('B' if is_first else "I")
                        is_first=False
        
        sentences.append(text)
        labels.append(label)

In [4]:
print('句子平均长度',sum([len(label) for label in  labels])/len(labels))
print('句子最大长度',max([len(label) for label in  labels]))

句子平均长度 37.38034983252698
句子最大长度 50


In [5]:
#这里我们选取55作为模型最大长度
MAX_LEN=55

In [6]:
id_label_dict={}
i=0
for label_name in label_set:
    id_label_dict[i]=label_name+'-B'
    i+=1
    id_label_dict[i]=label_name+'-I'
    i+=1

id_label_dict[i]='O'
label_id_dict = {v:k for k,v in id_label_dict.items()}
id_label_dict

{0: 'book-B',
 1: 'book-I',
 2: 'organization-B',
 3: 'organization-I',
 4: 'government-B',
 5: 'government-I',
 6: 'position-B',
 7: 'position-I',
 8: 'game-B',
 9: 'game-I',
 10: 'company-B',
 11: 'company-I',
 12: 'address-B',
 13: 'address-I',
 14: 'name-B',
 15: 'name-I',
 16: 'movie-B',
 17: 'movie-I',
 18: 'scene-B',
 19: 'scene-I',
 20: 'O'}

In [7]:
def convert_labels_to_tensor(labels,max_len=MAX_LEN,label_id_dict=label_id_dict):
    for label in labels:
        pad = max_len-len(label)-1  # should add O at the start of sentence
        label.insert(0,'O')
        label.extend('O'*pad)
        label = label[:MAX_LEN]
        label[-1]='O'
    labels_id = tf.constant([[label_id_dict[l] for l in label] for label in labels])
    onehot_labels_id = tf.one_hot(labels_id,depth=21,axis=2)
    return onehot_labels_id

In [8]:
onehot_label_tensor = convert_labels_to_tensor(labels)

In [9]:
#{"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，", "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}}
dev_sentences=[]
dev_labels=[]
with open('dev.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        text = data['text']
        label = ['O' for _ in text]
        for label_name,label_dict in data['label'].items():
            for _,pos_list in label_dict.items():
                for s,e in pos_list:
                    is_first = True
                    for i in range(s,e+1):
                        label[i] = label_name+'-'+ ('B' if is_first else "I")
                        is_first=False
        
        dev_sentences.append(text)
        dev_labels.append(label)
dev_onehot_label_tensor = convert_labels_to_tensor(dev_labels)

In [10]:
l1 = hub.KerasLayer(preprocessor.tokenize)
l2 = hub.KerasLayer(preprocessor.bert_pack_inputs,arguments=dict(seq_length=55))

In [11]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    encoder_inputs = l1(text_input)
    encoder_inputs = l2([encoder_inputs])
    outputs = encoder(encoder_inputs)
    net = outputs['sequence_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(21, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [12]:
model = build_classifier_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     (None, None, None)   0           ['text[0][0]']                   
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_type_ids':   0           ['keras_layer_1[0][0]']          
                                (None, 55),                                                       
                                 'input_word_ids':                                                
                                (None, 55),                                                   

In [13]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.CategoricalAccuracy()
optimizer = tf.keras.optimizers.Adam(2e-5)
model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [14]:
sentences_tensor = tf.convert_to_tensor(sentences)
dev_sentences_tensor = tf.convert_to_tensor(dev_sentences)

In [16]:
model.fit(sentences_tensor,onehot_label_tensor,
          validation_data=(dev_sentences_tensor,dev_onehot_label_tensor),epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8cc5da32e0>

In [17]:
# 查看一下第i条数据的结果
i=1
output = model.predict([sentences[i]])[0]
print('句子',sentences[i])
print('label',labels[i])
','.join([ id_label_dict[i] for i in  tf.argmax(output,axis=1).numpy()])

句子 生生不息CSOL生化狂潮让你填弹狂扫
label ['O', 'O', 'O', 'O', 'O', 'game-B', 'game-I', 'game-I', 'game-I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


'O,O,O,O,O,game-B,game-I,game-I,game-I,game-I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'