In [1]:
import os
import shutil
import json
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
import tensorflow_addons as tfa
import datetime

RANDOM_SEED=68
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [2]:
# all the caches are default to /tmp/tfhub_modules
#preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_preprocess/3")  #https://tfhub.dev/tensorflow/bert_zh_preprocess/3
preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_zh_preprocess/3")
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4",trainable=True)  #https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4

2022-03-18 10:52:20.446663: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-18 10:52:21.465855: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6642 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:17:00.0, compute capability: 7.5
2022-03-18 10:52:21.466577: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 6659 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:b3:00.0, compute capability: 7.5


# 数据预处理
### 数据:CLUE Fine-Grain NER   https://www.cluebenchmarks.com/introduce.html

地址（address），ADD
书名（book），BOOK
公司（company），COM
游戏（game），GA
政府（goverment），GOV
电影（movie），MOV
姓名（name），NAME
组织机构（organization），ORG
职位（position），POS
景点（scene）SCENE

使用B I 标注方式

In [3]:
#{"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，", "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}}
sentences=[]
labels=[]
label_set=set()
with open('train.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        text = data['text']
        label = ['O' for _ in text]
        for label_name,label_dict in data['label'].items():
            label_set.add(label_name)
            for _,pos_list in label_dict.items():
                for s,e in pos_list:
                    is_first = True
                    for i in range(s,e+1):
                        label[i] = label_name+'-'+ ('B' if is_first else "I")
                        is_first=False
        
        sentences.append(text)
        labels.append(label)

In [4]:
print('句子平均长度',sum([len(label) for label in  labels])/len(labels))
print('句子最大长度',max([len(label) for label in  labels]))

句子平均长度 37.38034983252698
句子最大长度 50


In [5]:
#这里我们选取55作为模型最大长度
MAX_LEN=55

In [6]:
id_label_dict={}
i=0
for label_name in label_set:
    id_label_dict[i]=label_name+'-B'
    i+=1
    id_label_dict[i]=label_name+'-I'
    i+=1

id_label_dict[i]='O'
label_id_dict = {v:k for k,v in id_label_dict.items()}
CLASS_NUM=len(id_label_dict)
id_label_dict

{0: 'scene-B',
 1: 'scene-I',
 2: 'company-B',
 3: 'company-I',
 4: 'organization-B',
 5: 'organization-I',
 6: 'government-B',
 7: 'government-I',
 8: 'address-B',
 9: 'address-I',
 10: 'movie-B',
 11: 'movie-I',
 12: 'name-B',
 13: 'name-I',
 14: 'game-B',
 15: 'game-I',
 16: 'book-B',
 17: 'book-I',
 18: 'position-B',
 19: 'position-I',
 20: 'O'}

In [7]:
def convert_labels_to_tensor(labels,max_len=MAX_LEN,label_id_dict=label_id_dict):
    for label in labels:
        pad = max_len-len(label)-1  # should add O at the start of sentence
        label.insert(0,'O')
        label.extend('O'*pad)
        label = label[:MAX_LEN]
        label[-1]='O'
    labels_id = tf.constant([[label_id_dict[l] for l in label] for label in labels])
    # onehot_labels_id = tf.one_hot(labels_id,depth=21,axis=2)
    return labels_id

In [8]:
sentences_tensor = tf.convert_to_tensor(sentences)
label_tensor = convert_labels_to_tensor(labels)

In [9]:
max([len(l) for l in labels])

55

In [10]:
#{"text": "浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，", "label": {"name": {"叶老桂": [[9, 11]]}, "company": {"浙商银行": [[0, 3]]}}}
dev_sentences=[]
dev_labels=[]
with open('dev.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        text = data['text']
        label = ['O' for _ in text]
        for label_name,label_dict in data['label'].items():
            for _,pos_list in label_dict.items():
                for s,e in pos_list:
                    is_first = True
                    for i in range(s,e+1):
                        label[i] = label_name+'-'+ ('B' if is_first else "I")
                        is_first=False
        
        dev_sentences.append(text)
        dev_labels.append(label)
dev_label_tensor = convert_labels_to_tensor(dev_labels)
dev_sentences_tensor = tf.convert_to_tensor(dev_sentences)

In [11]:
l1 = hub.KerasLayer(preprocessor.tokenize)
l2 = hub.KerasLayer(preprocessor.bert_pack_inputs,arguments=dict(seq_length=55))

In [12]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    encoder_inputs = l1(text_input)
    encoder_inputs = l2([encoder_inputs])
    outputs = encoder(encoder_inputs)
    net = outputs['sequence_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    potential_logits = tf.keras.layers.Dense(CLASS_NUM, activation=None, name='classifier')(net)
    decoded_sequence, potentials, sequence_length, chain_kernel = tfa.layers.CRF(units=CLASS_NUM,use_kernel=False)(potential_logits)
    return tf.keras.Model(text_input, [decoded_sequence, potentials, sequence_length, chain_kernel])

In [13]:
model = build_classifier_model()
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer_1 (KerasLayer)     (None, None, None)   0           ['text[0][0]']                   
                                                                                                  
 keras_layer_2 (KerasLayer)     {'input_mask': (Non  0           ['keras_layer_1[0][0]']          
                                e, 55),                                                           
                                 'input_type_ids':                                                
                                (None, 55),                                                   

In [14]:
train_ds = tf.data.Dataset.from_tensor_slices((sentences_tensor,label_tensor))
dev_ds = tf.data.Dataset.from_tensor_slices((dev_sentences_tensor,dev_label_tensor))
batched_train_ds = train_ds.shuffle(1000).batch(16)
batched_dev_ds = dev_ds.batch(32)
optimizer = tf.keras.optimizers.Adam(2e-5)
m = tf.keras.metrics.Accuracy()


In [15]:
@tf.function # take tensor 作为输入的函数可以使用静态图优化
def train_step(x_batch,y_batch):
    with tf.GradientTape() as tape:
        out = model(x_batch,training=True)
        decoded_sequence, potentials, sequence_length, chain_kernel = out
        losses = -tfa.text.crf_log_likelihood(potentials, y_batch, sequence_length, chain_kernel)[0] #似然大致可以理解成概率，对数不改变符号方向，我们希望正确的概率越大越好
        loss = tf.reduce_mean(losses)

    grads = tape.gradient(loss,model.trainable_weights)
    optimizer.apply_gradients(zip(grads,model.trainable_weights))
    m.update_state(decoded_sequence, y_batch)
    return loss

@tf.function
def test_step(x_batch_val,y_batch_val):
    val_decoded_sequence, _, _, _ = model(x_batch_val, training=False)
    m.update_state(y_batch_val, val_decoded_sequence)

In [16]:
epochs = 3
for epoch in range(epochs):
    for step, (x_batch,y_batch) in enumerate(batched_train_ds):
        loss = train_step(x_batch,y_batch)
        if step%200==0:
            print(f'{datetime.datetime.now()} - {epoch}-{step} loss: {loss}')
        
    epoch_acc = m.result()
    m.reset_states()
    print(f'training acc on epoch {epoch} is: {epoch_acc}')

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in batched_dev_ds:
        test_step(x_batch_val, y_batch_val)
        
    val_epoch_acc = m.result()
    print(f'VAL acc on epoch {epoch} is: {val_epoch_acc}')
    m.reset_states()
        

2022-03-18 10:52:46.741876: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: cond/branch_executed/_1360


2022-03-18 10:52:49.928500 - 0-0 loss: 171.75064086914062
2022-03-18 10:53:28.179889 - 0-200 loss: 22.44293212890625
2022-03-18 10:54:06.528936 - 0-400 loss: 18.585840225219727
2022-03-18 10:54:44.928932 - 0-600 loss: 19.53022575378418


2022-03-18 10:55:03.991768: W tensorflow/core/grappler/optimizers/loop_optimizer.cc:907] Skipping loop optimization for Merge node with control input: cond_1/branch_executed/_1384


training acc on epoch 0 is: 0.8942044377326965
VAL acc on epoch 0 is: 0.9260001182556152
2022-03-18 10:55:15.909097 - 1-0 loss: 13.712574005126953
2022-03-18 10:55:54.366822 - 1-200 loss: 13.386499404907227
2022-03-18 10:56:32.795324 - 1-400 loss: 9.843090057373047
2022-03-18 10:57:11.054718 - 1-600 loss: 7.803056716918945
training acc on epoch 1 is: 0.9291978478431702
VAL acc on epoch 1 is: 0.9304677248001099
2022-03-18 10:57:31.733265 - 2-0 loss: 12.836954116821289
2022-03-18 10:58:10.194211 - 2-200 loss: 11.2840576171875
2022-03-18 10:58:48.520130 - 2-400 loss: 8.996906280517578
2022-03-18 10:59:26.990509 - 2-600 loss: 6.702487945556641
training acc on epoch 2 is: 0.943480372428894
VAL acc on epoch 2 is: 0.9321870803833008


In [17]:
# 查看一下第i条数据的结果
i=1
output = model.predict([sentences[i]])[0]
print('句子',sentences[i])
print('label',labels[i])
','.join([ id_label_dict[i] for i in  output[0]])

句子 生生不息CSOL生化狂潮让你填弹狂扫
label ['O', 'O', 'O', 'O', 'O', 'game-B', 'game-I', 'game-I', 'game-I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


'O,O,O,O,O,game-B,game-I,game-I,game-I,game-I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

In [18]:
#看一下不做静态图优化的速度
def train_step(x_batch,y_batch):
    with tf.GradientTape() as tape:
        out = model(x_batch,training=True)
        decoded_sequence, potentials, sequence_length, chain_kernel = out
        losses = -tfa.text.crf_log_likelihood(potentials, y_batch, sequence_length, chain_kernel)[0] #似然大致可以理解成概率，对数不改变符号方向，我们希望正确的概率越大越好
        loss = tf.reduce_mean(losses)

    grads = tape.gradient(loss,model.trainable_weights)
    optimizer.apply_gradients(zip(grads,model.trainable_weights))
    m.update_state(decoded_sequence, y_batch)
    return loss

def test_step(x_batch_val,y_batch_val):
    val_decoded_sequence, _, _, _ = model(x_batch_val, training=False)
    m.update_state(y_batch_val, val_decoded_sequence)
epochs = 3
for epoch in range(epochs):
    for step, (x_batch,y_batch) in enumerate(batched_train_ds):
        loss = train_step(x_batch,y_batch)
        if step%200==0:
            print(f'{datetime.datetime.now()} - {epoch}-{step} loss: {loss}')
        
    epoch_acc = m.result()
    m.reset_states()
    print(f'training acc on epoch {epoch} is: {epoch_acc}')

    # Run a validation loop at the end of each epoch.
    for x_batch_val, y_batch_val in batched_dev_ds:
        test_step(x_batch_val, y_batch_val)
        
    val_epoch_acc = m.result()
    print(f'VAL acc on epoch {epoch} is: {val_epoch_acc}')
    m.reset_states()

2022-03-18 11:00:05.704831 - 0-0 loss: 6.388515472412109
2022-03-18 11:01:48.147187 - 0-200 loss: 6.950614929199219
2022-03-18 11:03:31.104976 - 0-400 loss: 3.645936965942383
2022-03-18 11:05:13.852246 - 0-600 loss: 12.634344100952148
training acc on epoch 0 is: 0.9539449214935303
VAL acc on epoch 0 is: 0.9376835823059082
2022-03-18 11:06:10.595539 - 1-0 loss: 8.483667373657227
2022-03-18 11:07:53.323358 - 1-200 loss: 7.234052658081055
2022-03-18 11:09:36.503812 - 1-400 loss: 5.007223129272461
2022-03-18 11:11:19.132952 - 1-600 loss: 6.304544448852539
training acc on epoch 1 is: 0.9609652757644653
VAL acc on epoch 1 is: 0.9372910261154175
2022-03-18 11:12:06.829385 - 2-0 loss: 6.307212829589844
2022-03-18 11:13:49.756716 - 2-200 loss: 4.997684478759766
2022-03-18 11:15:32.301416 - 2-400 loss: 1.9620513916015625
2022-03-18 11:17:14.463546 - 2-600 loss: 3.554637908935547
training acc on epoch 2 is: 0.9674002528190613
VAL acc on epoch 2 is: 0.9336763024330139
