In [1]:
import os
import shutil
import json
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import numpy as np
# from official.nlp import optimization  # to create AdamW optimizer

RANDOM_SEED=68
tf.random.set_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# 数据预处理
1. TNEWS 今日头条中文新闻分类

In [2]:
desc_label_dict = {"news_story":0,
"news_culture":1,
"news_entertainment":2,
"news_sports":3,
"news_finance":4,
"news_house":5,
"news_car":6,
"news_edu":7,
"news_tech":8,
"news_military":9,
"news_travel":10,
"news_world":11,
"news_stock":12,
"news_agriculture":13,
"news_game":14}
label_desc_dict = {v:k for k,v in desc_label_dict.items()}
CLASS_SIZE = len(desc_label_dict)

In [3]:
sentences=[]
labels=[]
with open('train.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        labels.append(desc_label_dict[data['label_desc']])
        sentences.append(data['sentence'])
sentences_tensor = tf.convert_to_tensor(sentences)
labels_tensor = tf.one_hot(labels,len(desc_label_dict))

2022-03-21 15:56:01.351535: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-21 15:56:02.378157: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6642 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:17:00.0, compute capability: 7.5
2022-03-21 15:56:02.378860: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 6659 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 2080, pci bus id: 0000:b3:00.0, compute capability: 7.5


In [4]:
print('训练集平均长度',sum([len(s) for s in sentences])/len(sentences))
print('训练集最大长度',max([len(s) for s in sentences]))

训练集平均长度 22.13124062968516
训练集最大长度 145


In [5]:
dev_sentences=[]
dev_labels=[]
with open('dev.json') as f:
    for l in f.readlines():
        data = json.loads(l)
        dev_labels.append(desc_label_dict[data['label_desc']])
        dev_sentences.append(data['sentence'])
dev_sentences_tensor = tf.convert_to_tensor(dev_sentences)
dev_labels_tensor = tf.one_hot(dev_labels,len(desc_label_dict))

## 模型构建

In [6]:
# all the caches are default to /tmp/tfhub_modules
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_preprocess/3")  #https://tfhub.dev/tensorflow/bert_zh_preprocess/3
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4",trainable=True)  #https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/4

In [7]:
def build_classifier_model(output_size=1,preprocessor=preprocessor,encoder=encoder):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    encoder_inputs = preprocessor(text_input)
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.6)(net)
    net = tf.keras.layers.Dense(output_size, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [8]:
classifier_model = build_classifier_model(CLASS_SIZE)
classifier_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [9]:
text_test = ['上课时学生手机响个不停，老师一怒之下把手机摔了，家长拿发票让老师赔，大家怎么看待这种事？']  #教育类新闻
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.math.softmax(bert_raw_result))

tf.Tensor(
[[0.06891412 0.07392806 0.00739026 0.0428463  0.01439426 0.04456672
  0.06459893 0.1343775  0.05999255 0.1392031  0.10317845 0.09619764
  0.00053205 0.14113835 0.00874169]], shape=(1, 15), dtype=float32)


In [10]:
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.CategoricalAccuracy()

In [11]:
optimizer = tf.keras.optimizers.Adam(2e-5)

In [12]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [13]:
history = classifier_model.fit(x=sentences_tensor,y=labels_tensor,
                               validation_data=(dev_sentences_tensor,dev_labels_tensor),
                               epochs=10,batch_size=16)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
bert_raw_result = classifier_model(tf.constant(text_test))
print(tf.math.argmax(tf.math.softmax(bert_raw_result),axis=1))

tf.Tensor([7], shape=(1,), dtype=int64)


In [15]:
label_desc_dict[7]

'news_edu'

In [16]:
classifier_model.evaluate(dev_sentences_tensor,dev_labels_tensor)



[2.3313093185424805, 0.5536999702453613]