In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

2022-05-31 14:50:45.604995: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [32]:
model_name = "bert-base-chinese" # https://huggingface.co/bert-base-chinese
max_len = 384
# download from [here](https://github.com/lemonhu/NER-BERT-pytorch/tree/master/data/msra)
train_data_path = "dataset/msra_train_bio"
test_data_path = "dataset/msra_test_bio"
tags_data_path = "dataset/tags.txt"

In [4]:
def preprocess_data(filename, max_data_len):
    with open(filename) as f:
        sequences = {
            'sentence': [],
            'word': [],
            'tag': []
        }
        sequnce_index = 0
        for line in f:
            if (max_data_len and len(sequences['word']) > max_data_len):
                break
            if line and line != '\n':
                sequences['sentence'].append(str(sequnce_index))
                splited = line.strip().split('\t')
                word, tag = splited
                sequences['word'].append(word)
                sequences['tag'].append(tag)
            else:
                sequnce_index += 1

    return sequences


def process_data(path, max_data_len):
    data = preprocess_data(path, max_data_len)
    df = pd.DataFrame(data)
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, 'tag'] = enc_tag.fit_transform(df['tag'])
    sentences = df.groupby('sentence')["word"].apply(list).values
    tag = df.groupby('sentence')['tag'].apply(list).values
    return sentences, tag, enc_tag


In [38]:
test_data = preprocess_data(test_data_path, 155)
print(pd.DataFrame(test_data))

    sentence word    tag
0          0    中  B-ORG
1          0    共  I-ORG
2          0    中  I-ORG
3          0    央  I-ORG
4          0    致      O
..       ...  ...    ...
151        1    基      O
152        1    本      O
153        1    任      O
154        1    务      O
155        1    。      O

[156 rows x 3 columns]


In [40]:
def process_data_1():
    df = pd.DataFrame(test_data)
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, 'tag'] = enc_tag.fit_transform(df['tag'])
    sentences = df.groupby('sentence')["word"].apply(list).values
    tag = df.groupby('sentence')['tag'].apply(list).values
    return sentences, tag, enc_tag

sentences, tag, enc_tag = process_data_1()
print(pd.DataFrame(sentences))
print(pd.DataFrame(tag))

                                                   0
0  [中, 共, 中, 央, 致, 中, 国, 致, 公, 党, 十, 一, 大, 的, 贺, ...
1  [在, 过, 去, 的, 五, 年, 中, ，, 致, 公, 党, 在, 邓, 小, 平, ...
                                                   0
0  [0, 2, 2, 2, 4, 0, 2, 2, 2, 2, 2, 2, 2, 4, 4, ...
1  [4, 4, 4, 4, 4, 4, 4, 4, 0, 2, 2, 4, 1, 3, 3, ...


In [45]:
num_tags = 7
def create_inputs_targets_1(sentences, tags, tag_encoder):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }

    for sentence, tag in zip(sentences, tags):
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            # keep the dim of input_ids and target_tags, e.g. [6] *2 = [6, 6]
            target_tags.extend([tag[idx]] * num_tokens)

        # Pad truncate, reserve space for[CLS] and [SEP]
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        # https://huggingface.co/docs/transformers/v4.19.2/en/glossary#token-type-ids

        input_ids = [101] + input_ids + [102]
        target_tags = [num_tags-1] + target_tags + [num_tags-1]  # [num_tags-1] is O
        token_type_ids = [0] * len(input_ids) # All is zero means that there is no relation between two sentences
        # This argument indicates to the model which tokens should be attended to, and which should not.
        attention_mask = [1] * len(input_ids) # The token need to be attended to
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len) # Add padded tokens
        attention_mask = attention_mask + ([0] * padding_len) # Add padded attention mask
        token_type_ids = token_type_ids + ([0] * padding_len) # Fill padded token type ids with zero
        target_tags = target_tags + ([num_tags] * padding_len) # Fill padded target tags with `undefined` tag

        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(
            target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'

    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder


x, y, tag_encoder = create_inputs_targets_1(sentences, tag, enc_tag)

print(x[0])
print(y[0])

[[ 101  704 1066  704 1925 5636  704 1744 5636 1062 1054 1282  671 1920
  4638 6590 6404 1392  855  807 6134  510 1392  855 1398 2562 8038 1762
   704 1744 5636 1062 1054 5018 1282  671 3613 1059 1744  807 6134 1920
   833 7384 7028 1374 2458  722 7354 8024  704 1744 1066  772 1054  704
  1925 1999 1447  833 6474 1403 1920  833 6134 4850 4178 4164 4638 4867
  6590 8024 1403 5636 1062 1054 4638 1398 2562  812 5636  809  779 1147
  4638 7309  952 8013  102    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0   

In [5]:
def create_inputs_targets(path, max_data_len):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    sentences, tags, tag_encoder = process_data(path, max_data_len)

    for sentence, tag in zip(sentences, tags):
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            # keep the dim of input_ids and target_tags, e.g. [6] *2 = [6, 6]
            target_tags.extend([tag[idx]] * num_tokens)

        # Pad truncate, reserve space for[CLS] and [SEP]
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        # https://huggingface.co/docs/transformers/v4.19.2/en/glossary#token-type-ids

        input_ids = [101] + input_ids + [102]
        target_tags = [num_tags-1] + target_tags + [num_tags-1]  # [num_tags-1] is O
        token_type_ids = [0] * len(input_ids) # All is zero means that there is no relation between two sentences
        # This argument indicates to the model which tokens should be attended to, and which should not.
        attention_mask = [1] * len(input_ids) # The token need to be attended to
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len) # Add padded tokens
        attention_mask = attention_mask + ([0] * padding_len) # Add padded attention mask
        token_type_ids = token_type_ids + ([0] * padding_len) # Fill padded token type ids with zero
        target_tags = target_tags + ([num_tags] * padding_len) # Fill padded target tags with `undefined` tag

        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(
            target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'

    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder


In [6]:
num_tags = pd.read_csv(tags_data_path, sep='\t', names=["tag"])["tag"].nunique()

In [None]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained(model_name)
save_path = "{}_tokens/".format(model_name)
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

In [7]:
# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("{}_tokens/vocab.txt".format(model_name), lowercase=True)

In [8]:
x_train, y_train, tag_encoder = create_inputs_targets(train_data_path, max_data_len=110829) # 221657

In [47]:
is_equal = tf.math.equal(7, 7) # True
print(is_equal)
mask = tf.math.logical_not(is_equal) # False
print(mask)
mask = tf.cast(mask, dtype=tf.float32) # 0
print(mask)
loss_ = 0.2
loss_ *= mask
print(loss_)

tf.Tensor(True, shape=(), dtype=bool)
tf.Tensor(False, shape=(), dtype=bool)
tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(0.0, shape=(), dtype=float32)


In [9]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)

# Each batch of data will consist of variable sized sentence tokens with
# appropriate padding in both input and target.
# During loss calculation, we ignore the loss corresponding padding tokens
# in the target.
def masked_ce_loss(real, pred):
    loss_ = loss_object(real, pred)

    # [pad] -> num_tags(undefined) True -logical_not-> False -cast-> 0 
    mask = tf.math.logical_not(tf.math.equal(real, num_tags))
    mask = tf.cast(mask, dtype=loss_.dtype)

    loss_ *= mask

    return tf.reduce_mean(loss_)


def create_model(num_tags):
    # BERT encoder
    encoder = TFBertModel.from_pretrained(model_name)

    # NER Model
    # Input() is used to instantiate a Keras tensor.
    input_ids = keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = keras.layers.Input(shape=(max_len,), dtype=tf.int32)

    outputs = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )
    # last_hidden_state, more details [here](https://huggingface.co/docs/transformers/v4.19.2/en/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions)
    embedding = outputs[0]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(num_tags+1, activation='softmax')(embedding)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss, metrics=['accuracy'])
    return model

In [None]:
model = create_model(num_tags)

In [48]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 384)]        0           []                               
                                                                                                  
 input_3 (InputLayer)           [(None, 384)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 384)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  102267648   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_3[0][0]',            

In [None]:
bs = 64

history = model.fit(
    x_train,
    y_train,
    epochs=1,
    verbose=1,
    batch_size=bs,
    validation_split=0.1
)

In [11]:
dataset_name = 'ner'
saved_model_path = './{}_bert'.format(dataset_name.replace('/', '_'))

In [None]:
model.save(saved_model_path, include_optimizer=False)

In [12]:
reloaded_model = tf.saved_model.load(saved_model_path)

In [13]:
def create_test_input_from_text(texts, use_tensor=False):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": []
    }
    for sentence in texts:
        input_ids = []
        for idx, word in enumerate(sentence.split()):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)

        # Pad and create attention masks.
        # Skip if truncation is needed
        input_ids = input_ids[:max_len - 2]

        input_ids = [101] + input_ids + [102]
        n_tokens = len(input_ids)
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)

        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)

    for key in dataset_dict:
        dataset_dict[key] = tf.constant(dataset_dict[key]) if use_tensor else np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    return x, n_tokens

In [26]:
def predict_from_text(text, model):
    x_test, n_tokens = create_test_input_from_text([text], use_tensor=True)
    pred_test = model.predict(x_test) if hasattr(model, 'predict') else model(x_test)
    # ignore predictions of padding tokens
    pred_tags = np.argmax(pred_test, 2)[0][:n_tokens]
    le_dict = dict(zip(tag_encoder.transform(
        tag_encoder.classes_), tag_encoder.classes_))
    tags = [le_dict.get(_, '[pad]') for _ in pred_tags]
    res = []
    words = {
        'word': '',
        'tag': None
    }
    for idx, tag in enumerate(tags):
        token = x_test[0][0][idx]
        token = token.numpy()
        if(token == 101 or token == 102 or token == None):
            continue
        if(tag != 'O' and tag != '[pad]'):
            pre, suf = tag.split('-')
            words['tag'] = suf
            word = tokenizer.decode([token])
            words['word'] =  words['word'] + word if words['word'] else word
        else:
            if(words['tag']):
                res.append(words)
            words = {
                'word': '',
                'tag': None
            }
    return pd.DataFrame(res)


In [28]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'
pre_train_model = model
print(predict_from_text(test_inputs, pre_train_model))

             word  tag
0  李华住在朝阳区香河园街道西坝  LOC
1            北里社区  LOC
2          在5月4号去  LOC
3               安  LOC
4              广场  LOC
5  5号下午去了太阳宫凯德茂商场  LOC


In [29]:
test_inputs = '李华住在朝阳区香河园街道西坝河北里社区，在5月4号去过天安门广场，5号下午去了太阳宫凯德茂商场。'
trained_model = reloaded_model
print(predict_from_text(test_inputs, trained_model))

              word  tag
0               李华  PER
1  朝阳区香河园街道西坝河北里社区  LOC
2            天安门广场  LOC
3         太阳宫凯德茂商场  LOC


In [30]:
news = '感染者1698：通过主动就诊发现，现住通州区张家湾镇环湖小镇，快递配送员。5月17日曾前往东城区东方广场送快递，自述5月21日、22日先后出现腹泻、发热等症状，5月22日前往医院就诊，5月23日报告核酸检测结果为阳性，当日诊断为确诊病例，临床分型为轻型。'
trained_model = reloaded_model
print(predict_from_text(news, trained_model))

          word  tag
0  通州区张家湾镇环湖小镇  LOC
1      东城区东方广场  LOC
