In [None]:
import os
import io
import re
import json
import string
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn import preprocessing
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig


In [None]:
model_name = "bert-base-chinese"
max_len = 384
configuration = BertConfig()
train_data_path = "dataset/msra_train_bio"
test_data_path = "dataset/msra_test_bio"
tags_data_path = "dataset/tags.txt"


In [None]:
def preprocess_data(filename):
    with open(filename) as f:
        sequences = {
            'sentence': [],
            'word': [],
            'tag': []
        }
        sequnce_index = 0
        for line in f:
            if line and line != '\n':
                sequences['sentence'].append(str(sequnce_index))
                word, tag = line.strip().split('\t')
                sequences['word'].append(word)
                sequences['tag'].append(tag)
            else:
                sequnce_index += 1

    return sequences

def process_data(path):
    data = preprocess_data(path)
    df = pd.DataFrame(data)
    enc_tag = preprocessing.LabelEncoder()
    df.loc[:, 'tag'] = enc_tag.fit_transform(df['tag'])
    sentences = df.groupby('sentence')["word"].apply(list).values
    tag = df.groupby('sentence')['tag'].apply(list).values
    return sentences, tag, enc_tag



In [None]:
def create_inputs_targets(path):
    dataset_dict = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "tags": []
    }
    sentences, tags, tag_encoder = process_data(path)
    
    for sentence, tag in zip(sentences, tags):
        input_ids = []
        target_tags = []
        for idx, word in enumerate(sentence):
            ids = tokenizer.encode(word, add_special_tokens=False)
            input_ids.extend(ids.ids)
            num_tokens = len(ids)
            target_tags.extend([tag[idx]] * num_tokens)
        
        
        # Pad truncate
        input_ids = input_ids[:max_len - 2]
        target_tags = target_tags[:max_len - 2]

        input_ids = [101] + input_ids + [102]
        target_tags = [16] + target_tags + [16] # why [16]?
        token_type_ids = [0] * len(input_ids)
        attention_mask = [1] * len(input_ids)
        padding_len = max_len - len(input_ids)

        input_ids = input_ids + ([0] * padding_len)
        attention_mask = attention_mask + ([0] * padding_len)
        token_type_ids = token_type_ids + ([0] * padding_len)
        target_tags = target_tags + ([17] * padding_len)
        
        dataset_dict["input_ids"].append(input_ids)
        dataset_dict["token_type_ids"].append(token_type_ids)
        dataset_dict["attention_mask"].append(attention_mask)
        dataset_dict["tags"].append(target_tags)
        assert len(target_tags) == max_len, f'{len(input_ids)}, {len(target_tags)}'
        
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key])

    x = [
        dataset_dict["input_ids"],
        dataset_dict["token_type_ids"],
        dataset_dict["attention_mask"],
    ]
    y = dataset_dict["tags"]
    return x, y, tag_encoder

In [None]:
all_tags = pd.read_csv(tags_data_path, sep='\t', names=["Tags"])


In [None]:
# Save the slow pretrained tokenizer
slow_tokenizer = BertTokenizer.from_pretrained(model_name)
save_path = "{}_tokens/".format(model_name)
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)


In [None]:
# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer(
    "{}_tokens/vocab.txt".format(model_name), lowercase=True)

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False, reduction=tf.keras.losses.Reduction.NONE
)


def masked_ce_loss(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 17))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)


def create_model(num_tags):
    # BERT encoder
    encoder = TFBertModel.from_pretrained(model_name)

    # NER Model
    input_ids = keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    token_type_ids = keras.layers.Input(shape=(max_len,), dtype=tf.int32)
    attention_mask = keras.layers.Input(shape=(max_len,), dtype=tf.int32)

    outputs = encoder(
        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
    )
    embedding = outputs[0]
    embedding = keras.layers.Dropout(0.3)(embedding)
    tag_logits = keras.layers.Dense(
        num_tags+1, activation='softmax')(embedding)

    model = keras.Model(
        inputs=[input_ids, token_type_ids, attention_mask],
        outputs=[tag_logits],
    )
    optimizer = keras.optimizers.Adam(lr=3e-5)
    model.compile(optimizer=optimizer, loss=masked_ce_loss,
                  metrics=['accuracy'])
    return model


In [None]:
num_tags = len(all_tags)

use_tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    use_tpu = True
except:
    use_tpu = False

if use_tpu:
    # Create distribution strategy
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

    # Create model
    with strategy.scope():
        model = create_model(num_tags)
else:
    model = create_model(num_tags)
    
model.summary()

In [None]:
x_train, y_train, tag_encoder = create_inputs_targets(train_data_path)
