# Chinese Word Segmentation
Using tokenizer, padding and binary classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
# import keras_tuner as kt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras. preprocessing.sequence import pad_sequences

 The versions of TensorFlow you are currently using is 2.9.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [3]:
train = pd.read_csv('as_training.utf8', names=['data'])
print(train.head())

                    data
0                   時間　：
1  三月　十日　（　星期四　）　上午　十時　。
2                   地點　：
3      學術　活動　中心　一樓　簡報室　。
4                   主講　：


## Create data and labels

In [4]:
y = train['data'].copy()
X = train['data'].str.replace('\u3000', '')
print(X.head())
print(type(X))
print(y.head())

0               時間：
1    三月十日（星期四）上午十時。
2               地點：
3      學術活動中心一樓簡報室。
4               主講：
Name: data, dtype: object
<class 'pandas.core.series.Series'>
0                     時間　：
1    三月　十日　（　星期四　）　上午　十時　。
2                     地點　：
3        學術　活動　中心　一樓　簡報室　。
4                     主講　：
Name: data, dtype: object


In [5]:
def create_labels(data):
    label = []
    index = 0

    while index < len(data) - 1:
        if data[index + 1] == '\u3000':
            label.append(1)
            index += 2
        else:
            label.append(0)
            index += 1
            
    if index == len(data) - 1:
        label.append(1)
    return label

y = y.apply(create_labels)

In [6]:
print(y)
print(type(y))

0                                          [0, 1, 1]
1         [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1]
2                                          [0, 1, 1]
3               [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1]
4                                          [0, 1, 1]
                             ...                    
708948             [0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1]
708949                      [0, 1, 1, 1, 0, 0, 1, 1]
708950                                     [0, 1, 1]
708951    [0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1]
708952          [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1]
Name: data, Length: 708953, dtype: object
<class 'pandas.core.series.Series'>


## Tokenization and Padding

In [7]:
tokenizer = Tokenizer(oov_token='<OOV>', split='\u3000', char_level=True)
tokenizer.fit_on_texts(train['data'])
char_index = tokenizer.word_index
total_chars = len(json.loads(tokenizer.get_config()['word_counts']))

In [8]:
print(list(char_index.keys())[:100])
print(len(char_index))

['<OOV>', '\u3000', '，', '的', '。', '一', '是', '不', '有', '在', '人', '我', '了', '、', '這', '大', '以', '為', '個', '中', '他', '會', '來', '國', '上', '時', '要', '們', '就', '到', '生', '學', '「', '可', '」', '年', '也', '之', '說', '對', '出', '而', '能', '地', '成', '得', '家', '多', '所', '自', '十', '於', '後', '都', '子', '好', '：', '過', '下', '與', '用', '方', '作', '如', '小', '和', '你', '心', '因', '發', '行', '天', '經', '那', '然', '分', '動', '現', '前', '？', '日', '去', '事', '公', '三', '法', '∥', '麼', '及', '長', '很', '業', '！', '同', '當', '開', '民', '其', '但', '二']
6086


In [9]:
max_length = X.str.len().max() # 188
# trunc_type = 'post'
padding = 'pre'

X_sequences = tokenizer.texts_to_sequences(X)
padded_X = pad_sequences(X_sequences, maxlen=max_length, padding=padding)
padded_y = pad_sequences(y, maxlen=max_length, padding=padding)
print(padded_X[1])
print(padded_X.shape)

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0  85 181  51  81 115 572
 221 164 114  25 721  51  26   5]
(708953, 188)


In [10]:
embedding_dim = 64

xIn = Input(shape=(max_length,))
x = Embedding(total_chars, embedding_dim, mask_zero=True, input_length=max_length)(xIn) # mask_zero will ignore timestamps with 0 (aka ignoring the padding)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(128, return_sequences=True))(x)
# x = Bidirectional(LSTM(128, return_sequences=True))(x)

# x = Flatten()(x) don't need to flatten, just put output layer as 2 neurons
# x = Dense(64, activation='swish')(x)
# x = Dense(64, activation='swish')(x)
x = Dense(64, activation='swish')(x)
# x = Dense(64, activation='swish')(x)
xOut = Dense(2, activation='linear')(x) # softmax is computed by loss function, so don't use activation="softmax" here

model = Model(inputs=xIn, outputs=xOut)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 188)]             0         
                                                                 
 embedding (Embedding)       (None, 188, 64)           389440    
                                                                 
 bidirectional (Bidirectiona  (None, 188, 256)         197632    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 188, 256)         394240    
 nal)                                                            
                                                                 
 dense (Dense)               (None, 188, 64)           16448     
                                                                 
 dense_1 (Dense)             (None, 188, 2)            130   

In [11]:
class MaskedSequenceLoss(tf.keras.losses.Loss):
    def __init__(
        self,
        average_across_timesteps=False,
        average_across_batch=False,
        sum_over_timesteps=True,
        sum_over_batch=True,
        softmax_loss_function=None,
        name=None,
        reduction=None, # dummy arg so it can be used as custom object when loading saved model
    ):
        super().__init__()
        self.opts = {
            "average_across_timesteps": average_across_timesteps,
            "average_across_batch": average_across_batch,
            "sum_over_timesteps": sum_over_timesteps,
            "sum_over_batch": sum_over_batch,
            "softmax_loss_function": softmax_loss_function,
            "name": name,
        }
    
    def call(self, y_true, y_pred):
        return tfa.seq2seq.sequence_loss(y_pred, y_true,
                                         weights=tf.cast(y_pred._keras_mask, tf.float32) if hasattr(y_pred, "_keras_mask") else tf.ones(y_true.shape),
                                         **self.opts)

def binary_crossentropy_arg_names_changed(labels, logits):
#     print(labels.numpy(), logits.numpy())
    output = tf.nn.sigmoid_cross_entropy_with_logits(tf.cast(labels, tf.float32)[..., tf.newaxis], logits)
    print(output)
    return output

model.compile(optimizer='adam', loss=MaskedSequenceLoss(), metrics=['acc'])

In [14]:
epochs = 100

callbacks = [
    tf.keras.callbacks.ModelCheckpoint('./8_best_model', monitor='acc', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='acc', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='acc', factor=0.1, patience=3, verbose=1)
]

# padded_X = tf.convert_to_tensor(padded_X)
# padded_y = tf.convert_to_tensor(padded_y)
print(type(padded_y))
history = model.fit(padded_X, padded_y, batch_size=570, epochs=epochs, validation_split=0.2, callbacks=callbacks)

<class 'numpy.ndarray'>
Epoch 1/100
143/996 [===>..........................] - ETA: 50:11 - loss: 0.0258 - acc: 0.7909

KeyboardInterrupt: 

## Evaluate model

In [None]:
# custom_objects = { "MaskedSequenceLoss": MaskedSequenceLoss }
# with tf.keras.utils.custom_object_scope(custom_objects):
#     model = tf.keras.models.load_model("saved-models/bidirectional-lstm/epoch8_valloss0.0042")

In [None]:
# show predicted results in sentences
def segment_sentence(sentence, skip_array):
    assert len(sentence) == len(skip_array)
    segmented_sentence = ""
    for i in range(len(sentence)):
        segmented_sentence += sentence[i]
        if skip_array[i] == 1:
            segmented_sentence += " "
    return segmented_sentence

In [None]:
test_sentences = [
    "公教学生是个具有高尚情操、坚韧个性，同时热爱生活，热爱学习，并且愿为人群服务的领袖、双语学者、与彬彬君子。", # Fail
    "明天更有一場「希望大樹」締造最多雙胞胎集合挑戰金氏世界紀錄活動。", # OK
    "張玨的這番話讓目前還在台大唸博士班的郭淑珍及她的雙胞胎妹妹郭淑玲感受最深", # OK except that it splits 張玨
    "然而，就其思想倾向而言，它却是属于日本战后派的，是战后派文学的一个组成部分。", # Fail. Output: '然 而 ， 就 其 思 想 倾 向而 言 ， 它 却是 属 于 日 本 战 后 派 的， 是 战 后 派 文 学 的一 个 组成 部分 。 '
    "如果說電影《遠離賭城》是尼可拉斯凱吉藝術成就上的轉捩點", # OK except that it doesn't separate 如果說
    "吳宇森正計劃拍攝一部二次大戰的電影《Ｗｉｎｄｔａｌｋｅｒｓ》", # OK (二次大戰 should not be separated)
    "雄立獅島式是炎黃萬世其無疆",
    "你好我的名字是傑夫",
    "不過成員練唱時投入的程度可不輸給一般專業合唱團",
    "你他媽到底在說我什麼，你這個小婊子？我會讓你知道我畢業於海豹突擊隊班，我曾參與過無數次對基地組織的秘密突襲，並確認殺死了 300 多人。我接受過大猩猩戰爭的訓練，我是整個美國武裝部隊中的頂級狙擊手。你對我來說什麼都不是，只是另一個目標。我會用地球上從未見過的精確度把你他媽擦掉，記住我他媽的話。你認為你可以在互聯網上對我說那些狗屎嗎？再想想，混蛋。在我們說話的時候，我正在聯繫我在美國的秘密間諜網絡，你的 IP 正在被追踪，所以你最好為風暴做好準備，蛆蟲。這場風暴會摧毀你稱之為生命的可悲小東西。你他媽死定了，孩子。我可以在任何地方，任何時間，我可以用七百多種方式殺死你，而這只是我的徒手。我不僅在徒手格斗方面受過廣泛的訓練，而且我還可以使用美國海軍陸戰隊的整個武器庫，我會盡其所能地使用它來將你的悲慘屁股從大陸上抹去，你這個小混蛋。如果你能知道你那小小的“聰明”評論會給你帶來什麼樣的邪惡報應，也許你會忍住你的舌頭。但你不能，你沒有，現在你要付出代價，你這個該死的白痴。我會在你身上發火，你會淹死的。你他媽死定了，孩子。"[:187],
    "你瞅啥！瞅你咋地！再瞅一个试试！试试就试试！",
]

for test_sentence in test_sentences:
    test_sentence_sequence = tokenizer.texts_to_sequences([test_sentence])[0]
    test_sentence_sequence_padded = pad_sequences([test_sentence_sequence],
                                                                                  maxlen=max_length)[0]

    actual_pred_start_idx = max_length - len(test_sentence)
    test_preds = model.predict(test_sentence_sequence_padded[tf.newaxis, ...])[0, actual_pred_start_idx:]
    probabilities = tf.nn.softmax(test_preds)
    skip_array = tf.argmax(probabilities, axis=-1)

    segment_sentence(test_sentence, skip_array)

## Save Model

In [None]:
import datetime as dt
dtime = dt.time()
now = dt.datetime.now()
now.strftime("%Y-%m-%d %H-%M-%S")

model.save(f'8_Chinese_Word_Segmentation/8_saved_models/{now}.h5')