# Chinese Word Segmentation
Using tokenizer, padding and binary classification

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import keras_tuner as kt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras. preprocessing.sequence import pad_sequences
import tensorflow_addons as tfa

In [2]:
train = pd.read_csv('C:/Users/mandy/OneDrive/2022 IRS/ML/Datasets/Chinese Word Segmentation/as_training.utf8', names=['data'])
print(train.head())

                    data
0                   時間　：
1  三月　十日　（　星期四　）　上午　十時　。
2                   地點　：
3      學術　活動　中心　一樓　簡報室　。
4                   主講　：


## Create data and labels

In [3]:
y = train['data'].copy()
X = train['data'].str.replace('\u3000', '')
print(X.head())
print(type(X))
print(y.head())

0               時間：
1    三月十日（星期四）上午十時。
2               地點：
3      學術活動中心一樓簡報室。
4               主講：
Name: data, dtype: object
<class 'pandas.core.series.Series'>
0                     時間　：
1    三月　十日　（　星期四　）　上午　十時　。
2                     地點　：
3        學術　活動　中心　一樓　簡報室　。
4                     主講　：
Name: data, dtype: object


In [4]:
def create_labels(data):
    label = []
    index = 0

    while index < len(data) - 1:
        if data[index + 1] == '\u3000':
            label.append(1)
            index += 2
        else:
            label.append(0)
            index += 1
            
    if index == len(data) - 1:
        label.append(1)
    return label

y = y.apply(create_labels)

In [5]:
print(y)
print(type(y))

0                                          [0, 1, 1]
1         [0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1]
2                                          [0, 1, 1]
3               [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1]
4                                          [0, 1, 1]
                             ...                    
708948             [0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1]
708949                      [0, 1, 1, 1, 0, 0, 1, 1]
708950                                     [0, 1, 1]
708951    [0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1]
708952          [0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1]
Name: data, Length: 708953, dtype: object
<class 'pandas.core.series.Series'>


## Tokenization and Padding

In [6]:
tokenizer = Tokenizer(oov_token='<OOV>', split='\u3000', char_level=True)
tokenizer.fit_on_texts(train['data'])
char_index = tokenizer.word_index
total_chars = len(char_index) + 1

In [7]:
print(list(char_index.keys())[:100])
print(len(char_index))

['<OOV>', '\u3000', '，', '的', '。', '一', '是', '不', '有', '在', '人', '我', '了', '、', '這', '大', '以', '為', '個', '中', '他', '會', '來', '國', '上', '時', '要', '們', '就', '到', '生', '學', '「', '可', '」', '年', '也', '之', '說', '對', '出', '而', '能', '地', '成', '得', '家', '多', '所', '自', '十', '於', '後', '都', '子', '好', '：', '過', '下', '與', '用', '方', '作', '如', '小', '和', '你', '心', '因', '發', '行', '天', '經', '那', '然', '分', '動', '現', '前', '？', '日', '去', '事', '公', '三', '法', '∥', '麼', '及', '長', '很', '業', '！', '同', '當', '開', '民', '其', '但', '二']
6086


In [8]:
max_length = X.str.len().max() # 188
trunc_type = 'post'
padding = 'post'

X_sequences = tokenizer.texts_to_sequences(X)
padded_X = pad_sequences(X_sequences, maxlen=max_length, truncating=trunc_type, padding=padding)
padded_y = pad_sequences(y, maxlen=max_length, truncating=trunc_type, padding=padding)
print(padded_X[1])
print(padded_X.shape)

[ 85 181  51  81 115 572 221 164 114  25 721  51  26   5   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0]
(708953, 188)


In [23]:
embedding_dim = 256

xIn = Input((188))
x = Embedding(total_chars, embedding_dim, input_length=188, mask_zero=True)(xIn)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = Flatten()(x)
# x = Dense(64, activation='swish')(x)
# x = Dense(64, activation='swish')(x)
x = Dense(64, activation='swish')(x)
x = Dense(64, activation='swish')(x)
xOut = Dense(188, activation='sigmoid')(x)

model = Model(inputs=xIn, outputs=xOut)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_13 (InputLayer)       [(None, 188)]             0         
                                                                 
 embedding_11 (Embedding)    (None, 188, 256)          1558272   
                                                                 
 bidirectional_16 (Bidirecti  (None, 188, 128)         164352    
 onal)                                                           
                                                                 
 bidirectional_17 (Bidirecti  (None, 188, 128)         98816     
 onal)                                                           
                                                                 
 flatten (Flatten)           (None, 24064)             0         
                                                                 
 dense_17 (Dense)            (None, 64)                1540

In [None]:
# need this to ensure loss function is calculated correctly when using mask_zero=True

class MaskedSequenceLoss(losses.Loss):
    def _init_(
        self,
        average_across_timesteps=False,
        average_across_batch=False,
        sum_over_timesteps=True,
        sum_over_batch=True,
        softmax_loss_function=None,
        name=None,
        reduction=None, # dummy arg so it can be used as custom object when loading saved model
    ):
        super()._init_()
        self.opts = {
            "average_across_timesteps": average_across_timesteps,
            "average_across_batch": average_across_batch,
            "sum_over_timesteps": sum_over_timesteps,
            "sum_over_batch": sum_over_batch,
            "softmax_loss_function": softmax_loss_function,
            "name": name,
        }
    
    def call(self, y_true, y_pred):
        return tfa.seq2seq.sequence_loss(y_pred, y_true,
                                         weights=tf.cast(y_pred._keras_mask, tf.float32) if hasattr(y_pred, "_keras_mask") else tf.ones(y_true.shape),
                                         **self.opts)

In [24]:
epochs = 30

callbacks = [
    tf.keras.callbacks.ModelCheckpoint('./8_best_model_weights', monitor='acc', save_best_only=True),
    tf.keras.callbacks.EarlyStopping(monitor='acc', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='acc', factor=0.1, patience=3, verbose=1)
]

padded_X = tf.convert_to_tensor(padded_X)
padded_y = tf.convert_to_tensor(padded_y)

history = model.fit(padded_X, padded_y, batch_size=32, validation_split=0.2, epochs=epochs, callbacks=callbacks, use_multiprocessing=True)

<class 'tensorflow.python.framework.ops.EagerTensor'>
Epoch 1/30
