In [1]:
!pip install tensorflow-addons

import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, losses, optimizers, callbacks
import tensorflow_addons as tfa
import json
import pickle
import os



# Load and label data

In [1]:
!wget http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip

--2022-05-07 19:42:50--  http://sighan.cs.uchicago.edu/bakeoff2005/data/icwb2-data.zip
Resolving sighan.cs.uchicago.edu (sighan.cs.uchicago.edu)... 128.135.164.125
Connecting to sighan.cs.uchicago.edu (sighan.cs.uchicago.edu)|128.135.164.125|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 52640127 (50M) [application/zip]
Saving to: ‘icwb2-data.zip’


2022-05-07 19:43:32 (1.35 MB/s) - ‘icwb2-data.zip’ saved [52640127/52640127]



In [2]:
!unzip icwb2-data.zip

Archive:  icwb2-data.zip
   creating: icwb2-data/
   creating: icwb2-data/doc/
  inflating: icwb2-data/doc/instructions.txt  
  inflating: icwb2-data/doc/result_instructions.txt  
   creating: icwb2-data/gold/
  inflating: icwb2-data/gold/as_testing_gold.txt  
  inflating: icwb2-data/gold/as_testing_gold.utf8  
  inflating: icwb2-data/gold/as_training_words.txt  
  inflating: icwb2-data/gold/as_training_words.utf8  
  inflating: icwb2-data/gold/cityu_test_gold.txt  
  inflating: icwb2-data/gold/cityu_test_gold.utf8  
  inflating: icwb2-data/gold/cityu_training_words.txt  
  inflating: icwb2-data/gold/cityu_training_words.utf8  
  inflating: icwb2-data/gold/msr_test_gold.txt  
  inflating: icwb2-data/gold/msr_test_gold.utf8  
  inflating: icwb2-data/gold/msr_training_words.txt  
  inflating: icwb2-data/gold/msr_training_words.utf8  
  inflating: icwb2-data/gold/pku_test_gold.txt  
  inflating: icwb2-data/gold/pku_test_gold.utf8  
  inflating: icwb2-data/gold/pku_training_words.txt  
  i

## Read saved dataset (only run if it exists)

In [2]:
with open("tokenizer.pkl", "rb") as tokenizer_file:
    tokenizer = pickle.load(tokenizer_file)

df = pd.read_csv("df.csv")

In [17]:
def label(sentence):
    labels = []
    for i in range(1, len(sentence)):
        if sentence[i-1] != "\u3000": # trailing punctuation will be ignored if it is preceded by a space (which I assume to always be true)
            if sentence[i] == "\u3000":
                labels.append(1)
            else:
                labels.append(0)
    return labels

In [12]:
df = pd.read_csv("icwb2-data/training/as_training.utf8", names=["sentences_split"])
df["sentences"] = df["sentences_split"].str.replace("\u3000", "").str[:-1] # remove spaces + strip trailing punctuation
df["labels"] = df["sentences_split"].map(label)
df["labels_padded"] = list(tf.keras.preprocessing.sequence.pad_sequences(df["labels"],
                                                                         value=0, # ignored during loss calculation
                                                                         padding="pre"))

# Preprocess data

In [22]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(df["sentences"])

In [24]:
df["sentences_word_indices"] = tokenizer.texts_to_sequences(df["sentences"])

In [33]:
df["sentences_padded"] = list(tf.keras.preprocessing.sequence.pad_sequences(df["sentences_word_indices"],
                                                                            value=0, # masked by Embedding layer
                                                                            padding="pre"))

## Save tokenizer and DataFrame to disk

In [8]:
with open("tokenizer.pkl", "wb") as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

df.to_csv("df.csv")

# Create model

## Define model architecture

In [39]:
max_sentence_length = len(df["sentences_padded"].iloc[0])
vocab_size = len(json.loads(tokenizer.get_config()["word_counts"]))

### Bidirectional LSTM

In [40]:
xIn = layers.Input(shape=(max_sentence_length,))
x = layers.Embedding(input_dim=vocab_size,
                     output_dim=200,
                     mask_zero=True)(xIn)
# merge forward and backward outputs by concatenating outputs along last dimension, NOT by averaging (potentially causes information loss)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
x = layers.Dense(64, activation="relu")(x)
xOut = layers.Dense(2, activation="linear")(x) # softmax is computed by loss function, so don't use activation="softmax" here

model = tf.keras.Model(xIn, xOut)
model.summary()

2022-05-13 16:49:13.395013: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-13 16:49:13.807431: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-13 16:49:13.808532: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-13 16:49:13.819218: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 187)]             0         
                                                                 
 embedding (Embedding)       (None, 187, 200)          1215400   
                                                                 
 bidirectional (Bidirectiona  (None, 187, 256)         336896    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 187, 256)         394240    
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 187, 256)         394240    
 nal)                                                            
                                                             

### Unidirectional LSTM

In [9]:
xIn = layers.Input(shape=(max_sentence_length,))
x = layers.Embedding(input_dim=vocab_size,
                     output_dim=200,
                     mask_zero=True)(xIn)
# merge forward and backward outputs by concatenating outputs along last dimension, NOT by averaging (potentially causes information loss)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.LSTM(128, return_sequences=True)(x)
x = layers.Dense(64, activation="relu")(x)
xOut = layers.Dense(2, activation="linear")(x) # softmax is computed by loss function, so don't use activation="softmax" here
# xOut = layers.Flatten()(x)

model = tf.keras.Model(xIn, xOut)
model.summary()

2022-05-11 16:53:37.502209: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-11 16:53:37.950587: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-11 16:53:37.951700: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-11 16:53:37.962485: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 187)]             0         
                                                                 
 embedding (Embedding)       (None, 187, 200)          1215400   
                                                                 
 lstm (LSTM)                 (None, 187, 128)          168448    
                                                                 
 lstm_1 (LSTM)               (None, 187, 128)          131584    
                                                                 
 lstm_2 (LSTM)               (None, 187, 128)          131584    
                                                                 
 dense (Dense)               (None, 187, 64)           8256      
                                                                 
 dense_1 (Dense)             (None, 187, 2)            130   

### Multi-head attention

In [None]:
xIn = layers.Input(shape=(max_sentence_length,))
x = layers.Embedding(input_dim=vocab_size,
                     output_dim=200,
                     mask_zero=True)(xIn)


## Define custom masked loss

In [42]:
class MaskedSequenceLoss(losses.Loss):
    def __init__(
        self,
        average_across_timesteps=False,
        average_across_batch=False,
        sum_over_timesteps=True,
        sum_over_batch=True,
        softmax_loss_function=None,
        name=None,
        reduction=None, # dummy arg so it can be used as custom object when loading saved model
    ):
        super().__init__()
        self.opts = {
            "average_across_timesteps": average_across_timesteps,
            "average_across_batch": average_across_batch,
            "sum_over_timesteps": sum_over_timesteps,
            "sum_over_batch": sum_over_batch,
            "softmax_loss_function": softmax_loss_function,
            "name": name,
        }
    
    def call(self, y_true, y_pred):
        return tfa.seq2seq.sequence_loss(y_pred, y_true,
                                         weights=tf.cast(y_pred._keras_mask, tf.float32) if hasattr(y_pred, "_keras_mask") else tf.ones(y_true.shape),
                                         **self.opts)

def binary_crossentropy_arg_names_changed(labels, logits):
#     print(labels.numpy(), logits.numpy())
    output = tf.nn.sigmoid_cross_entropy_with_logits(tf.cast(labels, tf.float32)[..., tf.newaxis], logits)
    print(output)
    return output
#     return tf.keras.losses.binary_crossentropy(y_true=labels, y_pred=logits, from_logits=True)

In [43]:
model.compile(optimizer=optimizers.Adam(learning_rate=1e-3),
#               loss=MaskedSequenceLoss(softmax_loss_function=binary_crossentropy_arg_names_changed),
#               loss=MaskedSequenceLoss(softmax_loss_function=tf.nn.sigmoid_cross_entropy_with_logits),
              loss=MaskedSequenceLoss(),
              metrics=["accuracy"])

## Train model

In [44]:
model.fit(x=tf.stack(df["sentences_padded"]),
          y=tf.stack(df["labels_padded"]),
          batch_size=720,
          epochs=1000,
          validation_split=0.2,
          callbacks=[
              callbacks.ReduceLROnPlateau(patience=10, verbose=1),
              callbacks.EarlyStopping(patience=20, verbose=1, restore_best_weights=True),
              callbacks.ModelCheckpoint(filepath=os.path.join("saved-models", "unidirectional-lstm", "epoch{epoch}_valloss{val_loss:.4f}"), verbose=1, save_best_only=True),
          ]) 

Epoch 1/1000

KeyboardInterrupt: 

## Evaluate model

In [46]:
custom_objects = { "MaskedSequenceLoss": MaskedSequenceLoss }
with tf.keras.utils.custom_object_scope(custom_objects):
    model = tf.keras.models.load_model("saved-models/bidirectional-lstm/epoch8_valloss0.0042")

2022-05-13 17:19:18.479087: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2022-05-13 17:19:20.295208: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2022-05-13 17:19:22.050232: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2022-05-13 17:19:22.088671: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but the _output_shapes attribute specifies shapes for 48 outputs. Output shapes may be inaccurate.
2022-05-13 17:19:22.357560: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 14 outputs but 

In [47]:
def segment_sentence(sentence, skip_array):
    assert len(sentence) == len(skip_array)
    segmented_sentence = ""
    for i in range(len(sentence)):
        segmented_sentence += sentence[i]
        if skip_array[i] == 1:
            segmented_sentence += " "
    return segmented_sentence

In [66]:
test_sentences = [
    "公教学生是个具有高尚情操、坚韧个性，同时热爱生活，热爱学习，并且愿为人群服务的领袖、双语学者、与彬彬君子。", # Fail
    "明天更有一場「希望大樹」締造最多雙胞胎集合挑戰金氏世界紀錄活動。", # OK
    "張玨的這番話讓目前還在台大唸博士班的郭淑珍及她的雙胞胎妹妹郭淑玲感受最深", # OK except that it splits 張玨
    "然而，就其思想倾向而言，它却是属于日本战后派的，是战后派文学的一个组成部分。", # Fail. Output: '然 而 ， 就 其 思 想 倾 向而 言 ， 它 却是 属 于 日 本 战 后 派 的， 是 战 后 派 文 学 的一 个 组成 部分 。 '
    "如果說電影《遠離賭城》是尼可拉斯凱吉藝術成就上的轉捩點", # OK except that it doesn't separate 如果說
    "吳宇森正計劃拍攝一部二次大戰的電影《Ｗｉｎｄｔａｌｋｅｒｓ》", # OK (二次大戰 should not be separated)
    "雄立獅島式是炎黃萬世其無疆",
    "你好我的名字是傑夫",
    "不過成員練唱時投入的程度可不輸給一般專業合唱團",
    "你他媽到底在說我什麼，你這個小婊子？我會讓你知道我畢業於海豹突擊隊班，我曾參與過無數次對基地組織的秘密突襲，並確認殺死了 300 多人。我接受過大猩猩戰爭的訓練，我是整個美國武裝部隊中的頂級狙擊手。你對我來說什麼都不是，只是另一個目標。我會用地球上從未見過的精確度把你他媽擦掉，記住我他媽的話。你認為你可以在互聯網上對我說那些狗屎嗎？再想想，混蛋。在我們說話的時候，我正在聯繫我在美國的秘密間諜網絡，你的 IP 正在被追踪，所以你最好為風暴做好準備，蛆蟲。這場風暴會摧毀你稱之為生命的可悲小東西。你他媽死定了，孩子。我可以在任何地方，任何時間，我可以用七百多種方式殺死你，而這只是我的徒手。我不僅在徒手格斗方面受過廣泛的訓練，而且我還可以使用美國海軍陸戰隊的整個武器庫，我會盡其所能地使用它來將你的悲慘屁股從大陸上抹去，你這個小混蛋。如果你能知道你那小小的“聰明”評論會給你帶來什麼樣的邪惡報應，也許你會忍住你的舌頭。但你不能，你沒有，現在你要付出代價，你這個該死的白痴。我會在你身上發火，你會淹死的。你他媽死定了，孩子。"[:187],
    "你瞅啥！瞅你咋地！再瞅一个试试！试试就试试！",
]

test_sentence = test_sentences[10]
test_sentence_sequence = tokenizer.texts_to_sequences([test_sentence])[0]
test_sentence_sequence_padded = tf.keras.preprocessing.sequence.pad_sequences([test_sentence_sequence],
                                                                              maxlen=max_sentence_length)[0]

actual_pred_start_idx = max_sentence_length - len(test_sentence)
test_preds = model.predict(test_sentence_sequence_padded[tf.newaxis, ...])[0, actual_pred_start_idx:]
probabilities = tf.nn.softmax(test_preds)
skip_array = tf.argmax(probabilities, axis=-1)

segment_sentence(test_sentence, skip_array)

'你 瞅 啥 ！ 瞅 你 咋 地 ！ 再 瞅 一 个 试 试 ！ 试 试 就 试 试 ！ '