# Tokenization (English)

In [1]:
# 英文斷詞
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print("英文斷詞：", text_to_word_sequence("I love jogging, and you?"))

英文斷詞： ['i', 'love', 'jogging', 'and', 'you']


# Tokenization (Chinese)

In [2]:
# Jieba
# Spacy (工業級, 使用BERT為基礎)

# Install jieba（結巴）
!pip install jieba

# Get the Tokenization Dictionary for Traditional Chinese
import os
Dictionary_File = 'dict.txt.big'

if not os.path.isfile(Dictionary_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + Dictionary_File)

# Get the Stop Words File for Traditional Chinese
# 了, 吧, 啦, etc.
StopWords_File = "stopWords_big5.txt"

if not os.path.isfile(StopWords_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + StopWords_File)



In [3]:
import jieba

# Set Dictionary for Traditional Chinese
# jieba.set_dictionary(Dictionary_File)

# Tokenization
result = list(jieba.cut("我喜歡跑步，你呢？"))
print("中文斷詞（有標點）：", result)

# Remove Stop Words from Set
stopWords = set("$!&#%\()+-*/_,. 　?:;'\"<=>^`|~[]{}’0123456789?_“”、。《》！，：；？「」（）")
print("中文斷詞（無標點）：", [word for word in result if word not in stopWords])

# Remove Stop Words from Files
stopWords = set()
with open(StopWords_File, "rt", encoding="utf-8") as f:
  for line in f:
    line = line.strip() # Remove trailing \n
    stopWords.add(line)
print("中文斷詞（更精簡）：", [word for word in result if word not in stopWords])

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.746 seconds.
DEBUG:jieba:Loading model cost 0.746 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


中文斷詞（有標點）： ['我', '喜歡', '跑步', '，', '你', '呢', '？']
中文斷詞（無標點）： ['我', '喜歡', '跑步', '你', '呢']
中文斷詞（更精簡）： ['喜歡', '跑步']


# Text Digitalize

In [4]:
# Create a Tokenizer object
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(
        num_words=None,
        filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n',
        lower=True,
        split=' ',
        char_level=False,
        oov_token='NiD'
    )

In [5]:
# Create Mapping by Corpus
# 使用語料庫建立字典
corpus = ["I love jogging, and you?",
      "I love reading!"]
tk.fit_on_texts(corpus)

# Show the Mapping Table
print(tk.word_index)    # WORD vs. NUMBER
print(tk.index_word)    # NUMBER vs. WORD

{'NiD': 1, 'i': 2, 'love': 3, 'jogging': 4, 'and': 5, 'you': 6, 'reading': 7}
{1: 'NiD', 2: 'i', 3: 'love', 4: 'jogging', 5: 'and', 6: 'you', 7: 'reading'}


In [6]:
# Test for Mapping Text into Sequence
input_text = ["I love jogging!",
        "and I love reading, too!"]

seq = tk.texts_to_sequences(input_text)
print(seq)

# Test for Mapping Sequence into Text
text = tk.sequences_to_texts(seq)
print(text)

[[2, 3, 4], [5, 2, 3, 7, 1]]
['i love jogging', 'and i love reading NiD']


# Sequence Alignment

In [7]:
# Create a Sequence Padding Object
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_seq = pad_sequences(
        sequences=seq,
        maxlen=5,
        dtype="int32",
        padding="pre", # pad the front
        truncating="post", # truncate the tail
        value=0
    )

print(padded_seq)

[[0 0 2 3 4]
 [5 2 3 7 1]]


# Encoding

In [8]:
# One-Hot Encoding
from tensorflow.keras.utils import to_categorical

print("獨熱編碼 -------------")
print(to_categorical(padded_seq))

獨熱編碼 -------------
[[[1. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [9]:
# Multi-Hot Encoding
print("多熱編碼 -------------")
print(tk.texts_to_matrix(input_text))

多熱編碼 -------------
[[0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 1. 1. 1. 0. 1. 0. 1.]]


In [10]:
# Word Embedding
import tensorflow as tf
from tensorflow.keras import layers

emb = layers.Embedding(8, 3)

# tf.constant(): Convert immediate values into tensor
result = emb(tf.constant(padded_seq))
print("詞向量嵌入 -------------")
print(result.numpy())

詞向量嵌入 -------------
[[[-0.04549634  0.04842453 -0.01803239]
  [-0.04549634  0.04842453 -0.01803239]
  [-0.02587713  0.00374747  0.01315535]
  [ 0.03545367 -0.01526957 -0.01657917]
  [ 0.01964043 -0.0327925   0.04979056]]

 [[ 0.04811317 -0.03244107 -0.04246626]
  [-0.02587713  0.00374747  0.01315535]
  [ 0.03545367 -0.01526957 -0.01657917]
  [-0.01841354 -0.03726889 -0.04098666]
  [ 0.03419845  0.01358436 -0.0463225 ]]]
