In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

In [None]:
tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [None]:
!rm -rf nlp_data
!git clone https://github.com/lquyet/nlp_data.git

In [None]:
with open("/kaggle/working/nlp_data/v5/train2023_cleaned.vi", "r", encoding="utf-8") as f:
    vi = f.readlines()

with open("/kaggle/working/nlp_data/v5/train2023_cleaned.lo", "r", encoding="utf-8") as f:
    lo = f.readlines()
    
train_examples = tf.data.Dataset.from_tensor_slices((lo, vi))

In [None]:
for lo, vi in train_examples.take(1):
  print("Laos: ", lo.numpy().decode('utf-8'))
  print("Viet:   ", vi.numpy().decode('utf-8'))

In [None]:
train_vi = train_examples.take(100000).map(lambda lo, vi: vi)
train_lo = train_examples.take(100000).map(lambda lo, vi: lo)

In [None]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 32000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
%%time
lo_vocab = bert_vocab.bert_vocab_from_dataset(
    train_lo.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [None]:
print(lo_vocab[:10])
print(lo_vocab[100:110])
print(lo_vocab[1000:1010])
print(lo_vocab[-10:])

In [None]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

In [None]:
write_vocab_file('lo_vocab.txt', lo_vocab)

In [None]:
%%time
vi_vocab = bert_vocab.bert_vocab_from_dataset(
    train_vi.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [None]:
print(vi_vocab[:10])
print(vi_vocab[100:110])
print(vi_vocab[1000:1010])
print(vi_vocab[-10:])

In [None]:
write_vocab_file('vi_vocab.txt', vi_vocab)

In [None]:
ls *.txt

In [None]:
lo_tokenizer = text.BertTokenizer('lo_vocab.txt', **bert_tokenizer_params)
vi_tokenizer = text.BertTokenizer('vi_vocab.txt', **bert_tokenizer_params)

In [None]:
for lo_examples, vi_examples in train_examples.batch(3).take(1):
  for ex in vi_examples:
    print(ex.numpy().decode("utf-8"))

In [None]:
# Tokenize the examples -> (batch, word, word-piece)
token_batch = vi_tokenizer.tokenize(vi_examples)
# Merge the word and word-piece axes -> (batch, tokens)
token_batch = token_batch.merge_dims(-2,-1)

for ex in token_batch.to_list():
  print(ex)

In [None]:
# Lookup each token id in the vocabulary.
txt_tokens = tf.gather(vi_vocab, token_batch)
# Join with spaces.
tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1)

In [None]:
words = vi_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [None]:
words = vi_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

In [None]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)
    
  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

In [None]:
vi_examples.numpy()

In [None]:
token_batch = vi_tokenizer.tokenize(vi_examples).merge_dims(-2,-1)
words = vi_tokenizer.detokenize(token_batch)
words

In [None]:
cleanup_text(reserved_tokens, words).numpy()

In [None]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))
    
    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()
    
  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [None]:
tokenizers = tf.Module()
tokenizers.lo = CustomTokenizer(reserved_tokens, 'lo_vocab.txt')
tokenizers.vi = CustomTokenizer(reserved_tokens, 'vi_vocab.txt')

In [None]:
model_name = 'btl_nlp_lao_viet'
tf.saved_model.save(tokenizers, model_name)

In [None]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.vi.get_vocab_size().numpy()

In [None]:
tokens = reloaded_tokenizers.vi.tokenize(['Xin chao Viet Nam'])
tokens.numpy()

In [None]:
text_tokens = reloaded_tokenizers.vi.lookup(tokens)
text_tokens

In [None]:
round_trip = reloaded_tokenizers.vi.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

In [None]:
!zip -r {model_name}.zip {model_name}

In [None]:
!du -h *.zip