In [1]:
%load_ext autoreload
%autoreload 2

import os
import io
import tempfile
import pandas as pd
import codecs
from sentencepiece import SentencePieceProcessor, SentencePieceTrainer

In [2]:
# Load bsd_ja_en
ja_fname = os.path.join("/mnt/dl/NLP/bsd_ja_en/data/train.ja")
en_fname = os.path.join("/mnt/dl/NLP/bsd_ja_en/data/train.en")
savepath = os.path.join("/mnt/dl/NLP/bsd_ja_en/data/sentencepiece")
os.makedirs(savepath, exist_ok=True)

In [3]:
temp = tempfile.NamedTemporaryFile(mode="w+", suffix=".en-ja")

In [4]:
temp.name

'/tmp/tmpg7624l8y.en-ja'

In [5]:
en_model_fname = os.path.join(savepath, "train.en.m")
en_writer = codecs.open(en_model_fname, "wb")
en_spm = SentencePieceTrainer.train(input=en_fname, 
                                    vocab_size=1024, 
                                    model_writer=en_writer
                                    )

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /mnt/dl/NLP/bsd_ja_en/data/train.en
  input_format: 
  model_prefix: 
  model_type: UNIGRAM
  vocab_size: 1024
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privac

In [6]:
en_sp = SentencePieceProcessor(model_file=en_model_fname)

In [7]:
en_sp

<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f64a8440f00> >

In [8]:
en_sp.bos_id()

1

In [9]:
en_sp.unk_id()

0

In [10]:
with open(en_fname, "r") as f:
    sent = f.readline()
    

In [11]:
sent

"So let's pretend we have to export a product to Japan today.\n"

In [12]:
en_sp.get_piece_size()

1024

In [13]:
en_sp.GetPieceSize()

1024

In [14]:
en_sp.PieceToId("unk")

0

In [15]:
en_sp.PieceToId("bos")

0

In [16]:
en_sp.PieceToId("<s>"), en_sp.PieceToId("</s>")

(1, 2)

In [17]:
en_sp.PieceToId("<unk>")

0

In [18]:
en_sp.IdToPiece(0)

'<unk>'

In [19]:
en_sp.IdToPiece(100)

'▁Oh'

In [20]:
en_sp.PieceToId("▁go")

105

In [21]:
vocab = dict()
for i in range(en_sp.get_piece_size()): 
    vocab[en_sp.IdToPiece(i)] = i

In [22]:
sorted(vocab)

['!',
 '"',
 "'",
 ',',
 '-',
 '.',
 '0',
 '00',
 '1',
 '2',
 '3',
 '4',
 '5',
 '9',
 '</s>',
 '<s>',
 '<unk>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'R',
 'S',
 'T',
 'U',
 'W',
 'Y',
 'a',
 'able',
 'ac',
 'ack',
 'age',
 'ake',
 'al',
 'ally',
 'am',
 'an',
 'ance',
 'ant',
 'ar',
 'ashi',
 'ate',
 'ation',
 'ative',
 'b',
 'body',
 'c',
 'ccording',
 'ce',
 'cha',
 'chi',
 'clock',
 'com',
 'ctual',
 'd',
 'dditional',
 'e',
 'ec',
 'ed',
 'el',
 'en',
 'end',
 'ent',
 'er',
 'ers',
 'es',
 'ever',
 'f',
 'fully',
 'g',
 'ge',
 'giving',
 'h',
 'ha',
 'house',
 'i',
 'ic',
 'id',
 'ies',
 'if',
 'ight',
 'il',
 'ill',
 'im',
 'in',
 'ing',
 'ion',
 'ir',
 'is',
 'ite',
 'ity',
 'ive',
 'ize',
 'j',
 'k',
 'ki',
 'l',
 'la',
 'le',
 'less',
 'li',
 'll',
 'lo',
 'ly',
 'm',
 'mail',
 'ment',
 'n',
 'nce',
 'ne',
 'o',
 'ock',
 'ok',
 'ol',
 'on',
 'op',
 'or',
 'ose',
 'ough',
 'ous',
 'out',
 'ow',
 'p',
 'per',
 'q',

In [23]:
sent

"So let's pretend we have to export a product to Japan today.\n"

In [24]:
en_sp.encode(sent)

[98, 162, 11, 4, 101, 50, 9, 400, 35, 36, 10, 785, 13, 189, 10, 434, 173, 3]

In [25]:
[en_sp.IdToPiece(i) for i in en_sp.encode(sent)]

['▁So',
 '▁let',
 "'",
 's',
 '▁p',
 're',
 't',
 'end',
 '▁we',
 '▁have',
 '▁to',
 '▁export',
 '▁a',
 '▁product',
 '▁to',
 '▁Japan',
 '▁today',
 '.']

In [26]:
en_sp.EncodeAsIds(sent)

[98, 162, 11, 4, 101, 50, 9, 400, 35, 36, 10, 785, 13, 189, 10, 434, 173, 3]

In [27]:
en_sp.EncodeAsPieces(sent)

['▁So',
 '▁let',
 "'",
 's',
 '▁p',
 're',
 't',
 'end',
 '▁we',
 '▁have',
 '▁to',
 '▁export',
 '▁a',
 '▁product',
 '▁to',
 '▁Japan',
 '▁today',
 '.']

In [28]:
en_sp.DecodePieces(en_sp.EncodeAsPieces(sent))

"So let's pretend we have to export a product to Japan today."

In [29]:
en_sp.DecodeIds(en_sp.encode(sent))

"So let's pretend we have to export a product to Japan today."

In [30]:
en_sp.SampleEncodeAsIds(sent, nbest_size=2)

[98, 162, 11, 4, 101, 50, 9, 400, 35, 36, 10, 785, 13, 189, 10, 434, 173, 3]

In [31]:
en_sp.SampleEncodeAsPieces(sent, nbest_size=2)

['▁So',
 '▁let',
 "'",
 's',
 '▁',
 'p',
 're',
 't',
 'end',
 '▁we',
 '▁have',
 '▁to',
 '▁export',
 '▁a',
 '▁product',
 '▁to',
 '▁Japan',
 '▁today',
 '.']

In [32]:
en_sp.SampleEncodeAsIds(sent, nbest_size=30)

[98,
 162,
 11,
 4,
 101,
 30,
 14,
 9,
 90,
 18,
 35,
 36,
 10,
 785,
 13,
 189,
 10,
 434,
 173,
 3]

In [33]:
en_sp.SampleEncodeAsPieces(sent, nbest_size=30)

['▁So',
 '▁let',
 "'",
 's',
 '▁',
 'p',
 're',
 't',
 'en',
 'd',
 '▁we',
 '▁have',
 '▁to',
 '▁export',
 '▁',
 'a',
 '▁product',
 '▁to',
 '▁Japan',
 '▁today',
 '.']

In [34]:
ja_model_fname = os.path.join(savepath, "train.ja.m")
ja_writer = codecs.open(ja_model_fname, "wb")
ja_spm = SentencePieceTrainer.train(input=ja_fname, model_writer=ja_writer,
                                    vocab_size=1024 * 2)

sentencepiece_trainer.cc(77) LOG(INFO) Starts training with : 
trainer_spec {
  input: /mnt/dl/NLP/bsd_ja_en/data/train.ja
  input_format: 
  model_prefix: 
  model_type: UNIGRAM
  vocab_size: 2048
  self_test_sample_size: 0
  character_coverage: 0.9995
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0
  differential_privac

In [35]:
ja_sp = SentencePieceProcessor()
ja_sp.Load(ja_model_fname)

True

In [36]:
ja_sp

<sentencepiece.SentencePieceProcessor; proxy of <Swig Object of type 'sentencepiece::SentencePieceProcessor *' at 0x7f64a8440a80> >

In [37]:
with open(ja_fname) as f:
    ja_sent = f.readline()

In [38]:
ja_sent

'では、今日日本へ商品を輸出すると仮定しましょう。\n'

In [39]:
ja_sp.encode(ja_sent)

[98, 5, 194, 205, 236, 145, 9, 1988, 53, 32, 14, 983, 203, 507, 4]

In [40]:
ja_sp.EncodeAsIds(ja_sent)

[98, 5, 194, 205, 236, 145, 9, 1988, 53, 32, 14, 983, 203, 507, 4]

In [41]:
ja_sp.EncodeAsPieces(ja_sent)

['▁では',
 '、',
 '今日',
 '日本',
 'へ',
 '商品',
 'を',
 '輸',
 '出',
 'する',
 'と',
 '仮',
 '定',
 'しましょう',
 '。']

In [42]:
ja_sp.DecodeIds(ja_sp.encode(ja_sent))

'では、今日日本へ商品を輸出すると仮定しましょう。'

In [43]:
ja_sp.DecodePieces(ja_sp.EncodeAsPieces(ja_sent))

'では、今日日本へ商品を輸出すると仮定しましょう。'

In [None]:
import fastBPE