In [1]:
#!pip install --upgrade sentencepiece

In [2]:
import pickle
import sentencepiece as spm
from time import time

In [3]:
MODEL_SAVE='/home/kvassay/data/z/models/sentencepiece/sp_lc_{}' 
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[8000,16000,32000,64000]

## Load data

In [4]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text'].lower()+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 8000 tokens model took 169.7974009513855 seconds.
Training 16000 tokens model took 159.92341208457947 seconds.
Training 32000 tokens model took 156.61435079574585 seconds.
Training 64000 tokens model took 153.4472267627716 seconds.


## Sanity check

In [19]:
test_model='/home/kvassay/data/z/models/sentencepiece/sp_lc_64000.model'

In [20]:
ID=1951

In [21]:
dev[ID]['text']

'My dog eats Nature\'s Recipe "Easy to Digest" dry food everyday. It really helped with her digestive issues. I thought the wet food version would be okay, but it made me completely ill. She was dry-heaving, had major gas issues and was quite lethargic. About 12 hours after her final dose of it, all of her symptoms disappeared. The dry food is still great, but this wet food is really awful.'

In [22]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [23]:
tokenizer.EncodeAsPieces(dev[ID]['text'].lower())

['▁my',
 '▁dog',
 '▁eats',
 '▁nature',
 "'",
 's',
 '▁recipe',
 '▁"',
 'easy',
 '▁to',
 '▁digest',
 '"',
 '▁dry',
 '▁food',
 '▁everyday',
 '.',
 '▁it',
 '▁really',
 '▁helped',
 '▁with',
 '▁her',
 '▁digestive',
 '▁issues',
 '.',
 '▁i',
 '▁thought',
 '▁the',
 '▁wet',
 '▁food',
 '▁version',
 '▁would',
 '▁be',
 '▁okay',
 ',',
 '▁but',
 '▁it',
 '▁made',
 '▁me',
 '▁completely',
 '▁i',
 'll',
 '.',
 '▁she',
 '▁was',
 '▁dry',
 '-',
 'he',
 'aving',
 ',',
 '▁had',
 '▁major',
 '▁gas',
 '▁issues',
 '▁and',
 '▁was',
 '▁quite',
 '▁lethargic',
 '.',
 '▁about',
 '▁12',
 '▁hours',
 '▁after',
 '▁her',
 '▁final',
 '▁dose',
 '▁of',
 '▁it',
 ',',
 '▁all',
 '▁of',
 '▁her',
 '▁symptoms',
 '▁disappeared',
 '.',
 '▁the',
 '▁dry',
 '▁food',
 '▁is',
 '▁still',
 '▁great',
 ',',
 '▁but',
 '▁this',
 '▁wet',
 '▁food',
 '▁is',
 '▁really',
 '▁awful',
 '.']

#### Decode

In [24]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'My dog eats Nature\'s Recipe "Easy to Digest" dry food everyday. It really helped with her digestive issues. I thought the wet food version would be okay, but it made me completely ill. She was dry-heaving, had major gas issues and was quite lethargic. About 12 hours after her final dose of it, all of her symptoms disappeared. The dry food is still great, but this wet food is really awful.'