In [1]:
#!pip install --upgrade sentencepiece

In [7]:
import pickle
import sentencepiece as spm
from time import time

In [8]:
MODEL_SAVE='/Users/matejkvassay/data/sz/models/sentencepiece/sp_lc_{}' 
DATASET='/Users/matejkvassay/data/sz/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[1000,8000]

## Load data

In [9]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text'].lower()+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 1000 tokens model took 287.24037885665894 seconds.
Training 8000 tokens model took 235.29969000816345 seconds.


## Sanity check

In [25]:
test_model='/Users/matejkvassay/data/sz/models/sentencepiece/sp_8000.model'

In [32]:
ID=1151

In [33]:
dev[ID]['text']

'I got this for my wife. She likes to cook Pamelas mixes with molasses. It must be good because she eats it all the time. The price was awesome.\n\n\n\nTwo antlers up.'

In [34]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [35]:
tokenizer.EncodeAsPieces(dev[ID]['text'].lower())

['▁i',
 '▁got',
 '▁this',
 '▁for',
 '▁my',
 '▁wife',
 '.',
 '▁she',
 '▁likes',
 '▁to',
 '▁cook',
 '▁pa',
 'me',
 'la',
 's',
 '▁mixes',
 '▁with',
 '▁molasses',
 '.',
 '▁it',
 '▁must',
 '▁be',
 '▁good',
 '▁because',
 '▁she',
 '▁eats',
 '▁it',
 '▁all',
 '▁the',
 '▁time',
 '.',
 '▁the',
 '▁price',
 '▁was',
 '▁awesome',
 '.',
 '▁two',
 '▁antler',
 's',
 '▁up',
 '.']

#### Decode

In [36]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'I got this for my wife. She likes to cook Pamelas mixes with molasses. It must be good because she eats it all the time. The price was awesome. Two antlers up.'