In [1]:
#!pip install --upgrade sentencepiece

In [2]:
import pickle
import sentencepiece as spm
from time import time

In [3]:
MODEL_SAVE='/home/kvassay/data/z/models/sentencepiece/sp_lc_{}' 
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[100,500,700,1000,2000]

## Load data

In [4]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text'].lower()+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 100 tokens model took 179.31135082244873 seconds.
Training 500 tokens model took 173.569965839386 seconds.
Training 700 tokens model took 172.5167191028595 seconds.
Training 1000 tokens model took 172.21199893951416 seconds.
Training 2000 tokens model took 169.3970229625702 seconds.


## Sanity check

In [67]:
test_model='/home/kvassay/data/z/models/sentencepiece/sp_lc_2000.model'

In [68]:
ID=1951

In [69]:
dev[ID]['text']

'My dog eats Nature\'s Recipe "Easy to Digest" dry food everyday. It really helped with her digestive issues. I thought the wet food version would be okay, but it made me completely ill. She was dry-heaving, had major gas issues and was quite lethargic. About 12 hours after her final dose of it, all of her symptoms disappeared. The dry food is still great, but this wet food is really awful.'

In [70]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [71]:
tokenizer.EncodeAsPieces(dev[ID]['text'].lower())

['▁my',
 '▁dog',
 '▁eat',
 's',
 '▁nature',
 "'",
 's',
 '▁recipe',
 '▁"',
 'ea',
 's',
 'y',
 '▁to',
 '▁digest',
 '"',
 '▁dry',
 '▁food',
 '▁everyday',
 '.',
 '▁it',
 '▁really',
 '▁help',
 'ed',
 '▁with',
 '▁her',
 '▁digest',
 'ive',
 '▁issues',
 '.',
 '▁i',
 '▁thought',
 '▁the',
 '▁we',
 't',
 '▁food',
 '▁version',
 '▁would',
 '▁be',
 '▁okay',
 ',',
 '▁but',
 '▁it',
 '▁made',
 '▁me',
 '▁completely',
 '▁i',
 'll',
 '.',
 '▁she',
 '▁was',
 '▁dry',
 '-',
 'he',
 'a',
 'v',
 'ing',
 ',',
 '▁had',
 '▁major',
 '▁ga',
 's',
 '▁issues',
 '▁and',
 '▁was',
 '▁quite',
 '▁let',
 'h',
 'ar',
 'g',
 'ic',
 '.',
 '▁about',
 '▁12',
 '▁hours',
 '▁after',
 '▁her',
 '▁fi',
 'n',
 'al',
 '▁do',
 'se',
 '▁of',
 '▁it',
 ',',
 '▁all',
 '▁of',
 '▁her',
 '▁',
 's',
 'y',
 'mp',
 'to',
 'm',
 's',
 '▁dis',
 'a',
 'pp',
 'ear',
 'ed',
 '.',
 '▁the',
 '▁dry',
 '▁food',
 '▁is',
 '▁still',
 '▁great',
 ',',
 '▁but',
 '▁this',
 '▁we',
 't',
 '▁food',
 '▁is',
 '▁really',
 '▁awful',
 '.']

#### Decode

In [29]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'My dog eats Nature\'s Recipe "Easy to Digest" dry food everyday. It really helped with her digestive issues. I thought the wet food version would be okay, but it made me completely ill. She was dry-heaving, had major gas issues and was quite lethargic. About 12 hours after her final dose of it, all of her symptoms disappeared. The dry food is still great, but this wet food is really awful.'