In [1]:
#!pip install --upgrade sentencepiece

In [2]:
import pickle
import sentencepiece as spm
from time import time

In [3]:
MODEL_SAVE='/home/kvassay/data/z/models/sentencepiece/sp_{}' 
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[100,500,1000,8000]

## Load data

In [4]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text']+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 100 tokens model took 188.0792109966278 seconds.
Training 500 tokens model took 179.54791069030762 seconds.
Training 1000 tokens model took 177.5086886882782 seconds.
Training 8000 tokens model took 169.26805591583252 seconds.


## Sanity check

In [21]:
test_model='/home/kvassay/data/z/models/sentencepiece/sp_500.model'

In [22]:
ID=1125

In [23]:
dev[ID]['text']

'First off, I just have to say that this was a large bag of dog food, so if you\'ve got a smaller dog or a canine friend with a smaller appetite, this bag will go a long way. Also, be careful when carrying this, as the top of this is resealable (yay!) with a ziplock top and this could scratch your neck if you\'re not a little wary. (Just saying this because I ended up with a long scratch down my neck from carrying this on my shoulder and wanted others to avoid the same fate.)\n\n\n\nMy friend\'s canine is a little picky about what he eats, with dry dog food being his "last resort". I figured that if I could get this dog to eat and even remotely enjoy this food, then it would be a pretty good recommendation. Now while he didn\'t take to the Beneful like a fish to water or a cat to milk, he did eat it, which says quite a bit considering how he can be about food.\n\n\n\nI don\'t think that Beneful will become a regular addition to the dog\'s diet but that he eats this means that my friend

In [24]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [25]:
tokenizer.EncodeAsPieces(dev[ID]['text'])

['▁F',
 'ir',
 's',
 't',
 '▁of',
 'f',
 ',',
 '▁I',
 '▁just',
 '▁have',
 '▁to',
 '▁sa',
 'y',
 '▁that',
 '▁this',
 '▁was',
 '▁a',
 '▁large',
 '▁bag',
 '▁of',
 '▁dog',
 '▁food',
 ',',
 '▁so',
 '▁if',
 '▁you',
 "'",
 've',
 '▁go',
 't',
 '▁a',
 '▁small',
 'er',
 '▁dog',
 '▁or',
 '▁a',
 '▁can',
 'ine',
 '▁f',
 'ri',
 'end',
 '▁with',
 '▁a',
 '▁small',
 'er',
 '▁a',
 'pp',
 'et',
 'it',
 'e',
 ',',
 '▁this',
 '▁bag',
 '▁will',
 '▁go',
 '▁a',
 '▁long',
 '▁way',
 '.',
 '▁A',
 'l',
 's',
 'o',
 ',',
 '▁be',
 '▁car',
 'e',
 'ful',
 '▁when',
 '▁car',
 'ry',
 'ing',
 '▁this',
 ',',
 '▁as',
 '▁the',
 '▁to',
 'p',
 '▁of',
 '▁this',
 '▁is',
 '▁re',
 'se',
 'al',
 'able',
 '▁(',
 'y',
 'a',
 'y',
 '!',
 ')',
 '▁with',
 '▁a',
 '▁',
 'z',
 'i',
 'p',
 'lo',
 'ck',
 '▁to',
 'p',
 '▁and',
 '▁this',
 '▁could',
 '▁',
 's',
 'c',
 'ra',
 't',
 'ch',
 '▁your',
 '▁',
 'ne',
 'ck',
 '▁if',
 '▁you',
 "'",
 're',
 '▁not',
 '▁a',
 '▁little',
 '▁w',
 'ar',
 'y',
 '.',
 '▁(',
 'J',
 'us',
 't',
 '▁sa',
 'y',
 'in

#### Decode

In [26]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'First off, I just have to say that this was a large bag of dog food, so if you\'ve got a smaller dog or a canine friend with a smaller appetite, this bag will go a long way. Also, be careful when carrying this, as the top of this is resealable (yay!) with a ziplock top and this could scratch your neck if you\'re not a little wary. (Just saying this because I ended up with a long scratch down my neck from carrying this on my shoulder and wanted others to avoid the same fate.) My friend\'s canine is a little picky about what he eats, with dry dog food being his "last resort". I figured that if I could get this dog to eat and even remotely enjoy this food, then it would be a pretty good recommendation. Now while he didn\'t take to the Beneful like a fish to water or a cat to milk, he did eat it, which says quite a bit considering how he can be about food. I don\'t think that Beneful will become a regular addition to the dog\'s diet but that he eats this means that my friend will have ano