In [1]:
#!pip install --upgrade sentencepiece

In [2]:
import pickle
import sentencepiece as spm
from time import time

In [3]:
MODEL_SAVE='/Users/matejkvassay/data/sz/models/sentencepiece/sp_{}' 
DATASET='/Users/matejkvassay/data/sz/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[1000,8000]

## Load data

In [4]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text']+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 1000 tokens model took 288.54957485198975 seconds.
Training 8000 tokens model took 246.3753662109375 seconds.


## Sanity check

In [8]:
test_model='/Users/matejkvassay/data/sz/models/sentencepiece/sp_8000.model'

In [9]:
ID=1125

In [10]:
dev[ID]['text']

'First off, I just have to say that this was a large bag of dog food, so if you\'ve got a smaller dog or a canine friend with a smaller appetite, this bag will go a long way. Also, be careful when carrying this, as the top of this is resealable (yay!) with a ziplock top and this could scratch your neck if you\'re not a little wary. (Just saying this because I ended up with a long scratch down my neck from carrying this on my shoulder and wanted others to avoid the same fate.)\n\n\n\nMy friend\'s canine is a little picky about what he eats, with dry dog food being his "last resort". I figured that if I could get this dog to eat and even remotely enjoy this food, then it would be a pretty good recommendation. Now while he didn\'t take to the Beneful like a fish to water or a cat to milk, he did eat it, which says quite a bit considering how he can be about food.\n\n\n\nI don\'t think that Beneful will become a regular addition to the dog\'s diet but that he eats this means that my friend

In [11]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [12]:
tokenizer.EncodeAsPieces(dev[ID]['text'])

['▁First',
 '▁off',
 ',',
 '▁I',
 '▁just',
 '▁have',
 '▁to',
 '▁say',
 '▁that',
 '▁this',
 '▁was',
 '▁a',
 '▁large',
 '▁bag',
 '▁of',
 '▁dog',
 '▁food',
 ',',
 '▁so',
 '▁if',
 '▁you',
 "'",
 've',
 '▁got',
 '▁a',
 '▁smaller',
 '▁dog',
 '▁or',
 '▁a',
 '▁can',
 'ine',
 '▁friend',
 '▁with',
 '▁a',
 '▁smaller',
 '▁appetite',
 ',',
 '▁this',
 '▁bag',
 '▁will',
 '▁go',
 '▁a',
 '▁long',
 '▁way',
 '.',
 '▁Also',
 ',',
 '▁be',
 '▁careful',
 '▁when',
 '▁carrying',
 '▁this',
 ',',
 '▁as',
 '▁the',
 '▁top',
 '▁of',
 '▁this',
 '▁is',
 '▁resealable',
 '▁(',
 'y',
 'ay',
 '!)',
 '▁with',
 '▁a',
 '▁ziplock',
 '▁top',
 '▁and',
 '▁this',
 '▁could',
 '▁scratch',
 '▁your',
 '▁neck',
 '▁if',
 '▁you',
 "'",
 're',
 '▁not',
 '▁a',
 '▁little',
 '▁war',
 'y',
 '.',
 '▁(',
 'J',
 'ust',
 '▁saying',
 '▁this',
 '▁because',
 '▁I',
 '▁ended',
 '▁up',
 '▁with',
 '▁a',
 '▁long',
 '▁scratch',
 '▁down',
 '▁my',
 '▁neck',
 '▁from',
 '▁carrying',
 '▁this',
 '▁on',
 '▁my',
 '▁should',
 'er',
 '▁and',
 '▁wanted',
 '▁others

#### Decode

In [13]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'First off, I just have to say that this was a large bag of dog food, so if you\'ve got a smaller dog or a canine friend with a smaller appetite, this bag will go a long way. Also, be careful when carrying this, as the top of this is resealable (yay!) with a ziplock top and this could scratch your neck if you\'re not a little wary. (Just saying this because I ended up with a long scratch down my neck from carrying this on my shoulder and wanted others to avoid the same fate.) My friend\'s canine is a little picky about what he eats, with dry dog food being his "last resort". I figured that if I could get this dog to eat and even remotely enjoy this food, then it would be a pretty good recommendation. Now while he didn\'t take to the Beneful like a fish to water or a cat to milk, he did eat it, which says quite a bit considering how he can be about food. I don\'t think that Beneful will become a regular addition to the dog\'s diet but that he eats this means that my friend will have ano