In [1]:
#!pip install --upgrade sentencepiece

In [7]:
import pickle
import sentencepiece as spm
from time import time

In [8]:
MODEL_SAVE='/home/kvassay/data/z/models/sentencepiece/sp_lc_{}' 
DATASET='/home/kvassay/data/z/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[100,500,1000,8000]

## Load data

In [9]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text'].lower()+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 1000 tokens model took 287.24037885665894 seconds.
Training 8000 tokens model took 235.29969000816345 seconds.


## Sanity check

In [37]:
test_model='/Users/matejkvassay/data/sz/models/sentencepiece/sp_1000.model'

In [43]:
ID=1951

In [44]:
dev[ID]['text']

'My dog eats Nature\'s Recipe "Easy to Digest" dry food everyday. It really helped with her digestive issues. I thought the wet food version would be okay, but it made me completely ill. She was dry-heaving, had major gas issues and was quite lethargic. About 12 hours after her final dose of it, all of her symptoms disappeared. The dry food is still great, but this wet food is really awful.'

In [45]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [46]:
tokenizer.EncodeAsPieces(dev[ID]['text'].lower())

['▁my',
 '▁dog',
 '▁eat',
 's',
 '▁',
 'n',
 'at',
 'ure',
 "'",
 's',
 '▁recipe',
 '▁"',
 'ea',
 's',
 'y',
 '▁to',
 '▁di',
 'g',
 'est',
 '"',
 '▁dry',
 '▁food',
 '▁every',
 'day',
 '.',
 '▁it',
 '▁really',
 '▁help',
 'ed',
 '▁with',
 '▁her',
 '▁di',
 'g',
 'est',
 'ive',
 '▁issue',
 's',
 '.',
 '▁i',
 '▁thought',
 '▁the',
 '▁we',
 't',
 '▁food',
 '▁version',
 '▁would',
 '▁be',
 '▁o',
 'k',
 'a',
 'y',
 ',',
 '▁but',
 '▁it',
 '▁made',
 '▁me',
 '▁completely',
 '▁',
 'ill',
 '.',
 '▁she',
 '▁was',
 '▁dry',
 '-',
 'he',
 'a',
 'ving',
 ',',
 '▁had',
 '▁ma',
 'j',
 'or',
 '▁g',
 'as',
 '▁issue',
 's',
 '▁and',
 '▁was',
 '▁quite',
 '▁let',
 'h',
 'ar',
 'g',
 'ic',
 '.',
 '▁about',
 '▁12',
 '▁',
 'ho',
 'ur',
 's',
 '▁after',
 '▁her',
 '▁f',
 'in',
 'al',
 '▁do',
 'se',
 '▁of',
 '▁it',
 ',',
 '▁all',
 '▁of',
 '▁her',
 '▁',
 's',
 'y',
 'm',
 'p',
 't',
 'o',
 'm',
 's',
 '▁dis',
 'a',
 'pp',
 'ear',
 'ed',
 '.',
 '▁the',
 '▁dry',
 '▁food',
 '▁is',
 '▁still',
 '▁great',
 ',',
 '▁but',
 '▁t

#### Decode

In [47]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'My dog eats Nature\'s Recipe "Easy to Digest" dry food everyday. It really helped with her digestive issues. I thought the wet food version would be okay, but it made me completely ill. She was dry-heaving, had major gas issues and was quite lethargic. About 12 hours after her final dose of it, all of her symptoms disappeared. The dry food is still great, but this wet food is really awful.'