In [1]:
#!pip install --upgrade sentencepiece

In [7]:
import pickle
import sentencepiece as spm
from time import time

In [8]:
MODEL_SAVE='/Users/matejkvassay/data/sz/models/sentencepiece/sp_enwiki_lc_{}' 
DATASET='/Users/matejkvassay/data/sz/data/reviews_train_test_dev1.pickle'
TMP_DATASET_FILE='/tmp/spt_train.txt'
VOCAB_SIZES=[1000,8000]

## Load data

In [9]:
with open(DATASET,'rb') as f:
    train,dev,_=pickle.load(f)

## Train model

In [5]:
def prepare_training_file(dataset, save_path, limit_data=None):
    with open(TMP_DATASET_FILE, 'w') as f:
        if limit_data is None:
            limit=10000000000
        else:
            limit=limit_data
        f.writelines([x['text'].lower()+'\n' for x in dataset if x][:limit])
        
def train_sentpiece(dataset_path, vocab_size, save_path):
    spm.SentencePieceTrainer.Train(
        '--input={} --model_prefix={} --vocab_size={}'.format(
            dataset_path,
            save_path,
            vocab_size))
        

## Prepare training dataset

In [6]:
prepare_training_file(train,TMP_DATASET_FILE, limit_data=None)

## Train models

In [7]:
for vocab_size in VOCAB_SIZES:
    start=time()
    train_sentpiece(TMP_DATASET_FILE, vocab_size, MODEL_SAVE.format(vocab_size))
    end=time()
    print('Training {} tokens model took {} seconds.'.format(vocab_size,end-start))

Training 1000 tokens model took 287.24037885665894 seconds.
Training 8000 tokens model took 235.29969000816345 seconds.


## Sanity check

In [25]:
test_model='/Users/matejkvassay/data/sz/models/sentencepiece/sp_8000.model'

In [26]:
ID=115

In [27]:
dev[ID]['text']

'howdy y\'all,\n\n\n\nthis is GOOD stuff! it\'s a tad too hot for me, so i either buy the "milder" version or add some non-spicy canned tomatoes to it.\n\n\n\nthe price is ... outrageous, tho. at about twenty-nine dollars for a 12-pack, it comes to nearly two fifty for ONE CAN. since it\'s listed at ro-tel for a suggested price of one thirty - and available at most grocery stores for one dollar - that\'s far, far too much.\n\n\n\nhopefully, the price will eventually drop. then i\'ll buy LOTS of it! [*grin*]\n\n\n\ntake care,\n\nlee'

In [28]:
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(test_model)

True

#### Encode

In [29]:
tokenizer.EncodeAsPieces(dev[ID]['text'].lower())

['▁how',
 'dy',
 '▁',
 'y',
 "'",
 'all',
 ',',
 '▁this',
 '▁is',
 '▁good',
 '▁stuff',
 '!',
 '▁it',
 "'",
 's',
 '▁a',
 '▁tad',
 '▁too',
 '▁hot',
 '▁for',
 '▁me',
 ',',
 '▁so',
 '▁i',
 '▁either',
 '▁buy',
 '▁the',
 '▁"',
 'm',
 'il',
 'der',
 '"',
 '▁version',
 '▁or',
 '▁add',
 '▁some',
 '▁non',
 '-',
 'spicy',
 '▁canned',
 '▁tomatoes',
 '▁to',
 '▁it',
 '.',
 '▁the',
 '▁price',
 '▁is',
 '▁...',
 '▁outrageous',
 ',',
 '▁tho',
 '.',
 '▁at',
 '▁about',
 '▁twenty',
 '-',
 'n',
 'ine',
 '▁dollars',
 '▁for',
 '▁a',
 '▁12-',
 'pack',
 ',',
 '▁it',
 '▁comes',
 '▁to',
 '▁nearly',
 '▁two',
 '▁fifty',
 '▁for',
 '▁one',
 '▁can',
 '.',
 '▁since',
 '▁it',
 "'",
 's',
 '▁listed',
 '▁at',
 '▁ro',
 '-',
 't',
 'el',
 '▁for',
 '▁a',
 '▁suggested',
 '▁price',
 '▁of',
 '▁one',
 '▁thirty',
 '▁-',
 '▁and',
 '▁available',
 '▁at',
 '▁most',
 '▁grocery',
 '▁stores',
 '▁for',
 '▁one',
 '▁dollar',
 '▁-',
 '▁that',
 "'",
 's',
 '▁far',
 ',',
 '▁far',
 '▁too',
 '▁much',
 '.',
 '▁hopefully',
 ',',
 '▁the',
 '▁pric

#### Decode

In [24]:
tokenizer.DecodePieces(tokenizer.EncodeAsPieces(dev[ID]['text']))

'howdy y\'all, this is GOOD stuff! it\'s a tad too hot for me, so i either buy the "milder" version or add some non-spicy canned tomatoes to it. the price is ... outrageous, tho. at about twenty-nine dollars for a 12-pack, it comes to nearly two fifty for ONE CAN. since it\'s listed at ro-tel for a suggested price of one thirty - and available at most grocery stores for one dollar - that\'s far, far too much. hopefully, the price will eventually drop. then i\'ll buy LOTS of it! [*grin*] take care, lee'