In [25]:
import torch
from torch.utils.data import Dataset
import pickle
import string

In [26]:
from mininlp.data import Tokenizer

tokenizer = Tokenizer()

In [27]:
"""Token are every ascii character and special tokens for start of sentence, 
end of sentence, padding, unknown and mask."""
from mininlp.data import assci_tokens
print(assci_tokens())

{'%', 'x', 'd', '6', 'z', '"', '<pad>', '<mask>', 'k', "'", 'X', 'R', '\\', '9', 'T', 'S', '=', 'M', ' ', 'y', 'B', '-', '\t', 'l', 'o', '(', 'Y', '\x0b', '<unk>', ',', ')', 'g', 'v', '|', 'j', 'q', '\n', '1', 'C', '<', 'c', 'm', 'b', 'f', 'a', 't', '.', '2', '?', '{', '3', 'H', 'A', '4', '`', '~', '*', 'J', '/', ':', '\r', 'i', '+', 'h', 'w', '&', '!', '>', '<eos>', 'W', 's', '7', '5', '\x0c', 'e', 'Z', 'K', 'P', 'V', 'u', 'F', '@', 'N', '$', 'r', '8', '#', 'E', '[', '_', ']', '^', 'Q', 'D', '<sos>', 'p', 'U', 'n', ';', '0', 'L', 'G', 'O', 'I', '}'}


In [28]:
tokens = assci_tokens()
tokenizer = Tokenizer(tokens)

In [29]:
"""Test if the tokenizer is able to convert tokens to ids and vice versa."""

for id in tokenizer._tokens:
    assert tokenizer._token_ids[tokenizer._tokens[id]] == id

In [30]:
"""Test if the tokenizer is able to encode and decode a string."""

test_string = "Hello, World! \nLovely day, isn't it?"

test_encoded = tokenizer.encode(test_string)
test_decoded = tokenizer.decode(test_encoded)
assert test_string == "".join(test_decoded)

print(test_string)
print(test_encoded)
print(test_decoded)

Hello, World! 
Lovely day, isn't it?
tensor([ 51,  74,  23,  23,  24,  29,  18,  69,  24,  84,  23,   2,  66,  18,
         36, 100,  24,  32,  74,  23,  19,  18,   2,  44,  19,  29,  18,  61,
         70,  97,   9,  45,  18,  61,  45,  48], dtype=torch.int32)
['H', 'e', 'l', 'l', 'o', ',', ' ', 'W', 'o', 'r', 'l', 'd', '!', ' ', '\n', 'L', 'o', 'v', 'e', 'l', 'y', ' ', 'd', 'a', 'y', ',', ' ', 'i', 's', 'n', "'", 't', ' ', 'i', 't', '?']


In [31]:
"""Test if the tokenizer is able to save and load itself."""

tokenizer.save("tokenizer")

tokenizer2 = Tokenizer()
tokenizer2.load("tokenizer.pkl")

In [32]:
"""Test if the loaded tokenizer is the same as the original tokenizer."""

assert tokenizer._tokens == tokenizer2._tokens
assert tokenizer._token_ids == tokenizer2._token_ids

In [33]:
"""Test if the orginal tokenizer and the loaded one encodes and decodes a string to the same tokens ids."""

test_encoded = tokenizer2.encode(test_string)
test_decoded = tokenizer.decode(test_encoded)
assert test_string == "".join(test_decoded)

test_encoded = tokenizer.encode(test_string)
test_decoded = tokenizer2.decode(test_encoded)
assert test_string == "".join(test_decoded)

print(test_decoded)
print(test_encoded)

['H', 'e', 'l', 'l', 'o', ',', ' ', 'W', 'o', 'r', 'l', 'd', '!', ' ', '\n', 'L', 'o', 'v', 'e', 'l', 'y', ' ', 'd', 'a', 'y', ',', ' ', 'i', 's', 'n', "'", 't', ' ', 'i', 't', '?']
tensor([ 51,  74,  23,  23,  24,  29,  18,  69,  24,  84,  23,   2,  66,  18,
         36, 100,  24,  32,  74,  23,  19,  18,   2,  44,  19,  29,  18,  61,
         70,  97,   9,  45,  18,  61,  45,  48], dtype=torch.int32)


In [34]:
from mininlp.data import SequenceDataset

encoded_document = tokenizer.tokenize_document("../data/anna.txt")
dataset = SequenceDataset('../data/anna.txt', tokenizer, 32, 1000)

In [35]:
dataset[0]

(tensor([  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,  94,  18, 103], dtype=torch.int32),
 tensor([  6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,   6,
           6,  18, 103,  97], dtype=torch.int32))

In [36]:
print(tokenizer.decode(dataset[50][0]), tokenizer.decode(dataset[50][1]))
print(tokenizer.decode(dataset[46][0]), tokenizer.decode(dataset[46][1]))
print(tokenizer.decode(dataset[5][0]), tokenizer.decode(dataset[5][1]))

['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<sos>', 'h', 'i', 'c', 'h', ',', ' ', 't', 'h'] ['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 'h', 'i', 'c', 'h', ',', ' ', 't', 'h', 'o']
['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<sos>', ' ', 'H', 'e', ' ', 'a', 'l', 'w', 'a', 'y', 's', ' ', 't', 'o', 'o', 'k', ','] ['<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', ' ', 'H', 'e', ' ', 'a', 'l', 'w', 'a', 'y', 's', ' ', 't', 'o', 'o', 'k', ',', ' ']
['<pad>', '<pad>', '<pad>', '<pad>', '<pad>'