In [1]:
# using the reinforce environment in d:\reinforce along with below libraries
# pip install transformers accelerate torchtext datasets evaluate peft
# Will be using python and transformers related functions / concepts only

from transformers import AutoTokenizer
from datasets import load_dataset
import warnings
warnings.filterwarnings("ignore")

ds_wiki = load_dataset("wikitext", name="wikitext-2-raw-v1",
                       split='train')
ds_wiki

  from .autonotebook import tqdm as notebook_tqdm
Generating test split: 100%|██████████| 4358/4358 [00:00<00:00, 173312.76 examples/s]
Generating train split: 100%|██████████| 36718/36718 [00:00<00:00, 158682.84 examples/s]
Generating validation split: 100%|██████████| 3760/3760 [00:00<00:00, 73672.28 examples/s]


Dataset({
    features: ['text'],
    num_rows: 36718
})

In [2]:
def return_text(row):
    return row['text']

In [45]:
ds_wiki_gen = (return_text(row) for row in ds_wiki)
ds_wiki_iter = iter(ds_wiki_gen)

# for i, row in enumerate(ds_wiki_iter):
    # print(row)
    # if i > 3:
        # break

In [6]:
model = 'gpt2'
gpt_tokenizer = AutoTokenizer.from_pretrained(model)

In [7]:
# check if the tokenizer is fast
gpt_tokenizer.is_fast

True

In [9]:
gpt_vocab_dict = gpt_tokenizer.get_vocab()
# len of gpt_vocab
len(gpt_vocab_dict.items())

50257

In [10]:
# lets retrain the tokenizer with lesser vocab
new_gpt_tokenizer = gpt_tokenizer.train_new_from_iterator(ds_wiki_iter,
                                                          vocab_size=25000)

In [11]:
new_vocab_dict = new_gpt_tokenizer.get_vocab()
len(new_vocab_dict.items())

25000

In [12]:
# manually splitting and encoding a sentence. How tokenizer works. Inside out
example = "There is more to what meets the eye in coding."
gpt_tokenizer.encode(example)

[1858, 318, 517, 284, 644, 11185, 262, 4151, 287, 19617, 13]

In [13]:
example_encode = [1858, 318, 517, 284, 644, 11185, 262, 4151, 287, 19617, 13]
gpt_tokenizer.decode(example_encode)

'There is more to what meets the eye in coding.'

In [14]:
# manually splitting and encoding a sentence. How tokenizer works. Inside out
example = "There is more to what meets the eye in coding."
new_gpt_tokenizer.encode(example)

[52, 2246, 364, 667, 290, 1304, 7172, 261, 4791, 284, 12656, 286, 14]

In [16]:
example_encode = [52, 2246, 364, 667, 290, 1304, 7172, 261, 4791, 284, 12656, 286, 14]
print(gpt_tokenizer.decode(example_encode))  # thats why tokenizers are important
print(new_gpt_tokenizer.decode(example_encode))

UACersirst andider consumeron hus to pipe of/
There is more to what meets the eye in coding.


In [19]:
# lets try manually to get the word / its indices from tokenizer vocab
gpt_tokenizer.tokenize(example)

['There',
 'Ġis',
 'Ġmore',
 'Ġto',
 'Ġwhat',
 'Ġmeets',
 'Ġthe',
 'Ġeye',
 'Ġin',
 'Ġcoding',
 '.']

In [20]:
# lets try manually to get the word / its indices from tokenizer vocab
new_gpt_tokenizer.tokenize(example)

['T',
 'here',
 'Ġis',
 'Ġmore',
 'Ġto',
 'Ġwhat',
 'Ġmeets',
 'Ġthe',
 'Ġeye',
 'Ġin',
 'Ġcod',
 'ing',
 '.']

In [25]:
store = []
# We are seeing the tokenizing and encoding here
for ele in new_gpt_tokenizer.tokenize(example):
    store.append(new_vocab_dict[ele])

print(store, 'new 25k vocab')
store = []
for ele in new_gpt_tokenizer.tokenize(example):
    store.append(gpt_vocab_dict[ele])
print(store, 'orig 50k+ vocab')

[52, 2246, 364, 667, 290, 1304, 7172, 261, 4791, 284, 12656, 286, 14] new 25k vocab
[51, 1456, 318, 517, 284, 644, 11185, 262, 4151, 287, 14873, 278, 13] orig 50k+ vocab


In [26]:
new_gpt_tokenizer.save_pretrained('new_25k_tokenizer')

('new_25k_tokenizer\\tokenizer_config.json',
 'new_25k_tokenizer\\special_tokens_map.json',
 'new_25k_tokenizer\\vocab.json',
 'new_25k_tokenizer\\merges.txt',
 'new_25k_tokenizer\\added_tokens.json',
 'new_25k_tokenizer\\tokenizer.json')

In [27]:
# building tokenizer from scratch
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
) 

In [46]:
# We can create WordPiece / Byte Pair / Unigram tokenizer

bpe_tokenizer = Tokenizer(models.BPE())  # start with BPE model
# no normalizer process for BPE (this is same as normalisation)
bpe_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
# lets tokenize a string string
bpe_tokenizer.pre_tokenizer.pre_tokenize_str("test tokenizing a string")

[('test', (0, 4)),
 ('Ġtokenizing', (4, 15)),
 ('Ġa', (15, 17)),
 ('Ġstring', (17, 24))]

In [47]:
# attaching a trainer to the tokenizer 
bpe_trainer = trainers.BpeTrainer(vocab_size=25000,
                                  special_token=["<|endoftext|>"])
bpe_tokenizer.train_from_iterator(ds_wiki_iter, trainer=bpe_trainer)

In [40]:
gpt_tokenizer.encode("Testing Encoding")

[44154, 14711, 7656]

In [48]:
bpe_tokenizer.encode("Testing encoding")

Encoding(num_tokens=5, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [49]:
bpe_tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
bpe_tokenizer.decoder = decoders.ByteLevel()

In [50]:
bpe_tokenizer.get_vocab()

{'ĠCommon': 4803,
 'iro': 4306,
 'ĠBeau': 10082,
 'ĠCloud': 10917,
 'ĠBoeing': 21630,
 'ĠAndersons': 22027,
 'ky': 2178,
 'ersey': 5269,
 'Ġagricult': 5273,
 'ĠEngineers': 12784,
 'ĠBrad': 3343,
 'rary': 21034,
 'Ġtherapeut': 23038,
 'Ġmillimeter': 15038,
 'ection': 1306,
 'ĠZel': 22593,
 'Ġearnings': 19123,
 'ves': 1451,
 'Ġtakes': 3543,
 'icate': 7310,
 'ĠBlues': 11105,
 'ĠGuth': 12650,
 'Ġbusiness': 2694,
 'ĠEmma': 13307,
 'Ġemployee': 15117,
 'Ġtouching': 15574,
 'Ġearn': 8635,
 'ouk': 15777,
 'ĠImmediately': 20075,
 'Ġtrenches': 23285,
 'ĠBeth': 11659,
 'Ġenhanced': 16143,
 'Ġmortal': 16607,
 'Ġtel': 11015,
 'Ġnuc': 2151,
 'ĠLiv': 17385,
 'Ġparallels': 17127,
 'ify': 3728,
 'Ġow': 7461,
 'if': 346,
 'Ġrounded': 12465,
 'Ġecclesi': 16171,
 'ĠExcept': 23967,
 'ts': 744,
 'ĠCavalry': 14903,
 'Ġburials': 24305,
 'uv': 7538,
 'Ġmock': 16804,
 'ellar': 8410,
 'Ġmissing': 6156,
 'Ġfeature': 3134,
 'aced': 1191,
 'ĠHoysala': 15710,
 'Ġquality': 4133,
 'Ġplate': 11064,
 'Ġsupposed': 6030,


In [56]:
encoding = bpe_tokenizer.encode("This is one sentence.",
                            "With this one we have a pair.")
print(encoding.ids)
print(encoding.tokens)
print(encoding.attention_mask)

[51, 71, 214, 300, 495, 5689, 13, 54, 286, 525, 495, 614, 489, 195, 3288, 13]
['T', 'h', 'is', 'Ġis', 'Ġone', 'Ġsentence', '.', 'W', 'ith', 'Ġthis', 'Ġone', 'Ġwe', 'Ġhave', 'Ġa', 'Ġpair', '.']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
