## Cusotm Tokenizer

In [1]:
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

sample1 = 'We are learning AI'
sample2 = 'AI is a CS topic'
data = [sample1, sample2]

# Define the max vocabulary size and sequence length
vocab_size = 8
sequence_length = 5

In [2]:
# Define tokenizer function
tokenizer = get_tokenizer('basic_english')

# Create a function to yield list of tokens
def yield_tokens(examples):
    for text in examples:
        yield tokenizer(text)

# Create vocabulary
vocab = build_vocab_from_iterator(yield_tokens(data),
                                  max_tokens=vocab_size,
                                  specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [10]:
tokens = tokenizer(sample1)
print(tokens)

sample1_ids = [vocab[token] for token in tokens]
print(sample1_ids)

['we', 'are', 'learning', 'ai']
[0, 4, 7, 2]


## Build-in Tokenizer

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [2]:
tokens = tokenizer('we are learning ai')
print(tokens) 

{'input_ids': [101, 2057, 2024, 4083, 9932, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [3]:
tokens = tokenizer('we are learning ai', add_special_tokens=False)
print(tokens) 

{'input_ids': [2057, 2024, 4083, 9932], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1]}


In [7]:
tokens =  'we are learning ai'.split()
tokenizer.convert_tokens_to_ids(tokens)

[2057, 2024, 4083, 9932]

In [8]:
tokenizer.decode(2057)

'we'

## Build-in Tokenizer (2)

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("TweebankNLP/bertweet-tb2_ewt-pos-tagging")

In [2]:
tokens = tokenizer('we are learning ai')
print(tokens) 

{'input_ids': [0, 54, 41, 2265, 4490, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [3]:
tokens =  'we are learning ai'.split()
tokenizer.convert_tokens_to_ids(tokens)

[54, 41, 2265, 4490]

In [4]:
tokenizer.decode(54)

'w e'