In [1]:
# Following NLP Course here:
# https://huggingface.co/learn/nlp-course/chapter2/4

In [2]:
sentence = "Jim Henson was a puppeteer"

In [3]:
# Word tokenization
# Limit: Vocabulary (total number of words) is too big
tokenized_text = sentence.split()
tokenized_text

['Jim', 'Henson', 'was', 'a', 'puppeteer']

In [4]:
# Character tokenization
# Limit: Context (total number of tokens in a sentence) is too big
tokenized_text = [s for s in sentence]
tokenized_text

['J',
 'i',
 'm',
 ' ',
 'H',
 'e',
 'n',
 's',
 'o',
 'n',
 ' ',
 'w',
 'a',
 's',
 ' ',
 'a',
 ' ',
 'p',
 'u',
 'p',
 'p',
 'e',
 't',
 'e',
 'e',
 'r']

In [5]:
# Subwords tokenization
# Example with Bert
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [6]:
tokenizer.tokenize(sentence)

['Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer']

In [7]:
tokenizer.save_pretrained(".tokenizers")

('.tokenizers/tokenizer_config.json',
 '.tokenizers/special_tokens_map.json',
 '.tokenizers/vocab.txt',
 '.tokenizers/added_tokens.json')

In [8]:
# TOKENIZER Pipeline
# 1) Tokenization
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
tokens = tokenizer.tokenize(sentence)
tokens

['Jim', 'He', '##nson', 'was', 'a', 'puppet', '##eer']

In [9]:
# 2) From tokens to input IDS
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[3104, 1124, 15703, 1108, 170, 16797, 8284]

In [10]:
# 3) Decoding
decoded_str = tokenizer.decode(ids)
decoded_str

'Jim Henson was a puppeteer'

In [11]:
# Same would apply to other models:
# 1) Tokenization
tokenizer2 = AutoTokenizer.from_pretrained("t5-large")
tokens2 = tokenizer2.tokenize(sentence)
tokens2

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


['▁Jim', '▁He', 'n', 'son', '▁was', '▁', 'a', '▁puppet', 'e', 'er']

In [12]:
# 2) From tokens to input IDS
ids2 = tokenizer2.convert_tokens_to_ids(tokens2)
ids2

[6006, 216, 29, 739, 47, 3, 9, 26141, 15, 49]

In [13]:
# 3) Decode
tokenizer2.decode(ids2)

'Jim Henson was a puppeteer'