<a href="https://colab.research.google.com/github/loganathanspr/nlp_course/blob/main/tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokenizers (PyTorch)

Install the Transformers, Datasets, and Evaluate libraries to run this notebook.

In [2]:
!pip install datasets evaluate transformers[sentencepiece]

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [3]:
tokenized_text = "This is an NLP course from HuggingFace".split()
print(tokenized_text)

['This', 'is', 'an', 'NLP', 'course', 'from', 'HuggingFace']


In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
text = "This is an NLP course from HuggingFace"

In [6]:
tokenizer(text)

{'input_ids': [101, 1188, 1110, 1126, 21239, 2101, 1736, 1121, 20164, 10932, 2271, 7954, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [8]:
tokenizer(text)

{'input_ids': [101, 1188, 1110, 1126, 21239, 2101, 1736, 1121, 20164, 10932, 2271, 7954, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [9]:
tokenizer("Using a Transformer network is simple")

{'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
tokenizer.save_pretrained("my_tokenizer")

('my_tokenizer/tokenizer_config.json',
 'my_tokenizer/special_tokens_map.json',
 'my_tokenizer/vocab.txt',
 'my_tokenizer/added_tokens.json',
 'my_tokenizer/tokenizer.json')

In [11]:
!ls -lsh my_tokenizer/

total 876K
4.0K -rw-r--r-- 1 root root  125 Sep 23 13:34 special_tokens_map.json
4.0K -rw-r--r-- 1 root root  315 Sep 23 13:34 tokenizer_config.json
656K -rw-r--r-- 1 root root 654K Sep 23 13:34 tokenizer.json
212K -rw-r--r-- 1 root root 209K Sep 23 13:34 vocab.txt


In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
sequence = "Hello world, the world has changed after the release of ChatGPT"
tokens = tokenizer.tokenize(sequence)
print(tokens)

['Hello', 'world', ',', 'the', 'world', 'has', 'changed', 'after', 'the', 'release', 'of', 'Cha', '##t', '##GP', '##T']


In [13]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[8667, 1362, 117, 1103, 1362, 1144, 2014, 1170, 1103, 1836, 1104, 24705, 1204, 17095, 1942]


In [21]:
# Test yourself
my_sequences = [
    "I’ve been waiting for a HuggingFace course my whole life.",
    "I hate this so much!"
]
tokenized_seq = tokenizer.tokenize(my_sequences)
token_ids = tokenizer.convert_tokens_to_ids(tokenized_seq)
tokenizer.decode(token_ids)

'I ’ ve been waiting for a HuggingFace course my whole life. I hate this so much!'

In [22]:
decoded_string = tokenizer.decode([7993, 170, 11303, 1200, 2443, 1110, 3014])
print(decoded_string)

Using a transformer network is simple
