# Tokenizers

In [2]:
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

In [9]:
# Log in to Hugging Face

hf_token = userdata.get('HF_API_KEY')
if hf_token and hf_token.startswith("hf_"):
  print("HF key looks good so far")
else:
  print("HF key is not set - please click the key in the left sidebar")
login(hf_token, add_to_git_credential=True)

# Check Google Colab GPU

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)
  if gpu_info.find('Tesla T4') >= 0:
    print("Success - Connected to a T4")
  else:
    print("NOT CONNECTED TO A T4")

HF key looks good so far
/bin/bash: line 1: nvidia-smi: command not found
NOT CONNECTED TO A T4


In [17]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.3-70B-Instruct', trust_remote_code=True)

In [18]:
text = "I am excited to show Tokenizers in action to my LLM engineers"
tokens = tokenizer.encode(text)
tokens

[128000,
 40,
 1097,
 12304,
 311,
 1501,
 9857,
 12509,
 304,
 1957,
 311,
 856,
 445,
 11237,
 25175]

In [19]:
character_count = len(text)
word_count = len(text.split(' '))
token_count = len(tokens)
print(f"There are {character_count} characters, {word_count} words and {token_count} tokens")

There are 61 characters, 12 words and 15 tokens


In [20]:
tokenizer.decode(tokens)

'<|begin_of_text|>I am excited to show Tokenizers in action to my LLM engineers'

In [21]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'I',
 ' am',
 ' excited',
 ' to',
 ' show',
 ' Token',
 'izers',
 ' in',
 ' action',
 ' to',
 ' my',
 ' L',
 'LM',
 ' engineers']

In [None]:
tokenizer.vocab

In [None]:
tokenizer.get_added_vocab()

# Instruct variants of models

In [27]:
messages = [
    {"role": "system", "content": "You are a helpful assistant"},
    {"role": "user", "content": "Tell a light-hearted joke for a room of Data Scientists"}
  ]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for a room of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>




# Trying new models

In [28]:
PHI4 = "microsoft/Phi-4-mini-instruct"
DEEPSEEK = "deepseek-ai/DeepSeek-V3.1"
QWEN_CODER = "Qwen/Qwen2.5-Coder-7B-Instruct"

In [29]:
phi4_tokenizer = AutoTokenizer.from_pretrained(PHI4)

text = "I am curiously excited to show Hugging Face Tokenizers in action to my LLM engineers"
print("Llama:")
tokens = tokenizer.encode(text)
print(tokens)
print(tokenizer.batch_decode(tokens))
print("\nPhi 4:")
tokens = phi4_tokenizer.encode(text)
print(tokens)
print(phi4_tokenizer.batch_decode(tokens))


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/249 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/587 [00:00<?, ?B/s]

Llama:
[128000, 40, 1097, 2917, 13610, 12304, 311, 1501, 473, 36368, 19109, 9857, 12509, 304, 1957, 311, 856, 445, 11237, 25175]
['<|begin_of_text|>', 'I', ' am', ' cur', 'iously', ' excited', ' to', ' show', ' H', 'ugging', ' Face', ' Token', 'izers', ' in', ' action', ' to', ' my', ' L', 'LM', ' engineers']

Phi 4:
[40, 939, 4396, 23138, 15209, 316, 2356, 59116, 4512, 29049, 17951, 24223, 306, 3736, 316, 922, 451, 19641, 32437]
['I', ' am', ' cur', 'iously', ' excited', ' to', ' show', ' Hug', 'ging', ' Face', ' Token', 'izers', ' in', ' action', ' to', ' my', ' L', 'LM', ' engineers']


In [30]:
print("Llama:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nPhi 4:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

Llama:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for a room of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Phi 4:
<|system|>You are a helpful assistant<|end|><|user|>Tell a light-hearted joke for a room of Data Scientists<|end|><|assistant|>


In [31]:
deepseek_tokenizer = AutoTokenizer.from_pretrained(DEEPSEEK)

text = "I am curiously excited to show Hugging Face Tokenizers in action to my LLM engineers"
print(tokenizer.encode(text))
print()
print(phi4_tokenizer.encode(text))
print()
print(deepseek_tokenizer.encode(text))

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

[128000, 40, 1097, 2917, 13610, 12304, 311, 1501, 473, 36368, 19109, 9857, 12509, 304, 1957, 311, 856, 445, 11237, 25175]

[40, 939, 4396, 23138, 15209, 316, 2356, 59116, 4512, 29049, 17951, 24223, 306, 3736, 316, 922, 451, 19641, 32437]

[0, 43, 1030, 108771, 15046, 304, 1801, 24133, 5426, 11906, 47948, 24524, 295, 4271, 304, 1026, 33792, 47, 26170]


In [32]:
print("Llama:")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nPhi:")
print(phi4_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("\nDeepSeek:")
print(deepseek_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

Llama:
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell a light-hearted joke for a room of Data Scientists<|eot_id|><|start_header_id|>assistant<|end_header_id|>



Phi:
<|system|>You are a helpful assistant<|end|><|user|>Tell a light-hearted joke for a room of Data Scientists<|end|><|assistant|>

DeepSeek:
<｜begin▁of▁sentence｜>You are a helpful assistant<｜User｜>Tell a light-hearted joke for a room of Data Scientists<｜Assistant｜></think>


In [33]:
qwen_tokenizer = AutoTokenizer.from_pretrained(QWEN_CODER)
code = """
def hello_world(person):
  print("Hello", person)
"""
tokens = qwen_tokenizer.encode(code)
for token in tokens:
  print(f"{token}={qwen_tokenizer.decode(token)}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

198=

750=def
23811= hello
31792=_world
29766=(person
982=):

220= 
1173= print
445=("
9707=Hello
497=",
1697= person
340=)

