## Tokenizer

it translates between text and tokens with encode() and decode() methods. it contains a vocab that can include special tokens to signal information. It can also include a chat template that knows how to format a chat message for the model.

In [None]:
## imports
from google.colab import userdata
from huggingface_hub import login
from transformers import AutoTokenizer

### logging into huggingface

In [None]:
hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

### accessing LLama3.1 from meta

1. go to https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct
2. accept the terms of service.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', trust_remote_code=True)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
text = "I am excited to show tokenizers in action to the LLM Engineers"
tokens = tokenizer.encode(text)
tokens

[128000,
 40,
 1097,
 12304,
 311,
 1501,
 4037,
 12509,
 304,
 1957,
 311,
 279,
 445,
 11237,
 49796]

In [None]:
len(tokens)

15

In [None]:
len(text)

62

In [None]:
tokenizer.decode(tokens)

'<|begin_of_text|>I am excited to show tokenizers in action to the LLM Engineers'

In [None]:
tokenizer.batch_decode(tokens)

['<|begin_of_text|>',
 'I',
 ' am',
 ' excited',
 ' to',
 ' show',
 ' token',
 'izers',
 ' in',
 ' action',
 ' to',
 ' the',
 ' L',
 'LM',
 ' Engineers']

In [None]:
tokenizer.vocab

{'Tester': 59799,
 'ĠiÃ§': 27615,
 'Ø¯Ø§Ø±ÛĮ': 113549,
 '442': 20502,
 'ĠJoe': 13142,
 '471': 20617,
 '.Foundation': 76837,
 'Ġforbidden': 37913,
 'ĠBYU': 93454,
 'Ġpeuvent': 56311,
 'Ġgoggles': 95090,
 'à¸±à¸ĩ': 100535,
 'nama': 18932,
 "='/": 22007,
 'izaciÃ³n': 42600,
 'âĢł': 84362,
 'à¸±à¸ļà¸£': 111567,
 '_pointer': 22140,
 'ĠOptim': 31197,
 'upyter': 73952,
 'xt': 2302,
 'smith': 34117,
 'Ã¡vka': 123765,
 '_look': 25257,
 'ĠnÆ°á»Ľng': 122843,
 'ourcing': 43662,
 'Ġang': 6590,
 'Ġblock': 2565,
 'ĠÚ©Ø±Ø¯ÙĨ': 105372,
 '<size': 31223,
 'Ġvoluntarily': 54523,
 ')")Ċ': 19652,
 'Ġä½į': 117602,
 'ĠGonzalez': 52186,
 'ĠCollapse': 76918,
 'à¥įà¤¡à¤²': 123241,
 'duÄŁunu': 118668,
 'ĠPropertyChangedEventArgs': 93216,
 'æ£®': 105466,
 'ÑģÐºÐ¾Ð¹': 101469,
 'DIC': 94261,
 'ĠkullanÄ±l': 111514,
 'norm': 20609,
 'Ġjustices': 71712,
 'çĮ«': 108429,
 'Ġsurprise': 13051,
 'ĠEconomy': 38661,
 'ulner': 59501,
 'Fake': 53417,
 'rubu': 119277,
 '.weights': 58718,
 'endez': 73602,
 'ĠFootball': 21424,
 'è

In [None]:
## special vocab
tokenizer.get_added_vocab()

{'<|begin_of_text|>': 128000,
 '<|end_of_text|>': 128001,
 '<|reserved_special_token_0|>': 128002,
 '<|reserved_special_token_1|>': 128003,
 '<|finetune_right_pad_id|>': 128004,
 '<|reserved_special_token_2|>': 128005,
 '<|start_header_id|>': 128006,
 '<|end_header_id|>': 128007,
 '<|eom_id|>': 128008,
 '<|eot_id|>': 128009,
 '<|python_tag|>': 128010,
 '<|reserved_special_token_3|>': 128011,
 '<|reserved_special_token_4|>': 128012,
 '<|reserved_special_token_5|>': 128013,
 '<|reserved_special_token_6|>': 128014,
 '<|reserved_special_token_7|>': 128015,
 '<|reserved_special_token_8|>': 128016,
 '<|reserved_special_token_9|>': 128017,
 '<|reserved_special_token_10|>': 128018,
 '<|reserved_special_token_11|>': 128019,
 '<|reserved_special_token_12|>': 128020,
 '<|reserved_special_token_13|>': 128021,
 '<|reserved_special_token_14|>': 128022,
 '<|reserved_special_token_15|>': 128023,
 '<|reserved_special_token_16|>': 128024,
 '<|reserved_special_token_17|>': 128025,
 '<|reserved_special_to

## Instruct Variants of Model

there are several models that are specifically trained so that they can carry out Chats with users.  typicall they are denoted with `instruct` at the end.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B-Instruct', trust_remote_code=True)

In [None]:
messages =[
    {"role": "system", "content": "You are a very helpful assistant"},
    {"role": "user", "content": "Tell me a pick-up line as I want to put that into a dating site as a data scientist."},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
print(prompt)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a very helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell me a pick-up line as I want to put that into a dating site as a data scientist.<|eot_id|><|start_header_id|>assistant<|end_header_id|>




## trying new models

In [None]:
PHI3_MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
QWEN2_MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
STARCODER2_MODEL_NAME = "bigcode/starcoder2-3b"

In [None]:
text = "I am excited to show tokenizers in action to the LLM Engineers"

In [None]:
phi3_tokenizer = AutoTokenizer.from_pretrained(PHI3_MODEL_NAME)
print(tokenizer.encode(text))
print()
print(phi3_tokenizer.encode(text))

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

[128000, 40, 1097, 12304, 311, 1501, 4037, 12509, 304, 1957, 311, 279, 445, 11237, 49796]

[306, 626, 24173, 304, 1510, 5993, 19427, 297, 3158, 304, 278, 365, 26369, 10863, 414]


In [None]:
tokens = phi3_tokenizer.encode(text)
print(phi3_tokenizer.batch_decode(tokens))

## phi3 does not have a start of sentence token

['I', 'am', 'excited', 'to', 'show', 'token', 'izers', 'in', 'action', 'to', 'the', 'L', 'LM', 'Engine', 'ers']


In [None]:
## chat template comparison - llama and phi

print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("-----------")
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a very helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell me a pick-up line as I want to put that into a dating site as a data scientist.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


-----------
<|system|>
You are a very helpful assistant<|end|>
<|user|>
Tell me a pick-up line as I want to put that into a dating site as a data scientist.<|end|>
<|assistant|>



In [None]:
## similar comparison iwth llama and qwen

qwen2_tokenizer = AutoTokenizer.from_pretrained(QWEN2_MODEL_NAME)
print("LLama3.1-8b-instruct tokenizer")
print(tokenizer.encode(text))
print("QWEN2 tokenization")
print(qwen2_tokenizer.encode(text))

LLama3.1-8b-instruct tokenizer
[128000, 40, 1097, 12304, 311, 1501, 4037, 12509, 304, 1957, 311, 279, 445, 11237, 49796]
QWEN2 tokenization
[40, 1079, 12035, 311, 1473, 3950, 12230, 304, 1917, 311, 279, 444, 10994, 48696]


In [None]:
## apply to chat template
print("Llama3.1's structure")
print(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("-----------")
print("Phi3's structure")
print(phi3_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))
print("-----------")
print("QWEN2's structure")
print(qwen2_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True))

Llama3.1's structure
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a very helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>

Tell me a pick-up line as I want to put that into a dating site as a data scientist.<|eot_id|><|start_header_id|>assistant<|end_header_id|>


-----------
Phi3's structure
<|system|>
You are a very helpful assistant<|end|>
<|user|>
Tell me a pick-up line as I want to put that into a dating site as a data scientist.<|end|>
<|assistant|>

-----------
QWEN2's structure
<|im_start|>system
You are a very helpful assistant<|im_end|>
<|im_start|>user
Tell me a pick-up line as I want to put that into a dating site as a data scientist.<|im_end|>
<|im_start|>assistant



In [None]:
starcoder2_tokenizer = AutoTokenizer.from_pretrained(STARCODER2_MODEL_NAME, trust_remote_code=True)

code = """
def print_hello(person):
  return f"Hello {person}"
"""

tokens = starcoder2_tokenizer.encode(code)
for token in tokens:
  print(f"{token} = {starcoder2_tokenizer.decode(token)}")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/958 [00:00<?, ?B/s]

222 = 

610 = def
1489 =  print
100 = _
7670 = hello
45 = (
6427 = person
731 = ):
353 = 
 
461 =  return
315 =  f
39 = "
8302 = Hello
320 =  {
6427 = person
3864 = }"
222 = 

