In [1]:
# tokenizers - translate text into data that can be processed by the model
# i.e. convert text to numbers
# from HF training - https://huggingface.co/learn/nlp-course/chapter2/4

In [2]:
# word based
# split will do this based on white space
tokenized_text = "Ford and Zaphod were cousins".split()
print(tokenized_text)

['Ford', 'and', 'Zaphod', 'were', 'cousins']


In [3]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer("Ford and Zaphod were cousins")

{'input_ids': [101, 4100, 1105, 163, 25890, 5412, 1127, 14904, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [4]:
# save it to a local folder
tokenizer.save_pretrained("/m2-data/llm/tokenizer")

('/m2-data/llm/tokenizer/tokenizer_config.json',
 '/m2-data/llm/tokenizer/special_tokens_map.json',
 '/m2-data/llm/tokenizer/vocab.txt',
 '/m2-data/llm/tokenizer/added_tokens.json')

In [5]:
# encoding - tokenize and convert to numeric input IDs
# is a subword meaning it breaks wors up to components the model knows
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
print(tokens)

sequence2 = "Ford and Zaphod were cousins"
tokens2 = tokenizer.tokenize(sequence2)
print(tokens2)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']
['Ford', 'and', 'Z', '##aph', '##od', 'were', 'cousins']


In [6]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [7]:
ids2 = tokenizer.convert_tokens_to_ids(tokens2)
print(ids2)

[4100, 1105, 163, 25890, 5412, 1127, 14904]


In [10]:
# decoding - taking the tokens back to the original text
print(tokenizer.decode(ids2))

2024-07-23 18:00:25.245467: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-23 18:00:25.264578: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-23 18:00:25.270161: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-23 18:00:25.283746: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Ford and Zaphod were cousins
