# Understanding tokenization and embeddings 

In [24]:
# LIBRARIES IMPORT
import tiktoken
from openai import OpenAI
from transformers import AutoTokenizer

In [25]:
# Initialize OpenAI API
client = OpenAI()
# Define the embdedding model
EMBEDDING_MODEL = "text-embedding-3-large"

Tokenization is the process of converting text into smaller sub-texts, which are called tokens. In the context of NLP, tokens are words, phrases, or sentences. Tokenization is a crucial step in NLP because it helps to break down the text into smaller units, which can be used for further analysis.

First step : Define the text to be tokenized.

In [26]:
question_chars = "What is DNA ?"
print(f"question : {question_chars}")

question : What is DNA ?


Second step : Tokenize the text using the tokenizer.

In [27]:
t = AutoTokenizer.from_pretrained("bert-base-uncased")
question_tokens = t.tokenize(question_chars)
print(f"Splitting the question into tokens : {question_tokens}")

Splitting the question into tokens : ['what', 'is', 'dna', '?']


In [28]:
encoding = tiktoken.get_encoding("cl100k_base")
question_enc = encoding.encode(question_chars)
print(f"Number of tokens : {len(question_enc)}")
print(f"Tokens id : {question_enc}")

Number of tokens : 4
Tokens id : [3923, 374, 15922, 949]


Step 3 : Get the embeddings for each token.

In [29]:
def get_embeddings_raw(text: str, model: str = EMBEDDING_MODEL) -> str:
    """Get raw embeddings from OpenAi."""
    return client.embeddings.create(
           input=[text],
           model=model
        ).data[0].embedding

In [30]:
for token in question_tokens:
    print(f"Token : {token}")
    token_emb = get_embeddings_raw(token)
    print(f"Token embedding shape : {len(token_emb)}")
    # save the embeddings in a text file
    with open(f"{token}_emb.txt", "w") as f:
        f.write(f"{token_emb}\n")
    print(f"Token embedding saved in {token}_emb.txt")


Token : what
Token embedding shape : 3072
Token embedding saved in what_emb.txt
Token : is
Token embedding shape : 3072
Token embedding saved in is_emb.txt
Token : dna
Token embedding shape : 3072
Token embedding saved in dna_emb.txt
Token : ?
Token embedding shape : 3072
Token embedding saved in ?_emb.txt
