<a href="https://colab.research.google.com/github/pelinbalci/LLM_Notebooks/blob/main/Tokenization_and_Embedding_Vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction
This notebook will show different types of tokenization and creating embedding vectors via transformers library.

In [4]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25h

# Tokenization

Tokenization is the process of breaking down a text or sequence of words into smaller units called tokens. Tokens are typically words, subwords, or characters, depending on the level of tokenization. Tokenization is a crucial preprocessing step before feeding text data into NLP models, as models operate on tokens rather than raw text.

In [2]:
text = "I would like to tokenize this sentence."

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")
tokenized_inputs = tokenizer(text)
tokenized_inputs

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

{'input_ids': [42, 651, 751, 281, 10669, 907, 436, 6197, 15], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

As you can see, the legth of the sentence is 7, however the length of input_ids is 9.

In [None]:
# Comment out these two lines to see the error
# tokenized_inputs = tokenizer(text, return_tensors="np", padding=True)
# tokenized_inputs

# Error:  Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)`
# or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

In [8]:
tokenized_inputs = tokenizer(text, return_tensors="np")
tokenized_inputs

{'input_ids': array([[   42,   651,   751,   281, 10669,   907,   436,  6197,    15]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
tokenized_inputs = tokenizer(text, return_tensors="pt")
tokenized_inputs

{'input_ids': tensor([[   42,   651,   751,   281, 10669,   907,   436,  6197,    15]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [10]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_inputs = tokenizer(text, return_tensors="np", padding=True)
tokenized_inputs

{'input_ids': array([[   42,   651,   751,   281, 10669,   907,   436,  6197,    15]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
# truncate
tokenizer.truncation_side = "left"
tokenized_inputs = tokenizer(text, return_tensors="np", padding=True, truncation=True, max_length=5)
tokenized_inputs

{'input_ids': array([[10669,   907,   436,  6197,    15]]), 'attention_mask': array([[1, 1, 1, 1, 1]])}

Let's apply to list of texts:

In [14]:
texts = ["I would like to tokenize this sentence.", "This is another one.", "One"]

In [15]:
tokenized_inputs = tokenizer(texts, return_tensors="np", padding=True, truncation=True, max_length=5)
tokenized_inputs

{'input_ids': array([[10669,   907,   436,  6197,    15],
       [ 1552,   310,  1529,   581,    15],
       [ 4041,     0,     0,     0,     0]]), 'attention_mask': array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 0, 0, 0, 0]])}

Apply it to a dataframe and turn it to datasets format:

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-70m")

def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
        text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
        text = examples["input"][0] + examples["output"][0]
    elif "prompt" in examples and "completion" in examples:  # our dataset
        text = examples["prompt"][0] + examples["completion"][0]
    else:
        text = examples["text"][0]

    # Add 0 for short sentences
    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    # find the max length after padding, select the min
    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )

    # truncate if the sentence is longer than 2048
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs


df = pd.read_excel("data_30.xlsx")
# Define the output JSONL file name
filename = 'output.jsonl'

# Iterate through the rows and write each row as a JSON object to the JSONL file
with open(filename, 'w') as jsonl_file:
    for _, row in df.iterrows():
        json_data = row.to_json(orient='columns')
        jsonl_file.write(json_data + '\n')

finetuning_dataset_loaded = datasets.load_dataset("json", data_files=filename, split="train")

tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched=True,
    batch_size=1,
    drop_last_batch=True
)

tokenized_dataset = tokenized_dataset.add_column("labels", tokenized_dataset["input_ids"])

Use BertTokenizer

In [16]:
from transformers import BertTokenizer

# Load a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize a sentence
text = "Tokenization is important for NLP."
tokens = tokenizer.tokenize(text)

print(tokens)
# Output: ['token', '##ization', 'is', 'important', 'for', 'nl', '##p', '.']


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['token', '##ization', 'is', 'important', 'for', 'nl', '##p', '.']


In [18]:
tokenized_inputs = tokenizer(text)
tokenized_inputs

{'input_ids': [101, 19204, 3989, 2003, 2590, 2005, 17953, 2361, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

What if we use AutoTokenizer for bert?

Below, you will see that these two classes give exactly the same output. There is no specific advantage of using BertTokenizer.

In [19]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_inputs = tokenizer(text)
tokenized_inputs

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': [101, 19204, 3989, 2003, 2590, 2005, 17953, 2361, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

# Embedding Vectors

Embedding vectors, also known as word embeddings or token embeddings, are dense numerical representations of tokens in a vector space. These vectors capture the semantic and syntactic meaning of words or subwords, allowing NLP models to work with continuous numerical data rather than discrete tokens. Word embeddings are typically learned during pre-training or obtained from pre-trained models. (ChatGPT)

In [24]:
from transformers import BertModel, BertTokenizer
import torch

# Load a pre-trained BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and obtain embedding vectors for a sentence
text = "Tokenization is important for NLP."
inputs = tokenizer(text, return_tensors='pt')
print("Inputs: ")
print(inputs)

outputs = model(**inputs)
print("Outputs:")
print(outputs)

# The embeddings for each token are in the last hidden state
embeddings = outputs.last_hidden_state
print("embeddings: ")
print(embeddings)



Inputs: 
{'input_ids': tensor([[  101, 19204,  3989,  2003,  2590,  2005, 17953,  2361,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Outputs:
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2603, -0.5215, -0.4275,  ..., -0.3853, -0.1231,  0.7852],
         [-0.4855, -0.1703, -0.7399,  ..., -0.0307,  0.0453,  0.8642],
         [-0.4281, -0.4596, -1.0244,  ..., -0.5987, -0.3673,  0.6417],
         ...,
         [-0.5963, -0.3015, -0.0608,  ..., -0.2188, -0.9433,  0.3536],
         [ 0.6983, -0.2541, -0.6408,  ...,  0.3179, -0.7106, -0.2647],
         [ 0.5300, -0.1785, -0.3590,  ...,  0.4909, -0.9151, -0.2356]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8598, -0.4651, -0.9185,  0.5748,  0.7357, -0.3219,  0.6282,  0.3219,
         -0.8381, -1.0000, -0.4022,  0.6677,  0.9573,  0.5144,  0.7529, -0.6316,
         -0.0315, -0.4661,  0.3756, -0.11

In [22]:
print(embeddings.shape)
# Output: torch.Size([1, 9, 768])

# Extract the embeddings for the first token
first_token_embedding = embeddings[0][0]

# Convert to a NumPy array for inspection
first_token_embedding = first_token_embedding.detach().numpy()

print(first_token_embedding)
# Output: A 768-dimensional array representing the first token's embedding.

torch.Size([1, 10, 768])
[-2.60313600e-01 -5.21473527e-01 -4.27503765e-01 -1.41868681e-01
 -5.00244915e-01 -3.32750469e-01 -4.67772745e-02  3.80777344e-02
  1.21736526e-01 -3.43897939e-01 -4.24083322e-01  2.30404753e-02
 -4.74672019e-01 -5.85825890e-02 -7.62139335e-02 -3.25774134e-04
 -1.14794020e-02  4.79494840e-01  5.80197526e-03 -2.54686564e-01
 -2.11137116e-01 -1.19926848e-01 -1.96198866e-01 -7.40247443e-02
 -5.69903068e-02 -2.44257063e-01  9.69227999e-02 -3.41107339e-01
  4.24124569e-01  4.56092805e-02 -6.62460625e-01  5.61993003e-01
  1.23753197e-01 -2.07318410e-01  7.13763654e-01 -1.45127878e-01
  1.46036401e-01 -2.05111772e-01  5.26602805e-01  2.41677612e-01
 -4.37333077e-01 -2.51780689e-01  1.64993137e-01  1.07946761e-01
 -4.80694413e-01 -2.79489368e-01 -3.33033085e+00 -4.56374623e-02
 -5.38615406e-01 -7.11098194e-01 -3.30806524e-01 -9.77438539e-02
 -1.16210885e-01  9.26752031e-01  4.34221447e-01  3.11039537e-01
 -1.88756347e-01  2.74082124e-01  8.18452001e-01  1.54637560e-01


Sentence transformers

In [27]:
!pip install -q sentence_transformers

In [28]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-uncased')
sentence_embeddings = model.encode(text)

Downloading (…)40076/.gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading (…)eb64c8a40076/LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading (…)64c8a40076/README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading (…)c8a40076/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)CoreML/model.mlmodel:   0%|          | 0.00/165k [00:00<?, ?B/s]

Downloading weight.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading (…)ackage/Manifest.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

Downloading model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)40076/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)64c8a40076/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [30]:
sentence_embeddings.shape

(768,)