## Token Embeddings

In [None]:
from torch import tensor

input_ids=tensor([2, 3, 5, 1])

In [None]:
import torch

# Number of tokens present
vocab_size=6
# Number of dimensions for each vector of a token
output_dim=3

# Generate random numbers in PyTorch
torch.manual_seed(123)

# A simple lookup table that stores the embeddings of a fixed dictionary and size
embedding_layer=torch.nn.Embedding(vocab_size, output_dim)

In [None]:
print(embedding_layer.weight)

In [None]:
print(embedding_layer(tensor([3])))

In [None]:
# To get the vector embeddings for particular token IDs
print(embedding_layer(input_ids))

### Positional Embeddings - Encoding Word Positions

In [None]:
vocab_size=50257
output_dim=256

token_embedding_layer=torch.nn.Embedding(vocab_size, output_dim)

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch import tensor

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids=[]
    self.target_ids=[]

    # Tokenize the entire text
    token_ids=tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    # Apply the sliding window approach to chunk the dataset
    for i in range(0, len(token_ids)-max_length, stride):
      input_chunk=token_ids[i:i+max_length]
      target_chunk=token_ids[i+1:i+max_length+1]
      self.input_ids.append(tensor(input_chunk))
      self.target_ids.append(tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [None]:
import tiktoken

def create_data_loader_v1(txt, batch_size=4, max_length=256,
                          stride=128, shuffle=True, drop_last=True,
                          num_workers=0):
  # Initialize the tokenizer
  tokenizer=tiktoken.get_encoding("gpt2")

  # Create dataset
  dataset=GPTDatasetV1(txt, tokenizer, max_length, stride)

  # Create dataloader
  dataloader=DataLoader(dataset, batch_size=batch_size,
                        shuffle=shuffle, drop_last=drop_last,
                        num_workers=num_workers)
  
  return dataloader

In [None]:
with open("verdict.txt", 'r', encoding='utf-8-sig') as f:
  raw_text=f.read()

In [None]:
max_length=4

dataloader=create_data_loader_v1(
  raw_text, batch_size=8, max_length=max_length,
  stride=max_length, shuffle=False
)

data_iter=iter(dataloader)
# Data batch contains 8 text samples with 4 tokens each
inputs, targets=next(data_iter)

In [None]:
# For each token, a 256 dimension vector is generated
print(f"Inputs: {inputs}")
print(f"Input size: {inputs.shape}")

In [None]:
token_embeddings=token_embedding_layer(inputs)
print(token_embeddings.shape)

In [None]:
context_length=max_length

pos_embedding_layer=torch.nn.Embedding(context_length, output_dim)

- We need to add one position vector to each of these 4 token embeddings.

- The same position embeddings are applied to each input of 4 tokens because there are only 4 positions.

- So we have to generate 4 positional embedding vectors from the positional embedding matrix.

In [None]:
pos_embeddings=pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

In [None]:
# Broad-casting operation
input_embeddings=token_embeddings+pos_embeddings
print(input_embeddings.shape)

In [None]:
input_embeddings

### Recap of Data Pre-Processing Pipeline - Stage 1

The 4 steps involved are:

1. `Tokenization` - Converting input text to individual tokens and then to respective token IDs.
   - Word based tokenization
   - Subword based tokenization (BPE tokenizer)
   - Character based tokenization

2. `Token Embeddings` - Converting token IDs to vectors.

3. `Position Embeddings` - Encoding information about position.

4. `Input Embeddings` - Given as input for LLM training.