### **Tutorial 08: Tokenizer**

This tutorial explains the concept of a tokenizer and demonstrates how to implement and use a basic character-level tokenizer in Python. A tokenizer is a tool that converts text into tokens (smaller units, such as words or characters) and, in some cases, converts those tokens back into text.

In this tutorial, we build 
- 1). A character-level tokenizer that works at the character granularity, encoding each character as a unique token ID and decoding token IDs back to text.
- 2). A word-level tokenizer that works at the word granularity, encoding each word as a unique token ID and decoding token IDs back to text.
- 3). An n-grams tokenizer that generates sequences of tokens based on an n-grams approach, a fundamental concept in language modeling.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [15]:
import torch
import numpy as np
class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text) # remove duplicates
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)


  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [16]:
class WordTokenizer:
    def __init__(self, vocabulary):
        self.token_id_for_word = {word: token_id for token_id, word in enumerate(vocabulary)}
        self.word_for_token_id = {token_id: word for token_id, word in enumerate(vocabulary)}

    @staticmethod
    def train_from_text(text):
        """Create a tokenizer from the unique words in the given text."""
        words = set(text.split())  # Split text into words and remove duplicates
        return WordTokenizer(sorted(list(words)))

    def encode(self, text):
        """Convert a string of text into a tensor of token IDs."""
        token_ids = [self.token_id_for_word[word] for word in text.split()]
        return torch.tensor(token_ids, dtype=torch.long)

    def decode(self, token_ids):
        """Convert a tensor of token IDs back into a string of text."""
        return ' '.join([self.word_for_token_id[token_id] for token_id in token_ids.tolist()])

    def vocabulary_size(self):
        """Return the size of the tokenizer's vocabulary."""
        return len(self.token_id_for_word)

In [17]:
class NGramsTokenizer:
    def __init__(self, n):
        self.n = n

    def generate_ngrams(self, tokens):
        """Generate n-grams from a sequence of tokens."""
        ngrams = [
            tokens[i:i+self.n]
            for i in range(len(tokens) - self.n + 1)
        ]
        return ngrams


In [20]:
def is_extraction_successful(text):
    if isinstance(text, str) and text.strip():
        return True
    return False

In [None]:
from utils import text_helper

url = "https://medium.com/letters-to-my-younger-self-embracing-emotions-and/bridging-theory-and-practice-1b277456400d"  
text = text_helper.extract_medium_post_content(url)
if not is_extraction_successful(text):
    print("Failed to extract valid content from the URL.")
else:
    print("Content successfully extracted.")

tokenizer = CharTokenizer.train_from_text(text)

char_tokenizer = CharTokenizer.train_from_text(text)
encoded = char_tokenizer.encode("Hello world")
print(encoded)

ngrams_tokenizer = NGramsTokenizer(n=3)
ngrams = ngrams_tokenizer.generate_ngrams(encoded)
print("Character-level n-grams:", ngrams)

text = "the quick brown fox"
word_tokenizer = WordTokenizer.train_from_text(text)
encoded = word_tokenizer.encode(text)
print(encoded)

ngrams_tokenizer = NGramsTokenizer(n=2)
ngrams = ngrams_tokenizer.generate_ngrams(encoded)
print("Word-level n-grams:", ngrams)



In [8]:
from torch.utils.data import DataLoader, RandomSampler
from utils import sequence_pairs_dataset
tokenized_text = tokenizer.encode(text)
dataset = sequence_pairs_dataset.SequencePairsDataset(tokenized_text, block_size=64)
print(len(dataset))

sampler = RandomSampler(dataset, replacement=True)
dataloader = DataLoader(dataset, batch_size=2, sampler=sampler)
x, y = next(iter(dataloader))
tokenizer.decode(x[0])
tokenizer.decode(y[0])