# Tokenization

### Text standardization

In [1]:
import re
import unicodedata


def standardize_text(text: str) -> str:
    # Convert text to lowercase
    text = text.lower()
    # Normalize unicode characters to ASCII
    text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8")
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text


# Example sentences
sentence1 = (
    "dusk fell, i was gazing at the Sao Paulo skyline. Isnt urban life vibrant??"
)
sentence2 = "Dusk fell; I gazed at the São Paulo skyline. Isn't urban life vibrant?"

# Standardize sentences
std_sentence1 = standardize_text(sentence1)
std_sentence2 = standardize_text(sentence2)
print(std_sentence1)
print(std_sentence2)

dusk fell i was gazing at the sao paulo skyline isnt urban life vibrant
dusk fell i gazed at the sao paulo skyline isnt urban life vibrant


### Word-level tokenization

In [2]:
text = "dusk fell i gazed at the sao paulo skyline isnt urban life vibrant"
tokens = text.split()
print(tokens)

['dusk', 'fell', 'i', 'gazed', 'at', 'the', 'sao', 'paulo', 'skyline', 'isnt', 'urban', 'life', 'vibrant']


### Character-level tokenization

In [3]:
text = "Dusk fell"
tokens = list(text)
print(tokens)

['D', 'u', 's', 'k', ' ', 'f', 'e', 'l', 'l']


### Sub-word tokenization

In [5]:
from transformers import BertTokenizer

text = "I have a new GPU!"
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokens = tokenizer.tokenize(text)
print(tokens)

['i', 'have', 'a', 'new', 'gp', '##u', '!']


### BPE