# **Collection of Different Tokenizers**

### **Author: Partha Seetala**

**Video Tutorial: https://www.youtube.com/watch?v=Oy48ZLHAUg0**

*   Rule based tokenization (character, whitespace, word boundary)
*   NLTK Tokenizer
*   gensim Tokenizer
*   HuggingFace Tokenizer



# **Input text string**

In [None]:
raw_text = "I can't believe how unbelievably fast AI can handle 'BERTification'!"

text = raw_text.lower()  # Lower case it (capitals don't really matter)

# **Utility Functions**

In [None]:
# Load our hugging face token
from google.colab import drive
import os

drive.mount('/content/drive')

with open('/content/drive/My Drive/cidl/hf.token', 'r') as file:
    huggingface_token = file.read().strip()

os.environ['HF_TOKEN'] = huggingface_token

Mounted at /content/drive


In [None]:
# @title
def makevocab(tokens, tok2id={}, id2tok={}):
    current_id = 0
    for tok in tokens:
        if tok not in tok2id:
            tok2id[tok] = current_id
            id2tok[current_id] = tok
            current_id += 1

    return tok2id, id2tok

def show(vocab, tokens):
    print("\nTokens: ", len(tokens), "\n[", end="")
    for tok in tokens:
        print("{}, ".format(tok), end="")
        #print(tok, ", ", end="")
    print("]")
    print("\nVocab: ", len(vocab), "\n[", end="")
    for tok in vocab:
        print("{}={}, ".format(tok, vocab[tok]), end="")
        #print(tok, ": ", vocab[tok], ", ", end="")
    print("]")

# **Basic Tokenizers**

**Most Basic Tokenizer #1 (``Character Based``)**

In [None]:
tokens = list(text)
vocab, _ = makevocab(tokens)

show(vocab, tokens)


Tokens:  68 
[i,  , c, a, n, ', t,  , b, e, l, i, e, v, e,  , h, o, w,  , u, n, b, e, l, i, e, v, a, b, l, y,  , f, a, s, t,  , a, i,  , c, a, n,  , h, a, n, d, l, e,  , ', b, e, r, t, i, f, i, c, a, t, i, o, n, ', !, ]

Vocab:  21 
[i=0,  =1, c=2, a=3, n=4, '=5, t=6, b=7, e=8, l=9, v=10, h=11, o=12, w=13, u=14, y=15, f=16, s=17, d=18, r=19, !=20, ]


**Most Basic Tokenizer #2 (``Whitespace Based``)**

In [None]:
tokens = text.split()
vocab, _ = makevocab(tokens)

show(vocab, tokens)


Tokens:  10 
[i, can't, believe, how, unbelievably, fast, ai, can, handle, 'bertification'!, ]

Vocab:  30 
[i=0,  =1, c=2, a=3, n=4, '=5, t=6, b=7, e=8, l=9, v=10, h=11, o=12, w=13, u=14, y=15, f=16, s=17, d=18, r=19, !=20, can't=0, believe=1, how=2, unbelievably=3, fast=4, ai=5, can=6, handle=7, 'bertification'!=8, ]


**Most Basic Tokenizer #3 (``Word Based``)**
Enhacement to whitespace tokenizer where we also split across punctuation symbols

In [None]:
import re
word_tokens = re.findall(r'\w+|[^\w\s]', text, re.UNICODE)
vocab, _ = makevocab(word_tokens)

show(vocab, word_tokens)


Tokens:  15 
[i, can, ', t, believe, how, unbelievably, fast, ai, can, handle, ', bertification, ', !, ]

Vocab:  31 
[i=0,  =1, c=2, a=3, n=4, '=5, t=6, b=7, e=8, l=9, v=10, h=11, o=12, w=13, u=14, y=15, f=16, s=17, d=18, r=19, !=20, can't=0, believe=1, how=2, unbelievably=3, fast=4, ai=5, can=6, handle=7, 'bertification'!=8, bertification=0, ]


In [None]:
def buildvocab(sentences):
    tok2id = {}
    id2tok = {}
    idx = 0
    for sent in sentences:
        text  = sent.lower()
        tokens = text.split()
        for tok in tokens:
            if tok not in tok2id:
                tok2id[tok] = idx
                id2tok[idx] = tok
                idx += 1
    return tok2id

def printvocab(vocab):
    print("Vocab: ", len(vocab), " tokens\n[", end="")
    for i,tok in enumerate(vocab):
        end = "" if (i+1) % 6  else "\n "
        v = "{}={},".format(tok, i)
        print("{:<15} ".format(v), end=end)
    print("]\n")

def check_tokens_in_vocab(vocab, sentences):
    print("-" * 80)
    ntoks = 0
    nfound = 0
    noov = 0
    for sent in sentences:
        text  = sent.lower()
        tokens = text.split()
        print(text)

        found = []
        oov = []
        for tok in tokens:
            ntoks += 1
            if tok not in vocab:
                oov.append(tok)
                noov += 1
            else:
                found.append(tok)
                nfound += 1
        print("tokens: ", len(tokens), "    ", tokens)
        print(" found: ", len(found), "{:3.0f}%".format(100.0* len(found)/len(tokens)),  found)
        print("   OOV: ", len(oov), "{:3.0f}%".format(100.0* len(oov)/len(tokens)), oov)
        print("\n")

    print("Total tokens: {:2d}, found={:2d} ({:2.0f}%),  OOV={:2d} ({:2.0f}%)".
          format(ntoks, nfound, 100*(nfound/ntoks), noov, 100.0*(noov/ntoks)))

In [None]:
training1 = [
    "the football player is playing the play the coach called",
    "my dog runs faster than most runners",
    "pack your toothbrush and toothpaste everytime"
]


vocab1 = buildvocab(training1)
printvocab(vocab1)


inference1 = [
    "my dog played ball with her foot",
    "coach has been running everyday",
    "I have a toothache so I brush everyday"
]

check_tokens_in_vocab(vocab1, inference1)

Vocab:  21  tokens
[the=0,          football=1,     player=2,       is=3,           playing=4,      play=5,         
 coach=6,        called=7,       my=8,           dog=9,          runs=10,        faster=11,      
 than=12,        most=13,        runners=14,     pack=15,        your=16,        toothbrush=17,  
 and=18,         toothpaste=19,  everytime=20,   ]

--------------------------------------------------------------------------------
my dog played ball with her foot
tokens:  7      ['my', 'dog', 'played', 'ball', 'with', 'her', 'foot']
 found:  2  29% ['my', 'dog']
   OOV:  5  71% ['played', 'ball', 'with', 'her', 'foot']


coach has been running everyday
tokens:  5      ['coach', 'has', 'been', 'running', 'everyday']
 found:  1  20% ['coach']
   OOV:  4  80% ['has', 'been', 'running', 'everyday']


i have a toothache so i brush everyday
tokens:  8      ['i', 'have', 'a', 'toothache', 'so', 'i', 'brush', 'everyday']
 found:  0   0% []
   OOV:  8 100% ['i', 'have', 'a', 'toothac

In [None]:
training2 = [
    "the foot ball play er is play ing the play the coach called",
    "my dog runs fast er than most run ners",
    "pack your tooth brush and tooth paste every time"
]


vocab2 = buildvocab(training2)
printvocab(vocab2)


inference2 = [
    "my dog play ed ball with her foot",
    "coach has been run ning every day",
    "I have a tooth ache so I brush every day"
]

check_tokens_in_vocab(vocab2, inference2)

Vocab:  25  tokens
[the=0,          foot=1,         ball=2,         play=3,         er=4,           is=5,           
 ing=6,          coach=7,        called=8,       my=9,           dog=10,         runs=11,        
 fast=12,        than=13,        most=14,        run=15,         ners=16,        pack=17,        
 your=18,        tooth=19,       brush=20,       and=21,         paste=22,       every=23,       
 time=24,        ]

--------------------------------------------------------------------------------
my dog play ed ball with her foot
tokens:  8      ['my', 'dog', 'play', 'ed', 'ball', 'with', 'her', 'foot']
 found:  5  62% ['my', 'dog', 'play', 'ball', 'foot']
   OOV:  3  38% ['ed', 'with', 'her']


coach has been run ning every day
tokens:  7      ['coach', 'has', 'been', 'run', 'ning', 'every', 'day']
 found:  3  43% ['coach', 'run', 'every']
   OOV:  4  57% ['has', 'been', 'ning', 'day']


i have a tooth ache so i brush every day
tokens:  10      ['i', 'have', 'a', 'tooth', 'a

# **Using NLTK's Punkt Library**

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.data import find
import os

# Download Punkt tokenizer for sentence boundary detection in natural language processing
def download_punkt():
    try:
        find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')

download_punkt()


# Tokenize the text using NLTK's word_tokenize
nltk_tokens = word_tokenize(text)

print("NLTK based tokenization: {} tokens".format(len(nltk_tokens)))
print(nltk_tokens)

# Tokenize the text using NLTK's sent_tokenize
nltk_sentences = sent_tokenize(text)

print("\nNLTK based sentence tokenization: {} sentences".format(len(nltk_sentences)))
print(nltk_sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


NameError: name 'text' is not defined

# **Using ``gensim`` Library**

In [None]:
!pip install --upgrade numpy
!pip install --upgrade gensim

In [None]:
from gensim.utils import simple_preprocess


# Gensim simple_preprocess() library does the following:
# (1) converts text to lower-case
# (2) Splits text into individual words
# (3) Removes punctuation marks, ensuring only words and numbers remain
# (4) Filter short words -- by default discards words shorter than 2 characters or longer than 15 characters (so long URLs/SHA hashes will be discarded)
# (4) Only retains alphanumeric words
gensim_tokens = simple_preprocess(raw_text)

print("Gensim based tokenization: {} tokens".format(len(gensim_tokens)))
print(gensim_tokens)

Gensim based tokenization: 9 tokens
['can', 'believe', 'how', 'unbelievably', 'fast', 'ai', 'can', 'handle', 'bertification']


# **Using HuggingFace Library**

In [None]:
# Load our hugging face token
from google.colab import drive
import os

drive.mount('/content/drive')

with open('/content/drive/My Drive/cidl/hf.token', 'r') as file:
    huggingface_token = file.read().strip()

os.environ['HF_TOKEN'] = huggingface_token

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### **Using ``Huggingface`` Tokenizer -- ``Byte-Pair Encoding (BPE) tokenizer``**

In [None]:
from transformers import GPT2Tokenizer

# Load the pre-trained GPT-2 tokenizer (which uses BPE)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

gpt2_tokens = tokenizer.tokenize(raw_text)


print("GPT2 (BPE) based tokenization: {} tokens".format(len(gpt2_tokens)))
print(gpt2_tokens)

# Get IDs for each token
gpt2_token_ids = tokenizer.convert_tokens_to_ids(gpt2_tokens)

print("Token IDs:", gpt2_token_ids)

# In GPT-2, the Ġ character indicates the start of a new word following a space. So "Ġtime" indicates " time" (with a preceding space)

GPT2 (BPE) based tokenization: 16 tokens
['I', 'Ġcan', "'t", 'Ġbelieve', 'Ġhow', 'Ġunbelievably', 'Ġfast', 'ĠAI', 'Ġcan', 'Ġhandle', "Ġ'", 'BER', 'T', 'ification', "'", '!']
Token IDs: [40, 460, 470, 1975, 703, 48943, 3049, 9552, 460, 5412, 705, 13246, 51, 2649, 6, 0]


### **Using `Huggingface` Tokenizer -- `Wordpiece tokenizer`**

In [None]:
from transformers import AutoTokenizer

# Load the pre-trained tokenizer (WordPiece example from BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

bert_tokens = tokenizer.tokenize(raw_text)

print("BERT (wordpiece) based tokenization: {} tokens".format(len(bert_tokens)))
print(bert_tokens)

# Get IDs for each token
bert_token_ids = tokenizer.convert_tokens_to_ids(bert_tokens)

print("Token IDs:", bert_token_ids)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

BERT (wordpiece) based tokenization: 20 tokens
['i', 'can', "'", 't', 'believe', 'how', 'un', '##bel', '##ie', '##va', '##bly', 'fast', 'ai', 'can', 'handle', "'", 'bert', '##ification', "'", '!']
Token IDs: [1045, 2064, 1005, 1056, 2903, 2129, 4895, 8671, 2666, 3567, 6321, 3435, 9932, 2064, 5047, 1005, 14324, 9031, 1005, 999]


In [None]:
# STEP1: TOKENIZE MY TRAINING DATA TO BUILD VOCAB
for doc in documents:
    for sent in break_doc_to_sentence(doc):
        tokens = tokenize_sentence(sent)
        for tok in tokens:
            if not in vocab:
                add_to_vocab(tok)

# Here I have a vocab [{token -> ID}]
token_count = len(vocab)   # 370

# STEP2: BUILD ONE-HOT-ENCODING EMBEDDINGS FOR EACH TOKEN IN MY VOCAB
emb = np.zero(token_count, token_count)

for tok,id in vocab.items():
    emb[id,id] = 1

(vocab, emb)

"Jagadish asked a good question"

tokens = tokenize("Jagadish asked a good question")
for tok in tokens:
    tokid = vocab.get_token_id(tok) # 5
    toke = emb[tokid] # [0, 0, 0, 0, 1, 0, 0, ...., 0]


# **Using ``spaCy`` Library**

[https://spacy.io/](https://spacy.io/)

spaCy is a high-performance NLP library for production use cases. It's tokenizer is built on top of a rules engine. The following rules are used:

1. **Prefix, Suffix, and Infix Handling**
    - Prefix rule: \$1000 will be tokenized as ["$", "1000"].
    - Suffix rule: 'happy.' will be tokenized as ["happy", "."].
    - Infix rule: 3.14 will remain as ["3.14"], but 'well-known' will be tokenized as ["well", "-", "known"].
2. **Whitespace Rule (if there is a whitespace a new token is considered)**
3. **Exceptions**
    - Contractions: "don't" will be split into ["do", "n't"].
    - Abbreviations: Common abbreviations like "U.S." treated as a single token instead of splitting them.
4. **URLs, Emails, and Special Characters**
    - URLs like http://spacy.io are treated as a single token.
    - Emails like myname@gmail.com are also treated as a single token.
    - Emoji or special characters (like 😊) are kept as individual tokens.
5. **Punctuation Rules**
    - "hello." becomes ["hello", "."]
    - "(hello)" becomes ["(", "hello", ")"]

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m89.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

# Load the spaCy language model
nlp = spacy.load("en_core_web_sm")

# Process the text using the spaCy pipeline. This is tokenization happens.
doc = nlp(raw_text)

# extract token texts
spacy_tokens = [token.text for token in doc]

print("spaCy based tokenization: {} tokens".format(len(spacy_tokens)))
print(spacy_tokens)

spaCy based tokenization: 14 tokens
['I', 'ca', "n't", 'believe', 'how', 'unbelievably', 'fast', 'AI', 'can', 'handle', "'", 'BERTification', "'", '!']


In [None]:
# Get token IDs by accessing the token's lexeme hash
# Lexemes are part of spaCy’s efficient system to avoid storing repeated info
spacy_lexeme = [token.vocab.strings[token.text] for token in doc]

print("Token lexeme:", spacy_lexeme)

Token lexeme: [4690420944186131903, 13913074996085095135, 2043519015752540944, 7936246238499659209, 16331095434822636218, 11010712746389516034, 1826119438242743099, 5530044837203964789, 6635067063807956629, 8691213008739214150, 11221368173670222813, 10580551838355052375, 11221368173670222813, 17494803046312582752]


In [None]:
# Iterate through tokens and print important attributes for each token
for i,token in enumerate(doc):
    print(f"[{i}] Token: {token.text}")       # The actual text of the token.
    print(f"  Lemma: {token.lemma_}")         # The lemma (root form) of the token
    print(f"  Part of Speech: {token.pos_}")  # Part of speech of the token (e.g., NOUN, VERB)
    print(f"  Dependency: {token.dep_}")      # Syntactic dependency label (e.g., nsubj for subject)
    print(f"  Is Stop Word: {token.is_stop}") # Is the token a stop word
    print(f"  Is Alpha: {token.is_alpha}")    # Does token consists only of alphabetic characters
    print(f"  Shape: {token.shape_}")         # Shape of the token, e.g., Xxxx for capitalized words
    print(f"  Is Punctuation: {token.is_punct}") # Is the token a punctuation
    print(f"  Is Space: {token.is_space}")    # Is the token a space character
    print("-" * 30)

[0] Token: I
  Lemma: I
  Part of Speech: PRON
  Dependency: nsubj
  Is Stop Word: True
  Is Alpha: True
  Shape: X
  Is Punctuation: False
  Is Space: False
------------------------------
[1] Token: ca
  Lemma: can
  Part of Speech: AUX
  Dependency: aux
  Is Stop Word: True
  Is Alpha: True
  Shape: xx
  Is Punctuation: False
  Is Space: False
------------------------------
[2] Token: n't
  Lemma: not
  Part of Speech: PART
  Dependency: neg
  Is Stop Word: True
  Is Alpha: False
  Shape: x'x
  Is Punctuation: False
  Is Space: False
------------------------------
[3] Token: believe
  Lemma: believe
  Part of Speech: VERB
  Dependency: ROOT
  Is Stop Word: False
  Is Alpha: True
  Shape: xxxx
  Is Punctuation: False
  Is Space: False
------------------------------
[4] Token: how
  Lemma: how
  Part of Speech: SCONJ
  Dependency: advmod
  Is Stop Word: True
  Is Alpha: True
  Shape: xxx
  Is Punctuation: False
  Is Space: False
------------------------------
[5] Token: unbelievably
  