In [None]:
# Tokenization is just for strings

In [1]:
# suppress warnings that would normally be displayed during execution
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

In [2]:
# word_tokenize (a Word-based tokenizer) 
import nltk  # Natural Language Toolkit (a Python library)
nltk.download("punkt")  # Downloads the Punkt tokenizer models, which are used for sentence and word tokenization.
nltk.download("punkt_tab")  # punkt_tab could contain additional pre-trained tokenization rules or enhancements for punkt
from nltk.tokenize import word_tokenize

text = "This is a sample sentence for word tokenization."
tokens = word_tokenize(text)
print(tokens)

['This', 'is', 'a', 'sample', 'sentence', 'for', 'word', 'tokenization', '.']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Menoo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Menoo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
# or simply use built-in split() for basic tokenization
text = 'Hey thery how you doin?'
text.split()

['Hey', 'thery', 'how', 'you', 'doin?']

In [4]:
# rule-based tokenizer (spaCy tokenizer)
# helps computers understand and analyze text
import spacy

text = "I couldn't help the dog. Can't you do it? Don't be afraid if you are."
nlp = spacy.load("en_core_web_sm")  # Load the English NLP model
# en_core_web_sm: a small English NLP model that helps computers understand and analyze text.
doc = nlp(text)  # Process the text using spaCy
print(doc)

# make a list of tokens
token_list = [token.text for token in doc]
# .text: the word of the token
print(token_list)

# show token details
for token in doc:
    print(token.text, token.pos_, token.dep_)
    # .pos_: grammatical category of the word (e.g., noun, verb, adjective, etc.).
    # .dep_: shows the relationship of the word with other words in the sentence.

I couldn't help the dog. Can't you do it? Don't be afraid if you are.
['I', 'could', "n't", 'help', 'the', 'dog', '.', 'Ca', "n't", 'you', 'do', 'it', '?', 'Do', "n't", 'be', 'afraid', 'if', 'you', 'are', '.']
I PRON nsubj
could AUX aux
n't PART neg
help VERB ROOT
the DET det
dog NOUN dobj
. PUNCT punct
Ca AUX aux
n't PART neg
you PRON nsubj
do VERB ROOT
it PRON dobj
? PUNCT punct
Do AUX aux
n't PART neg
be AUX ROOT
afraid ADJ acomp
if SCONJ mark
you PRON nsubj
are AUX advcl
. PUNCT punct


In [5]:
# subword-based tokenizer (BertTokenizer)
# Note that it treats composite words as separate tokens
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize("IBM taught me tokenization")

['ibm', 'taught', 'me', 'token', '##ization']

In [6]:
# for character-based tokenizer we simply use list?
text = "IBM taught me tokenization"
char_tokens = list(text)
print(char_tokens)

['I', 'B', 'M', ' ', 't', 'a', 'u', 'g', 'h', 't', ' ', 'm', 'e', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n']


In [7]:
# XLNetTokenizer uses a subword-based tokenizer, specifically SentencePiece with an Unigram Language Model
from transformers import XLNetTokenizer

tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tokenizer.tokenize("IBM taught me tokenization.")

['▁IBM', '▁taught', '▁me', '▁token', 'ization', '.']

In [64]:
# Tokenization with PyTorch
dataset = [
    (1,"Introduction to NLP"),
    (2,"Basics of PyTorch"),
    (1,"NLP Techniques for Text Classification"),
    (3,"Named Entity Recognition with PyTorch"),
    (3,"Sentiment Analysis using PyTorch"),
    (3,"Machine Translation with PyTorch"),
    (1," NLP Named Entity,Sentiment Analysis,Machine Translation "),
    (1," Machine Translation with NLP "),
    (1," Named Entity vs Sentiment Analysis  NLP ")]

In [65]:
from torchtext.data.utils import get_tokenizer
# for this change python version to 3.10.7

tokenizer = get_tokenizer("basic_english")
# basic_english: lowercases text and splits it into words
tokenizer('HeLlo how You doiN!')

['hello', 'how', 'you', 'doin', '!']

In [66]:
# indexing
tokenizer(dataset[0][1])

['introduction', 'to', 'nlp']

In [67]:
def yield_tokens(data_iter):
    for _,text in data_iter:  # so in tuples "_" is the number and "text" is the text
        yield tokenizer(text)
        # we need to get all "for"s, so we use yield (it keeps them as a generator object)
        # return just gives the first "for" (and also gives error in next())

my_iterator = yield_tokens(dataset)

print(next(my_iterator))
print(next(my_iterator))
print(next(my_iterator))

['introduction', 'to', 'nlp']
['basics', 'of', 'pytorch']
['nlp', 'techniques', 'for', 'text', 'classification']


In [68]:
# create a word-to-index mapping.
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(yield_tokens(dataset), specials=["<unk>"])
# build_vocab_from_iterator() takes the tokenized text iterator as input -> creates a word-to-index mapping.
# This mapping is used to convert text into numerical format for ML models.
# Unknown words get mapped to <unk> (index 0).
vocab.set_default_index(vocab["<unk>"])
# vocab["<unk>"]: Retrieves the index of the special token <unk>, which represents unknown words.
# vocab.set_default_index(0): 
# Ensures that any word not found in the vocabulary is automatically assigned the <unk> index.
# Prevents errors when looking up missing words.


In [None]:
# to get a list of tokenz of each tuple and a list of their indexs by each run of this cell
def get_tokenized_sentence_and_indices(iterator):
    tokenized_sentence = next(iterator)
    token_indices = [vocab[token] for token in tokenized_sentence]
    return tokenized_sentence, token_indices

tokenized_sentence, token_indices = get_tokenized_sentence_and_indices(my_iterator)
next(my_iterator)

print("Tokenized Sentence:", tokenized_sentence)
print("Token Indices:", token_indices)
# why skips some lines?

Tokenized Sentence: ['machine', 'translation', 'with', 'nlp']
Token Indices: [5, 8, 9, 1]


In [None]:
lines = ["IBM taught me tokenization", 
         "Special tokenizers are ready and they will blow your mind", 
         "just saying hi!"]

special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']
# <unk>: "Unknown" token
# <pad>: "Padding" token
# <bos>: "Beginning of Sequence" token
# <eos>: "End of Sequence" token

tokenizer_en = get_tokenizer('spacy', language='en_core_web_sm')
# spacy: 'spacy' indicates you want to use spaCy's tokenization method
# language='en_core_web_sm': to use spaCy's small English language model

tokens = []
max_length = 0

for line in lines:
    tokenized_line = tokenizer_en(line)
    tokenized_line = ['<bos>'] + tokenized_line + ['<eos>']
    # eg it will be ["<bos>", "hello", "world", "<eos>"]
    # Help the model identify where sequences start and end
    tokens.append(tokenized_line)
    max_length = max(max_length, len(tokenized_line))

for i in range(len(tokens)):
    tokens[i] = tokens[i] + ['<pad>'] * (max_length - len(tokens[i]))
    # max_length - len(tokens[i]): calculates how many padding tokens are needed
    # for example If we have:
    # tokens[i] = ["hello", "world"]
    # max_length = 5
    # tokens[i] = tokens[i] + ['<pad>'] * (5 - 2)
    # The result would be:
    # tokens[i] = ["hello", "world", "<pad>", "<pad>", "<pad>"]

print("Lines after adding special tokens:\n", tokens)

vocab = build_vocab_from_iterator(tokens, specials=['<unk>'])
vocab.set_default_index(vocab['<unk'])

In [None]:
new_line = "I learned about embeddings and attention mechanisms."

tokenized_new_line = tokenizer_en(new_line)
tokenized_new_line = ['<bos>'] + tokenized_new_line + ['<eos>']

new_line_padded = tokenized_new_line + ['<pad>'] * (max_length - len(tokenized_new_line))

new_line_ids = [vocab[token] if token in vocab else vocab['<unk>'] for token in new_line_padded]

print("Token IDs for new line:", new_line_ids)