In [1]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer, TreebankWordTokenizer, TweetTokenizer
from textblob import TextBlob
import spacy
from gensim.utils import simple_preprocess
from nltk.tokenize import MWETokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [5]:
text = '''Sentiment analysis 💬🤖❤️ is revolutionizing how? businesses interpret customer #feedback!
Companies use NLP models  to analyze tweets, reviews & comments—identifying positive, negative, or neutral sentiments 🎭📊.
But it isn't  always accurate... Context matters! 'This movie is sick!' could mean amazing or terrible—confusing AI! 🤔😵‍💫'''



In [6]:
# 1. Word Tokenization
word_tokens = word_tokenize(text)
print("Word Tokenization:", word_tokens)
print("Word tonizer will divide the paragraph or sentence into tokens of each word. ")

Word Tokenization: ['Sentiment', 'analysis', '💬🤖❤️', 'is', 'revolutionizing', 'how', '?', 'businesses', 'interpret', 'customer', '#', 'feedback', '!', 'Companies', 'use', 'NLP', 'models', 'to', 'analyze', 'tweets', ',', 'reviews', '&', 'comments—identifying', 'positive', ',', 'negative', ',', 'or', 'neutral', 'sentiments', '🎭📊', '.', 'But', 'it', 'is', "n't", 'always', 'accurate', '...', 'Context', 'matters', '!', "'This", 'movie', 'is', 'sick', '!', "'", 'could', 'mean', 'amazing', 'or', 'terrible—confusing', 'AI', '!', '🤔😵\u200d💫']
Word tonizer will divide the paragraph or sentence into tokens of each word. 


In [8]:
# 2. Sentence Tokenization
sent_tokens = sent_tokenize(text)
print("\nSentence Tokenization:", sent_tokens)
print("Sentence tokenizer - will divide the paragraph or sentences into token of each sentence.")


Sentence Tokenization: ['Sentiment analysis 💬🤖❤️ is revolutionizing how?', 'businesses interpret customer #feedback!', 'Companies use NLP models  to analyze tweets, reviews & comments—identifying positive, negative, or neutral sentiments 🎭📊.', "But it isn't  always accurate...", 'Context matters!', "'This movie is sick!'", 'could mean amazing or terrible—confusing AI!', '🤔😵\u200d💫']
Sentence tokenizer - will divide the paragraph or sentences into token of each sentence.


In [9]:
# 3. Punctuation-based Tokenization
from nltk.tokenize import wordpunct_tokenize
tokens = wordpunct_tokenize(text)
print("Punctuation Tokenization:", tokens)
print("Punctuation Tokenization:- Tokenizes paragraph or text based on punctuations marks & punctuations will have separate token.")

Punctuation Tokenization: ['Sentiment', 'analysis', '💬🤖❤️', 'is', 'revolutionizing', 'how', '?', 'businesses', 'interpret', 'customer', '#', 'feedback', '!', 'Companies', 'use', 'NLP', 'models', 'to', 'analyze', 'tweets', ',', 'reviews', '&', 'comments', '—', 'identifying', 'positive', ',', 'negative', ',', 'or', 'neutral', 'sentiments', '🎭📊.', 'But', 'it', 'isn', "'", 't', 'always', 'accurate', '...', 'Context', 'matters', '!', "'", 'This', 'movie', 'is', 'sick', "!'", 'could', 'mean', 'amazing', 'or', 'terrible', '—', 'confusing', 'AI', '!', '🤔😵\u200d💫']
Punctuation Tokenization:- Tokenizes paragraph or text based on punctuations marks & punctuations will have separate token.


In [10]:
# 4. Treebank Word Tokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
print("Treebank Tokenization:", tokens)
print("Treebank Tokenization: splits contractions, separates punctuation, and handles special cases like possessives ('s).) " )
print( "isn't → [is, n't] ")

Treebank Tokenization: ['Sentiment', 'analysis', '💬🤖❤️', 'is', 'revolutionizing', 'how', '?', 'businesses', 'interpret', 'customer', '#', 'feedback', '!', 'Companies', 'use', 'NLP', 'models', 'to', 'analyze', 'tweets', ',', 'reviews', '&', 'comments—identifying', 'positive', ',', 'negative', ',', 'or', 'neutral', 'sentiments', '🎭📊.', 'But', 'it', 'is', "n't", 'always', 'accurate', '...', 'Context', 'matters', '!', "'This", 'movie', 'is', 'sick', '!', "'", 'could', 'mean', 'amazing', 'or', 'terrible—confusing', 'AI', '!', '🤔😵\u200d💫']
Treebank Tokenization: splits contractions, separates punctuation, and handles special cases like possessives ('s).) 
isn't → [is, n't] 


In [11]:
# 5. Tweet Tokenizer
tweet_tokenizer = TweetTokenizer()
tweet_tokens = tweet_tokenizer.tokenize(text)
print("\nTweet Tokenization:", tweet_tokens)
print("Tweet Tokenization: Handles hashtags, emojis and mentions.")


Tweet Tokenization: ['Sentiment', 'analysis', '💬', '🤖', '❤', '️', 'is', 'revolutionizing', 'how', '?', 'businesses', 'interpret', 'customer', '#feedback', '!', 'Companies', 'use', 'NLP', 'models', 'to', 'analyze', 'tweets', ',', 'reviews', '&', 'comments', '—', 'identifying', 'positive', ',', 'negative', ',', 'or', 'neutral', 'sentiments', '🎭', '📊', '.', 'But', 'it', "isn't", 'always', 'accurate', '...', 'Context', 'matters', '!', "'", 'This', 'movie', 'is', 'sick', '!', "'", 'could', 'mean', 'amazing', 'or', 'terrible', '—', 'confusing', 'AI', '!', '🤔', '😵\u200d💫']
Tweet Tokenization: Handles hashtags, emojis and mentions.


In [12]:
# 6. Multi-Word Expression Tokenizer
mwe_tokenizer = MWETokenizer([( 'Sentiment', 'analysis'), ('contextual', 'embeddings'), ('customer', 'feedback')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokens)
print("\nMulti-Word Expression Tokenization:", mwe_tokens)
print("Multiword Tokenizer: The MWE Tokenizer in NLTK is used to detect and merge specific multi-word phrases into single tokens.")



Multi-Word Expression Tokenization: ['Sentiment_analysis', '💬🤖❤️', 'is', 'revolutionizing', 'how', '?', 'businesses', 'interpret', 'customer', '#', 'feedback', '!', 'Companies', 'use', 'NLP', 'models', 'to', 'analyze', 'tweets', ',', 'reviews', '&', 'comments—identifying', 'positive', ',', 'negative', ',', 'or', 'neutral', 'sentiments', '🎭📊', '.', 'But', 'it', 'is', "n't", 'always', 'accurate', '...', 'Context', 'matters', '!', "'This", 'movie', 'is', 'sick', '!', "'", 'could', 'mean', 'amazing', 'or', 'terrible—confusing', 'AI', '!', '🤔😵\u200d💫']
Multiword Tokenizer: The MWE Tokenizer in NLTK is used to detect and merge specific multi-word phrases into single tokens.


In [13]:
# 7. TextBlob Word Tokenization
blob = TextBlob(text)
textblob_tokens = blob.words
print("\nTextBlob Word Tokenization:", textblob_tokens)
print("TextBlob Tokenizer:- Splits text into words and sentences using simple rules, removing punctuation from words but keeping sentence structure intact.")
print(".?! removed")


TextBlob Word Tokenization: ['Sentiment', 'analysis', '💬🤖❤️', 'is', 'revolutionizing', 'how', 'businesses', 'interpret', 'customer', 'feedback', 'Companies', 'use', 'NLP', 'models', 'to', 'analyze', 'tweets', 'reviews', 'comments—identifying', 'positive', 'negative', 'or', 'neutral', 'sentiments', '🎭📊', 'But', 'it', 'is', "n't", 'always', 'accurate', 'Context', 'matters', "'This", 'movie', 'is', 'sick', 'could', 'mean', 'amazing', 'or', 'terrible—confusing', 'AI', '🤔😵\u200d💫']
TextBlob Tokenizer:- Splits text into words and sentences using simple rules, removing punctuation from words but keeping sentence structure intact.
.?! removed


In [None]:
# 8. spaCy Tokenization
nlp = spacy.load("en_core_web_sm")
spacy_tokens = [token.text for token in nlp(text)]
print("\nspaCy Tokenization:", spacy_tokens)
print("spaCy is the most powerful tokenizer as it handles contractions, punctuation, emojis, and special characters efficiently.")





spaCy Tokenization: ['Sentiment', 'analysis', '💬', '🤖', '❤', '️', 'is', 'revolutionizing', 'how', '?', 'businesses', 'interpret', 'customer', 'feedback', '!', '\n', 'Companies', 'use', 'NLP', 'models', ' ', 'to', 'analyze', 'tweets', ',', 'reviews', '&', 'comments', '—', 'identifying', 'positive', ',', 'negative', ',', 'or', 'neutral', 'sentiments', '🎭', '📊', '.', '\n', 'But', 'it', 'is', "n't", ' ', 'always', 'accurate', '...', 'Context', 'matters', '!', "'", 'This', 'movie', 'is', 'sick', '!', "'", 'could', 'mean', 'amazing', 'or', 'terrible', '—', 'confusing', 'AI', '!', '🤔', '😵', '\u200d', '💫']
spaCy is the most powerful tokenizer as it handles contractions, punctuation, emojis, and special characters efficiently.


In [None]:
# 9. Gensim word tokenizer
gensim_tokens = simple_preprocess(text)
print("\nGensim Tokenization:", gensim_tokens)
print("Removes punctuations and converts to lowercase also is fast.")


Gensim Tokenization: ['sentiment', 'analysis', 'is', 'revolutionizing', 'how', 'businesses', 'interpret', 'customer', 'feedback', 'companies', 'use', 'nlp', 'models', 'to', 'analyze', 'tweets', 'reviews', 'comments', 'identifying', 'positive', 'negative', 'or', 'neutral', 'sentiments', 'but', 'it', 'isn', 'always', 'accurate', 'context', 'matters', 'this', 'movie', 'is', 'sick', 'could', 'mean', 'amazing', 'or', 'terrible', 'confusing', 'ai']
Removes punctuations and converts to lowercase also is fast.


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
# 10. Tokenization with Keras
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
keras_tokens = tokenizer.texts_to_sequences([text])
print("Keras Numeric Tokenization:", keras_tokens)
word_index = tokenizer.word_index
print("\nWord Index:", word_index)
print("Maps each token to numeric values")

Keras Numeric Tokenization: [[3, 4, 5, 1, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 2, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 1, 35, 36, 37, 38, 39, 2, 40, 41, 42]]

Word Index: {'is': 1, 'or': 2, 'sentiment': 3, 'analysis': 4, '💬🤖❤️': 5, 'revolutionizing': 6, 'how': 7, 'businesses': 8, 'interpret': 9, 'customer': 10, 'feedback': 11, 'companies': 12, 'use': 13, 'nlp': 14, 'models': 15, 'to': 16, 'analyze': 17, 'tweets': 18, 'reviews': 19, 'comments—identifying': 20, 'positive': 21, 'negative': 22, 'neutral': 23, 'sentiments': 24, '🎭📊': 25, 'but': 26, 'it': 27, "isn't": 28, 'always': 29, 'accurate': 30, 'context': 31, 'matters': 32, "'this": 33, 'movie': 34, 'sick': 35, "'": 36, 'could': 37, 'mean': 38, 'amazing': 39, 'terrible—confusing': 40, 'ai': 41, '🤔😵\u200d💫': 42}
Maps each token to numeric values


In [None]:
# Treebank Tokenizer: Follows Penn Treebank rules, splits contractions.
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize("Can't")
tokens

['Ca', "n't"]