In [None]:
import nltk
from nltk.tokenize import WhitespaceTokenizer, WordPunctTokenizer, TreebankWordTokenizer, TweetTokenizer
from nltk.tokenize.mwe import MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
text = "NLTK is a powerful library for NLP! It's easy-to-use and supports multiple tokenization techniques."

In [None]:
# Tokenization
whitespace_tokenizer = WhitespaceTokenizer()
punct_tokenizer = WordPunctTokenizer()
treebank_tokenizer = TreebankWordTokenizer()
tweet_tokenizer = TweetTokenizer()
mwe_tokenizer = MWETokenizer([("easy", "to", "use")])

In [None]:
print("Whitespace Tokenization:", whitespace_tokenizer.tokenize(text))
print("Punctuation-based Tokenization:", punct_tokenizer.tokenize(text))
print("Treebank Tokenization:", treebank_tokenizer.tokenize(text))
print("Tweet Tokenization:", tweet_tokenizer.tokenize(text))
print("MWE Tokenization:", mwe_tokenizer.tokenize(text.split()))

Whitespace Tokenization: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP!', "It's", 'easy-to-use', 'and', 'supports', 'multiple', 'tokenization', 'techniques.']
Punctuation-based Tokenization: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '!', 'It', "'", 's', 'easy', '-', 'to', '-', 'use', 'and', 'supports', 'multiple', 'tokenization', 'techniques', '.']
Treebank Tokenization: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '!', 'It', "'s", 'easy-to-use', 'and', 'supports', 'multiple', 'tokenization', 'techniques', '.']
Tweet Tokenization: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP', '!', "It's", 'easy-to-use', 'and', 'supports', 'multiple', 'tokenization', 'techniques', '.']
MWE Tokenization: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'NLP!', "It's", 'easy-to-use', 'and', 'supports', 'multiple', 'tokenization', 'techniques.']


In [None]:
# Stemming
porter = PorterStemmer()
snowball = SnowballStemmer("english")

words = ["running", "flies", "easily", "studies"]
print("Porter Stemmer:", [porter.stem(word) for word in words])
print("Snowball Stemmer:", [snowball.stem(word) for word in words])


Porter Stemmer: ['run', 'fli', 'easili', 'studi']
Snowball Stemmer: ['run', 'fli', 'easili', 'studi']


In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
print("Lemmatization:", [lemmatizer.lemmatize(word) for word in words])

Lemmatization: ['running', 'fly', 'easily', 'study']
