In [9]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
text = "Australia fast bowler Pat Cummins was ruled out on Saturday (January 31, 2026) for next month's ‍Twenty20 World Cup after failing to recover from a nagging ​back injury, and Ben Dwarshuis will replace him ‌for the global showpiece."

In [11]:
#Based on Spaces
from nltk.tokenize import WhitespaceTokenizer

wt = WhitespaceTokenizer()
print("Whitespace Tokenization:")
print(wt.tokenize(text))

Whitespace Tokenization:
['Australia', 'fast', 'bowler', 'Pat', 'Cummins', 'was', 'ruled', 'out', 'on', 'Saturday', '(January', '31,', '2026)', 'for', 'next', "month's", '\u200dTwenty20', 'World', 'Cup', 'after', 'failing', 'to', 'recover', 'from', 'a', 'nagging', '\u200bback', 'injury,', 'and', 'Ben', 'Dwarshuis', 'will', 'replace', 'him', '\u200cfor', 'the', 'global', 'showpiece.']


In [12]:
#Words and Punctuation
from nltk.tokenize import WordPunctTokenizer

wpt = WordPunctTokenizer()
print("\nPunctuation-based Tokenization:")
print(wpt.tokenize(text))


Punctuation-based Tokenization:
['Australia', 'fast', 'bowler', 'Pat', 'Cummins', 'was', 'ruled', 'out', 'on', 'Saturday', '(', 'January', '31', ',', '2026', ')', 'for', 'next', 'month', "'", 's', '\u200d', 'Twenty20', 'World', 'Cup', 'after', 'failing', 'to', 'recover', 'from', 'a', 'nagging', '\u200b', 'back', 'injury', ',', 'and', 'Ben', 'Dwarshuis', 'will', 'replace', 'him', '\u200c', 'for', 'the', 'global', 'showpiece', '.']


In [13]:
#Grammer Aware Tokenization
from nltk.tokenize import TreebankWordTokenizer

tbt = TreebankWordTokenizer()
print("\nTreebank Tokenization:")
print(tbt.tokenize(text))


Treebank Tokenization:
['Australia', 'fast', 'bowler', 'Pat', 'Cummins', 'was', 'ruled', 'out', 'on', 'Saturday', '(', 'January', '31', ',', '2026', ')', 'for', 'next', 'month', "'s", '\u200dTwenty20', 'World', 'Cup', 'after', 'failing', 'to', 'recover', 'from', 'a', 'nagging', '\u200bback', 'injury', ',', 'and', 'Ben', 'Dwarshuis', 'will', 'replace', 'him', '\u200cfor', 'the', 'global', 'showpiece', '.']


In [14]:
from nltk.tokenize import TweetTokenizer


tt = TweetTokenizer()
print("\nTweet Tokenization:")
print(tt.tokenize(text))


Tweet Tokenization:
['Australia', 'fast', 'bowler', 'Pat', 'Cummins', 'was', 'ruled', 'out', 'on', 'Saturday', '(', 'January', '31', ',', '2026', ')', 'for', 'next', "month's", ' \u200dT', 'wenty', '20', 'World', 'Cup', 'after', 'failing', 'to', 'recover', 'from', 'a', 'nagging', '\u200b', 'back', 'injury', ',', 'and', 'Ben', 'Dwarshuis', 'will', 'replace', 'him', '\u200cfor', 'the', 'global', 'showpiece', '.']


In [15]:
#Combine two words as single token
from nltk.tokenize import MWETokenizer

mwe = MWETokenizer([('machine', 'learning'), ('artificial', 'intelligence')])
sentence = "I love machine learning and artificial intelligence"
print("\nMWE Tokenization:")
print(mwe.tokenize(sentence.split()))


MWE Tokenization:
['I', 'love', 'machine_learning', 'and', 'artificial_intelligence']


In [16]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
words = ["running", "flies", "easily", "fairness","Properly"]

print("\nPorter Stemming:")
for w in words:
    print(w, "→", ps.stem(w))


Porter Stemming:
running → run
flies → fli
easily → easili
fairness → fair
Properly → properli


In [17]:
from nltk.stem import SnowballStemmer

ss = SnowballStemmer("english")
print("\nSnowball Stemming:")
for w in words:
    print(w, "→", ss.stem(w))


Snowball Stemming:
running → run
flies → fli
easily → easili
fairness → fair
Properly → proper


In [18]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ["running", "better", "cars", "flies"]

print("\nLemmatization:")
for w in words:
    print(w, "→", lemmatizer.lemmatize(w))


Lemmatization:
running → running
better → better
cars → car
flies → fly


In [19]:
import nltk

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [20]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk

lemmatizer = WordNetLemmatizer()

words = ["running", "better", "cars", "flies"]

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    if tag == 'J':
        return wordnet.ADJ
    elif tag == 'V':
        return wordnet.VERB
    elif tag == 'N':
        return wordnet.NOUN
    elif tag == 'R':
        return wordnet.ADV
    else:
        return wordnet.NOUN

print("Lemmatization:")
for w in words:
    pos = get_wordnet_pos(w)
    print(w, "→", lemmatizer.lemmatize(w, pos))

Lemmatization:
running → run
better → well
cars → car
flies → fly
