In [None]:
# Perform tokenization (Whitespace, Punctuation-based, Treebank, Tweet, MWE) using NLTK
# Use Porter Stemmer and Snowball Stemmer for stemming
# Use lemmatization using WordNet


In [None]:
import nltk
from nltk.tokenize import word_tokenize, TreebankWordTokenizer, TweetTokenizer, MWETokenizer
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
text = "The Uniform Civil Code promotes equality and justice in a diverse nation like India."

In [None]:
# Whitespace Tokenization
whitespace_tokens = text.split()
print("Whitespace Tokens:", whitespace_tokens)

In [None]:
# Punctuation-based Tokenization
punctuation_tokens = word_tokenize(text)
print("Punctuation Tokens:", punctuation_tokens)

In [None]:
# Treebank Tokenization
treebank = TreebankWordTokenizer()
treebank_tokens = treebank.tokenize(text)
print("Treebank Tokens:", treebank_tokens)

In [None]:
# Tweet Tokenization
tweet = TweetTokenizer()
tweet_tokens = tweet.tokenize(text)
print("Tweet Tokens:", tweet_tokens)

In [None]:
# Multi-Word Expression (MWE) Tokenization
mwe = MWETokenizer([('Uniform', 'Civil', 'Code'), ('diverse', 'nation')])
mwe_tokens = mwe.tokenize(word_tokenize(text))
print("MWE Tokens:", mwe_tokens)

In [None]:
# Stemming
porter = PorterStemmer()
snowball = SnowballStemmer('english')

porter_stems = [porter.stem(word) for word in punctuation_tokens]
snowball_stems = [snowball.stem(word) for word in punctuation_tokens]

print("Porter Stemmer:", porter_stems)
print("Snowball Stemmer:", snowball_stems)

In [None]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in punctuation_tokens]
print("Lemmatized Words:", lemmatized_words)