In [1]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

In [2]:
# Download required resources
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Test Corpus
corpus = [
    "The cats are playing in the garden.",
    "The children are playing with their toys.",
    "The quick brown fox jumps over the lazy dog."
]

In [4]:
# Instatiating Objects For Stemmers and Lemmatizers
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()


In [5]:
# Stemming Function
def stem_text(text, stemmer):
    tokens = word_tokenize(text)
    return ' '.join([stemmer.stem(token) for token in tokens])

In [6]:
# Lemmatization Function
def lemmatize_text(text, lemmatizer):
    tokens = word_tokenize(text)
    return ' '.join([lemmatizer.lemmatize(token, pos=get_wordnet_pos(token)) for token in tokens])

In [7]:
# Utility For Lemmatization Function (Maps POS tags from the Penn Treebank format to WordNet POS tags)
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [8]:
# Actual Function For Calling Stemmers and Lemmatizers
def process_corpus(corpus):
    for sentence in corpus:
        print(f"Original: {sentence}")

        # Stemming
        print(f"Porter Stemmer: {stem_text(sentence, porter_stemmer)}")
        print(f"Lancaster Stemmer: {stem_text(sentence, lancaster_stemmer)}")
        print(f"Snowball Stemmer: {stem_text(sentence, snowball_stemmer)}")

        # Lemmatization
        print(f"WordNet Lemmatizer: {lemmatize_text(sentence, wordnet_lemmatizer)}")
        print()

In [9]:
# Calling The Above Function And Verifying The Results
process_corpus(corpus)

Original: The cats are playing in the garden.
Porter Stemmer: the cat are play in the garden .
Lancaster Stemmer: the cat ar play in the gard .
Snowball Stemmer: the cat are play in the garden .
WordNet Lemmatizer: The cat are playing in the garden .

Original: The children are playing with their toys.
Porter Stemmer: the children are play with their toy .
Lancaster Stemmer: the childr ar play with their toy .
Snowball Stemmer: the children are play with their toy .
WordNet Lemmatizer: The child are playing with their toy .

Original: The quick brown fox jumps over the lazy dog.
Porter Stemmer: the quick brown fox jump over the lazi dog .
Lancaster Stemmer: the quick brown fox jump ov the lazy dog .
Snowball Stemmer: the quick brown fox jump over the lazi dog .
WordNet Lemmatizer: The quick brown fox jump over the lazy dog .

