<a href="https://colab.research.google.com/github/martinthetechie/nlp-guide/blob/main/stemming_and_lemmatization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# Download required resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# List of corporate terms and lingo terms
terms = ["emailing", "replying", "meeting", "presentations", "reporting", "scheduling"]

# Stemming
stemmed_terms = [stemmer.stem(term) for term in terms]

# Lemmatization
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmatized_terms = []
for term in terms:
    pos_tag = nltk.pos_tag(word_tokenize(term))[0][1]
    wordnet_pos = get_wordnet_pos(pos_tag)
    lemmatized_terms.append(lemmatizer.lemmatize(term, pos=wordnet_pos))

# Print results
print("Original Terms:", terms)
print("Stemmed Terms:", stemmed_terms)
print("Lemmatized Terms:", lemmatized_terms)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Original Terms: ['emailing', 'replying', 'meeting', 'presentations', 'reporting', 'scheduling']
Stemmed Terms: ['email', 'repli', 'meet', 'present', 'report', 'schedul']
Lemmatized Terms: ['email', 'reply', 'meeting', 'presentation', 'reporting', 'schedule']


<h2>Stemming</h2>

In [3]:
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["emailing", "replying", "meeting", "presentations", "reporting", "scheduling"]
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed Words:", stemmed_words)

Stemmed Words: ['email', 'repli', 'meet', 'present', 'report', 'schedul']


In [4]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()
words = ["emailing", "replying", "meeting", "presentations", "reporting", "scheduling"]
# Function to get the part of speech for lemmatization
def get_wordnet_pos(word):
    """Return the part of speech for a word"""
    tag = nltk.pos_tag([word])[0][1]
    # print(nltk.pos_tag([word]))

    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

lemmas = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in words]
print("Lemmas:", lemmas)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


[('emailing', 'VBG')]
[('replying', 'VBG')]
[('meeting', 'NN')]
[('presentations', 'NNS')]
[('reporting', 'NN')]
[('scheduling', 'VBG')]
Lemmas: ['email', 'reply', 'meeting', 'presentation', 'reporting', 'schedule']
