In [1]:
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


## resource:https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

## Create stemmer & Lemmatizer
stemmer=PorterStemmer()
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...


### Example

In [2]:
print('Stemming amusing : {}'.format(stemmer.stem('amusing')))
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v')))

Stemming amusing : amus
lemmatization amusing : amuse


### Use tokenize + stemming to obtain the root of every word.

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [4]:
# Define the sentence to be lemmatized
sentence = "We went out often, hiding from sight, desperately searching for food."

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']

stemming_output = ' '.join([w for w in word_list])
print(stemming_output)
#> We went out often , hide from sight , desper search for food.


['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']
We went out often , hiding from sight , desperately searching for food .


### Use tokenize + lemmatize to obtain the lemma of every word.

In [5]:
# Define the sentence to be lemmatized
sentence = "We went out often, hiding from sight, desperately searching for food."

# Tokenize: Split the sentence into words
word_list = nltk.word_tokenize(sentence)
print(word_list)
#> ['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']

# Lemmatize list of words and join
lemmatized_output = ' '.join([w for w in word_list])
print(lemmatized_output)
#> We went out often , hiding from sight , desperately searching for food .

['We', 'went', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'searching', 'for', 'food', '.']
We went out often , hiding from sight , desperately searching for food .


### Sometimes the lemma of a word might change depending to it's Part Of Speech.

In [6]:
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'v'))) ## Verb
print('lemmatization amusing : {}'.format(lemmatizer.lemmatize('amusing',pos = 'a'))) ## Adj

lemmatization amusing : amuse
lemmatization amusing : amusing


### Use pos_tag + lemmatize to obtain the lemma of every word.

In [7]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [8]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """map the pos_tag result to the pos format of the lemmatizer."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


In [9]:
word = 'using'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

use


### Lemmatize every word in the sentence then add POS tag

In [11]:
sentence = "We went out often, hiding from sight, desperately searching for food."
word_list = nltk.word_tokenize(sentence)
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_list])

#> ['We', 'go', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'search', 'for', 'food', '.']


['We', 'go', 'out', 'often', ',', 'hiding', 'from', 'sight', ',', 'desperately', 'search', 'for', 'food', '.']
