In [1]:
import nltk
import spacy
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

##### __Stemming in NLTK__

In [2]:
# Stemming is the process of reducing a word to its word stem that affixes to suffixes and prefixes or to the roots of words known as a lemma.

stemmer = PorterStemmer()
ords = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]

for word in  ords:
    print(word, " |  ", stemmer.stem(word))

eating  |   eat
eats  |   eat
eat  |   eat
ate  |   ate
adjustable  |   adjust
rafting  |   raft
ability  |   abil
meeting  |   meet


##### __Lemmatization in Spacy__

In [3]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_, " | ", token.lemma) # token.lemma is the hash value of the lemma

# Sometimes lemma does not understand the context of the word(SLang words) and gives wrong results, so, we have to customize the lemmatizer

eating  |  eat  |  9837207709914848172
eats  |  eat  |  9837207709914848172
eat  |  eat  |  9837207709914848172
ate  |  eat  |  9837207709914848172
adjustable  |  adjustable  |  6033511944150694480
rafting  |  raft  |  7154368781129989833
ability  |  ability  |  11565809527369121409
meeting  |  meeting  |  14798207169164081740
better  |  well  |  4525988469032889948


In [4]:
print(nlp.pipe_names)

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


#### __Customizing lemmatizer__

In [10]:
# attribute_ruler is the component that is responsible for lemmatization, so, we will customize it, and then add it to the pipeline, and then use it.
ar = nlp.get_pipe("attribute_ruler")
print(ar)

# lemma_rules = {"eating": "eat", "eats": "eat", "ate": "eat", "adjustable": "adjust", "rafting": "raft", "ability": "able", "meeting": "meet", "better": "good"}

# ar.add([[{"TEXT": "eating"}], {"LEMMA": "eat", "POS": "VERB"}])
ar.add([[{"TEXT": "Bro"}],[{"TEXT": "Brah"}]], {"LEMMA": "Brother"})

doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

for token in doc:
    print(token, " | ", token.lemma_)

<spacy.pipeline.attributeruler.AttributeRuler object at 0x7f104eae0700>
Bro  |  Brother
,  |  ,
you  |  you
wanna  |  wanna
go  |  go
?  |  ?
Brah  |  Brother
,  |  ,
do  |  do
n't  |  not
say  |  say
no  |  no
!  |  !
I  |  I
am  |  be
exhausted  |  exhaust
