Stemming in NLTK

In [18]:
from nltk.stem import PorterStemmer

In [19]:
stemmer = PorterStemmer()
words = ["eating", "eats", "eat", "ate", "adjustable", "rafting", "ability", "meeting"]
for word in words:
    print(word, "|", stemmer.stem(word))

eating | eat
eats | eat
eat | eat
ate | ate
adjustable | adjust
rafting | raft
ability | abil
meeting | meet


Lemmatization in Spacy


In [20]:
import spacy

In [21]:
nlp = spacy.load("en_core_web_sm")

doc = nlp("Mando talked for 3 hours although talking isn't his thing")
doc = nlp("eating eats eat ate adjustable rafting ability meeting better")
for token in doc:
    print(token, " | ", token.lemma_)

eating  |  eat
eats  |  eat
eat  |  eat
ate  |  eat
adjustable  |  adjustable
rafting  |  raft
ability  |  ability
meeting  |  meet
better  |  well


In [22]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

Customizing lemmatizer


In [23]:
arr = nlp.get_pipe('attribute_ruler')
arr.add([[{"TEXT":"Bro"}],[{"TEXT":"Brah"}]],{"LEMMA":"Brother"})
doc = nlp("Bro, you wanna go? Brah, don't say no! I am exhausted")

token = [(tokens.text,tokens.lemma_) for tokens in doc]
token

[('Bro', 'Brother'),
 (',', ','),
 ('you', 'you'),
 ('wanna', 'wanna'),
 ('go', 'go'),
 ('?', '?'),
 ('Brah', 'Brother'),
 (',', ','),
 ('do', 'do'),
 ("n't", 'not'),
 ('say', 'say'),
 ('no', 'no'),
 ('!', '!'),
 ('I', 'I'),
 ('am', 'be'),
 ('exhausted', 'exhaust')]

Exercise

In [26]:


doc = nlp("running painting walking dressing likely children who good ate fishing")
lemmatization = [(word,word.lemma_) for word in doc]
lemmatization

[(running, 'run'),
 (painting, 'painting'),
 (walking, 'walking'),
 (dressing, 'dress'),
 (likely, 'likely'),
 (children, 'child'),
 (who, 'who'),
 (good, 'good'),
 (ate, 'eat'),
 (fishing, 'fish')]

In [36]:
text = """Latha is very multi talented girl.She is good at many skills like dancing, running, singing, playing.She also likes eating Pav Bhagi. she has a 
habit of fishing and swimming too.Besides all this, she is a wonderful at cooking too.
"""

#step1: Word tokenizing
doc = nlp(text)


#step2: getting the base form for each token using spacy 'lemma_'
lemmatizations = [token.text for token in doc]
print("Lemma word list: ",lemmatizations)

#step3: joining all words in a list into string using 'join()'
full_text_lemma = " ".join(lemmatizations)
    
print(full_text_lemma)

Lemma word list:  ['Latha', 'is', 'very', 'multi', 'talented', 'girl', '.', 'She', 'is', 'good', 'at', 'many', 'skills', 'like', 'dancing', ',', 'running', ',', 'singing', ',', 'playing', '.', 'She', 'also', 'likes', 'eating', 'Pav', 'Bhagi', '.', 'she', 'has', 'a', '\n', 'habit', 'of', 'fishing', 'and', 'swimming', 'too', '.', 'Besides', 'all', 'this', ',', 'she', 'is', 'a', 'wonderful', 'at', 'cooking', 'too', '.', '\n']
Latha is very multi talented girl . She is good at many skills like dancing , running , singing , playing . She also likes eating Pav Bhagi . she has a 
 habit of fishing and swimming too . Besides all this , she is a wonderful at cooking too . 

