### Stemming

Stemming is the process of reducing words to their root or stem form by removing suffixes, allowing different variations of a word to be treated as the same.

In [1]:
import nltk
from nltk.stem.porter import PorterStemmer

In [2]:
p_stemmer = PorterStemmer()
words = ['run','runner','ran','runs','easily','fairly']

In [3]:
for word in words:
    print(word + '----->' + p_stemmer.stem(word))

run----->run
runner----->runner
ran----->ran
runs----->run
easily----->easili
fairly----->fairli


In [4]:
from nltk.stem.snowball import SnowballStemmer

In [5]:
s_stemmer = SnowballStemmer(language='english')

In [6]:
for word in words:
    print(word + ' ----->' + s_stemmer.stem(word))

run ----->run
runner ----->runner
ran ----->ran
runs ----->run
easily ----->easili
fairly ----->fair


### Lemmas


A lemma is the base or canonical form of a word, and lemmatization is the process of reducing words to their base forms. It helps normalize text data and extract core meanings, improving accuracy in various natural language processing tasks.

Lemmas tend to be more accurate, but more computationally expensive as they taken context around the word into account.

In [7]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc1 = nlp(u"I am a runner in a race because I love to run since I ran today")

for token in doc1:
    print(token.text,'\t',token.pos_,'\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
runner 	 NOUN 	 12640964157389618806 	 runner
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
race 	 NOUN 	 8048469955494714898 	 race
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
run 	 VERB 	 12767647472892411841 	 run
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
today 	 NOUN 	 11042482332948150395 	 today


In [8]:
# Alternative way of showign lemmas
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma:<{22}} {token.lemma_}')
        
doc2 = nlp(u"I saw ten mice today")
show_lemmas(doc2)

I            PRON   4690420944186131903    I
saw          VERB   11925638236994514241   see
ten          NUM    7970704286052693043    ten
mice         NOUN   1384165645700560590    mouse
today        NOUN   11042482332948150395   today


### Stop Words
How to access spacy's default stop words (326) and add/remove them

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')
nlp.print(nlp.Defaults.stop_words)

In [9]:
len(nlp.Defaults.stop_words)

326

In [10]:
# How to tell if 'is' a stopword
display('is',nlp.vocab['is'].is_stop)

display('mystery',nlp.vocab['mystery'].is_stop)

'is'

True

'mystery'

False

In [11]:
# How to add stopwords (+validation)
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

len(nlp.Defaults.stop_words)

327

In [12]:
# How to remove stopwords
nlp.vocab['beyond'].is_stop = False
nlp.vocab['beyond'].is_stop

False