# Stemming and Lemmatization

Stemming is a process of REDUCING a word to its base/root form.

Stemming is a rulebased approach that may not necessarily produce a valid word

In [20]:
!python --version
!pip show nltk

Python 3.11.7
Name: nltk
Version: 3.8.1
Summary: Natural Language Toolkit
Home-page: https://www.nltk.org/
Author: NLTK Team
Author-email: nltk.team@gmail.com
License: Apache License, Version 2.0
Location: /opt/anaconda3/lib/python3.11/site-packages
Requires: click, joblib, regex, tqdm
Required-by: sentence-transformers


In [1]:
import nltk


In [2]:
import spacy
model = spacy.load('en_core_web_sm')

SPACY model architecture: 

https://spacy.io/usage/layers-architectures/

In [3]:
#use a unicode string
textData = u"running run easily fairly cache caching cached"

data = model(textData)

# Stemming

In [4]:
#Stemming using NLTK

nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
#Tokenize the words
from nltk.tokenize import word_tokenize
words = word_tokenize(textData)

In [6]:
#Initialize Stemmer and extract stem word
stemmer = nltk.PorterStemmer()
stemmedWords = [stemmer.stem(word) for word in words]
stemmedWords

['run', 'run', 'easili', 'fairli', 'cach', 'cach', 'cach']

### Stemming using Spacy ---------> Spacy recommends using Lemmatization than Stemming. Hence Stemming doesnt exists

# Lemmatization
Its more sophisticated process that reduces the word into its base form considering the base word is a dictionary word

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmaWords = [ lemmatizer.lemmatize(word, pos='v') for word in words]
lemmaWords

['run', 'run', 'easily', 'fairly', 'cache', 'cache', 'cache']

In [8]:
lemmaWordsFromSpacy = [token.lemma_ for token in data]
lemmaWordsFromSpacy

['run', 'run', 'easily', 'fairly', 'cache', 'cache', 'cache']

# Stoppwords

Stopwords are words that have little SEMANTIC meaning that may or may not add main context or information in the text data.

Refe: https://www.ranks.nl/stopwords

In [9]:
#Stopwords removal in NLTK
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [16]:
nltk.download('stopwords')

stopWordList = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
textData = u"Welcome to Las Vegas. I an on vacation and I am swimming in the pool"
#Tokenize words
words = word_tokenize(textData)

In [19]:
#Filter the stop words
#Normalization in the context of Text data, I am referring to Lowercase

# We do a dictionary search
filteredWords = [word for word in words if word.lower() not in stopWordList]
print(filteredWords)

['Welcome', 'Las', 'Vegas', '.', 'vacation', 'swimming', 'pool']


In [18]:
data = model(textData)
filteredData = [ token.text for token in data if not token.is_stop ]
filteredData

['Welcome', 'Las', 'Vegas', '.', 'vacation', 'swimming', 'pool']

In [27]:
# Add a custom stopword in existing Spacy model.
# Goal: Add Prashant as a stopword
model.Defaults.stop_words.add('eric')
model.vocab['eric'].is_stop=True

In [28]:
model.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'Eric',
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'eric',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four

In [29]:
#Remove stopword from default repo/list

model.Defaults.stop_words.remove('eric')
model.vocab['eric'].is_stop=False

In [30]:
model.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'Eric',
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from