In [1]:
import nltk

# <b> PorterStemmer</b>

In [2]:
from nltk.stem import PorterStemmer

In [3]:
porter = PorterStemmer()

In [4]:
print(porter.stem('walks'))


walk


In [5]:
porter.stem('walking')

#meaningfull

'walk'

In [6]:
porter.stem('replacement')

#not meaningfull

'replac'

In [7]:
print(porter.stem('movement'))
print(porter.stem('running'))
print(porter.stem('bosses'))
print(porter.stem('berry'))

movement
run
boss
berri


# <b>Snowball Stemmer</b>

In [8]:
from nltk.stem.snowball import SnowballStemmer

In [9]:
snowballstemmer = SnowballStemmer(language='english')

In [10]:
print(snowballstemmer.stem('walks'))
print(snowballstemmer.stem('replacement'))
print(snowballstemmer.stem('berry'))

walk
replac
berri


In [11]:
print(SnowballStemmer.languages)

#this much languages can be used with SnowballStemmer

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


# <b>Lemmatization</b>

In [12]:
sentence = "Lemmatization is sophosticated to stemming".split()

In [13]:
#stemming is used here

for tokens in sentence:
    print(porter.stem(tokens))

lemmat
is
sophost
to
stem


In [14]:
#Using Lemmatization

#Lemmatization is case sensitive whereas stemming is not case sensitive

In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
#downloading the dataset have the word - lemma collection (like a Nx2 array)

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
#corpus means dataset

from nltk.corpus import wordnet

In [18]:
lemmatizer = WordNetLemmatizer()

In [19]:
print(wordnet.VERB) #constants
print(wordnet.NOUN)

v
n


In [20]:
nltk.download('omw-1.4')

#just to solve an error caused

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [21]:
print(lemmatizer.lemmatize('Walking'))
print(lemmatizer.lemmatize('mouse'))
print(lemmatizer.lemmatize('was'))
print(lemmatizer.lemmatize('mice',pos='n'))

print(lemmatizer.lemmatize('was',pos='v'))  ##pos = 'v' means context = verb

#if we use stem we can see the difference
print("\n")
print(porter.stem('mouse'))
print(porter.stem('mice'))

Walking
mouse
wa
mouse
be


mous
mice


# <b>How to use Parts of speech for context in lemmatization</b>

In [22]:
#averaged_perceptron_tagger is aa dataset which has the word - parts of speech pairs

nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [23]:
sentence = "I ate Mango".split()

In [24]:
word_and_tag = nltk.pos_tag(sentence)
print(word_and_tag)

[('I', 'PRP'), ('ate', 'VBP'), ('Mango', 'NNS')]


In [25]:
#to convert words like 'ADJ','NNS',... to wordnet constants like wordnet.ADJ,...

def get_POS(token_tag):
    if token_tag.startswith('J'):
        return wordnet.ADJ  
    elif token_tag.startswith('N'):
        return wordnet.NOUN  # is a constant , which is 'n'
    elif token_tag.startswith('V'):
        return wordnet.VERB
    elif token_tag.startswith('R'):
        return wordnet.ADV #it still continues...
    else:
        return wordnet.NOUN
        
        
#wordnet.ADJ, wordnet.NOUN ,etc are constants

In [26]:
for word,tag in word_and_tag:
    lemma = lemmatizer.lemmatize(word,pos=get_POS(tag))
    print(lemma)

I
eat
Mango


In [27]:
#another example

sentence = "The stripped bats are hanging on their feet for the best".split()
word_and_tag = nltk.pos_tag(sentence)

for word,tag in word_and_tag:
    lemma = lemmatizer.lemmatize(word,pos=get_POS(tag))
    print(lemma)

The
stripped
bat
be
hang
on
their
foot
for
the
best


### comparing the result with stemmer

In [28]:
for word,tag in word_and_tag: #here tag not needed, only need word
    stemmed_output = porter.stem(word)
    print(stemmed_output)
    
print('****************************')
for word in sentence: #here tag not needed, only need word
    stemmed_output = porter.stem(word)
    print(stemmed_output)
    
#both are correct and same

the
strip
bat
are
hang
on
their
feet
for
the
best
****************************
the
strip
bat
are
hang
on
their
feet
for
the
best


In [29]:
#another method to find Parts of speech

def get_POS_revised(token):
    dict = {
        'J' : wordnet.ADJ,
        'N' : wordnet.NOUN,
        'R' : wordnet.ADV,
        'V' : wordnet.VERB
    }
    tag = nltk.pos_tag([token])[0][1][0].upper() 
    return dict.get(tag,wordnet.NOUN) 

    #dict.get() will return wordnet.NOUN if 'tag' is not present in dict

    #slicing will allow us to get the first letter of tags from <word,tag> format

In [30]:
sentence = "The stripped bats are hanging on their feet for the best".split()

for word in sentence:
    lemma = lemmatizer.lemmatize(word,pos=get_POS_revised(word))
    print(lemma)

The
strip
bat
be
hang
on
their
foot
for
the
best
