# simple tokenization and stemming

In [1]:
from nltk.tokenize import word_tokenize

In [2]:
from nltk.stem import PorterStemmer

In [3]:
text="Running runners run in a race."

In [6]:
tokens=word_tokenize(text)

In [7]:
stemmer=PorterStemmer()

In [8]:
stemmed=[stemmer.stem(token) for token in tokens]

In [9]:
print("Stemmed Words:",stemmed)

Stemmed Words: ['run', 'runner', 'run', 'in', 'a', 'race', '.']


# Advanced Lemmatization

In [10]:
import nltk

In [11]:
from nltk.corpus import wordnet

In [12]:
from nltk.stem import WordNetLemmatizer

In [14]:
from nltk import pos_tag,word_tokenize

In [15]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\2mscds29\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [18]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\2mscds29\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\2mscds29\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
txt="The striped bats are hanging on their feet."

In [22]:
tokens=word_tokenize(txt)

In [23]:
lemmatizer=WordNetLemmatizer()

In [29]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
lemmatized=[lemmatizer.lemmatize(w,get_wordnet_pos(t)) for w,t in pos_tag(tokens)]
print("Lemmatized Words:",lemmatized)

Lemmatized Words: ['The', 'striped', 'bat', 'be', 'hang', 'on', 'their', 'foot', '.']


# Types of stemming

# snowball stemmer

In [31]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")

words = ["running", "ran", "runner", "easily", "fairly", "happiness"]

stems = [stemmer.stem(word) for word in words]

for word, stem in zip(words, stems):
    print(f"{word} → {stem}")


running → run
ran → ran
runner → runner
easily → easili
fairly → fair
happiness → happi


# Lancaster stemmer

In [45]:

from nltk.stem import LancasterStemmer

lan = LancasterStemmer()

words = ["activation", "controlling", "defensible", "happily"]

stems = [lan.stem(word) for word in words]

print(stems)

['act', 'control', 'defens', 'happy']


# Lovins stemmer

In [51]:
!pip install Stemming

Collecting Stemming
  Downloading https://files.pythonhosted.org/packages/d1/eb/fd53fb51b83a4e3b8e98cfec2fa9e4b99401fce5177ec346e4a5c61df71e/stemming-1.0.1.tar.gz
Building wheels for collected packages: Stemming
  Running setup.py bdist_wheel for Stemming: started
  Running setup.py bdist_wheel for Stemming: finished with status 'done'
  Stored in directory: C:\Users\2mscds29\AppData\Local\pip\Cache\wheels\e8\05\2e\2ddeb64d4464b854b48323f9676528c17560da7d153db7b0e2
Successfully built Stemming
Installing collected packages: Stemming
Successfully installed Stemming-1.0.1


In [64]:
import nltk

from stemming.lovins import stem
from nltk.tokenize import word_tokenize

words = ["activation", "controlling", "defensible", "happily"]

stems = [stem(word) for word in words]

print(stems)

['activ', 'control', 'defens', 'hap']


# Regex stemmer

In [65]:
import re
def regex_stemmer(tokens):
    return re.sub('(ing|ed|s)$','',tokens)
[regex_stemmer(w) for w in tokens]

['The', 'strip', 'bat', 'are', 'hang', 'on', 'their', 'feet', '.']