# Stemming Words with NLTK

In [1]:
from nltk import word_tokenize
from nltk.stem import *

import pandas as pd

### PorterStemmer
* PorterStemmer uses Suffix Stripping to produce stems.

In [2]:
input_tokens = ['overwhelming', 'overwhelmingly', 
                'hushed', 'hush',
                'functional', 'functionally',
                'lying', 'lied',
                'fairly', 
                'destabilize', 'stability',
                'friendship', 'friendships', 'friendly', 'friendless', 
                'connect', 'connections', 'connected',  
                'the', 'these', 'those',
                'motivational', 'motivate', 'motivating']

In [3]:
ps = PorterStemmer()

ps_stemmed_tokens = []
for token in input_tokens:
    ps_stemmed_tokens.append(ps.stem(token))

In [4]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Porter Stemmer': ps_stemmed_tokens
})

stems_df

Unnamed: 0,words,Porter Stemmer
0,overwhelming,overwhelm
1,overwhelmingly,overwhelmingli
2,hushed,hush
3,hush,hush
4,functional,function
5,functionally,function
6,lying,lie
7,lied,lie
8,fairly,fairli
9,destabilize,destabil


### LancasterStemmer
* The LancasterStemmer (Paice-Husk stemmer) is an iterative algorithm with rules saved externally.
* LancasterStemmer is simple, but heavy stemming due to iterations and over-stemming may occur. 
* Over-stemming causes the stems to be not linguistic, or they may have no meaning.

In [6]:
ls = LancasterStemmer()

ls_stemmed_tokens = []
for token in input_tokens:
    ls_stemmed_tokens.append(ls.stem(token))

In [7]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Lancaster Stemmer': ls_stemmed_tokens
})

stems_df

Unnamed: 0,words,Lancaster Stemmer
0,overwhelming,overwhelm
1,overwhelmingly,overwhelm
2,hushed,hush
3,hush,hush
4,functional,funct
5,functionally,funct
6,lying,lying
7,lied,lied
8,fairly,fair
9,destabilize,dest


In [8]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Porter Stemmer': ps_stemmed_tokens,
    'Lancaster Stemmer': ls_stemmed_tokens
})

stems_df

Unnamed: 0,words,Porter Stemmer,Lancaster Stemmer
0,overwhelming,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm
2,hushed,hush,hush
3,hush,hush,hush
4,functional,function,funct
5,functionally,function,funct
6,lying,lie,lying
7,lied,lie,lied
8,fairly,fairli,fair
9,destabilize,destabil,dest


### SnowballStemmer
* One can generate its own set of rules for any language that is why Python nltk introduced SnowballStemmers that are used to create non-English Stemmers!

In [9]:
print(SnowballStemmer.languages)

('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [12]:
ss =  SnowballStemmer('english')

ss_stemmed_tokens = []
for token in input_tokens:
    ss_stemmed_tokens.append(ss.stem(token))

In [13]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Snowball Stemmer': ss_stemmed_tokens
})

stems_df

Unnamed: 0,words,Snowball Stemmer
0,overwhelming,overwhelm
1,overwhelmingly,overwhelm
2,hushed,hush
3,hush,hush
4,functional,function
5,functionally,function
6,lying,lie
7,lied,lie
8,fairly,fair
9,destabilize,destabil


In [38]:
stems_df = pd.DataFrame({
    'words': input_tokens,
    'Porter Stemmer': ps_stemmed_tokens,
    'Lancaster Stemmer': ls_stemmed_tokens,
    'Snowball Stemmer': ss_stemmed_tokens
})

stems_df

Unnamed: 0,words,Porter Stemmer,Lancaster Stemmer,Snowball Stemmer
0,overwhelming,overwhelm,overwhelm,overwhelm
1,overwhelmingly,overwhelmingli,overwhelm,overwhelm
2,hushed,hush,hush,hush
3,hush,hush,hush,hush
4,functional,function,funct,function
5,functionally,function,funct,function
6,lying,lie,lying,lie
7,lied,lie,lied,lie
8,fairly,fairli,fair,fair
9,destabilize,destabil,dest,destabil


In [15]:
with open('./datasets/RabindraNath.txt', 'r') as f:
    file_contents = f.read()

print(file_contents)

The Project Gutenberg EBook of The Hungry Stones And Other Stories, by 
Rabindranath Tagore

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org


Title: The Hungry Stones And Other Stories

Author: Rabindranath Tagore

Translator: The author and Mr. C. F. Andrews

Posting Date: December 22, 2008 [EBook #2518]
Release Date: February, 2001

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK THE HUNGRY STONES ***




Produced by Alev Akman





THE HUNGRY STONES AND OTHER STORIES

By Rabindranath Tagore




Contents:

     The Hungry Stones
     The Victory
     Once There Was A King
     The Home-coming
     My Lord, The Baby
     The Kingdom Of Cards
     The Devotee
     Vision
     The Babus Of Nayanjore
     Living Or Dead?
     "We Crown Thee King"
     The Renunciation
     Th

In [16]:
word_tokens = word_tokenize(file_contents)

In [17]:
ss =  SnowballStemmer('english', ignore_stopwords=True)

ss_stemmed_words = []
for word in word_tokens:
    ss_stemmed_words.append(ss.stem(word))

In [18]:
" ".join(ss_stemmed_words)

"the project gutenberg ebook of the hungri stone and other stori , by rabindranath tagor this ebook is for the use of anyon anywher at no cost and with almost no restrict whatsoev . you may copi it , give it away or re-us it under the term of the project gutenberg licens includ with this ebook or onlin at www.gutenberg.org titl : the hungri stone and other stori author : rabindranath tagor translat : the author and mr. c. f. andrew post date : decemb 22 , 2008 [ ebook # 2518 ] releas date : februari , 2001 languag : english *** start of this project gutenberg ebook the hungri stone *** produc by alev akman the hungri stone and other stori by rabindranath tagor content : the hungri stone the victori once there was a king the home-com my lord , the babi the kingdom of card the devote vision the babus of nayanjor live or dead ? `` we crown thee king '' the renunci the cabuliwallah [ the fruitsel from cabul ] prefac : the stori contain in this volum were translat by sever hand . the versio