- 1 | case folding 
- 2 | stop word removal
- 3 | stemming
- 4 | Lemmetization

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')
s = "He told Dr.Lovato that he was done with the tests and would post the results shortly."
doc = nlp(s)

# 1 | spacy

In [5]:
print([t for t in doc])

[He, told, Dr., Lovato, that, he, was, done, with, the, tests, and, would, post, the, results, shortly, .]


##  1 | case folding

In [None]:
print([t.lower_ for t in doc])

In [7]:
# You can also apply conditions when generating these views. For example, we can skip case-folding if a token is the start of a sentence.
print([t.lower_ if not t.is_sent_start else t for t in doc])

[He, 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']


## 2 | stop work removal

In [8]:
# spaCy's default stop word list.
print(nlp.Defaults.stop_words)
print(len(nlp.Defaults.stop_words))

{'whereas', 'nine', 'wherever', 'anything', 'quite', 're', 'beforehand', 'somehow', 'below', 'at', 'meanwhile', 'from', 'again', 'less', 'everyone', 'do', 'onto', 'somewhere', 'can', 'whom', 'or', 'top', '‘s', 'mostly', 'five', 'ourselves', 'used', 'nor', 'go', 'just', 'much', 'none', 'would', 'she', 'fifty', 'more', 'latterly', '’ll', 'down', 'beyond', 'who', 'bottom', 'yours', 'whoever', 'there', 'well', 'after', 'i', 'those', 'except', 'same', 'what', 'together', 'he', 'rather', 'the', '’d', 'therein', 'within', 'already', 'also', 'a', 'am', 'part', 'out', 'had', 'yourself', 'eight', 'herein', 'seem', 'was', 'our', 'fifteen', 'some', 'whenever', 'eleven', 'whether', 'everywhere', 'everything', 'whence', '’ve', 'never', 'often', 'empty', 'every', 'and', 'sixty', 'one', 'towards', 'besides', "'ll", 'around', 'others', 'call', 'own', 'each', 'in', 'whereupon', 'on', 'too', 'by', 'over', 'thereafter', 'alone', 'you', 'seems', 'hereafter', 'which', 'serious', 'should', 'how', 'forty', 's

In [9]:
print([t for t in doc if not t.is_stop])

[told, Dr., Lovato, tests, post, results, shortly, .]


In [24]:

# Using Spacy 2.0.11, you can update its stopwords set using one of the following:

# To add a single stopword:

import spacy    
nlp = spacy.load("en_core_web_sm")
nlp.Defaults.stop_words.add("my_new_stopword")
print(nlp.Defaults.stop_words)

# To add several stopwords at once:


nlp.Defaults.stop_words |= {"my_new_stopword1","my_new_stopword2",}

# To remove a single stopword:


nlp.Defaults.stop_words.remove("whatever")
# 
# To remove several stopwords at once:


nlp.Defaults.stop_words -= {"whatever", "whenever"}


# Note: To see the current set of stopwords, use:

print(nlp.Defaults.stop_words)


{'whereas', 'nine', 'wherever', 'anything', 'quite', 're', 'beforehand', 'somehow', 'below', 'at', 'meanwhile', 'from', 'again', 'less', 'everyone', 'do', 'onto', 'somewhere', 'can', 'whom', 'or', 'top', '‘s', 'mostly', 'five', 'ourselves', 'used', 'nor', 'go', 'just', 'much', 'none', 'would', 'she', 'fifty', 'more', 'latterly', '’ll', 'down', 'beyond', 'who', 'bottom', 'yours', 'whoever', 'there', 'well', 'after', 'i', 'those', 'except', 'same', 'what', 'together', 'he', 'rather', 'the', '’d', 'therein', 'within', 'already', 'also', 'a', 'am', 'part', 'out', 'had', 'yourself', 'eight', 'herein', 'seem', 'was', 'our', 'fifteen', 'some', 'whenever', 'eleven', 'whether', 'everywhere', 'everything', 'whence', '’ve', 'never', 'often', 'empty', 'every', 'and', 'sixty', 'one', 'towards', 'besides', "'ll", 'around', 'others', 'call', 'own', 'each', 'in', 'whereupon', 'on', 'too', 'by', 'over', 'thereafter', 'alone', 'you', 'seems', 'hereafter', 'which', 'serious', 'should', 'how', 'forty', 's

## 3 |  Lemmatization

In [10]:
print([(t.text, t.lemma_) for t in doc])

[('He', 'he'), ('told', 'tell'), ('Dr.', 'Dr.'), ('Lovato', 'Lovato'), ('that', 'that'), ('he', 'he'), ('was', 'be'), ('done', 'do'), ('with', 'with'), ('the', 'the'), ('tests', 'test'), ('and', 'and'), ('would', 'would'), ('post', 'post'), ('the', 'the'), ('results', 'result'), ('shortly', 'shortly'), ('.', '.')]


In [19]:
#
# EXERCISE: Find out how to intialize the SnowballStemmer, then tokenize
# and stem the sentence below.
#
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
sen = 'He told Dr. Lovato that he was done with the tests and would post the results shortly.'

# Initialize the stemmer here.
s = SnowballStemmer(language='english')

# Tokenize, stem, and print the tokens.
token = word_tokenize(sen)
print(token)


# stemming 
print([s.stem(w) for w in token])


snow_stemmer = SnowballStemmer(language='english')

#list of tokenized words
words = ['cared','university','fairly','easily','singing',
       'sings','sung','singer','sportingly']
 
#stem's of each word
stem_words = []
for w in words:
    x = snow_stemmer.stem(w)
    stem_words.append(x)
     
#print stemming results
for e1,e2 in zip(words,stem_words):
    print(e1+' ----> '+e2)

['He', 'told', 'Dr.', 'Lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'tests', 'and', 'would', 'post', 'the', 'results', 'shortly', '.']
['he', 'told', 'dr.', 'lovato', 'that', 'he', 'was', 'done', 'with', 'the', 'test', 'and', 'would', 'post', 'the', 'result', 'short', '.']
cared ----> care
university ----> univers
fairly ----> fair
easily ----> easili
singing ----> sing
sings ----> sing
sung ----> sung
singer ----> singer
sportingly ----> sport


# 2 | nltk

In [31]:
import nltk
word = "i don't wanna go out today. it's not good MR.amal!. i am bored outside"

## 1 | case folding

In [25]:
# use sting lower upper thing for it

## 2 | stop word removal

In [34]:
# nltk.download('stopwords') # download the stopwords from nltk
from nltk.corpus import stopwords
eng_stop_words = stopwords.words('english')
print(eng_stop_words)
print(len(eng_stop_words))


word_tokens = word_tokenize(word)
# it's simply a list add/remove your own stop words.
# filter out using list comprehension.
print("word -->", word)
print([w for w in word_tokens if w not in eng_stop_words])



['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

## 3 | stemming

Porter Stemmer: We mostly use this algorithm for its speed, minimal error rate, and simplicity. It is based on the fact that suffixes in the English language are composed of smaller and simpler suffixes. It is only limited to English words. Import as: from nltk.stem.porter import PorterStemmer.
rules: https://vijinimallawaarachchi.com/2017/05/09/porter-stemming-algorithm/


Snowball stemmer: This is a multilingual stemmer. Thus, it supports other languages. It is more aggressive than the Porter stemmer. Import as: from nltk.stem import SnowballStemmer.

Lancaster stemmer: Despite being more aggressive and dynamic than the other stemmers, it is confusing when small words are involved. This stemmer is also less efficient. Import as: from nltk.stem.lancaster import LancasterStemmer.




Over-stemming (False positives, precision):  where two separate inflected words are stemmed to the same root, but should not have been
Under-stemming (False negatives, recall): where two separate inflected words should be stemmed to the same root, but are they are not.



In [35]:
from nltk.stem.porter import PorterStemmer

# Create a stemmer object
stemmer = PorterStemmer()

# Stemming
stemmed_tokens = [stemmer.stem(token) for token in word_tokens]

print(f"===Unstemmed tokens==== {word_tokens}")
print(f"===Stemmed tokens==== {stemmed_tokens}")


===Unstemmed tokens==== ['i', 'do', "n't", 'wan', 'na', 'go', 'out', 'today', '.', 'it', "'s", 'not', 'good', 'MR.amal', '!', '.', 'i', 'am', 'bored', 'outside']
===Stemmed tokens==== ['i', 'do', "n't", 'wan', 'na', 'go', 'out', 'today', '.', 'it', "'s", 'not', 'good', 'mr.amal', '!', '.', 'i', 'am', 'bore', 'outsid']


## 4 | lemmetization

In [38]:
from nltk.stem import WordNetLemmatizer


# nltk.download('omw-1.4')
# nltk.download('wordnet')


# Create lemmatizer object
lemmatizer = WordNetLemmatizer()

# lemmatizing
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in word_tokens]
print(lemmatized_tokens)

[nltk_data] Downloading package omw-1.4 to /home/pavithra/nltk_data...
[nltk_data] Downloading package wordnet to /home/pavithra/nltk_data...


['i', 'do', "n't", 'wan', 'na', 'go', 'out', 'today', '.', 'it', "'s", 'not', 'good', 'MR.amal', '!', '.', 'i', 'am', 'bored', 'outside']


## 5 | word frequency matcher

In [2]:
from nltk import FreqDist
from nltk.tokenize import word_tokenize
freq_distribution = FreqDist(word_tokenize("i am the bose"))

# extract the 10 frequent words in the text
freq_distribution.most_common(10)

[('i', 1), ('am', 1), ('the', 1), ('bose', 1)]