<a href="https://colab.research.google.com/github/naveenk5199/nlp_practice/blob/main/removing_stopwords_text_normalisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [None]:
# sample sentence
text = """He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were."""

In [None]:
# set of stop words
stop_words = (set(stopwords.words('english')))

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# tokens of words
word_tokens = word_tokenize(text)

In [None]:
filtered_word_tokens = []

for token in word_tokens:
  if token not in stop_words:
    filtered_word_tokens.append(token)

In [None]:
print('Original sentence: :', ' '.join(word_tokens))
print('Length of tokens: ', len(word_tokens))

Original sentence: : He determined to drop his litigation with the monastry , and relinguish his claims to the wood-cuting and fishery rihgts at once . He was the more ready to do this becuase the rights had become much less valuable , and he had indeed the vaguest idea where the wood and river in question were .
Length of tokens:  56


In [None]:
print('Filtered sentence: ', ' '.join(filtered_word_tokens))
print('Length of filtered tokens: ', len(filtered_word_tokens))

Filtered sentence:  He determined drop litigation monastry , relinguish claims wood-cuting fishery rihgts . He ready becuase rights become much less valuable , indeed vaguest idea wood river question .
Length of filtered tokens:  28


# **Removing Stopwords using spaCy**

In [None]:
from spacy.lang.en import English

In [None]:
# Load English tokeniser, tagger, parser, NER and word vectors
nlp = English()

In [None]:
# Create nlp object for document
nlp_doc = nlp(text)

In [None]:
# Create list of word tokens
token_list = []
for token in nlp_doc:
  token_list.append(token.text)

print('Original sentence: ', ' '.join(token_list))
print('Length of tokens: ', len(token_list))



Original sentence:  He determined to drop his litigation with the monastry , and relinguish his claims to the wood - cuting and 
 fishery rihgts at once . He was the more ready to do this becuase the rights had become much less valuable , and he had 
 indeed the vaguest idea where the wood and river in question were .
Length of tokens:  60


In [None]:
# import list of stop words
from spacy.lang.en.stop_words import STOP_WORDS


In [None]:
print(type(STOP_WORDS))

<class 'set'>


In [None]:
# Create list of filtered word tokens by removing stopwords
filtered_word_tokens_spacy = []

for token in token_list:
  if token not in STOP_WORDS:
    filtered_word_tokens_spacy.append(token)
print('Filtered text from nltk: ', ' '.join(filtered_word_tokens))
print('Filered text from spaCy: ', ' '.join(filtered_word_tokens_spacy))
print('Length of filtered text from nltk: ', len(filtered_word_tokens))
print('Length of filtered text from spaCy: ', len(filtered_word_tokens_spacy))

Filtered text from nltk:  He determined drop litigation monastry , relinguish claims wood-cuting fishery rihgts . He ready becuase rights become much less valuable , indeed vaguest idea wood river question .
Filered text from spaCy:  He determined drop litigation monastry , relinguish claims wood - cuting 
 fishery rihgts . He ready becuase rights valuable , 
 vaguest idea wood river question .
Length of filtered text from nltk:  28
Length of filtered text from spaCy:  28


# **Stopwords removal using Gensim**

In [None]:
# import Gensim packages
from gensim.parsing.preprocessing import remove_stopwords

In [None]:
# Load stopwords

In [None]:
# Tokenise text

In [None]:
# Filter tokens by removing stopwords
filtered_text = remove_stopwords(text)

In [None]:
# Comparisons from nltk, spacy and gensim
print('Original text: ', text)
print('Filtered text from NLTK: ', ' '.join(filtered_word_tokens))
print('Filered text from spaCy: ', ' '.join(filtered_word_tokens_spacy))
print('Filtered text from Gensim:', filtered_text)

Original text:  He determined to drop his litigation with the monastry, and relinguish his claims to the wood-cuting and 
fishery rihgts at once. He was the more ready to do this becuase the rights had become much less valuable, and he had 
indeed the vaguest idea where the wood and river in question were.
Filtered text from NLTK:  He determined drop litigation monastry , relinguish claims wood-cuting fishery rihgts . He ready becuase rights become much less valuable , indeed vaguest idea wood river question .
Filered text from spaCy:  He determined drop litigation monastry , relinguish claims wood - cuting 
 fishery rihgts . He ready becuase rights valuable , 
 vaguest idea wood river question .
Filtered text from Gensim: He determined drop litigation monastry, relinguish claims wood-cuting fishery rihgts once. He ready becuase rights valuable, vaguest idea wood river question were.
