In [1]:
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer 
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

In [2]:
# Install the nltk component for several tasks
nltk.download('punkt')     
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
sentence= "The quick brown fox jumps over the lazy dog"

#**Tokenize**

In [4]:
tokens = word_tokenize(sentence)

print (tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


# **POS Tagging**

In [5]:
nltk.pos_tag(tokens)

[('The', 'DT'),
 ('quick', 'JJ'),
 ('brown', 'NN'),
 ('fox', 'NN'),
 ('jumps', 'VBZ'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

# **Stop Words Removal**

In [6]:
stop_words = set(stopwords.words('english'))

print (stop_words)

{'under', 'does', 'itself', 's', 'his', 'but', 'ourselves', 'through', 'to', "should've", "wasn't", 'y', 'were', 'then', 'haven', 'why', 'hadn', "aren't", 'is', 'our', 'll', 'ain', 'few', 'some', 'mightn', 'which', 're', 'for', 'nor', 'those', 'can', 'the', 'against', 'ma', 'aren', 'between', 'not', 'them', 'are', "weren't", 'has', 'she', 'into', 'on', 'up', "mightn't", 'am', 'should', "you're", 'same', 'where', 'over', "haven't", 'was', 'than', "mustn't", 'having', 'had', 've', 'by', 'yourselves', 'with', 'me', 'before', 'once', 'yours', "it's", 'because', 'we', 'more', 'both', "that'll", 'a', 'there', 'such', "don't", 'hasn', 'herself', 'if', 'these', "you'll", 'be', 'wouldn', 'after', 'no', 'needn', 'only', 'about', 'as', 'shouldn', "wouldn't", 't', 'very', "hadn't", 'it', 'while', 'most', 'weren', 'myself', 'all', 'or', 'now', 'shan', 'any', 'he', "won't", 'have', 'they', 'at', 'other', "doesn't", 'wasn', 'him', 'further', "you've", 'how', 'of', 'that', 'their', 'themselves', 'do',

In [7]:
tokens = [w for w in tokens if not w in stop_words]

print(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']


# **Stemming**

In [8]:
porter = PorterStemmer()
stems = []
for t in tokens:    
    stems.append(porter.stem(t))
print(stems)

['the', 'quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


# **Lemmatizing**

In [9]:
lemmatizer = WordNetLemmatizer()
lemmas=[]
for t in tokens:
  lemmas.append(lemmatizer.lemmatize(t))
print (lemmas)

['The', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog']
