##### Install NLTK

In [1]:
pip install nltk

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


##### Install modules inside the nltk

In [4]:
import nltk

In [6]:
nltk.download('punkt')  # Tokenization
nltk.download('stopwords') # Stopwords Removal
nltk.download('averaged_perceptron_tagger') # POS Tagging
nltk.download('omw-1.4')  # Supporting
nltk.download('wordnet')  # Lemmatization

[nltk_data] Downloading package punkt to /home/mitu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mitu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mitu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/mitu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/mitu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##### Tokenization

In [15]:
sent = 'They told that their ages are 23, 26 and 31 respectively.'

In [16]:
# Find the average of ages mentioned in the sentence.

In [17]:
sent.split()

['They',
 'told',
 'that',
 'their',
 'ages',
 'are',
 '23,',
 '26',
 'and',
 '31',
 'respectively.']

In [18]:
ages = []
for word in sent.split():
    if word.isdigit():
        ages.append(int(word))

In [19]:
ages

[26, 31]

In [20]:
sum(ages) / len(ages)

28.5

In [21]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [22]:
sent = 'Hello friends! How are you? Welcome to the world of Python Programming.'

In [23]:
sent_tokenize(sent)

['Hello friends!',
 'How are you?',
 'Welcome to the world of Python Programming.']

In [24]:
tokens = word_tokenize(sent)

In [25]:
tokens

['Hello',
 'friends',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming',
 '.']

##### Stemming

In [27]:
from nltk.stem import PorterStemmer

In [28]:
ps = PorterStemmer()

In [36]:
word = 'players'
print(ps.stem(word))

player


In [37]:
from nltk.stem import LancasterStemmer

In [38]:
ls = LancasterStemmer()

In [39]:
word = 'players'
print(ls.stem(word))

play


##### Lemmatization

In [41]:
from nltk.stem import WordNetLemmatizer

In [42]:
wnl = WordNetLemmatizer()

In [51]:
word = 'better'
print(wnl.lemmatize(word, pos = 'v'))  # Verb
print(wnl.lemmatize(word, pos = 'n'))  # Noun
print(wnl.lemmatize(word, pos = 'a'))  # Adjective
print(wnl.lemmatize(word, pos = 'r'))  # Adverb

better
better
good
well


##### Punctuation Removal

In [53]:
words = word_tokenize(sent)

In [54]:
words

['Hello',
 'friends',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming',
 '.']

In [55]:
clean_words = []
for word in words:
    if word.isalpha():
        clean_words.append(word)

In [56]:
clean_words

['Hello',
 'friends',
 'How',
 'are',
 'you',
 'Welcome',
 'to',
 'the',
 'world',
 'of',
 'Python',
 'Programming']

##### Stopwords removal

In [58]:
from nltk.corpus import stopwords

In [59]:
swords = stopwords.words('english')

In [60]:
swords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [61]:
clean_words = []
for word in words:
    if word not in swords:
        clean_words.append(word)

In [62]:
clean_words

['Hello',
 'friends',
 '!',
 'How',
 '?',
 'Welcome',
 'world',
 'Python',
 'Programming',
 '.']

In [63]:
clean_words = [word for word in words if word not in swords]

In [64]:
clean_words

['Hello',
 'friends',
 '!',
 'How',
 '?',
 'Welcome',
 'world',
 'Python',
 'Programming',
 '.']