In [None]:
# Install the NLTK library if not already installed
!pip install nltk

# Import necessary libraries
import nltk
import string
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download required NLTK resources (only needed once)
nltk.download('punkt')       # for tokenization
nltk.download('stopwords')   # for stop word removal

# Sample simple sentence
text = "The dog is playing in the garden."

# 1. Tokenization – splits sentence into individual words
tokens = word_tokenize(text)
print("1. Tokenized Words:\n", tokens)

# 2. Filtration – removes punctuation and numbers
filtered_tokens = [word for word in tokens if word.isalpha()]  # keeps only words made of letters
print("\n2. After Filtration (Only alphabetic words):\n", filtered_tokens)

# 3. Script Validation – keeps only English alphabetic words
# This is a basic check to exclude anything unusual like emojis or other scripts
english_words = [word for word in filtered_tokens if re.match("^[a-zA-Z]+$", word)]
print("\n3. After Script Validation (English words only):\n", english_words)

# 4. Stop Word Removal – removes common words like "the", "is", "in", etc.
stop_words = set(stopwords.words('english'))
non_stopwords = [word for word in english_words if word.lower() not in stop_words]
print("\n4. After Stop Word Removal:\n", non_stopwords)

# 5. Stemming – reduces words to their root form (e.g., "playing" → "play")
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in non_stopwords]
print("\n5. After Stemming:\n", stemmed_words)


1. Tokenized Words:
 ['The', 'dog', 'is', 'playing', 'in', 'the', 'garden', '.']

2. After Filtration (Only alphabetic words):
 ['The', 'dog', 'is', 'playing', 'in', 'the', 'garden']

3. After Script Validation (English words only):
 ['The', 'dog', 'is', 'playing', 'in', 'the', 'garden']

4. After Stop Word Removal:
 ['dog', 'playing', 'garden']

5. After Stemming:
 ['dog', 'play', 'garden']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
