In [1]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [7]:
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
text = "Natural Language Processing (NLP) helps computers understand human language. It's awesome!"
print("🔹 Original Text:\n", text)

🔹 Original Text:
 Natural Language Processing (NLP) helps computers understand human language. It's awesome!


In [4]:
text_lower = text.lower()

In [5]:
text_clean = "".join([ch for ch in text_lower if ch not in string.punctuation])
print("✅ Cleaned Text:\n", text_clean)

✅ Cleaned Text:
 natural language processing nlp helps computers understand human language its awesome


In [8]:
tokens = word_tokenize(text_clean)
print("🧩 Tokens:\n", tokens)

🧩 Tokens:
 ['natural', 'language', 'processing', 'nlp', 'helps', 'computers', 'understand', 'human', 'language', 'its', 'awesome']


In [9]:
stop_words = set(stopwords.words('english'))
filtered_tokens = [w for w in tokens if w not in stop_words]
print("🚫 After Stopword Removal:\n", filtered_tokens)

🚫 After Stopword Removal:
 ['natural', 'language', 'processing', 'nlp', 'helps', 'computers', 'understand', 'human', 'language', 'awesome']


In [10]:
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("🌱 After Stemming:\n", stemmed_tokens)

🌱 After Stemming:
 ['natur', 'languag', 'process', 'nlp', 'help', 'comput', 'understand', 'human', 'languag', 'awesom']


In [11]:
lemmatizer = WordNetLemmatizer()
lemm_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("🧠 After Lemmatization:\n", lemm_tokens)

🧠 After Lemmatization:
 ['natural', 'language', 'processing', 'nlp', 'help', 'computer', 'understand', 'human', 'language', 'awesome']


In [12]:
def preprocess_text(text):
    text = text.lower()
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return tokens


In [13]:
sample = "NLTK is a powerful Python library for text processing!"
print("✅ Final Output:", preprocess_text(sample))

✅ Final Output: ['nltk', 'powerful', 'python', 'library', 'text', 'processing']
