In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string

# First-time setup (download resources)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text
text = "The quick brown foxes were jumping over the lazy dogs, happily barking."



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nisharamprasath/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nisharamprasath/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nisharamprasath/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# 1. Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)



Tokens: ['The', 'quick', 'brown', 'foxes', 'were', 'jumping', 'over', 'the', 'lazy', 'dogs', ',', 'happily', 'barking', '.']


In [4]:

# 2. Lowercasing
tokens = [word.lower() for word in tokens]
print("Lowercased:", tokens)



Lowercased: ['the', 'quick', 'brown', 'foxes', 'were', 'jumping', 'over', 'the', 'lazy', 'dogs', ',', 'happily', 'barking', '.']


In [5]:


# 3. Removing Punctuation
tokens = [word for word in tokens if word not in string.punctuation]
print("No Punctuation:", tokens)



No Punctuation: ['the', 'quick', 'brown', 'foxes', 'were', 'jumping', 'over', 'the', 'lazy', 'dogs', 'happily', 'barking']


In [6]:


# 4. Removing Stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
print("No Stopwords:", tokens)



No Stopwords: ['quick', 'brown', 'foxes', 'jumping', 'lazy', 'dogs', 'happily', 'barking']


In [7]:

# 5a. Stemming
stemmer = PorterStemmer()
stemmed = [stemmer.stem(word) for word in tokens]
print("Stemmed:", stemmed)




Stemmed: ['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', 'happili', 'bark']


In [8]:


# 5b. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized:", lemmatized)


Lemmatized: ['quick', 'brown', 'fox', 'jumping', 'lazy', 'dog', 'happily', 'barking']
