In [1]:
# Q1: Text Preprocessing
# Importing required libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import string

# Download required nltk packages (only run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MSI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [25]:
# Sample raw text (with uppercase, punctuation, numbers, stopwords)
raw_text = "I AM Loving the NLP class, but sometimes it feels Confusing!!! In 2025, it's still fun."
print("Raw Text:\n", raw_text)


Raw Text:
 I AM Loving the NLP class, but sometimes it feels Confusing!!! In 2025, it's still fun.


In [5]:
# Step 1: Lowercasing
text_lower = raw_text.lower()
print("After Lowercasing:\n", text_lower)

After Lowercasing:
 i am loving the nlp class, but sometimes it feels confusing!!! in 2025, it's still fun.


In [7]:
# Step 2: Removing punctuation and numbers
text_clean = "".join([char for char in text_lower if char.isalpha() or char.isspace()])
print("After Removing Punctuation & Numbers:\n", text_clean)

After Removing Punctuation & Numbers:
 i am loving the nlp class but sometimes it feels confusing in  its still fun


In [9]:
# Step 3: Tokenization (splitting into words)
tokens = word_tokenize(text_clean)
print("After Tokenization:\n", tokens)

After Tokenization:
 ['i', 'am', 'loving', 'the', 'nlp', 'class', 'but', 'sometimes', 'it', 'feels', 'confusing', 'in', 'its', 'still', 'fun']


In [11]:
# Step 4: Stopword removal
stop_words = set(stopwords.words("english"))
tokens_no_stop = [word for word in tokens if word not in stop_words]
print("After Stopword Removal:\n", tokens_no_stop)

After Stopword Removal:
 ['loving', 'nlp', 'class', 'sometimes', 'feels', 'confusing', 'still', 'fun']


In [13]:
# Step 5: Stemming
ps = PorterStemmer()
stemmed = [ps.stem(word) for word in tokens_no_stop]
print("After Stemming:\n", stemmed)

After Stemming:
 ['love', 'nlp', 'class', 'sometim', 'feel', 'confus', 'still', 'fun']


In [15]:
# Step 6: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized = [lemmatizer.lemmatize(word) for word in tokens_no_stop]
print("After Lemmatization:\n", lemmatized)

After Lemmatization:
 ['loving', 'nlp', 'class', 'sometimes', 'feel', 'confusing', 'still', 'fun']


In [17]:
# Step 7: POS tagging (Part-of-Speech)
pos_tags = pos_tag(tokens_no_stop)
print("POS Tagging:\n", pos_tags)

POS Tagging:
 [('loving', 'VBG'), ('nlp', 'JJ'), ('class', 'NN'), ('sometimes', 'RB'), ('feels', 'VBZ'), ('confusing', 'VBG'), ('still', 'RB'), ('fun', 'VB')]


In [19]:
# Q2: Bag of Words (BOW)
from sklearn.feature_extraction.text import CountVectorizer

# Given corpus
corpus = [
    "I am loving the NLP class, but sometimes it feels confusing!!!",
    "NLP is a fascinating field — it deals with text, speech, and language understanding."
]
print("Original Corpus:\n", corpus)

Original Corpus:
 ['I am loving the NLP class, but sometimes it feels confusing!!!', 'NLP is a fascinating field — it deals with text, speech, and language understanding.']


In [21]:
# Step 1: Create CountVectorizer
vectorizer = CountVectorizer()

# Step 2: Fit and transform corpus
X = vectorizer.fit_transform(corpus)

# Step 3: Get vocabulary
vocab = vectorizer.get_feature_names_out()
print("Vocabulary:\n", vocab)

# Step 4: Bag of Words Representation
bow = X.toarray()
print("Bag of Words (BOW):\n", bow)

Vocabulary:
 ['am' 'and' 'but' 'class' 'confusing' 'deals' 'fascinating' 'feels'
 'field' 'is' 'it' 'language' 'loving' 'nlp' 'sometimes' 'speech' 'text'
 'the' 'understanding' 'with']
Bag of Words (BOW):
 [[1 0 1 1 1 0 0 1 0 0 1 0 1 1 1 0 0 1 0 0]
 [0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 1]]
