In [2]:
# ------------------------------------------------------------
# IMPORT REQUIRED LIBRARIES
# ------------------------------------------------------------

import nltk                                      # Main NLP library
from nltk.tokenize import word_tokenize, sent_tokenize   # For word & sentence splitting
from nltk.corpus import stopwords               # English stopwords
from nltk.stem import PorterStemmer             # For stemming
from nltk.stem import WordNetLemmatizer         # For lemmatization
from sklearn.feature_extraction.text import CountVectorizer  # Bag-of-Words
import re                                       # For cleaning text
import pandas as pd                             # For DataFrame display


# ------------------------------------------------------------
# DOWNLOAD REQUIRED NLTK RESOURCES
# ------------------------------------------------------------

nltk.download('punkt')          # Tokenizer models
nltk.download('stopwords')      # Stopword list
nltk.download('wordnet')        # Dictionary for Lemmatizer
nltk.download('punkt_tab') 

# ------------------------------------------------------------
# SAMPLE PARAGRAPH FOR NLP PROCESSING
# ------------------------------------------------------------

paragraph = """'this is IT 7 sem H div',
       'students of IT sem 7 are good',
       'sem system is good foe education',
       'Education is good for human',
        'we want to carry semantic meaning',
       'understand the meaning of words',
       'It is a department'"""


# ------------------------------------------------------------
# 1️⃣ SENTENCE TOKENIZATION
# ------------------------------------------------------------

sentences = sent_tokenize(paragraph)   # Split paragraph into individual sentences
print("\nSentence Tokenization Output:\n", sentences)


# ------------------------------------------------------------
# 2️⃣ WORD TOKENIZATION
# ------------------------------------------------------------

words = word_tokenize(paragraph)       # Split text into individual words/punctuation
print("\nWord Tokenization Output:\n", words)
print("\nTotal Words =", len(words))


# ------------------------------------------------------------
# 3️⃣ PRINTING WORDS SPLIT BY SENTENCES (Manual split)
# ------------------------------------------------------------

print("\nWords Printed Sentence by Sentence:\n")
for sentence in sentences:           
    for word in sentence.split(" "):   # Split sentence by space
        print(word)


# ------------------------------------------------------------
# 4️⃣ STEMMING EXAMPLE
# ------------------------------------------------------------

stemmer = PorterStemmer()             # Initialize stemmer

filtered_sentences = []               # To store processed sentences

for i in range(len(sentences)):
    words_in_sentence = word_tokenize(sentences[i])   # Tokenize each sentence
    # Remove stopwords + apply stemming
    processed_words = [
        stemmer.stem(word)            # Apply stemming
        for word in words_in_sentence
        if word.lower() not in stopwords.words("english")  # Remove stopwords
    ]
    filtered_sentences.append(' '.join(processed_words))  # Join back into text

print("\nStemmed Sentences:\n", filtered_sentences)


# ------------------------------------------------------------
# 5️⃣ LEMMATIZATION EXAMPLE
# ------------------------------------------------------------

lemmatizer = WordNetLemmatizer()      # Initialize lemmatizer

lemmatized_sentences = []             # To store processed sentences

for i in range(len(sentences)):
    words_in_sentence = word_tokenize(sentences[i])
    # Remove stopwords + apply lemmatization
    processed_words = [
        lemmatizer.lemmatize(word)    # Apply lemmatization
        for word in words_in_sentence
        if word.lower() not in stopwords.words("english")
    ]
    lemmatized_sentences.append(' '.join(processed_words))

print("\nLemmatized Sentences:\n", lemmatized_sentences)


# ------------------------------------------------------------
# 6️⃣ STEMMING VS LEMMATIZATION EXAMPLE ON CUSTOM TEXT
# ------------------------------------------------------------

text = "I have three visions for India and indian people and peoples."
word_tokens = word_tokenize(text)

# Stemming
stemmed_output = [
    stemmer.stem(word)
    for word in word_tokens
    if word.lower() not in stopwords.words('english')
]

# Lemmatization
lemmatized_output = [
    lemmatizer.lemmatize(word)
    for word in word_tokens
    if word.lower() not in stopwords.words('english')
]

print("\nStemming Output:\n", stemmed_output)
print("\nLemmatization Output:\n", lemmatized_output)


# ------------------------------------------------------------
# 7️⃣ CLEAN THE PARAGRAPH FOR BAG-OF-WORDS (STEMMING VERSION)
# ------------------------------------------------------------

sentences = sent_tokenize(paragraph)
corpus_stem = []                      # Store cleaned & stemmed sentences

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])  # Remove non-letters
    review = review.lower()                           # Convert to lowercase
    review = review.split()                           # Split into words
    # Remove stopwords + apply stemming
    review = [stemmer.stem(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)                         # Join words back
    corpus_stem.append(review)

print("\nStemmed Corpus:\n", corpus_stem)


# ------------------------------------------------------------
# 8️⃣ CLEAN THE PARAGRAPH FOR BAG-OF-WORDS (LEMMATIZATION VERSION)
# ------------------------------------------------------------

corpus_lemma = []

for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]', ' ', sentences[i])
    review = review.lower()
    review = review.split()
    # Remove stopwords + apply lemmatization
    review = [lemmatizer.lemmatize(word) for word in review if word not in stopwords.words('english')]
    review = ' '.join(review)
    corpus_lemma.append(review)

print("\nLemmatized Corpus:\n", corpus_lemma)


# ------------------------------------------------------------
# 9️⃣ BAG OF WORDS USING CountVectorizer
# ------------------------------------------------------------

cv = CountVectorizer()                          # Create BOW model
X = cv.fit_transform(corpus_lemma).toarray()    # Convert text → vectors

df_bow = pd.DataFrame(X, columns=cv.get_feature_names_out())   # Create DataFrame
print("\nBag of Words Matrix:\n")
print(df_bow)


[nltk_data] Downloading package punkt to /Users/om/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/om/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/om/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/om/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Sentence Tokenization Output:
 ["'this is IT 7 sem H div',\n       'students of IT sem 7 are good',\n       'sem system is good foe education',\n       'Education is good for human',\n        'we want to carry semantic meaning',\n       'understand the meaning of words',\n       'It is a department'"]

Word Tokenization Output:
 ["'this", 'is', 'IT', '7', 'sem', 'H', 'div', "'", ',', "'students", 'of', 'IT', 'sem', '7', 'are', 'good', "'", ',', "'sem", 'system', 'is', 'good', 'foe', 'education', "'", ',', "'Education", 'is', 'good', 'for', 'human', "'", ',', "'we", 'want', 'to', 'carry', 'semantic', 'meaning', "'", ',', "'understand", 'the', 'meaning', 'of', 'words', "'", ',', "'It", 'is', 'a', 'department', "'"]

Total Words = 53

Words Printed Sentence by Sentence:

'this
is
IT
7
sem
H
div',







'students
of
IT
sem
7
are
good',







'sem
system
is
good
foe
education',







'Education
is
good
for
human',








'we
want
to
carry
semantic
meaning',







'understand
the
meani