In [6]:
# Install required libraries if not installed:
# pip install nltk scikit-learn

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [23]:
# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

nltk.download('stopwords')
nltk.download('wordnet')

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# Sample Document
text = "The quick brown fox jumps over the lazy dog."

In [16]:
# Step 1: Tokenization
tokens = word_tokenize(text)
print("\nTokens:")
print(tokens)



Tokens:
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [17]:
# Step 2: POS Tagging
pos_tags = nltk.pos_tag(tokens)
print("\nPOS Tags:")
print(pos_tags)


POS Tags:
[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [18]:
# Step 3: Stop Words Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print("\nTokens after Stop Words Removal:")
print(filtered_tokens)


Tokens after Stop Words Removal:
['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.']


In [19]:
# Step 4: Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
print("\nTokens after Stemming:")
print(stemmed_tokens)


Tokens after Stemming:
['quick', 'brown', 'fox', 'jump', 'lazi', 'dog', '.']


In [20]:
# Step 5: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
print("\nTokens after Lemmatization:")
print(lemmatized_tokens)



Tokens after Lemmatization:
['quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.']


In [24]:
# -------------------------------
# Step 6: Term Frequency and IDF
# Using two small documents for better demonstration

documents = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly."
]

# Term Frequency (TF)
vectorizer = CountVectorizer(stop_words='english')
tf_matrix = vectorizer.fit_transform(documents)

print("\nTerm Frequency (TF) Matrix:")
print(vectorizer.get_feature_names_out())
print(tf_matrix.toarray())

# Inverse Document Frequency (IDF) / TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# print("\nTF-IDF Matrix:")
# print(tfidf_vectorizer.get_feature_names_out())
# print(tfidf_matrix.toarray())

# Print feature names
print("\nFeatures/Words:")
print(tfidf_vectorizer.get_feature_names_out())

# Print TF-IDF matrix in a clean tabular way
import pandas as pd

df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix (Proper Table):")
print(df)



Term Frequency (TF) Matrix:
['brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'quick' 'quickly']
[[1 1 1 0 1 1 1 0]
 [0 1 0 1 0 1 0 1]]

Features/Words:
['brown' 'dog' 'fox' 'jump' 'jumps' 'lazy' 'quick' 'quickly']

TF-IDF Matrix (Proper Table):
      brown       dog       fox      jump     jumps      lazy     quick  \
0  0.446656  0.317800  0.446656  0.000000  0.446656  0.317800  0.446656   
1  0.000000  0.409937  0.000000  0.576152  0.000000  0.409937  0.000000   

    quickly  
0  0.000000  
1  0.576152  


In [None]:
# Step 1: Library Installation and Import
# We use two main libraries:

# NLTK (Natural Language Toolkit) — for text preprocessing like tokenization, stopword removal, stemming, lemmatization, etc.

# Scikit-learn — for calculating Term Frequency (TF) and Inverse Document Frequency (IDF).

# We install and import required modules like word_tokenize, stopwords, PorterStemmer, WordNetLemmatizer, CountVectorizer, and TfidfVectorizer.

# Step 2: Downloading Necessary Resources
# NLTK requires certain datasets (called corpora) for tokenization and other tasks.

# We download:

# 'punkt' — for tokenization.

# 'averaged_perceptron_tagger' — for Part of Speech (POS) tagging.

# 'stopwords' — list of common words like "the", "is", etc.

# 'wordnet' — for lemmatization.

# Step 3: Sample Document
# We define a simple sample sentence:

# text
# Copy
# Edit
# "The quick brown fox jumps over the lazy dog."
# Step 4: Text Preprocessing
# 4.1 Tokenization
# Break the sentence into individual words (tokens).

# Example output:

# css
# Copy
# Edit
# ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']
# 4.2 POS Tagging
# Assign Part of Speech (like noun, verb, adjective) to each token.

# Example output:

# css
# Copy
# Edit
# [('The', 'DT'), ('quick', 'JJ'), ('brown', 'JJ'), ('fox', 'NN'), ...]
# Good question! Here's the full form and meaning of those POS tags you asked about:


# Tag	Full Form	Meaning
# NN	Noun, Singular	Names of people, places, things (e.g., dog, fox)
# JJ	Adjective	Describes a noun (e.g., quick, lazy)
# DT	Determiner	Words that modify nouns (e.g., the, a, an)
# VBZ	Verb, 3rd person singular present	Action word for he/she/it (e.g., jumps)
# IN	Preposition or subordinating conjunction	Links words (e.g., over, in, on)
# 4.3 Stop Words Removal
# Remove very common words that do not add much meaning (e.g., "the", "is", "in", "on").

# We keep only meaningful words.

# Example output:

# css
# Copy
# Edit
# ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.']
# 4.4 Stemming
# Reduce words to their root form by chopping off prefixes/suffixes.

# Example:

# "jumps" → "jump"

# "lazy" → "lazi"

# Stemming does not always produce real English words but is useful for matching similar words.

# 4.5 Lemmatization
# Like stemming, but more intelligent.

# Converts words to real dictionary base forms.

# Example:

# "jumps" → "jump"

# "dogs" → "dog"

# Step 5: Document Representation
# 5.1 Term Frequency (TF)
# How often a word appears in the document.

# We use CountVectorizer to build a matrix where:

# Each row = a document

# Each column = a term (word)

# Each cell = count of how many times the word appeared.

# Example:


# brown	dog	fox	jump	lazy	quick
# Doc1	1	1	1	1	1	1
# Doc2	0	1	0	1	1	0
# 5.2 TF-IDF (Term Frequency - Inverse Document Frequency)
# TF-IDF gives more importance to rare words across documents.

# Common words get lower weight, rare words get higher weight.

# Helps highlight important terms in a collection of documents.

# Example:

# css
# Copy
# Edit
# [['brown':0.47, 'dog':0.37, 'fox':0.47, ...], ...]
# ✅ Final Output
# At the end:

# You have all the preprocessing steps applied to your sample text.

# You also have a numerical representation (TF and TF-IDF) ready for machine learning or analysis.

