In [None]:
# Step 0: Install and Import Required Libraries
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [None]:
# Step 1: Download NLTK resources (only first time)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Step 2: Sample Documents
documents = [
    "Natural Language Processing is a field of Artificial Intelligence.",
    "It deals with the interaction between computers and humans using natural language.",
    "NLP techniques are widely used in text analytics and sentiment analysis."
]

In [None]:
# Step 3: Text Preprocessing Function
def preprocess_text(text):
    # 3.1 Convert to lowercase
    text = text.lower()

    # 3.2 Tokenization
    tokens = word_tokenize(text)

    # 3.3 Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # 3.4 Remove Stop Words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # 3.5 POS Tagging
    pos_tags = pos_tag(filtered_tokens)

    # 3.6 Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # 3.7 Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return {
        'tokens': tokens,
        'filtered': filtered_tokens,
        'pos_tags': pos_tags,
        'stemmed': stemmed_tokens,
        'lemmatized': lemmatized_tokens
    }

In [None]:
nltk.download('punkt_tab')

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
# Step 4: Preprocess and Print Each Document
for i, doc in enumerate(documents):
    print(f"\n--- Document {i+1} ---")
    result = preprocess_text(doc)
    print("Original Tokens:     ", result['tokens'])
    print("After Stop Removal:  ", result['filtered'])
    print("POS Tags:            ", result['pos_tags'])
    print("After Stemming:      ", result['stemmed'])
    print("After Lemmatization: ", result['lemmatized'])

In [None]:
# Step 5: Term Frequency (TF) Matrix using CountVectorizer
print("\n--- Term Frequency (TF) Matrix ---")
tf_vectorizer = CountVectorizer(stop_words='english')
tf_matrix = tf_vectorizer.fit_transform(documents)
print("Vocabulary:", tf_vectorizer.get_feature_names_out())
print(tf_matrix.toarray())

In [None]:
# Step 6: TF-IDF Matrix using TfidfVectorizer
print("\n--- TF-IDF Matrix ---")
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print(np.round(tfidf_matrix.toarray(), 3))  # rounded for better readability