In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [4]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yatha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yatha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yatha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Tokenization, Lammatisation, StopWords, Removing Special Characters

In [20]:
def preprocess_paragraph(paragraph):
    # Remove special characters
    paragraph = re.sub(r'[^A-Za-z0-9\s]', '', paragraph)
    
    # Convert to lowercase
    paragraph = paragraph.lower()
    
    # Tokenize the paragraph into words
    tokens = word_tokenize(paragraph)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return ' '.join(lemmatized_tokens)

In [30]:
# Example usage
paragraph = "This is an example paragraph! It will be processed to remove stopwords, converted to lowercase, tokenized, and lemmatized."
processed_tokens = preprocess_paragraph(paragraph)
print(processed_tokens)

example paragraph processed remove stopwords converted lowercase tokenized lemmatized


str

In [22]:
paragraphs = [
    "This is the first example paragraph. It will be processed to remove stopwords, converted to lowercase, tokenized, and lemmatized.",
    "Here is another example! This paragraph will also go through the same preprocessing steps.",
    "Preprocessing is an important step in natural language processing."
]

In [23]:
# Preprocess the paragraphs
preprocessed_paragraphs = [preprocess_paragraph(paragraph) for paragraph in paragraphs]

## Bag Of Words

In [24]:
# Apply Bag of Words model
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_paragraphs)

In [25]:
print("Bag of Words matrix:")
print(bow_matrix.toarray())

Bag of Words matrix:
[[0 0 1 1 1 0 0 0 1 1 0 1 0 1 0 1 0 1 1]
 [1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0]
 [0 0 0 0 0 0 1 1 0 0 1 0 1 0 1 0 1 0 0]]


In [26]:
print("Feature names (Bag of Words):")
print(vectorizer.get_feature_names_out())

Feature names (Bag of Words):
['also' 'another' 'converted' 'example' 'first' 'go' 'important'
 'language' 'lemmatized' 'lowercase' 'natural' 'paragraph' 'preprocessing'
 'processed' 'processing' 'remove' 'step' 'stopwords' 'tokenized']


## TF-IDF

In [27]:
# Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_paragraphs)

In [28]:
print("TF-IDF matrix:")
print(tfidf_matrix.toarray())

TF-IDF matrix:
[[0.         0.         0.33046705 0.25132871 0.33046705 0.
  0.         0.         0.33046705 0.33046705 0.         0.25132871
  0.         0.33046705 0.         0.33046705 0.         0.33046705
  0.33046705]
 [0.43381609 0.43381609 0.         0.32992832 0.         0.43381609
  0.         0.         0.         0.         0.         0.32992832
  0.32992832 0.         0.         0.         0.32992832 0.
  0.        ]
 [0.         0.         0.         0.         0.         0.
  0.44036207 0.44036207 0.         0.         0.44036207 0.
  0.3349067  0.         0.44036207 0.         0.3349067  0.
  0.        ]]


In [29]:
print("Feature names (TF-IDF):")
print(tfidf_vectorizer.get_feature_names_out())

Feature names (TF-IDF):
['also' 'another' 'converted' 'example' 'first' 'go' 'important'
 'language' 'lemmatized' 'lowercase' 'natural' 'paragraph' 'preprocessing'
 'processed' 'processing' 'remove' 'step' 'stopwords' 'tokenized']
