Assignment 3:
Text Cleaning, Lemmatization, Stop Word Removal, Label Encoding & TF-IDF

In [1]:
texts = [
    "I love NLP! It is very interesting.",
    "Text cleaning is an important step in NLP.",
    "TF-IDF converts text into numerical features."
]

labels = ["positive", "neutral", "neutral"]


Text cleaning

In [2]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)   # remove punctuation & numbers
    text = re.sub(r'\s+', ' ', text)       # remove extra spaces
    return text.strip()

cleaned_texts = [clean_text(text) for text in texts]
print("Cleaned Texts:", cleaned_texts)


Cleaned Texts: ['i love nlp it is very interesting', 'text cleaning is an important step in nlp', 'tfidf converts text into numerical features']


Lemmatization (Using WordNet)

In [3]:
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

lemmatized_texts = []
for text in cleaned_texts:
    words = text.split()
    lemmas = [lemmatizer.lemmatize(word) for word in words]
    lemmatized_texts.append(" ".join(lemmas))

print("Lemmatized Texts:", lemmatized_texts)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MANSI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MANSI\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Lemmatized Texts: ['i love nlp it is very interesting', 'text cleaning is an important step in nlp', 'tfidf convert text into numerical feature']


Stop Word Removal

In [4]:
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

filtered_texts = []
for text in lemmatized_texts:
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    filtered_texts.append(" ".join(filtered))

print("After Stop Word Removal:", filtered_texts)


After Stop Word Removal: ['love nlp interesting', 'text cleaning important step nlp', 'tfidf convert text numerical feature']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MANSI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Label encoding

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

print("Original Labels:", labels)
print("Encoded Labels:", encoded_labels)


Original Labels: ['positive', 'neutral', 'neutral']
Encoded Labels: [1 0 0]


TF-IDF representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(filtered_texts)

print("TF-IDF Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


TF-IDF Feature Names: ['cleaning' 'convert' 'feature' 'important' 'interesting' 'love' 'nlp'
 'numerical' 'step' 'text' 'tfidf']
TF-IDF Matrix:
 [[0.         0.         0.         0.         0.62276601 0.62276601
  0.4736296  0.         0.         0.         0.        ]
 [0.49047908 0.         0.         0.49047908 0.         0.
  0.37302199 0.         0.49047908 0.37302199 0.        ]
 [0.         0.46735098 0.46735098 0.         0.         0.
  0.         0.46735098 0.         0.35543247 0.46735098]]
