# NLP Preprocessing
# 1. Import Required Libraries


In [34]:
# Basic NLP Libraries
import nltk
import re  #Regular Expressions
import string

# Download necessary NLTK datasets (only need to run once)
nltk.download('punkt')       # For tokenization
nltk.download('stopwords')   # For stopword removal
nltk.download('wordnet')     # For lemmatization
nltk.download('averaged_perceptron_tagger')  # For POS tagging

# Specific Tools
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')





[nltk_data] Downloading package punkt to C:\Users\3
[nltk_data]     STAR\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\3
[nltk_data]     STAR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\3
[nltk_data]     STAR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\3 STAR\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# 2. Sample Text

In [4]:
# Example text for preprocessing
text = "Hello there! Welcome to the world of NLP. Let's clean, process, and understand text better."
print("Original Text:\n", text)


Original Text:
 Hello there! Welcome to the world of NLP. Let's clean, process, and understand text better.


# 3. Text Cleaning
- Removing special characters, numbers, and extra spaces




In [8]:
# Remove punctuation
text_cleaned = text.translate(str.maketrans('', '', string.punctuation))

# Remove numbers
text_cleaned = re.sub(r'\d+', '', text_cleaned)

# Remove extra whitespace
text_cleaned = " ".join(text_cleaned.split())

print("Cleaned Text:\n", text_cleaned)


Cleaned Text:
 Hello there Welcome to the world of NLP Lets clean process and understand text better


# 4. Lowercasing

In [11]:
# Convert text to lowercase
text_lower = text_cleaned.lower()
print("Lowercased Text:\n", text_lower)


Lowercased Text:
 hello there welcome to the world of nlp lets clean process and understand text better


# 5. Tokenization
- Breaking text into words (word tokenization)

In [14]:
# Tokenize the text into words
tokens = nltk.word_tokenize(text_lower)
print("Tokens:\n", tokens)


Tokens:
 ['hello', 'there', 'welcome', 'to', 'the', 'world', 'of', 'nlp', 'lets', 'clean', 'process', 'and', 'understand', 'text', 'better']


# 6. Stopword Removal

In [17]:
# Load English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from tokens
tokens_without_stopwords = [word for word in tokens if word not in stop_words]
print("Tokens after Stopword Removal:\n", tokens_without_stopwords)


Tokens after Stopword Removal:
 ['hello', 'welcome', 'world', 'nlp', 'lets', 'clean', 'process', 'understand', 'text', 'better']


# 7. Stemming
- Crude root extraction using PorterStemmer

In [20]:
# Initialize stemmer
stemmer = PorterStemmer()

# Apply stemming
stemmed_tokens = [stemmer.stem(word) for word in tokens_without_stopwords]
print("Stemmed Tokens:\n", stemmed_tokens)


Stemmed Tokens:
 ['hello', 'welcom', 'world', 'nlp', 'let', 'clean', 'process', 'understand', 'text', 'better']


# 8. Lemmatization
- Smarter root form extraction

In [23]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_without_stopwords]
print("Lemmatized Tokens:\n", lemmatized_tokens)


Lemmatized Tokens:
 ['hello', 'welcome', 'world', 'nlp', 'let', 'clean', 'process', 'understand', 'text', 'better']


Note: Lemmatization gives proper dictionary words unlike stemming.

# 9. POS Tagging 

In [28]:
# Perform Part-of-Speech (POS) tagging
pos_tags = nltk.pos_tag(tokens_without_stopwords)
print("POS Tags:\n", pos_tags)


POS Tags:
 [('hello', 'NN'), ('welcome', 'JJ'), ('world', 'NN'), ('nlp', 'NN'), ('lets', 'NNS'), ('clean', 'JJ'), ('process', 'NN'), ('understand', 'VBP'), ('text', 'NN'), ('better', 'NN')]


- NN → Noun

- VB → Verb

- JJR → Comparative adjective (e.g., better)

## NOTE
- For serious projects, lemmatization is preferred over stemming.

- Stopword lists should be customized based on your specific NLP task.

- Vectorization (BoW, TF-IDF, Word2Vec) comes after preprocessing.