1️⃣ Basic Text Cleaning (Lowercasing, Removing Punctuation & Digits)

In [2]:
import re

def clean_text(text):
    text = text.lower()  # Lowercasing
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.strip()  # Remove extra spaces
    return text

text = "Hello, NLP! Text preprocessing in 2024 is fun.123"
cleaned_text = clean_text(text)
print(cleaned_text)  # Output: "hello nlp text preprocessing in  is fun"

hello nlp text preprocessing in  is fun


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = ["I love AI", "AI is amazing"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(texts)

sim = cosine_similarity(tfidf[0], tfidf[1])
print(sim[0][0])  # similarity score between 0 and 1

2️⃣ Tokenization (NLTK & spaCy)

In [1]:
import nltk
from nltk.tokenize import word_tokenize
import spacy

nltk.download('punkt')

text = "Natural Language Processing (NLP) is awesome!"
tokens_nltk = word_tokenize(text)
print(tokens_nltk)  # ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'awesome', '!']

nlp = spacy.load("en_core_web_sm")
tokens_spacy = [token.text for token in nlp(text)]
print(tokens_spacy)  # ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'is', 'awesome', '!']

ModuleNotFoundError: No module named 'spacy'

3️⃣ Stopword Removal (NLTK)

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens_nltk if word.lower() not in stop_words]
print(filtered_tokens)  # ['Natural', 'Language', 'Processing', '(', 'NLP', ')', 'awesome', '!']

6️⃣ Removing URLs, Emails & Special Characters

In [None]:
def remove_noise(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\S+@\S+', '', text)  # Remove emails
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    return text

text = "Contact me at hello@example.com or visit https://example.com!"
cleaned_text = remove_noise(text)
print(cleaned_text)  # "Contact me at  or visit "