### **📚 Text Preprocessing Master File for NLP / ML / GenAI**

In [None]:
# ✅ 1. Import Required Libraries
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize

# ✅ 2. Download Required NLTK Resources (Only run once)
# These resources include stopword lists, tokenizers, and lemmatization dictionaries.
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# ✅ 3. Define Individual Cleaning Functions
# Each function handles a specific part of the text cleaning pipeline.

# 🔹 Convert all characters to lowercase for uniformity
def to_lower(text):
    return text.lower()

# 🔹 Remove emojis from text to avoid noise in analysis
def remove_emojis(text):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# 🔹 Optional: Convert emojis to their textual description (e.g., 😃 → :grinning_face:)
def demojize_emojis(text):
    return emoji.demojize(text)

# 🔹 Remove HTML tags (e.g., <br>, <p>, etc.)
def remove_html_tags(text):
    return BeautifulSoup(text, "html.parser").get_text()

# 🔹 Remove URLs (e.g., http://example.com)
def remove_urls(text):
    return re.sub(r'http\S+|www\.\S+', '', text)

# 🔹 Remove punctuation and special characters
# Keeps only alphanumeric characters and whitespace
def remove_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

# 🔹 Tokenize text (split into individual words)
def tokenize_text(text):
    return word_tokenize(text)

# 🔹 Remove common stopwords like "the", "is", "and"
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

# 🔹 Lemmatize words (e.g., "running" → "run") for root-word consistency
# This improves model generalization
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

# ✅ 4. Create a Full Cleaning Pipeline
# This function applies all steps in sequence to fully prepare text for NLP tasks
# For example: turning a noisy movie review with emojis, stopwords, and punctuation into clean, model-ready tokens.

def full_clean(text):
    text = to_lower(text)                      # Step 1: Lowercase
    text = remove_html_tags(text)             # Step 2: Remove HTML tags
    text = remove_urls(text)                  # Step 3: Remove URLs
    text = remove_emojis(text)                # Step 4: Remove emojis
    text = remove_punctuation(text)           # Step 5: Remove punctuation
    tokens = tokenize_text(text)              # Step 6: Tokenize words
    tokens = remove_stopwords(tokens)         # Step 7: Remove stopwords
    tokens = lemmatize_tokens(tokens)         # Step 8: Lemmatize words
    return ' '.join(tokens)

# ✅ 5. Example Test Case
# This demonstrates how to use the pipeline on a sample text string.
# 🧪 You can replace this text with any raw input like reviews, tweets, or chat logs.

if __name__ == '__main__':
    original_text = "Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />Visit http://example.com to read more! OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."
    print("Original Text:\n", original_text)
    print("\nCleaned Text:\n", full_clean(original_text))
