In [2]:
#Environment Setup
# Install the libraries (run this in your terminal or Jupyter Notebook)
!pip install nltk pandas scikit-learn

# Run this in your Python script or notebook once
import nltk
nltk.download('punkt')        # For tokenization
nltk.download('stopwords')    # For stopword list
nltk.download('wordnet')      # For lemmatization



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
#Reusable Preprocessing Function/Class
#Core Imports and Initializations
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# --- Initialize Global Resources ---
# 1. Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()
# 2. Get English Stopwords
STOP_WORDS = set(stopwords.words('english'))

In [4]:
# Preprocessing Functions
## Step 1: Convert Text to Lowercase
def to_lowercase(text):
    """Converts all text to lowercase."""
    if isinstance(text, str):
        return text.lower()
    return text

## Step 2: Remove Punctuation & Special Characters (and remove numbers/digits)
def remove_punc_and_numbers(text, keep_numbers=False):
    """
    Removes punctuation and special characters.
    Optionally removes numbers based on 'keep_numbers'.
    """
    if not isinstance(text, str):
        return ""

    # Replace all non-word characters (excluding numbers, if we decide to keep them)
    # The regex r'[^\w\s]' targets anything that isn't a word character or whitespace.
    # We will refine this after considering numbers.

    # 1. Remove URLs (a good extra step!)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 2. Remove non-alphabetic characters (including punctuation)
    if keep_numbers:
        # Keep letters and numbers
        text = re.sub(r'[^a-z0-9\s]', '', text)
    else:
        # Keep ONLY letters
        text = re.sub(r'[^a-z\s]', '', text)

    # Remove extra whitespace created by the removal process
    text = re.sub(r'\s+', ' ', text).strip()
    return text

## Step 3 & 4: Tokenization & Stopword Removal
def tokenize_and_remove_stopwords(text):
    """
    Splits text into tokens (words) and removes common stopwords.
    Returns a list of tokens.
    """
    if not isinstance(text, str):
        return []

    # Tokenization
    tokens = word_tokenize(text)

    # Stopword Removal
    filtered_tokens = [word for word in tokens if word not in STOP_WORDS]

    return filtered_tokens

## Step 5: Apply Stemming or Lemmatization (We'll use Lemmatization for better accuracy)
def lemmatize_tokens(tokens):
    """
    Applies lemmatization to a list of tokens.
    Returns a list of lemmatized tokens.
    """
    # Note: Lemmatizer is initialized globally at the top
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [5]:
#The Complete Preprocessing Pipeline
def preprocess_text_pipeline(text, keep_numbers=False, return_string=True):
    """
    The main reusable text preprocessing pipeline.

    Args:
        text (str): The raw input text.
        keep_numbers (bool): If True, numbers are kept.
        return_string (bool): If True, joins the tokens back into a single string.
                              If False, returns a list of tokens.

    Returns:
        str or list: The cleaned text/tokens.
    """
    # 1. Lowercasing
    text = to_lowercase(text)

    # 2. Punctuation & Number Removal
    # We pass the lowercase text to remove_punc_and_numbers
    text = remove_punc_and_numbers(text, keep_numbers=keep_numbers)

    # 3. Tokenization & Stopword Removal
    tokens = tokenize_and_remove_stopwords(text)

    # 4. Lemmatization
    cleaned_tokens = lemmatize_tokens(tokens)

    # 5. Return Format
    if return_string:
        # Re-join tokens into a cleaned sentence for the final output
        return " ".join(cleaned_tokens)
    else:
        return cleaned_tokens

In [6]:
#Load Dataset
# Create a sample DataFrame (simulating loading a CSV/JSON file)
data = {
    'review_id': [1, 2, 3],
    'raw_text': [
        "The battery life is AMAZING, lasting 48 hours! I'm studying this product.",
        "Terrible service and very slow shipping. The price was too high.",
        "I was surprised! This is a 10/10 product. Not bad. www.example.com"
    ]
}
df = pd.DataFrame(data)

# Inspect first few rows (simulating: df.head())
print("--- Raw Dataset (First Few Rows) ---")
print(df)

--- Raw Dataset (First Few Rows) ---
   review_id                                           raw_text
0          1  The battery life is AMAZING, lasting 48 hours!...
1          2  Terrible service and very slow shipping. The p...
2          3  I was surprised! This is a 10/10 product. Not ...


In [9]:
#Apply the Pipeline and Save Cleaned Text
# Apply the pipeline to the entire 'raw_text' column
# We will choose to KEEP numbers for this example, as they might be relevant in reviews (e.g., ratings, dates, etc.)
df['cleaned_text'] = df['raw_text'].apply(
    lambda x: preprocess_text_pipeline(x, keep_numbers=True, return_string=True)
)

print("\n--- Cleaned Dataset ---")
print(df[['raw_text', 'cleaned_text']])


--- Cleaned Dataset ---
                                            raw_text  \
0  The battery life is AMAZING, lasting 48 hours!...   
1  Terrible service and very slow shipping. The p...   
2  I was surprised! This is a 10/10 product. Not ...   

                                        cleaned_text  
0  battery life amazing lasting 48 hour im studyi...  
1          terrible service slow shipping price high  
2                         surprised 1010 product bad  


In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
# Save Cleaned Dataset
# Save the cleaned dataset to a new CSV file
# Only save the relevant columns
output_filename = 'cleaned_reviews_dataset.csv'
df[['review_id', 'cleaned_text']].to_csv(output_filename, index=False)

print(f"\n✅ Cleaned dataset successfully saved to: {output_filename}")


✅ Cleaned dataset successfully saved to: cleaned_reviews_dataset.csv


In [11]:
import pandas as pd

# Load the file you just saved
df_cleaned = pd.read_csv('cleaned_reviews_dataset.csv')

# Display the first few rows to inspect
print(df_cleaned.head())

   review_id                                       cleaned_text
0          1  battery life amazing lasting 48 hour im studyi...
1          2          terrible service slow shipping price high
2          3                         surprised 1010 product bad


In [12]:
import nltk
# Download the resource needed for Part-of-Speech tagging
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [13]:
from nltk.corpus import wordnet
from nltk import pos_tag

# Function to convert NLTK tag to WordNet tag format
def get_wordnet_pos(treebank_tag):
    """Converts NLTK POS tags to WordNet POS tags for the Lemmatizer."""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN # Default to noun

# The new, improved Lemmatization function
def lemmatize_tokens_pos_aware(tokens):
    """
    Applies POS-aware lemmatization to a list of tokens.
    """
    lemmatized_tokens = []
    # 1. Get POS tags for all tokens
    tagged_tokens = pos_tag(tokens)

    for word, tag in tagged_tokens:
        # 2. Convert NLTK tag to WordNet tag
        wntag = get_wordnet_pos(tag)

        # 3. Lemmatize using the correct tag (using the globally defined 'lemmatizer')
        lemma = lemmatizer.lemmatize(word, pos=wntag)
        lemmatized_tokens.append(lemma)

    return lemmatized_tokens

In [14]:
def preprocess_text_pipeline(text, keep_numbers=False, return_string=True):
    # 1. Lowercasing
    text = to_lowercase(text)

    # 2. Punctuation & Number Removal
    text = remove_punc_and_numbers(text, keep_numbers=keep_numbers)

    # 3. Tokenization & Stopword Removal
    tokens = tokenize_and_remove_stopwords(text)

    # 4. Lemmatization (CALL THE NEW FUNCTION HERE!)
    cleaned_tokens = lemmatize_tokens_pos_aware(tokens)

    # 5. Return Format
    if return_string:
        return " ".join(cleaned_tokens)
    else:
        return cleaned_tokens

In [15]:
# Re-apply the pipeline to the entire 'raw_text' column
df['cleaned_text'] = df['raw_text'].apply(
    lambda x: preprocess_text_pipeline(x, keep_numbers=True, return_string=True)
)

print("\n--- Cleaned Dataset ---")
print(df[['raw_text', 'cleaned_text']])

LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [16]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [17]:
# Re-apply the pipeline to the entire 'raw_text' column
df['cleaned_text'] = df['raw_text'].apply(
    lambda x: preprocess_text_pipeline(x, keep_numbers=True, return_string=True)
)

print("\n--- Cleaned Dataset ---")
print(df[['raw_text', 'cleaned_text']])


--- Cleaned Dataset ---
                                            raw_text  \
0  The battery life is AMAZING, lasting 48 hours!...   
1  Terrible service and very slow shipping. The p...   
2  I was surprised! This is a 10/10 product. Not ...   

                                       cleaned_text  
0  battery life amaze last 48 hour im study product  
1         terrible service slow shipping price high  
2                        surprised 1010 product bad  
