In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # 🔹 Add this line
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# ===============================
# Install & Download Dependencies
# ===============================
!pip install nltk spacy pandas

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   # Fix for LookupError
nltk.download('stopwords')
nltk.download('wordnet')

# Download SpaCy English model
!python -m spacy download en_core_web_sm




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
# ===============================
# 📌 TEXT PREPROCESSING PIPELINE
# ===============================

# Install dependencies
!pip install nltk spacy pandas

# Download required NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Download SpaCy model
!python -m spacy download en_core_web_sm

# ===============================
# 1. Load Dataset
# ===============================
import pandas as pd

# Example dataset (you can replace this with your own CSV/JSON/text file)
data = {
    "text": [
        "Hello WORLD! This is a sample sentence, with numbers like 123.",
        "NLTK & SpaCy are amazing tools for NLP preprocessing!!",
        "The cats are running, studied hard, and will be studies again..."
    ]
}
df = pd.DataFrame(data)
print("🔹 Original Dataset:")
print(df)

# ===============================
# 2. Define Preprocessing Pipeline
# ===============================
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

# Load spaCy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Stopword list
stop_words = set(stopwords.words('english'))

class TextPreprocessor:
    def __init__(self, remove_numbers=True, use_stemming=False, use_lemmatization=True):
        self.remove_numbers = remove_numbers
        self.use_stemming = use_stemming
        self.use_lemmatization = use_lemmatization
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def preprocess(self, text):
        # 1. Lowercase
        text = text.lower()

        # 2. Remove punctuation & special characters
        text = re.sub(r'[^\w\s]', '', text)

        # 3. Remove numbers if irrelevant
        if self.remove_numbers:
            text = re.sub(r'\d+', '', text)

        # 4. Tokenization
        tokens = word_tokenize(text)

        # 5. Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        # 6. Apply stemming or lemmatization
        if self.use_stemming:
            tokens = [self.stemmer.stem(word) for word in tokens]
        if self.use_lemmatization:
            tokens = [self.lemmatizer.lemmatize(word) for word in tokens]

        return tokens

    def preprocess_text_column(self, df, text_column="text"):
        df["cleaned_tokens"] = df[text_column].apply(self.preprocess)
        df["cleaned_text"] = df["cleaned_tokens"].apply(lambda x: " ".join(x))
        return df

# ===============================
# 3. Apply Pipeline
# ===============================
processor = TextPreprocessor(remove_numbers=True, use_stemming=True, use_lemmatization=True)

cleaned_df = processor.preprocess_text_column(df, "text")

print("\n✅ Cleaned Dataset:")
print(cleaned_df)

# ===============================
# 4. Save Cleaned Dataset
# ===============================
# Save as CSV
cleaned_df.to_csv("cleaned_dataset.csv", index=False)

# Save as JSON
cleaned_df.to_json("cleaned_dataset.json", orient="records", lines=True)

print("\n📂 Files saved: cleaned_dataset.csv & cleaned_dataset.json")

# ===============================
# 5. Show Before & After
# ===============================
for i in range(len(df)):
    print("\n🔹 Original:", df['text'][i])
    print("🔹 Cleaned :", cleaned_df['cleaned_text'][i])




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
🔹 Original Dataset:
                                                text
0  Hello WORLD! This is a sample sentence, with n...
1  NLTK & SpaCy are amazing tools for NLP preproc...
2  The cats are running, studied hard, and will b...

✅ Cleaned Dataset:
                                                text  \
0  Hello WORLD! This is a sample sentence, with n...   
1  NLTK & SpaCy are amazing tools for NLP preproc...   
2  The cats are running, studied hard, an