Install the requirements packages

In [None]:
!pip install datasets pandas nltk beautifulsoup4 -q


In [None]:
!pip install -q datasets huggingface_hub


In [None]:
from huggingface_hub import login
login()


Load the Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("jason23322/high-accuracy-email-classifier")

print(ds)


In [None]:
# Convert train split to pandas DataFrame
df = ds['train'].to_pandas()

print("Dataset shape:", df.shape)
print(df.head())


Check if any empty fields present

In [None]:
print(df.columns)
print(df.isnull().sum())


Clean the datasets like (Removal of HTML tags, special characters, extra spaces, URLs, signature (basic), numbers, convert to lowercase, )

In [None]:
from datasets import load_dataset
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_email(text):

    # Convert to string
    text = str(text)

    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove email signatures (simple rule)
    text = re.sub(r'(--\s.*)', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters & numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


Preprocessing the cleaned text (fully NLP-ready version for ML) like (Tokenization, Stopword removal, Lemmatization)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join back to string
    return " ".join(tokens)

In [None]:
# Clean raw text
df['clean_text'] = df['text'].apply(clean_email)

# Apply preprocessing
df['processed_text'] = df['clean_text'].apply(preprocess_text)

print(df[['text', 'processed_text']].head())


Label the cleaned and preprocessed (complaint->0, request->1, feedback->2, spam->3)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['encoded_label'] = label_encoder.fit_transform(df['category'])

print(df[['category', 'encoded_label']].head())

convert the processed dataset into csv

In [None]:
df.to_csv("clean_email_dataset.csv", index=False)


Optional (if you want can store in drive)

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
df.to_csv("/content/drive/MyDrive/clean_email_dataset.csv", index=False)
