Install the requirements packages

In [1]:
!pip install datasets pandas nltk beautifulsoup4 -q


In [2]:
!pip install -q datasets huggingface_hub


In [3]:
from huggingface_hub import login
login()


Load the Dataset

In [4]:
from datasets import load_dataset

ds = load_dataset("jason23322/high-accuracy-email-classifier")

print(ds)


README.md:   0%|          | 0.00/3.67k [00:00<?, ?B/s]

train.json:   0%|          | 0.00/4.15M [00:00<?, ?B/s]

test.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'subject', 'body', 'text', 'category', 'category_id'],
        num_rows: 10780
    })
    test: Dataset({
        features: ['id', 'subject', 'body', 'text', 'category', 'category_id'],
        num_rows: 2697
    })
})


In [5]:
# Convert train split to pandas DataFrame
df = ds['train'].to_pandas()

print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (10780, 6)
                id                                    subject  \
0   promotions_582  Anniversary Special: Buy one get one free   
1        spam_1629         Your Amazon was used on new device   
2         spam_322                    Re: Your Google inquiry   
3  social_media_80         Digital Ritual Experience Creation   
4       forum_1351  Your post was moved to "Programming Help"   

                                                body  \
0  As our loyal customer, get exclusive $60 off $...   
1  Your $5000 refund is processed. Claim: bit.ly/...   
2  Hi, following up about your Google application...   
3  Cross-cultural ceremony design. Join: virtualr...   
4  Trending: "cooking" (258 comments). View: supp...   

                                                text      category  \
0  Anniversary Special: Buy one get one free As o...    promotions   
1  Your Amazon was used on new device Your $5000 ...          spam   
2  Re: Your Google inquiry Hi, follo

Check if any empty fields present

In [6]:
print(df.columns)
print(df.isnull().sum())


Index(['id', 'subject', 'body', 'text', 'category', 'category_id'], dtype='object')
id             0
subject        0
body           0
text           0
category       0
category_id    0
dtype: int64


Clean the datasets like (Removal of HTML tags, special characters, extra spaces, URLs, signature (basic), numbers, convert to lowercase, )

In [10]:
from datasets import load_dataset
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_email(text):

    # Convert to string
    text = str(text)

    # Remove HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove email signatures (simple rule)
    text = re.sub(r'(--\s.*)', '', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters & numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing the cleaned text (fully NLP-ready version for ML) like (Tokenization, Stopword removal, Lemmatization)

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet=True)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join back to string
    return " ".join(tokens)

In [14]:
# Clean raw text
df['clean_text'] = df['text'].apply(clean_email)

# Apply preprocessing
df['processed_text'] = df['clean_text'].apply(preprocess_text)

print(df[['text', 'processed_text']].head())


                                                text  \
0  Anniversary Special: Buy one get one free As o...   
1  Your Amazon was used on new device Your $5000 ...   
2  Re: Your Google inquiry Hi, following up about...   
3  Digital Ritual Experience Creation Cross-cultu...   
4  Your post was moved to "Programming Help" Tren...   

                                      processed_text  
0  anniversary special buy one get one free loyal...  
1  amazon used new device refund processed claim ...  
2  google inquiry hi following google application...  
3  digital ritual experience creation crosscultur...  
4  post moved programming help trending cooking c...  


Label the cleaned and preprocessed (complaint->0, request->1, feedback->2, spam->3)

In [16]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df['encoded_label'] = label_encoder.fit_transform(df['category'])

print(df[['category', 'encoded_label']].head())

       category  encoded_label
0    promotions              1
1          spam              3
2          spam              3
3  social_media              2
4         forum              0


convert the processed dataset into csv

In [17]:
df.to_csv("clean_email_dataset.csv", index=False)


Optional (if you want can store in drive)

In [19]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [20]:
df.to_csv("/content/drive/MyDrive/clean_email_dataset.csv", index=False)
