In [11]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login

In [12]:
load_dotenv()


True

In [13]:
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
ds = load_dataset("jason23322/high-accuracy-email-classifier")

In [14]:
import pandas as pd

In [15]:
train_data = pd.DataFrame(ds["train"])
test_data = pd.DataFrame(ds["test"])

In [16]:
train_data.head()

Unnamed: 0,id,subject,body,text,category,category_id
0,promotions_582,Anniversary Special: Buy one get one free,"As our loyal customer, get exclusive $60 off $...",Anniversary Special: Buy one get one free As o...,promotions,1
1,spam_1629,Your Amazon was used on new device,Your $5000 refund is processed. Claim: bit.ly/...,Your Amazon was used on new device Your $5000 ...,spam,3
2,spam_322,Re: Your Google inquiry,"Hi, following up about your Google application...","Re: Your Google inquiry Hi, following up about...",spam,3
3,social_media_80,Digital Ritual Experience Creation,Cross-cultural ceremony design. Join: virtualr...,Digital Ritual Experience Creation Cross-cultu...,social_media,2
4,forum_1351,"Your post was moved to ""Programming Help""","Trending: ""cooking"" (258 comments). View: supp...","Your post was moved to ""Programming Help"" Tren...",forum,0



#### Label Distribution

In [17]:
train_data['category'].value_counts()

category
forum           1800
verify_code     1800
promotions      1796
social_media    1796
spam            1794
updates         1794
Name: count, dtype: int64

#### Handling Null Values

In [18]:
train_data.isna().sum()

id             0
subject        0
body           0
text           0
category       0
category_id    0
dtype: int64

Information


In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10780 entries, 0 to 10779
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           10780 non-null  object
 1   subject      10780 non-null  object
 2   body         10780 non-null  object
 3   text         10780 non-null  object
 4   category     10780 non-null  object
 5   category_id  10780 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 505.4+ KB


Category Analysis

In [20]:
train_data['category'].unique()

array(['promotions', 'spam', 'social_media', 'forum', 'verify_code',
       'updates'], dtype=object)

Dataset Overview


In [21]:
print("Dataset Shape:", train_data.shape)

Dataset Shape: (10780, 6)


In [22]:
train_data.columns.tolist()

['id', 'subject', 'body', 'text', 'category', 'category_id']

In [23]:
train_data.dtypes

id             object
subject        object
body           object
text           object
category       object
category_id     int64
dtype: object

In [24]:
train_data['subject'].describe()

count                          10780
unique                          2575
top       Special offer just for you
freq                             107
Name: subject, dtype: object

Imbalance ratio

In [25]:
count= train_data['category'].value_counts()
count.max() / count.min()

1.0033444816053512

Word Frequency Analysis

In [26]:
from collections import Counter
import re

for category in train_data['category'].unique():
    text = ' '.join(train_data[train_data['category']==category]['text'])
    words = re.findall(r'\b\w+\b', text.lower())
    top_words = Counter(words).most_common(20)
    print(f"\n{category}: {top_words}")


promotions: [('com', 1780), ('off', 1621), ('example', 1594), ('code', 1216), ('shop', 693), ('items', 656), ('now', 599), ('for', 588), ('sale', 561), ('free', 530), ('get', 512), ('clearance', 464), ('extra', 444), ('ends', 434), ('with', 431), ('use', 431), ('time', 427), ('to', 407), ('new', 388), ('exclusive', 368)]

spam: [('your', 1307), ('com', 1155), ('now', 745), ('delivery', 649), ('account', 645), ('scam', 613), ('site', 559), ('phishing', 555), ('bit', 542), ('ly', 542), ('fakeprize', 542), ('claim', 525), ('update', 503), ('within', 487), ('payment', 470), ('refund', 405), ('you', 396), ('to', 376), ('or', 339), ('in', 338)]

social_media: [('com', 2067), ('social', 1204), ('platform', 905), ('group', 719), ('in', 676), ('new', 550), ('view', 508), ('see', 473), ('you', 471), ('live', 448), ('your', 434), ('from', 413), ('tech', 398), ('notifications', 371), ('profile', 344), ('post', 342), ('friends', 340), ('and', 331), ('comments', 310), ('reply', 308)]

forum: [('thr

Text length analysis

In [27]:
train_data['word'] = train_data['text'].str.split().str.len()
train_data['char'] = train_data['text'].str.len()

print("TEXT LENGTH STATISTICS")
print("\nWord Count:")
print(train_data['word'].describe())

print("\nBy Category:")
print(train_data.groupby('category')['word'].agg(['mean', 'median', 'min', 'max']))

print(f"\nVery short emails (<10 words): {(train_data['word'] < 10).sum()}")
print(f"Very long emails (>500 words): {(train_data['word'] > 500).sum()}")


TEXT LENGTH STATISTICS

Word Count:


count    10780.000000
mean        16.033024
std          6.412992
min          7.000000
25%         13.000000
50%         16.000000
75%         18.000000
max        236.000000
Name: word, dtype: float64

By Category:
                   mean  median  min  max
category                                 
forum         13.054444    12.0    7   72
promotions    18.075167    18.0    7  231
social_media  14.866927    15.0    7   79
spam          18.227425    18.0    8  192
updates       16.189521    16.0    7  236
verify_code   15.794444    15.0    7  209

Very short emails (<10 words): 501
Very long emails (>500 words): 0


In [28]:
short_emails = train_data[train_data['word'] < 10]

print(f"Short email dist by category:")
print(short_emails['category'].value_counts())

print("\nSAMPLES OF VERY SHORT EMAILS")
for idx, row in short_emails.sample(min(10, len(short_emails))).iterrows():
    print(f"\nCategory: {row['category']}")
    print(f"Length: {row['word']} words")
    print(f"Text: '{row['text']}'")
    print("-" * 60)

Short email dist by category:
category
forum           293
social_media     82
updates          73
verify_code      38
spam              8
promotions        7
Name: count, dtype: int64

SAMPLES OF VERY SHORT EMAILS

Category: forum
Length: 9 words
Text: 'New featured guide: books {event} starts {time}. Prepare: support.site/ticket/456.'
------------------------------------------------------------

Category: social_media
Length: 7 words
Text: 'Live Shopping Event Limited-edition jewelry. Join: social.com/live/shopping-event.'
------------------------------------------------------------

Category: forum
Length: 8 words
Text: 'AMA session: Tech CEO User: TechGuru99 thread. wiki.site/page/789.'
------------------------------------------------------------

Category: updates
Length: 9 words
Text: 'Payment received: Invoice INV-2345 Confirm/cancel: service.com/status Address (same building).'
------------------------------------------------------------

Category: forum
Length: 9 words
Text: '

-----------------------------------Preprocessing----------------------------------------------------


making copy

In [29]:

import numpy as np

In [30]:
org_train= pd.DataFrame(ds["train"])
org_test= pd.DataFrame(ds["test"])

In [31]:
train=org_train.copy()
test=org_test.copy()

In [32]:
len(train)

10780

In [33]:
len(test)

2697

Removing missing value

In [34]:
train = train.dropna(subset=['text', 'category'])
print(f"After removing NaN: {len(train)} emails")

After removing NaN: 10780 emails


In [35]:
train = train[train['text'].str.strip() != '']
print(f"After removing empty strings: {len(train)} emails")

After removing empty strings: 10780 emails
