In [86]:

import numpy as np
import pandas as pd


In [87]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
from huggingface_hub import login

In [88]:
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
ds = load_dataset("jason23322/high-accuracy-email-classifier")

making copies

In [89]:
org_train= pd.DataFrame(ds["train"])
org_test= pd.DataFrame(ds["test"])

In [90]:
train=org_train.copy()
test=org_test.copy()

removing missing value

In [91]:
len(train)

10780

In [92]:
len(test)

2697

In [93]:
train = train.dropna(subset=['text', 'category'])
print(f"After removing NaN: {len(train)} emails")

After removing NaN: 10780 emails


In [94]:
train = train[train['text'].str.strip() != '']
print(f"After removing empty strings: {len(train)} emails")

After removing empty strings: 10780 emails


In [95]:
test = test.dropna(subset=['text', 'category'])
print(f"After removing NaN: {len(test)} emails")

After removing NaN: 2697 emails


In [96]:
test = test[test['text'].str.strip() != '']
print(f"After removing empty strings: {len(test)} emails")

After removing empty strings: 2697 emails


Handle duplicate mails

In [97]:
dup_train = train['text'].duplicated().sum()
print(f"duplicate emails:{dup_train}")

duplicate emails:707


In [98]:
train = train.drop_duplicates(subset=['text'], keep='first')
print(f"After removing duplicate  emails: {len(train)}")

After removing duplicate  emails: 10073


In [99]:
dup_test = test['text'].duplicated().sum()
print(f"duplicate emails in test set: {dup_test}")

duplicate emails in test set: 53


In [100]:
test = test.drop_duplicates(subset=['text'], keep='first')
print(f"After removing duplicate  emails: {len(test)}")

After removing duplicate  emails: 2644


In [101]:
len(train), len(test)

(10073, 2644)

html tags removal


In [102]:
from bs4 import BeautifulSoup
import re

In [103]:

def rem_html(text):
    try:
        sp = BeautifulSoup(text, 'html.parser')
        text = sp.get_text(separator=' ')
    except:
        text = re.sub(r'<[^>]+>', ' ', text)
    return text

In [104]:
count_train = train['text'].str.contains('<html|<div|<p|<br', case=False, na=False).sum()
print(f"Emails with HTML tags: {count_train}")

Emails with HTML tags: 0


In [105]:
count_test= train['text'].str.contains('<html|<div|<p|<br', case=False, na=False).sum()
print(f"Emails with HTML tags: {count_test}")

Emails with HTML tags: 0


handling email signatures

In [106]:
sign_patterns = [

    r'sent from my iphone',
    r'sent from my ipad',
    r'sent from my android',
    r'sent from my samsung',
    r'sent from my mobile',
    r'sent via my mobile',
    r'get outlook for ios',
    r'get outlook for android',

    r'best regards[\s\S]*', 
    r'kind regards[\s\S]*',
    r'warm regards[\s\S]*',
    r'with regards[\s\S]*',
    r'yours sincerely[\s\S]*',
    r'sincerely yours[\s\S]*',
    r'sincerely[\s\S]*',

    r'best wishes[\s\S]*',
    r'many thanks[\s\S]*',
    r'thanks and regards[\s\S]*',
    r'thanks in advance[\s\S]*',
    r'thank you in advance[\s\S]*',
    r'cheers[\s\S]*',

    r'regards[\s\S]*',
    r'yours truly[\s\S]*',
    r'yours faithfully[\s\S]*',
    r'respectfully[\s\S]*',

    r'this email (is|was) (sent|generated)[\s\S]*',
    r'confidentiality notice[\s\S]*',
    r'disclaimer[\s\S]*',
    r'this message contains confidential[\s\S]*',
    r'unsubscribe[\s\S]*',
    
]

In [107]:
def rem_sign(txt):
    for pattern in sign_patterns:
        txt = re.sub(pattern, '', txt, flags=re.IGNORECASE)
    txt = re.sub(r'\n\s*\n', '\n', txt)
    txt = re.sub(r'\s+', ' ', txt)
    txt = txt.strip()
    return txt

In [108]:
def count_sign(column, patterns):
    
    tmatches = 0
    pat_match_count= {}
    
    for pattern in patterns:
        count = column.apply(
            lambda text: bool(re.search(pattern, str(text), flags=re.IGNORECASE))
        ).sum()
        
        if count > 0:
            pat_match_count[pattern] = count
            tmatches += count
    
    return tmatches, pat_match_count

In [109]:
bef_count_train, pat_details_train = count_sign(
    train['text'], 
    sign_patterns)
print(f"\n emails with signatures: {bef_count_train}")


 emails with signatures: 164


In [110]:
bef_count_test, pat_details_test = count_sign(
    test['text'], 
    sign_patterns)
print(f"\n emails with signatures: {bef_count_test}")


 emails with signatures: 38


In [111]:
train['text'] = train['text'].apply(rem_sign)

In [112]:
af_count_train, rem_train = count_sign(
    train['text'], 
    sign_patterns
)
print(f"\nemailswith signatures: {af_count_train}")


emailswith signatures: 0


In [113]:
test['text'] = test['text'].apply(rem_sign)

In [114]:
af_count_test, rem_test = count_sign(
    test['text'], 
    sign_patterns
)
print(f"\nemails with signatures: {af_count_test}")


emails with signatures: 0


Email thread removal

In [115]:
thread_pat = [

    r'-{3,}\s*original message\s*-{3,}[\s\S]*',   
    r'-{3,}\s*forwarded message\s*-{3,}[\s\S]*',  
    r'-{3,}\s*reply message\s*-{3,}[\s\S]*',    
    r'_{3,}\s*[\s\S]*',                         
    r'on\s+\w+,?\s+\w+\s+\d+,?\s+\d{4}.*?wrote:[\s\S]*',
    r'on\s+\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4}.*?wrote:[\s\S]*',
    r'on\s+\w+\s+\d+.*?at\s+\d+:\d+.*?wrote:[\s\S]*',
    r'(from|de)\s*:.*\n(to|para)\s*:.*\n[\s\S]*',
    r'from\s*:.*\n(sent|date)\s*:.*\n[\s\S]*',
    r'begin forwarded message[\s\S]*',
    r'forwarded by[\s\S]*',
    r'---------- forwarded message ---------[\s\S]*',

    
    r'(?m)^>+.*$',        

]

In [116]:
def count_thread(df_column, patterns):
    def has_thread(text):
        for pat in patterns:
            if re.search(pat, str(text), flags=re.IGNORECASE):
                return True  
        return False
    count = df_column.apply(has_thread).sum()
    return count

In [117]:
def remove_email_threads(txt):
    for pattern in thread_pat:
        txt = re.sub(pattern, '', txt, flags=re.IGNORECASE | re.MULTILINE)

    txt = re.sub(r'\n\s*\n', '\n', txt)
    txt = re.sub(r'\s+', ' ', txt)

    return txt.strip()

In [118]:
btrain = count_thread(train['text'], thread_pat)
print(f"  Emails with thread:{btrain}")

  Emails with thread:0


In [119]:
btest = count_thread(test['text'], thread_pat)
print(f"  Emails with thread:{btest}")

  Emails with thread:0


In [120]:
btrain = count_thread(train['text'], thread_pat)
print(f"  Emails with thread:{btrain}")

  Emails with thread:0


Expand Contractions 

In [121]:
import contractions

In [122]:
def expand_contractions(text):
    return contractions.fix(text)

In [123]:
contra_train = train['text'].str.contains(
    "don't|can't|won't|isn't|aren't|couldn't|i'm|it's",
    case=False, na=False
).sum()

print(f"\n emails with contractions: {contra_train}")


 emails with contractions: 365


In [124]:
contra_test = test['text'].str.contains(
    "don't|can't|won't|isn't|aren't|couldn't|i'm|it's",
    case=False, na=False
).sum()

print(f"\n emails with contractions: {contra_test}")


 emails with contractions: 95


In [125]:
train['text'] = train['text'].apply(expand_contractions)
test['text'] = test['text'].apply(expand_contractions)

In [126]:
contra_train = train['text'].str.contains(
    "don't|can't|won't|isn't|aren't|couldn't|i'm|it's",
    case=False, na=False
).sum()

print(f"\n emails with contractions: {contra_train}")


 emails with contractions: 0


In [127]:
contra_test = test['text'].str.contains(
    "don't|can't|won't|isn't|aren't|couldn't|i'm|it's",
    case=False, na=False
).sum()

print(f"\n emails with contractions: {contra_test}")


 emails with contractions: 0


Remove URLs & Email Addresses

In [128]:

url_train   = train['text'].str.contains(
              r'https?://|www\.', case=False, na=False).sum()

email_train= train['text'].str.contains(
              r'\S+@\S+\.\S+', case=False, na=False).sum()

print(f"URLs : {url_train}")
print(f"Email : {email_train} ")

URLs : 0
Email : 328 


In [129]:

url_test  = test['text'].str.contains(
              r'https?://|www\.', case=False, na=False).sum()

email_test= test['text'].str.contains(
              r'\S+@\S+\.\S+', case=False, na=False).sum()

print(f"URLs : {url_test} ")
print(f"Email : {email_test} ")

URLs : 0 
Email : 85 


In [130]:
def rem_email_addr(txt):
    
    txt = re.sub(r'\S+@\S+\.\S+', ' ', txt)
    
    txt = re.sub(r'\s+', ' ', txt)
    
    return txt.strip()

In [131]:
train['text'] = train['text'].apply(rem_email_addr)

e_train = train['text'].str.contains(
                    r'\S+@\S+\.\S+', case=False, na=False).sum()
print(f"Email : {e_train} ")


Email : 0 


In [132]:
test['text'] = test['text'].apply(rem_email_addr)

e_test = test['text'].str.contains(
                    r'\S+@\S+\.\S+', case=False, na=False).sum()
print(f"Email : {e_test} ")

Email : 0 


Special char removal

In [133]:
sp_char_train = train['text'].str.contains(
                      r'[^a-zA-Z0-9\s\.\,\!\?\-\']', na=False).sum()
print(f"special chars in email: {sp_char_train} ")

special chars in email: 10030 


In [134]:
sp_char_test = test['text'].str.contains(
                      r'[^a-zA-Z0-9\s\.\,\!\?\-\']', na=False).sum()
print(f"special chars in email: {sp_char_test} ")

special chars in email: 2630 


In [135]:
def rem_sp_chars(txt):
    txt = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\-\']', ' ', txt)
    txt = re.sub(r'\s+', ' ', txt)
    
    return txt.strip()

In [136]:
train['text'] = train['text'].apply(rem_sp_chars)

In [137]:
sp_char_train = train['text'].str.contains(
                      r'[^a-zA-Z0-9\s\.\,\!\?\-\']', na=False).sum()
print(f"special chars in email: {sp_char_train} ")

special chars in email: 0 


In [138]:
test['text'] = test['text'].apply(rem_sp_chars)


In [139]:
sp_char_test= test['text'].str.contains(
                      r'[^a-zA-Z0-9\s\.\,\!\?\-\']', na=False).sum()
print(f"special chars in email: {sp_char_test} ")

special chars in email: 0 


Lowercasing

In [140]:
print(train['text'].head())

0    Anniversary Special Buy one get one free As ou...
1    Your Amazon was used on new device Your 5000 r...
2    Re Your Google inquiry Hi, following up about ...
3    Digital Ritual Experience Creation Cross-cultu...
4    Your post was moved to Programming Help Trendi...
Name: text, dtype: object


In [141]:
train['text'] = train['text'].str.lower()

In [142]:
print(train['text'].head())

0    anniversary special buy one get one free as ou...
1    your amazon was used on new device your 5000 r...
2    re your google inquiry hi, following up about ...
3    digital ritual experience creation cross-cultu...
4    your post was moved to programming help trendi...
Name: text, dtype: object


In [143]:
print(test['text'].head())

0    Watch later Recommended story Group update Boo...
1    News from groups you follow Group Tech Enthusi...
2    Two-Factor Authentication Enforcement Notice R...
3    Security upgrade 2FA enabled Your monthly stat...
4    Verification PIN 907472 Use 404583 as your ver...
Name: text, dtype: object


In [144]:
test['text'] = test['text'].str.lower()

In [145]:
print(test['text'].head())

0    watch later recommended story group update boo...
1    news from groups you follow group tech enthusi...
2    two-factor authentication enforcement notice r...
3    security upgrade 2fa enabled your monthly stat...
4    verification pin 907472 use 404583 as your ver...
Name: text, dtype: object


whitespace normalizing

In [146]:
space_train = train['text'].str.contains(
                   r'  +|\t|\n', na=False).sum()
print(f"whitespace in email: {space_train} ")

whitespace in email: 0 


In [147]:
space_test = test['text'].str.contains(
                   r'  +|\t|\n', na=False).sum()
print(f"whitespace in email: {space_test} ")

whitespace in email: 0 


stopword removal

In [148]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

True

In [149]:
standard_sword= set(stopwords.words('english'))

In [150]:
neg_word= {
    'no', 'not', 'nor', 'never', 'neither',
    'nothing', 'nobody', 'nowhere', 'none',
    'cannot', 'without', 'against',
}

keep_word= {
    'very', 'too', 'much', 'more', 'most',
    'up', 'down', 'off', 'out', 'over', 'under',
    'above', 'below', 'few',
}

In [151]:
all_keep_word= neg_word | keep_word
final_stopword= standard_sword - all_keep_word

In [152]:
def rem_stopword(text):
    if not text or not isinstance(text, str):
        return text
    words = text.split()
    filtered = [w for w in words if w not in final_stopword]
    return ' '.join(filtered) if filtered else ''

In [153]:
before_train=train['text'].str.split().str.len().mean()
print(before_train)

17.48634964757272


In [154]:
before_test=test['text'].str.split().str.len().mean()
print(before_test)

17.824886535552192


In [155]:
train['text'] = train['text'].apply(rem_stopword)

after_train=train['text'].str.split().str.len().mean()
print(after_train)

15.007942023230418


In [156]:
test['text'] = test['text'].apply(rem_stopword)

after_test=test['text'].str.split().str.len().mean()
print(after_test)

15.36232980332829


Remove Short Emails

In [157]:
short_train = (train['text'].str.split().str.len() <5).sum()
short_test = (test['text'].str.split().str.len() < 5).sum()
print(short_train)
print(short_test)

0
0


Reset index

In [158]:

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

Tokenization

In [159]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [160]:
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1,2),
    min_df=5,
    max_df=0.95,
    sublinear_tf=True
)

In [161]:
X_train = tfidf.fit_transform(train['text'])
print(f"email,feature: {X_train.shape[0]},{X_train.shape[1]}")

email,feature: 10073,4263


In [162]:
X_test = tfidf.transform(test['text'])
print(f"email,feature: {X_test.shape[0]},{X_test.shape[1]}")

email,feature: 2644,4263


Label Encoding

In [163]:
from sklearn.preprocessing import LabelEncoder

In [164]:
le = LabelEncoder()
y_train = le.fit_transform(train['category'])
y_test  = le.transform(test['category'])

In [165]:
for label, encoded in zip(le.classes_, range(len(le.classes_))):
    print(f"  '{label}' → {encoded}")

  'forum' → 0
  'promotions' → 1
  'social_media' → 2
  'spam' → 3
  'updates' → 4
  'verify_code' → 5


In [166]:

print(f"  y_train: {y_train[:5]}")
print(f"  y_test: {y_test[:5]}")


  y_train: [1 3 3 2 0]
  y_test: [2 2 0 4 5]


In [167]:
import numpy as np
uni, count = np.unique(y_train, return_counts=True)
for u, c in zip(uni, count):
    print(f"  {le.classes_[u]} ({u}): {c} ({c/len(y_train)*100:.1f}%)")

  forum (0): 1572 (15.6%)
  promotions (1): 1765 (17.5%)
  social_media (2): 1689 (16.8%)
  spam (3): 1719 (17.1%)
  updates (4): 1559 (15.5%)
  verify_code (5): 1769 (17.6%)


Saving data

In [168]:
from scipy import sparse
sparse.save_npz("X_train.npz", X_train)
sparse.save_npz("X_test.npz", X_test)

np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)

In [169]:
import joblib
joblib.dump(tfidf, "tfidf_vector.pkl")
joblib.dump(le, "label_encoder.pkl")


['label_encoder.pkl']