Spamsassasin Preprocessing and combining with enron

In [None]:
import os
import pandas as pd

SA_BASE = "spamsassasin"   
def load_spamassassin_texts(folder):
    texts = []
    fnames = sorted(os.listdir(folder))
    for fname in fnames:
        fpath = os.path.join(folder, fname)
        try:
            with open(fpath, "r", encoding="latin-1") as f:
                raw = f.read()
                # drop headers similar to Enron: split at first blank line
                parts = raw.split("\n\n", 1)
                body = parts[1] if len(parts) > 1 else parts[0]
                texts.append(body.strip())
        except Exception as e:
            # skip unreadable files
            continue
    return texts

easy = load_spamassassin_texts(os.path.join(SA_BASE, "easy_ham"))
hard = load_spamassassin_texts(os.path.join(SA_BASE, "hard_ham"))
spam = load_spamassassin_texts(os.path.join(SA_BASE, "spam"))

print("easy:", len(easy), "hard:", len(hard), "spam:", len(spam))

df_spam = pd.DataFrame({
    "clean_text": easy + hard + spam,
    "category": ["ham"] * (len(easy)+len(hard)) + ["spam"] * len(spam)
})


easy: 2551 hard: 250 spam: 501


In [None]:
df = pd.read_csv("enron_clean.csv")

print("Enron current emails:", len(df))

# Filtering very shorter mails
df = df[df['clean_text'].str.len() > 20].copy()

# Adding category placeholder
df['category'] = "unknown"

# Keeping only needed columns
df_enron = df[['clean_text', 'category']].reset_index(drop=True)
print("Enron after filtering:", len(df_enron))


Enron current emails: 510504
Enron after filtering: 510504


In [18]:
df_enron.head()


Unnamed: 0,clean_text,category
0,Traveling to have a business meeting takes the...,unknown
1,test successful. way to go!!!,unknown
2,"Randy,\n Can you send me a schedule of the sal...",unknown
3,Let's shoot for Tuesday at 11:45.,unknown
4,"Greg,\n How about either next Tuesday or Thurs...",unknown


In [19]:
df_enron.shape

(510504, 2)

In [21]:
df_spam = pd.DataFrame({
    "clean_text": easy + hard + spam,
    "category": ["ham"] * (len(easy) + len(hard)) + ["spam"] * len(spam)
})

print(df_spam.shape)
df_spam.head()

(3302, 2)


Unnamed: 0,clean_text,category
0,"Date: Wed, 21 Aug 2002 10:54:46 -0500\n...",ham
1,"Martin A posted:\nTassos Papadopoulos, the Gre...",ham
2,Man Threatens Explosion In Moscow \n\nThursday...,ham
3,Klez: The Virus That Won't Die\n \nAlready the...,ham
4,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrot...",ham


In [22]:
df_combined = pd.concat([df_enron, df_spam], ignore_index=True)

print("Combined shape:", df_combined.shape)
df_combined.sample(5)

Combined shape: (513806, 2)


Unnamed: 0,clean_text,category
273827,it is tough to watch because they cant stop sh...,unknown
267734,Congratulations on making this not-unexpected ...,unknown
121855,"Hey Buddy, would you change my email address t...",unknown
18255,"Dear InvestorPlace.com Member,\nI'm about to r...",unknown
224860,Attached are the BGE filings at the US Distric...,unknown


In [23]:
print(df_enron.shape)
print(df_spam.shape)
print(df_combined.shape)

(510504, 2)
(3302, 2)
(513806, 2)


Combined Dataset Preprocessing

Step 1: Adding Automatic category labels

In [None]:
import re

# Keyword lists
request_keywords = [
    r"\bcan you\b", r"\bcould you\b", r"\bplease\b", r"\bhelp me\b",
    r"\bcan u\b", r"\bneed\b", r"\brequest\b"
]

complaint_keywords = [
    r"\bnot working\b", r"\bissue\b", r"\bproblem\b", r"\berror\b",
    r"\bwrong\b", r"\bfailure\b", r"\bcomplain\b", r"\brefund\b"
]

feedback_keywords = [
    r"\bthank you\b", r"\bthanks\b", r"\bappreciate\b",
    r"\bgreat\b", r"\bgood\b", r"\bfeedback\b"
]

def classify_category(text):
    t = text.lower()

    for p in request_keywords:
        if re.search(p, t):
            return "request"

    for p in complaint_keywords:
        if re.search(p, t):
            return "complaint"

    for p in feedback_keywords:
        if re.search(p, t):
            return "feedback"

    return "other"

# Only apply rules to Enron rows (unknown category)
df_combined['auto_category'] = df_combined.apply(
    lambda row: row['category'] if row['category'] in ['spam', 'ham'] 
                else classify_category(row['clean_text']),
    axis=1
)

df_combined['auto_category'].value_counts().head(10)


auto_category
request      287686
other        144013
feedback      60867
complaint     17938
ham            2801
spam            501
Name: count, dtype: int64

Step 2: Automatic urgency Labelling

In [26]:
import re

def detect_urgency(text):
    t = text.lower()

    # High urgency signals
    if re.search(r"\burgent\b|\basap\b|\bimmediately\b|\bcritical\b|\bsevere\b|\bimportant\b|\bnot working\b|\bdeadline\b", t):
        return "high"

    # Medium urgency: requires action but not immediate
    if re.search(r"\bplease\b|\bsoon\b|\bneed\b|\bcan you\b|\bcould you\b|\brequest\b", t):
        return "medium"

    # Otherwise low
    return "low"

df_combined['auto_urgency'] = df_combined['clean_text'].apply(detect_urgency)

df_combined['auto_urgency'].value_counts()


auto_urgency
medium    245930
low       208814
high       59062
Name: count, dtype: int64

In [29]:
df_combined.to_csv("final_email_dataset.csv", index=False)
print("Saved final_email_dataset.csv with", len(df_combined), "rows")


Saved final_email_dataset.csv with 513806 rows


In [30]:
d_f = pd.read_csv("final_email_dataset.csv")

In [31]:
d_f.head()

Unnamed: 0,clean_text,category,auto_category,auto_urgency
0,Traveling to have a business meeting takes the...,unknown,other,low
1,test successful. way to go!!!,unknown,other,low
2,"Randy,\n Can you send me a schedule of the sal...",unknown,request,medium
3,Let's shoot for Tuesday at 11:45.,unknown,other,low
4,"Greg,\n How about either next Tuesday or Thurs...",unknown,other,low


In [33]:
d_f.shape

(513806, 4)