load the data

In [72]:
import os

def load_emails_from_folder(folder_path):
    emails = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, encoding='latin-1') as f: 
                emails.append(f.read())
    return emails

# load the data sets
spam_emails = load_emails_from_folder(r'C:\Users\alexa\Documents\4. Semester Mechatronik-2025\Machine_Learning_und_Data_Science\mlds_spam_filter\data\spam')
ham_emails = load_emails_from_folder(r'C:\Users\alexa\Documents\4. Semester Mechatronik-2025\Machine_Learning_und_Data_Science\mlds_spam_filter\data\ham')


formate the emails

In [73]:
import re
import string
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

# Download necessary NLTK components
nltk.download('punkt_tab') 
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


# Load all emails from a folder
def load_emails_from_folder(folder_path):
    emails = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            with open(file_path, encoding='latin-1') as f:
                emails.append(f.read())
    return emails


# Preprocess a single email by cleaning and simplifying its contents
def preprocess_email(text):

    # Remove email header (everything before the first empty line)
    if "\n\n" in text:
        text = text.split("\n\n", 1)[1]
        
    # Convert text to lowercase
    text = text.lower()

    # Replace URLs with placeholder
    text = re.sub(r'http\\S+|www\\S+|https\\S+', 'URL', text)

    # Replace numbers with placeholder
    text = re.sub(r'\\d+', 'NUMBER', text)

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text for stemming
    words = word_tokenize(text)

    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]

    # Remove stopwords
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]

    return ' '.join(stemmed_words)


# === Example usage ===
spam_emails = load_emails_from_folder(
    r'C:\\Users\\alexa\\Documents\\4. Semester Mechatronik-2025\\Machine_Learning_und_Data_Science\\mlds_spam_filter\\data\\spam'
)
ham_emails = load_emails_from_folder(
    r'C:\\Users\\alexa\\Documents\\4. Semester Mechatronik-2025\\Machine_Learning_und_Data_Science\\mlds_spam_filter\\data\\ham'
)

# Apply preprocessing to each email
preprocessed_spam = [preprocess_email(email) for email in spam_emails]
preprocessed_ham = [preprocess_email(email) for email in ham_emails]

# Optional: Print example of original and preprocessed email
print("Original:\n", spam_emails[0][:500])
print("\nPreprocessed:\n", preprocessed_spam[0][:500])

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original:
 mv 1 00001.bfc8d64d12b325ff385cca8d07b84288
mv 10 00010.7f5fb525755c45eb78efc18d7c9ea5aa
mv 100 00100.c60d1c697136b07c947fa180ba3e0441
mv 101 00101.2dfd7ee79ae439b8d9c38e783a137efa
mv 102 00102.2e3969075728dde7a328e05d19b35976
mv 103 00103.8c39bfed2079f865e9dfb75f4416a468
mv 104 00104.886f4a22362f4d3528c3e675878f17f7
mv 105 00105.9790e1c57fcbf7885b7cd1719fb4681b
mv 106 00106.fa6df8609cebb6f0f37aec3f70aa5b9a
mv 107 00107.f1d4194b57840ea6587b9a73ed88e075
mv 108 00108.4506c2ef846b80b9a7beb90315b227

Preprocessed:
 mv 1 00001bfc8d64d12b325ff385cca8d07b84288 mv 10 000107f5fb525755c45eb78efc18d7c9ea5aa mv 100 00100c60d1c697136b07c947fa180ba3e0441 mv 101 001012dfd7ee79ae439b8d9c38e783a137efa mv 102 001022e3969075728dde7a328e05d19b35976 mv 103 001038c39bfed2079f865e9dfb75f4416a468 mv 104 00104886f4a22362f4d3528c3e675878f17f7 mv 105 001059790e1c57fcbf7885b7cd1719fb4681b mv 106 00106fa6df8609cebb6f0f37aec3f70aa5b9a mv 107 00107f1d4194b57840ea6587b9a73ed88e075 mv 108 001084506c2ef846b8

split data in train and test

In [74]:
from sklearn.model_selection import train_test_split

# 1. Labels: 1 for Spam, 0 for Ham
labels_spam = [1] * len(preprocessed_spam)
labels_ham = [0] * len(preprocessed_ham)

# 2. Combine all data
all_emails = preprocessed_spam + preprocessed_ham
all_labels = labels_spam + labels_ham

# 3. Split into training and test data (e.g., 80% training, 20% testing)
X_train_texts, X_test_texts, y_train, y_test = train_test_split(
    all_emails,              # Input data (emails)
    all_labels,              # Corresponding labels
    test_size=0.2,           # 20% of the data for testing
    random_state=42,         # Ensure reproducibility
    stratify=all_labels      # Maintain label distribution in both sets
)





convert each email into a feature vector

In [75]:
from collections import Counter

# 1. Tokenize (emails have already been preprocessed → simple split is enough)
tokenized_spam = [email.split() for email in preprocessed_spam]
tokenized_ham = [email.split() for email in preprocessed_ham]
all_tokenized = tokenized_spam + tokenized_ham

# 2. Build vocabulary (e.g., top 1000 most frequent words)
def build_vocabulary(tokenized_emails, vocab_size=None):
    all_tokens = []
    for tokens in tokenized_emails:
        all_tokens.extend(tokens)  # Flatten list of token lists into a single list
    word_counts = Counter(all_tokens)  # Count word frequencies
    most_common = word_counts.most_common(vocab_size)  # Get top N words
    vocabulary = [word for word, _ in most_common]  # Extract just the words
    return vocabulary

vocabulary = build_vocabulary(all_tokenized, vocab_size=1000)

# 3. Function to create feature vector from tokens
def email_to_vector(tokens, vocabulary, binary=True):
    token_counts = Counter(tokens)  # Count tokens in the email
    vector = []
    for word in vocabulary:
        if binary:
            vector.append(1 if word in token_counts else 0)  # 1 if word is present
        else:
            vector.append(token_counts[word])  # Count how many times the word appears
    return vector

# 4. Vectorize all emails (binary and count-based)
binary_vectors_spam = [email_to_vector(tokens, vocabulary, binary=True) for tokens in tokenized_spam]
binary_vectors_ham = [email_to_vector(tokens, vocabulary, binary=True) for tokens in tokenized_ham]

count_vectors_spam = [email_to_vector(tokens, vocabulary, binary=False) for tokens in tokenized_spam]
count_vectors_ham = [email_to_vector(tokens, vocabulary, binary=False) for tokens in tokenized_ham]

# 5. Example output
print("Vocabulary (Top 20):", vocabulary[:20])
print("Binary vector (first spam email):", binary_vectors_spam[0][:20])
print("Count vector (first spam email):", count_vectors_spam[0][:20])

Vocabulary (Top 20): ['the', 'to', 'a', 'and', 'of', 'in', 'td', 'for', 'is', 'you', 'it', 'that', 'i', 'thi', 'tr', 'on', 'your', 'with', 'tabl', 'be']
Binary vector (first spam email): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Count vector (first spam email): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


learn the model and plot the results

In [76]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Tokenize the training and test data
tokenized_train = [email.split() for email in X_train_texts]
tokenized_test = [email.split() for email in X_test_texts]

# Convert emails to vectors (using the same vocabulary as before)
X_train_vectors = [email_to_vector(tokens, vocabulary, binary=True) for tokens in tokenized_train]
X_test_vectors = [email_to_vector(tokens, vocabulary, binary=True) for tokens in tokenized_test]

# Define classifiers
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(C=0.1, max_iter=1000),
    "Linear SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train and evaluate all classifiers
for name, clf in classifiers.items():
    clf.fit(X_train_vectors, y_train)               # Train the model
    y_pred = clf.predict(X_test_vectors)            # Predict on the test set
    
    print(f"\n📌 {name}:\n")
    print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))  # Print precision, recall, F1-score
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))              # Show confusion matrix


📌 Naive Bayes:

              precision    recall  f1-score   support

         Ham       0.98      0.98      0.98       560
        Spam       0.95      0.94      0.95       201

    accuracy                           0.97       761
   macro avg       0.97      0.96      0.96       761
weighted avg       0.97      0.97      0.97       761

Confusion Matrix:
 [[551   9]
 [ 12 189]]

📌 Logistic Regression:

              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       560
        Spam       0.97      0.97      0.97       201

    accuracy                           0.98       761
   macro avg       0.98      0.98      0.98       761
weighted avg       0.98      0.98      0.98       761

Confusion Matrix:
 [[555   5]
 [  7 194]]

📌 Linear SVM:

              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       560
        Spam       0.98      0.98      0.98       201

    accuracy                           0