# Baseline model for spam detection & evasion

### Importing libraries


In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import os

### Loading and understanding datasets

In [19]:

# load function, will be used for each dataset
def load_dataset(dataset_name, base_path="dataset"):
    # Adjust file names based on dataset
    if dataset_name in ["enron1", "enron2"]:
        train_file = f"{dataset_name}_train.csv"
        test_file = f"{dataset_name}_test.csv"
        val_file = f"{dataset_name}_val.csv"
    else:  # For sms
        train_file = "train.csv"
        test_file = "test.csv"
        val_file = "val.csv"

    train_path = os.path.join(base_path, dataset_name, train_file)
    test_path = os.path.join(base_path, dataset_name, test_file)
    val_path = os.path.join(base_path, dataset_name, val_file)
    
    # Load the CSV files
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    val_df = pd.read_csv(val_path)
    
    # Rename columns to match expected names
    train_df = train_df.rename(columns={'email': 'text', 'target': 'label'})
    test_df = test_df.rename(columns={'email': 'text', 'target': 'label'})
    val_df = val_df.rename(columns={'email': 'text', 'target': 'label'})

    # Encode the labels (e.g., 'spam' → 1, 'ham' → 0)
    le = LabelEncoder()
    train_df['label'] = le.fit_transform(train_df['label'])
    test_df['label'] = le.transform(test_df['label'])
    val_df['label'] = le.transform(val_df['label'])

    return train_df, test_df, val_df

In [13]:
datasets = ['enron1', 'enron2', 'sms']

In [26]:

# exploring the datasets
def explore_dataset(dataset_name, train_df, test_df, val_df):
    print(f"\n=== Exploring {dataset_name} Dataset ===")
    
    # Display basic info for each split
    print("\nTrain Split:")
    print(f"Number of rows: {train_df.shape[0]}")
    print(f"Columns: {list(train_df.columns)}")
    print("Data types:")
    print(train_df.dtypes)
    
    print("\nTest Split:")
    print(f"Number of rows: {test_df.shape[0]}")
    print(f"Columns: {list(test_df.columns)}")
    print("Data types:")
    print(test_df.dtypes)
    
    print("\nValidation Split:")
    print(f"Number of rows: {val_df.shape[0]}")
    print(f"Columns: {list(val_df.columns)}")
    print("Data types:")
    print(val_df.dtypes)
    
    # Add label distribution
    print("\nLabel Distribution:")
    for split_name, df in [("Train", train_df), ("Test", test_df), ("Validation", val_df)]:
        print(f"\n{split_name} Split:")
        label_counts = df['label'].value_counts()
        print(label_counts)
        print(f"Spam percentage: {100 * label_counts.get(1, 0) / len(df):.2f}%")
    
    # Add sample rows
    print("\nSample Rows from Train Split:")
    print(train_df.head())
    
    return

In [32]:
for dataset in datasets:
    print(f"\nProcessing {dataset} dataset...")
    train_df, test_df, val_df = load_dataset(dataset)
    explore_dataset(dataset, train_df, test_df, val_df)


Processing enron1 dataset...

=== Exploring enron1 Dataset ===

Train Split:
Number of rows: 3196
Columns: ['text', 'label']
Data types:
text     object
label     int64
dtype: object

Test Split:
Number of rows: 999
Columns: ['text', 'label']
Data types:
text     object
label     int64
dtype: object

Validation Split:
Number of rows: 799
Columns: ['text', 'label']
Data types:
text     object
label     int64
dtype: object

Label Distribution:

Train Split:
label
0    2260
1     936
Name: count, dtype: int64
Spam percentage: 29.29%

Test Split:
label
0    706
1    293
Name: count, dtype: int64
Spam percentage: 29.33%

Validation Split:
label
0    565
1    234
Name: count, dtype: int64
Spam percentage: 29.29%

Sample Rows from Train Split:
                                                text  label
0  Subject: prom dress shopping hi , just wanted ...      0
1  Subject: hi agaain hello , welcome to pharm la...      1
2  Subject: feedback monitor error - meter 984132...      0
3  Subject: 

In [28]:
# Prepare the data and extract TF-IDF features
def prepare_data(train_df, test_df, val_df):
    X_train = train_df['text'].values
    y_train = train_df['label'].values
    X_test = test_df['text'].values
    y_test = test_df['label'].values
    X_val = val_df['text'].values
    y_val = val_df['label'].values

    # Convert text to TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)
    X_val_tfidf = vectorizer.transform(X_val)

    return X_train_tfidf, y_train, X_test_tfidf, y_test, X_val_tfidf, y_val, vectorizer

In [31]:
# Train and evaluate a classifier (Naïve Bayes or Logistic Regression)
def train_and_evaluate(classifier, classifier_name, X_train, y_train, X_test, y_test, dataset_name):
    # Train the classifier
    classifier.fit(X_train, y_train)

    # Predict on the test set
    y_pred = classifier.predict(X_test)

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    # Print the results
    print(f"\nResults for {dataset_name} dataset ({classifier_name}):")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("\nDetailed Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['ham', 'spam']))

    return classifier

In [34]:
for dataset in datasets:
        print(f"\nProcessing {dataset} dataset...")
        
        # Load the data
        train_df, test_df, val_df = load_dataset(dataset)
        
        # Prepare the data (TF-IDF features)
        X_train_tfidf, y_train, X_test_tfidf, y_test, X_val_tfidf, y_val, vectorizer = prepare_data(train_df, test_df, val_df)
        
        # Train and evaluate Naïve Bayes
        nb_classifier = MultinomialNB()
        train_and_evaluate(nb_classifier, "Naïve Bayes", X_train_tfidf, y_train, X_test_tfidf, y_test, dataset)
        
        # Train and evaluate Logistic Regression
        lr_classifier = LogisticRegression(max_iter=1000, class_weight='balanced')
        train_and_evaluate(lr_classifier, "Logistic Regression", X_train_tfidf, y_train, X_test_tfidf, y_test, dataset)


Processing enron1 dataset...

Results for enron1 dataset (Naïve Bayes):
Accuracy: 0.9600
Precision: 0.9408
Recall: 0.9215
F1-Score: 0.9310

Detailed Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.98      0.97       706
        spam       0.94      0.92      0.93       293

    accuracy                           0.96       999
   macro avg       0.95      0.95      0.95       999
weighted avg       0.96      0.96      0.96       999


Results for enron1 dataset (Logistic Regression):
Accuracy: 0.9770
Precision: 0.9327
Recall: 0.9932
F1-Score: 0.9620

Detailed Classification Report:
              precision    recall  f1-score   support

         ham       1.00      0.97      0.98       706
        spam       0.93      0.99      0.96       293

    accuracy                           0.98       999
   macro avg       0.96      0.98      0.97       999
weighted avg       0.98      0.98      0.98       999


Processing enron2 dat