# Baseline model for spam detection & evasion

### Importing libraries


In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import os

### Loading and understanding datasets

In [12]:
# load function, will be used for each dataset
def load_dataset(dataset_name, base_path="dataset"):
    # Adjust file names based on dataset
    if dataset_name in ["enron1", "enron2"]:
        train_file = f"{dataset_name}_train.csv"
        test_file = f"{dataset_name}_test.csv"
        val_file = f"{dataset_name}_val.csv"
    else:  # For sms
        train_file = "train.csv"
        test_file = "test.csv"
        val_file = "val.csv"

    train_path = os.path.join(base_path, dataset_name, train_file)
    test_path = os.path.join(base_path, dataset_name, test_file)
    val_path = os.path.join(base_path, dataset_name, val_file)
    
    # Load the CSV files
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    val_df = pd.read_csv(val_path)

    return train_df, test_df, val_df

In [13]:
datasets = ['enron1', 'enron2', 'sms']

In [14]:
# exploring the datasets
def explore_dataset(dataset_name, train_df, test_df, val_df):
    print(f"\n=== Exploring {dataset_name} Dataset ===")
    
    # Display basic info for each split
    print("\nTrain Split:")
    print(f"Number of rows: {train_df.shape[0]}")
    print(f"Columns: {list(train_df.columns)}")
    print("Data types:")
    print(train_df.dtypes)
    print("Missing values:")
    print(train_df.isnull().sum())
    
    print("\nTest Split:")
    print(f"Number of rows: {test_df.shape[0]}")
    print(f"Columns: {list(test_df.columns)}")
    print("Data types:")
    print(test_df.dtypes)
    print("Missing values:")
    print(test_df.isnull().sum())
    
    print("\nValidation Split:")
    print(f"Number of rows: {val_df.shape[0]}")
    print(f"Columns: {list(val_df.columns)}")
    print("Data types:")
    print(val_df.dtypes)
    print("Missing values:")
    print(val_df.isnull().sum())
    return

In [15]:
for dataset in datasets:
        print(f"\nProcessing {dataset} dataset...")
        train_df, test_df, val_df = load_dataset(dataset)
        explore_dataset(dataset, train_df, test_df, val_df)


Processing enron1 dataset...

=== Exploring enron1 Dataset ===

Train Split:
Number of rows: 3196
Columns: ['email', 'target']
Data types:
email     object
target    object
dtype: object
Missing values:
email     0
target    0
dtype: int64

Test Split:
Number of rows: 999
Columns: ['email', 'target']
Data types:
email     object
target    object
dtype: object
Missing values:
email     0
target    0
dtype: int64

Validation Split:
Number of rows: 799
Columns: ['email', 'target']
Data types:
email     object
target    object
dtype: object
Missing values:
email     0
target    0
dtype: int64

Processing enron2 dataset...

=== Exploring enron2 Dataset ===

Train Split:
Number of rows: 3727
Columns: ['email', 'target']
Data types:
email     object
target    object
dtype: object
Missing values:
email     0
target    0
dtype: int64

Test Split:
Number of rows: 1165
Columns: ['email', 'target']
Data types:
email     object
target    object
dtype: object
Missing values:
email     0
target    0