In [1]:
import os

import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from utils import fetch_data, get_data_urls, get_labeled_files

DATA_DIR = "data"
RESULTS_DIR = "results"
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)
if not os.path.exists(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)

data_urls = get_data_urls()
fetch_data(data_urls, DATA_DIR)
data_files, class_labels = get_labeled_files(DATA_DIR)
train_files, test_files, y_train, y_test = train_test_split(
    data_files, class_labels, test_size=0.2, random_state=44
)

print(f"Training set size is\t {len(train_files)}")
print(f"Test set size is\t {len(test_files)}")

Training set size is	 7479
Test set size is	 1870


In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from message_transformer import MessageTransformer

pipeline = Pipeline(
    [
        ("message_trf", MessageTransformer()),
        (
            "vectorizer",
            ColumnTransformer(
                [
                    (
                        "tdidf_body_vectorizer",
                        TfidfVectorizer(max_features=5000),
                        "tokens",
                    ),
                    ("std_scaler", StandardScaler(), ["num_links", "cap_max"]),
                ],
                remainder="passthrough",
            ),
        ),
    ]
)
joblib.dump(pipeline, os.path.join(RESULTS_DIR, "pipeline.pkl"));

In [3]:
%%capture

X_train = pipeline.fit_transform(train_files)
joblib.dump(X_train, os.path.join(RESULTS_DIR, "features.pkl"))

In [4]:
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

parameters = {
    "lambda": (0.0, 0.0001, 0.01),
    "alpha": (0.0, 0.0001, 0.01),
}
xgb_clf = xgb.XGBClassifier(booster="gblinear")
grid_search = GridSearchCV(xgb_clf, parameters, cv=5)
grid_search.fit(X_train, y_train)

classifier = grid_search.best_estimator_
joblib.dump(classifier, os.path.join(RESULTS_DIR, "classifier.pkl"))

print("Best parameters for linear classifier:", grid_search.best_params_)

Best parameters for linear classifier: {'alpha': 0.0, 'lambda': 0.0}


In [5]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

dummy_clf = DummyClassifier().fit(X_train, y_train)
dummy_train_accuracy = np.mean(cross_val_score(dummy_clf, X_train, y_train, cv=5))
xgb_train_accuracy = np.mean(cross_val_score(classifier, X_train, y_train, cv=5))

print("XGB Accuracy:\t", xgb_train_accuracy)
print("Dummy Accuracy\t", dummy_train_accuracy)

XGB Accuracy:	 0.9802103267576413
Dummy Accuracy	 0.6187985799366873


In [7]:
import database_connector

DATABASE = os.path.join(RESULTS_DIR, "spam.db")
# removes the database if it already exists.
os.remove(DATABASE)
db_connector = database_connector.DatabaseConnector(DATABASE, pipeline, classifier)
db_connector.populate_schema("../schema.sql")

db_connector.populate_feature_table(commit=True)
db_connector.populate_message_table(test_files, y_test, commit=True)

Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO ?192.168.0.100? salimma1@212.18.241.211 with plain by smtp.mail.vip.sc5.yahoo.com with SMTP; 10 Oct 2002 10:30:25 -0000
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO ?192.168.0.100? salimma1@212.18.241.211 with plain by smtp.mail.vip.sc5.yahoo.com with SMTP; 10 Oct 2002 10:30:25 -0000
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
