In [11]:
import json
import os

import joblib
import numpy as np

from sklearn.model_selection import train_test_split
from utils import fetch_data, get_data_urls, get_labeled_files

DATA_DIR = "data"
RESULTS_DIR = "results"
if not os.path.exists(DATA_DIR):
    os.mkdir(DATA_DIR)
if not os.path.exists(RESULTS_DIR):
    os.mkdir(RESULTS_DIR)

#data_urls = get_data_urls()
#fetch_data(data_urls, DATA_DIR)
data_files, class_labels = get_labeled_files(DATA_DIR)
train_files, test_files, y_train, y_test = train_test_split(
    data_files, class_labels, test_size=0.2, random_state=44
)

print(f"Training set size is\t {len(train_files)}")
print(f"Test set size is\t {len(test_files)}")

Training set size is	 7479
Test set size is	 1870


In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from message_transformer import MessageTransformer

pipeline = Pipeline(
    [
        ("message_trf", MessageTransformer()),
        (
            "vectorizer",
            ColumnTransformer(
                [
                    (
                        "tdidf_body_vectorizer", Pipeline(steps=[
                            ("counter", CountVectorizer(max_features=5000)),
                            ("tfidf", TfidfTransformer())
                        ]),
                        "tokens",
                    ),
                    ("std_scaler", StandardScaler(), ["num_links", "cap_max"]),
                ],
                remainder="passthrough",
            ),
        ),
    ]
)
joblib.dump(pipeline, os.path.join(RESULTS_DIR, "pipeline.pkl"));

In [13]:
%%capture

from message import Message

train_messages = [Message(x) for x in train_files]
test_messages = [Message(x) for x in test_files]

X_train = pipeline.fit_transform(train_messages)
joblib.dump(X_train, os.path.join(RESULTS_DIR, "features.pkl"))

Email content 'ms-tnef' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'enriched' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO mfrenchw2k mfrench42@62.254.163.42 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 13 Aug 2002 12:54:07 -0000
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO mfrenchw2k mfrench42@62.254.163.42 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 13 Aug 2002 12:54:07 -0000
Email content 'pgp-si

['results/features.pkl']

In [14]:
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

parameters = {
    "lambda": (0.0, 0.0001, 0.01),
    "alpha": (0.0, 0.0001, 0.01),
}
xgb_clf = xgb.XGBClassifier(booster="gblinear")
grid_search = GridSearchCV(xgb_clf, parameters, cv=5)
grid_search.fit(X_train, y_train)

classifier = grid_search.best_estimator_
joblib.dump(classifier, os.path.join(RESULTS_DIR, "classifier.pkl"))

print("Best parameters for linear classifier:", grid_search.best_params_)

Best parameters for linear classifier: {'alpha': 0.0, 'lambda': 0.0}


In [15]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

dummy_clf = DummyClassifier(strategy="prior").fit(X_train, y_train)
dummy_train_accuracy = np.mean(cross_val_score(dummy_clf, X_train, y_train, cv=5))
xgb_train_accuracy = np.mean(cross_val_score(classifier, X_train, y_train, cv=5))

print("XGB Accuracy:\t", xgb_train_accuracy)
print("Dummy Accuracy\t", dummy_train_accuracy)

XGB Accuracy:	 0.981413535313791
Dummy Accuracy	 0.7432811689589183


In [16]:
%%capture

from sklearn.metrics import confusion_matrix

X_test = pipeline.transform(test_messages)

train_conf_mat = confusion_matrix(y_train, classifier.predict(X_train))
test_conf_mat = confusion_matrix(y_test, classifier.predict(X_test))


def conf_to_dict(mat):
    return {
        "trueNegative": int(mat[0, 0]),
        "falseNegative": int(mat[1, 0]),
        "falsePositive": int(mat[0, 1]),
        "truePositive": int(mat[1, 1]),
    }

confusion_matrix = {
    "trainingSet": conf_to_dict(train_conf_mat),
    "testingSet": conf_to_dict(test_conf_mat)
}

json.dump(confusion_matrix, open("results/confusion_matrix.json", "w+"))

In [17]:
import database_connector

DATABASE = os.path.join(RESULTS_DIR, "spam.db")
if(os.path.exists(DATABASE)):
    os.remove(DATABASE)
db_connector = database_connector.DatabaseConnector(DATABASE, pipeline, classifier)
db_connector.populate_schema("../schema.sql")

all_messages = train_messages + test_messages

db_connector.populate_feature_table(all_messages, commit=True)
db_connector.populate_message_table(train_messages, y_train, "train", commit=True)
db_connector.populate_message_table(test_messages, y_test, "test", commit=True)

