In [1]:
# https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from common.utils import get_data_urls, fetch_data, get_labeled_files

In [2]:
#data_urls = get_data_urls()
#fetch_data(data_urls)

data_files, class_labels = get_labeled_files()

In [3]:
from sklearn.model_selection import train_test_split

train_files, test_files, y_train, y_test = train_test_split(data_files, class_labels,
                                                            test_size=0.2, random_state=44)
print(f"Training set size is\t {len(train_files)}")
print(f"Test set size is\t {len(test_files)}")

Training set size is	 7479
Test set size is	 1870


In [4]:
from common.message import Message
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import mailparser

class MessageTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, input="message"):
        '''string {‘filename’, ‘file’, ‘content’}, '''
        self.input = input
        self.messages = []
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''X is a list of files'''
        feature_aggregator = []
        if isinstance(X, str):
             raise ValueError("Must be a list or iterable, not a string")
        for x in X:
            if self.input == "content":
                mailparser_obj = mailparser.parse_from_string(x)
            elif self.input == "file":
                mailparser_obj = mailparser.parse_from_file_obj(x)
            elif self.input == "filename":
                mailparser_obj = mailparser.parse_from_file(x)
            elif self.input == "message":
                feature_aggregator.append(x.extract_features())
                continue
            feature_aggregator.append(Message(mailparser_obj).extract_features())

            
        return pd.DataFrame.from_records(feature_aggregator)


pipeline = Pipeline([
    ("message_trf", MessageTransformer()),
    ("vectorizer", 
         ColumnTransformer([
            ("tdidf_body_vectorizer", TfidfVectorizer(max_features=5000), "body_tokens"),
         ])
    )
])

In [31]:
from mailparser import mailparser
from common.message import Message

N = 100

messages = []
for i, message in enumerate(train_files[:N]):
    try:
        m = Message(mailparser.parse_from_file(message))
    except OSError:
        m = Message(mailparser.parse_from_string(" "))
    messages.append(m)

labels = y_train[:N]

features = pipeline.fit_transform(messages)

Email content 'ms-tnef' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled


In [32]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

parameters = {
    "lambda": (0.0, 0.1, 1.0, 5.0),
    "alpha": (0.0, 0.1, 1.0, 5.0),
}
xgb_clf = xgb.XGBClassifier(booster="gblinear")
clf = GridSearchCV(xgb_clf, parameters)
clf.fit(features, labels)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster='gblinear',
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_par

In [33]:
classifier = clf.best_estimator_
coefs = classifier.coef_

In [None]:
# TODO: unify pipeline and save model

In [35]:
from common.database_populator import DatabasePopulator

db_populator = DatabasePopulator("../db/spam.db")
db_populator.populate_schema("../schema.sql")
db_populator.populate_message_table(messages, y_train[:N])

vectorizer = pipeline["vectorizer"].named_transformers_["tdidf_body_vectorizer"]
coefficients = []
for i in zip(vectorizer.get_feature_names(), coefs):
    coefficients.append(i)

db_populator.populate_feature_table(coefficients)

In [20]:
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier().fit(messages, y_train[:N])
dummy_train_accuracy = np.mean(cross_val_score(dummy_clf, features, y_train[:N], cv=5))
xgb_train_accuracy = np.mean(cross_val_score(classifier, features, y_train[:N], cv=5))
print("XGB Accuracy:\t", xgb_train_accuracy)
print("Dummy Accuracy\t", dummy_train_accuracy)

XGB Accuracy:	 0.93
Dummy Accuracy	 0.616


In [723]:
import joblib

joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']