In [1]:
# https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from common.utils import get_data_urls, fetch_data, get_labeled_files

In [2]:
#data_urls = get_data_urls()
#fetch_data(data_urls)

data_files, class_labels = get_labeled_files()

In [3]:
from sklearn.model_selection import train_test_split

train_files, test_files, y_train, y_test = train_test_split(data_files, class_labels,
                                                            test_size=0.2, random_state=44)
print(f"Training set size is\t {len(train_files)}")
print(f"Test set size is\t {len(test_files)}")

Training set size is	 7479
Test set size is	 1870


In [4]:
from common.message import Message
from common.message_transformer import MessageTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline = Pipeline([
    ("message_trf", MessageTransformer()),
    ("vectorizer", 
         ColumnTransformer([
            ("tdidf_body_vectorizer", TfidfVectorizer(max_features=100), "tokens"),
         ])
    )
])

In [5]:
from mailparser import mailparser
from common.message import Message

N = 100

messages = []
for i, message in enumerate(train_files[:N]):
    try:
        m = Message(mailparser.parse_from_file(message))
    except OSError:
        m = Message(mailparser.parse_from_string(" "))
    messages.append(m)

labels = y_train[:N]

features = pipeline.fit_transform(messages)

Email content 'ms-tnef' not handled
Email content 'pgp-signature' not handled
Email content 'pgp-signature' not handled


In [6]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

parameters = {
    "lambda": (0.0,), # (0.0, 0.1, 1.0, 5.0),
    "alpha": (0.0,), # (0.0, 0.1, 1.0, 5.0),
}
xgb_clf = xgb.XGBClassifier(booster="gblinear")
clf = GridSearchCV(xgb_clf, parameters)
clf.fit(features, labels)

classifier = clf.best_estimator_

In [7]:
import joblib

joblib.dump(pipeline, "pipeline.pkl");
joblib.dump(classifier, "classifier.pkl");

In [8]:
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier().fit(messages, y_train[:N])
dummy_train_accuracy = np.mean(cross_val_score(dummy_clf, features, y_train[:N], cv=5))
xgb_train_accuracy = np.mean(cross_val_score(classifier, features, y_train[:N], cv=5))
print("XGB Accuracy:\t", xgb_train_accuracy)
print("Dummy Accuracy\t", dummy_train_accuracy)

XGB Accuracy:	 0.8
Dummy Accuracy	 0.5700000000000001
