In [1]:
# https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from common.utils import get_data_urls, fetch_data, get_labeled_files

In [2]:
#data_urls = get_data_urls()
#fetch_data(data_urls)

data_files, class_labels = get_labeled_files()

In [3]:
from sklearn.model_selection import train_test_split

train_files, test_files, y_train, y_test = train_test_split(data_files, class_labels,
                                                            test_size=0.2, random_state=44)
print(f"Training set size is\t {len(train_files)}")
print(f"Test set size is\t {len(test_files)}")

Training set size is	 7479
Test set size is	 1870


In [4]:
from mailparser import mailparser
from common.message import Message

messages = []
for i, message in enumerate(train_files[:100]):
    m = Message(mailparser.parse_from_file(message), y_train[i])
    m._extract_body_features()
    messages.append(m)

More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO mfrenchw2k mfrench42@62.254.163.42 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 13 Aug 2002 09:18:55 -0000
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO mfrenchw2k mfrench42@62.254.163.42 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 13 Aug 2002 09:18:55 -0000
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from chekitb5876.com 195.27.92.154 by asptown.co.kr 211.52.47.8 with Nmail V3.1 20010905 S for <jm@netnoteinc.com> from <jimmiester@hanmesoft.co.kr>; Sun, 02 Jun 2002 02:37:30 +0900
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*

In [37]:
import spacy
from spacy.tokenizer import Tokenizer
from spacy.util import compile_prefix_regex, compile_infix_regex, compile_suffix_regex

def custom_tokenizer(nlp):
    infixes = nlp.Defaults.infixes + tuple([r"\b[\[\(]\b"]) 
    infix_re = compile_infix_regex(infixes)
    prefix_re = compile_prefix_regex(nlp.Defaults.prefixes)
    suffix_re = compile_suffix_regex(nlp.Defaults.suffixes)
    return Tokenizer(
        nlp.vocab,
        prefix_search=prefix_re.search,
        suffix_search=suffix_re.search,
        infix_finditer=infix_re.finditer,
        token_match=None
    )

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "textcat"])
nlp.tokenizer = custom_tokenizer(nlp)


In [5]:
from common.database_populator import DatabasePopulator

db_populator = DatabasePopulator("../db/spam.db")

In [6]:
import sqlite3

conn = sqlite3.connect("../db/spam.db")
cursor = conn.cursor()
conn.executescript(open("../schema.sql").read())
conn.commit()
conn.close()

In [9]:
db_populator.populate_message_table(messages)

In [10]:
from common.message import Message
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd

class MessageTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, input="filename"):
        '''string {‘filename’, ‘file’, ‘content’}, '''
        self.input = input
        self.messages = []
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        '''X is a list of files'''
        feature_aggregator = []
        if isinstance(X, str):
             raise ValueError("Must be a list or iterable, not a string")
        for x in X:
            if self.input == "content":
                mailparser_obj = mailparser.parse_from_string(x)
            elif self.input == "file":
                mailparser_obj = mailparser.parse_from_file_obj(x)
            elif self.input == "filename":
                mailparser_obj = mailparser.parse_from_file(x)
            
            feature_aggregator.append(Message(mailparser_obj, None).extract_features())
            
        return pd.DataFrame.from_records(feature_aggregator)


pipeline = Pipeline([
    ("message_trf", MessageTransformer()),
    ("vectorizer", 
         ColumnTransformer([
            ("tdidf_body_vectorizer", TfidfVectorizer(max_features=1000), "body_tokens"),
         ])
    )
])

In [11]:
test = pipeline.fit_transform(train_files[:100], y_train[:100])

More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO mfrenchw2k mfrench42@62.254.163.42 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 13 Aug 2002 09:18:55 -0000
More than one match found for (?:with(?! cipher)\s+(?P<with>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+from|\s+by|\s+id|\s+for|\s+via|;)) in from unknown HELO mfrenchw2k mfrench42@62.254.163.42 with login by smtp.mail.vip.sc5.yahoo.com with SMTP; 13 Aug 2002 09:18:55 -0000
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*[(]?envelope-sender|\s+by|\s+with(?! cipher)|\s+id|\s+for|\s+via|;)) in from chekitb5876.com 195.27.92.154 by asptown.co.kr 211.52.47.8 with Nmail V3.1 20010905 S for <jm@netnoteinc.com> from <jimmiester@hanmesoft.co.kr>; Sun, 02 Jun 2002 02:37:30 +0900
More than one match found for (?:(?:^|\s)from\s+(?P<from>.+?)(?:\s*[(]?envelope-from|\s*

In [12]:
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB
#xgb.XGBClassifier()
xgb_clf = xgb.XGBClassifier(booster="gblinear").fit(test, y_train[:100])
#naive_clf = MultinomialNB().fit(test, y_train[:400])

In [29]:
p = pipeline["vectorizer"].named_transformers_["tdidf_body_vectorizer"]
p.get_feature_names()

In [33]:
coefficients = []
for i in zip(p.get_feature_names(), xgb_clf.coef_):
    coefficients.append(i)

In [43]:
len(coefficients)

1000

In [41]:
conn = sqlite3.connect("../db/spam.db")
cursor = conn.cursor()
cursor.executemany("INSERT INTO feature(feature coefficient) values (?, ?)", coefficients)
conn.commit()
conn.close()

OperationalError: near "coefficient": syntax error

In [82]:
import numpy as np

from sklearn.model_selection import cross_val_score

xgb_train_accuracy = np.mean(cross_val_score(xgb_clf, test, y_train[:400], cv=5))
naive_train_accuracy = np.mean(cross_val_score(naive_clf, test, y_train[:400], cv=5))

In [85]:
len(train_files)

7483

In [83]:
print("XGB Accuracy:\t", xgb_train_accuracy)
print("Naive Bayes Accuracy:\t", naive_train_accuracy)

XGB Accuracy:	 0.9199999999999999
Naive Bayes Accuracy:	 0.9349999999999999


In [723]:
import joblib

joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']