Required imports and some constants setup

In [48]:
import numpy as np
import os
import glob

MARKER = 'Date: '
BASE = 'datasets'
FILTERED = '_filtered'

Initial data cleaning - potentially not required, maybe even harmful to the models performance. Still, I decided to classify emails only based on their contents

In [69]:
%%time
from pathlib import Path


non_spam_folders = ('easy_ham1', 'easy_ham2', 'hard_ham1', 'hard_ham2')
spam_folders = list(map(lambda n: f'spam{n}', range(1, 5)))


def leave_contents_only(filepath):
    path = Path(filepath)
    filtered_path = f'{BASE}/{FILTERED}'
    category_path = f'{filtered_path}/{path.parent.name}'

    if not os.path.exists(filtered_path):
        os.mkdir(filtered_path)
    if not os.path.exists(category_path):
        os.mkdir(category_path)

    with open(filepath, 'r', errors='ignore') as file:
        contents = file.read()

        try:
            index = contents.index(MARKER)
            contents = contents[index + len(MARKER):]
        except ValueError:
            print(f'Marker {MARKER} not found in {filepath}.')

    new_path = f'{category_path}/{path.name}'

    with open(new_path, 'w') as file:
        file.write(contents)


def create_clean_files(folders):
    for folder in folders:
        if os.path.exists(f'{BASE}/{FILTERED}/{folder}'):
            return

        path = f'{BASE}/{folder}/*'
        filepaths = glob.glob(path)

        for filepath in filepaths:
            leave_contents_only(filepath)

create_clean_files(non_spam_folders)
create_clean_files(spam_folders)

Wall time: 999 µs


Get splitted training and test data

In [93]:
from sklearn.model_selection import train_test_split


RANDOM_STATE = 42


def get_file_names(folders):
    files = []

    for folder in folders:
        for (dirpath, dirnames, filenames) in os.walk(f'{BASE}/{FILTERED}/{folder}'):
            files += [os.path.join(dirpath, file) for file in filenames]

    return files


spam_files = get_file_names(spam_folders)
non_spam_files = get_file_names(non_spam_folders)
y = np.hstack(
    (np.ones(len(spam_files), dtype=bool),
     np.zeros(len(non_spam_files), dtype=bool)))
X_train, X_test, y_train, y_test = train_test_split(spam_files + non_spam_files, y, test_size=.2, random_state=RANDOM_STATE, stratify=y)

In [98]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train).toarray()
X_test_vect = vectorizer.transform(X_test).toarray()

First choice - SVC

In [96]:
from sklearn.svm import SVC


np.random.seed(RANDOM_STATE)

svc_clf = SVC()

np.random.shuffle(X_train)
np.random.shuffle(y_train)

svc_clf.fit(X_train_vect, y_train)

preds = svc_clf.predict(X_test_vect)

In [97]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support


conf = confusion_matrix(y_test, preds)

display(conf)

precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds)

display(precision)
display(recall)
display(f1)

[[1023   87]
 [ 562  197]]
[0.64542587 0.69366197]
[0.92162162 0.25955204]
[0.75918367 0.37775647]


Second choice - RFC

In [99]:
from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier()

rfc.fit(X_train_vect, y_train)

preds = rfc.predict(X_test_vect)

conf = confusion_matrix(y_test, preds)

display(conf)

precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds)

display(precision)
display(recall)
display(f1)

array([[1109,    1],
       [ 645,  114]], dtype=int64)

array([0.6322691 , 0.99130435])

array([0.9990991 , 0.15019763])

array([0.77444134, 0.26086957])