# Data Extraction

In [1]:
import io
import os
import re
import tarfile
from pathlib import Path

import chardet
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")

23/06/19 16:48:23 WARN Utils: Your hostname, mr.local resolves to a loopback address: 127.0.0.1; using 192.168.15.7 instead (on interface en0)
23/06/19 16:48:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/19 16:48:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
APACHE_SPAM_ASSASSIN = "https://spamassassin.apache.org/old/publiccorpus"
DATA_FOLDER = '../data'

In [4]:
def pull_data():
    response = requests.get(APACHE_SPAM_ASSASSIN)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    file_paths = [link.get('href') for link in soup.find_all('a')]
    file_paths = [path for path in file_paths if path.split('.')[-1] == 'bz2']

    available_data = set(os.listdir(DATA_FOLDER))
    file_paths = [path for path in file_paths if path not in available_data and path != 'corpus.parquet']

    if len(file_paths) == 0:
        print('No data to pull')

    for file_path in file_paths:
        print(f'Pulling {file_path}')
        response = requests.get(f"{APACHE_SPAM_ASSASSIN}/{file_path}")
        response.raise_for_status()

        file_object = io.BytesIO(response.content)
        tar = tarfile.open(fileobj=file_object, mode="r:bz2")

        extract_dir = Path(DATA_FOLDER)
        extract_path = extract_dir.joinpath(Path(file_path))

        extract_dir.mkdir(exist_ok=True)
        tar.extractall(extract_path)

        tar.close()

    size = get_directory_size("../data")
    print(f"Data directory size: {size} bytes")


def get_directory_size(directory):
    total = 0
    for dirpath, dirnames, filenames in os.walk(directory):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if not os.path.islink(fp):
                total += os.path.getsize(fp)

    return total

In [10]:
pull_data()

No data to pull
Data directory size: 121604826 bytes


In [5]:
def parse_data():
    successful_files = []
    failed_files = []
    corpus_df = pd.DataFrame(columns=['date', 'difficulty', 'category', 'collection', 'body'])
    for dirpath, dirnames, filenames in os.walk(DATA_FOLDER):
        print(dirpath)
        for filename in filenames:
            corpus_path = os.path.join(dirpath, filename)
            corpus_df, successful_files, failed_files = incorporate(corpus_df, corpus_path, dirpath, successful_files,
                                                                    failed_files)
    return corpus_df, successful_files, failed_files


def incorporate(df, corpus_path, dirpath, successful_files, failed_files):
    rawdata = open(corpus_path, 'rb').read()
    result = chardet.detect(rawdata)
    encoding = result['encoding']

    with open(corpus_path, 'r', encoding=encoding) as f:
        [(date, *ids)] = re.findall(r'([\d]{8})_([a-z]*)_?([a-z]*)_?([\d]*)\.tar\.bz2.*',
                                    dirpath)
        [difficulty, category, collection] = parse_ids(ids)
        try:
            body = f.read()
            row = pd.DataFrame({'date': date,
                                'difficulty': difficulty,
                                'category': category,
                                'collection': collection,
                                'body': body}, index=[0, 1, 2, 3, 4])
            df = pd.concat([df, row], ignore_index=True)
            successful_files.append(corpus_path)
        except Exception as E:
            print(E)
            failed_files.append(corpus_path)
    return df, successful_files, failed_files


def parse_ids(ids):
    cls, difficulty, collection = None, None, None
    match ids:
        case [cls, '', '']:
            difficulty = None
            collection = None
        case [difficulty, cls, '']:
            collection = None
        case [difficulty, cls, collection]:
            pass
        case _:
            pass

    return [difficulty, cls, collection]


def save_as_parquet(df):
    spark_df = spark.createDataFrame(df)
    spark_df.write.parquet(f'{DATA_FOLDER}/corpus.parquet')

In [6]:
if 'corpus.parquet' in set(os.listdir(DATA_FOLDER)):
    df = spark.read.parquet(f'{DATA_FOLDER}/corpus.parquet')
    corpus_df = df.toPandas()
else:
    corpus_df, successful_files, failed_files = parse_data()
    print(failed_files)
    save_as_parquet(corpus_df)

                                                                                

In [7]:
corpus_df

Unnamed: 0,date,difficulty,category,collection,body
0,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
1,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
2,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
3,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
4,20030228,hard,ham,,Return-Path: <bounce-lghtml-2534368@sprocket.l...
...,...,...,...,...,...
53650,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
53651,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
53652,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...
53653,20030228,easy,ham,,From exmh-workers-admin@redhat.com Tue Aug 27...


# Preprocessing

## Categorical or Binary Features

In [8]:
def extract_headers(row):
    row['has_headers'] = True
    try:
        [headers, body] = row['body'].split('\n\n', maxsplit=1)
        header_rows = headers.split('\n')
        current_key = None
        for h in header_rows:
            row, current_key = parse_header(row, h, current_key)
        row['body'] = body
    except Exception as e:
        row['header_error'] = e
        row['has_headers'] = False
    return row


def parse_header(row, h, current_key):
    match re.split(r"([\w\-]+)\:\s*", h):
        case ['', key, value]:
            row[key.lower()] = value
            current_key = key.lower()
        case [value]:
            if current_key:
                row[current_key] += value
        case _:
            pass
    return row, current_key

In [9]:
preproc_df = corpus_df.drop_duplicates(subset='body')
preproc_df = preproc_df.apply(extract_headers, axis=1)

In [10]:
reduced_preproc_df = preproc_df.dropna(thresh=1000, axis=1)

In [11]:
def extract_email_address(row, cols):
    for col in cols:
        if str(row[col]) and type(row[col]) != list:
            row[col] = re.findall(r"([^\s\<]+\@[^\s\>]+)", str(row[col])) or None
    return row

In [12]:
envelope_cols = ['delivered-to', 'errors-to', 'from', 'in-reply-to', 'list-id', 'message-id', 'received', 'references',
                 'reply-to', 'return-path', 'sender', 'to', 'x-beenthere']
extracted_emails_df = reduced_preproc_df.apply(lambda r: extract_email_address(r, envelope_cols), axis=1)

In [13]:
matching_cols = [
    ['delivered-to', 'to'],
    ['errors-to', 'from'],
    ['errors-to', 'return-path'],
    ['from', 'reply-to'],
    ['from', 'return-path'],
    ['from', 'sender'],
    ['x-beenthere', 'list-id']
]


def envelope_cols_match(row):
    for [col1, col2] in matching_cols:
        values_exist = row[col1] and row[col2]
        if not values_exist:
            return row
        row[f'feat-match-{col1}-{col2}'] = 1 if set(row[col1]) == set(row[col2]) else 0
    return row

In [14]:
matched_envelopes_df = extracted_emails_df.apply(envelope_cols_match, axis=1)

In [15]:
def get_feat_cols(df):
    return [col for col in df.columns if re.match(r"^feat\-.*$", col) is not None]

In [16]:
matched_envelopes_df[get_feat_cols(matched_envelopes_df)].describe()

Unnamed: 0,feat-match-delivered-to-to,feat-match-errors-to-from,feat-match-errors-to-return-path,feat-match-from-reply-to,feat-match-from-return-path,feat-match-from-sender
count,7760.0,4569.0,4568.0,1662.0,1662.0,1654.0
mean,0.147938,0.003502,0.992557,0.29302,0.010229,0.00786
std,0.355062,0.059079,0.085961,0.455285,0.100648,0.088333
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
message_ids = {}


def duplicate_message_id(row):
    if not row['message-id']:
        matched_envelopes_df.loc[row.name, 'feat-duplicate-message-id'] = 0
        return row

    for message_id in row['message-id']:
        if message_id not in message_ids:
            message_ids[message_id] = row.name
            matched_envelopes_df.loc[row.name, 'feat-duplicate-message-id'] = 0
        else:
            matched_envelopes_df.loc[message_ids[message_id], 'feat-duplicate-message-id'] = 1
    return row


matched_envelopes_df.apply(duplicate_message_id, axis=1)
duplicate_messages_df = matched_envelopes_df

In [18]:
def prepare_for_encoding(row, col):
    if pd.isna(row[col]):
        return row

    row[col] = row[col].lower()
    row[col] = row[col].split(' ')[0]
    return row


def one_hot_encode(df, col):
    cte_df = df.apply(lambda r: prepare_for_encoding(r, col), axis=1)
    encoded_cte_df = pd.get_dummies(cte_df[col], prefix=f'feat-encoded-{col}', dtype=int)
    return pd.concat([cte_df, encoded_cte_df], axis=1)


with_encoded_cte_df = one_hot_encode(duplicate_messages_df, 'content-transfer-encoding')

In [19]:
with_encoded_precedence_df = one_hot_encode(with_encoded_cte_df, 'precedence')
with_encoded_precedence_df[get_feat_cols(with_encoded_precedence_df)]

Unnamed: 0,feat-match-delivered-to-to,feat-match-errors-to-from,feat-match-errors-to-return-path,feat-match-from-reply-to,feat-match-from-return-path,feat-match-from-sender,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,feat-encoded-content-transfer-encoding_binary,feat-encoded-content-transfer-encoding_quoted-printable,feat-encoded-precedence_bulk,feat-encoded-precedence_first-class,feat-encoded-precedence_list,feat-encoded-precedence_normal
0,,,,,,,0.0,0,0,0,0,0,0,0,0,0
5,0.0,,,,,,1.0,1,0,0,0,0,1,0,0,0
10,,,,,,,1.0,1,0,0,0,0,0,0,0,0
15,,,,,,,1.0,0,0,0,0,1,0,0,0,0
20,,,,,,,1.0,1,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53629,0.0,0.0,1.0,0.0,0.0,0.0,,0,0,0,0,0,1,0,0,0
53634,0.0,,,,,,,0,1,0,0,0,0,0,0,0
53639,0.0,,,,,,,0,0,0,0,0,0,0,0,0
53644,1.0,0.0,1.0,,,,,1,0,0,0,0,1,0,0,0


In [20]:
with_encoded_mailman_version_df = one_hot_encode(with_encoded_precedence_df, 'x-mailman-version')
with_encoded_mailman_version_df[get_feat_cols(with_encoded_mailman_version_df)]

Unnamed: 0,feat-match-delivered-to-to,feat-match-errors-to-from,feat-match-errors-to-return-path,feat-match-from-reply-to,feat-match-from-return-path,feat-match-from-sender,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,...,feat-encoded-x-mailman-version_2.0.1,feat-encoded-x-mailman-version_2.0.10,feat-encoded-x-mailman-version_2.0.11,feat-encoded-x-mailman-version_2.0.3,feat-encoded-x-mailman-version_2.0.6,feat-encoded-x-mailman-version_2.0.7,feat-encoded-x-mailman-version_2.0.8,feat-encoded-x-mailman-version_2.0.9-sf.net,feat-encoded-x-mailman-version_2.1b5,feat-encoded-x-mailman-version_2.1b5+
0,,,,,,,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0.0,,,,,,1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10,,,,,,,1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
15,,,,,,,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,,,,,,,1.0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53629,0.0,0.0,1.0,0.0,0.0,0.0,,0,0,0,...,0,0,1,0,0,0,0,0,0,0
53634,0.0,,,,,,,0,1,0,...,0,0,0,0,0,0,0,0,0,0
53639,0.0,,,,,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53644,1.0,0.0,1.0,,,,,1,0,0,...,0,0,1,0,0,0,0,0,0,0


In [21]:
def parse_msmail_priority(row):
    if pd.isna(row['x-msmail-priority']):
        return row

    match row['x-msmail-priority']:
        case 'normal':
            row['feat-x-msmail-priority'] = 1
        case 'medium':
            row['feat-x-msmail-priority'] = 1
        case 'low':
            row['feat-x-msmail-priority'] = 2
        case 'high':
            row['feat-x-msmail-priority'] = 3
        case _:
            row['feat-x-msmail-priority'] = -1

    return row


with_msmail_priority_df = with_encoded_mailman_version_df.apply(lambda r: prepare_for_encoding(r, 'x-msmail-priority'),
                                                                axis=1)
with_categorical_msmail_priority_df = with_msmail_priority_df.apply(parse_msmail_priority, axis=1)
with_categorical_msmail_priority_df['feat-x-msmail-priority'].describe()

count    1149.000000
mean        1.075718
std         0.387537
min        -1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: feat-x-msmail-priority, dtype: float64

In [22]:
def parse_priority(row):
    if pd.isna(row['x-priority']):
        return row

    priority = row['x-priority']
    if re.findall(r"Highest", priority):
        row['feat-x-priority'] = 0
    else:
        row['feat-x-priority'] = int(re.search(r'\d', priority).group())
    return row


with_categorical_priority_df = with_categorical_msmail_priority_df.apply(parse_priority, axis=1)
with_categorical_priority_df['feat-x-priority'].describe()

count    1488.000000
mean        2.828629
std         0.588674
min         0.000000
25%         3.000000
50%         3.000000
75%         3.000000
max         5.000000
Name: feat-x-priority, dtype: float64

In [23]:
def parse_spam_status(row):
    spam_status = row['x-spam-status']
    if pd.isna(spam_status):
        return row

    match spam_status.split(', ')[0].lower():
        case 'no':
            row['feat-x-spam-status'] = 0
        case 'yes':
            row['feat-x-spam-status'] = 1
        case _:
            row['feat-x-spam-status'] = -1
    return row


with_binary_spam_status_df = with_categorical_priority_df.apply(parse_spam_status, axis=1)
with_binary_spam_status_df['feat-x-spam-status'].describe()

count    2155.000000
mean        0.007425
std         0.085866
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: feat-x-spam-status, dtype: float64

In [24]:
def parse_spam_level(row):
    spam_level = row['x-spam-level']
    if pd.isna(spam_level):
        return row

    row['feat-x-spam-level'] = len(spam_level.rstrip().lstrip())
    return row


with_categorical_spam_level_df = with_binary_spam_status_df.apply(parse_spam_level, axis=1)
with_categorical_spam_level_df['feat-x-spam-level'].describe()

count    2156.000000
mean        0.067718
std         0.628818
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        11.000000
Name: feat-x-spam-level, dtype: float64

In [25]:
embedding_cols = ['cc', 'content-type', 'subject', 'x-mailer']

In [26]:
def get_cc_count(row):
    if pd.isna(row['cc']):
        return row

    row['feat-cc-count'] = len(row['cc'].split(','))
    return row


with_cc_count_df = with_categorical_spam_level_df.apply(get_cc_count, axis=1)
with_cc_count_df['feat-cc-count'].describe()

count    1962.000000
mean        3.007136
std         6.011204
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        73.000000
Name: feat-cc-count, dtype: float64

In [27]:
ct_subfields = set()


def parse_content_type(row):
    content_type = row['content-type']
    if pd.isna(content_type):
        return row

    value = content_type.split(';')[0]
    row['content-type'] = value

    subfields = re.findall(r"([\w]+)\=[\"]?(.+?)[\"]?[\;|\n|$]", re.sub('[\t\s\n]+?', '', content_type))
    for (subfield, subvalue) in subfields:
        ct_subfields.add(f'content-type-{subfield.lower()}')
        row[f'content-type-{subfield}'] = subvalue
    return row


with_content_type_df = with_cc_count_df.apply(parse_content_type, axis=1)
with_content_type_df[list(ct_subfields)].describe()

Unnamed: 0,content-type-protocol,content-type-type,content-type-charset,content-type-micalg,content-type-delsp,content-type-format,content-type-boundary
count,69,39,585,180,2,13,131
unique,2,2,7,3,1,1,77
top,application/pgp-signature,multipart/alternative,us-ascii,pgp-sha1,yes,flowed,=_NextPart_2rfkindysadvnqw3nerasdf
freq,67,34,388,168,2,13,6


In [28]:
with_encoded_charset_df = one_hot_encode(with_content_type_df, 'content-type-charset')
with_encoded_charset_df[get_feat_cols(with_encoded_charset_df)]

Unnamed: 0,feat-cc-count,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,feat-encoded-content-transfer-encoding_binary,feat-encoded-content-transfer-encoding_quoted-printable,feat-encoded-precedence_bulk,feat-encoded-precedence_first-class,feat-encoded-precedence_list,...,feat-match-from-return-path,feat-match-from-sender,feat-x-msmail-priority,feat-x-priority,feat-x-spam-level,feat-x-spam-status,feat-encoded-content-type-charset_iso-8859-1,feat-encoded-content-type-charset_iso-8859-15,feat-encoded-content-type-charset_us-ascii,feat-encoded-content-type-charset_windows-1252
0,,0.0,0,0,0,0,0,0,0,0,...,,,,,,,0,0,0,0
5,,1.0,1,0,0,0,0,1,0,0,...,,,,,,,0,0,1,0
10,,1.0,1,0,0,0,0,0,0,0,...,,,,,,,0,0,0,0
15,,1.0,0,0,0,0,1,0,0,0,...,,,,,,,0,0,0,0
20,,1.0,1,0,0,0,0,1,0,0,...,,,,,,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53629,,,0,0,0,0,0,1,0,0,...,0.0,0.0,,,,,0,0,0,0
53634,2.0,,0,1,0,0,0,0,0,0,...,,,,,,,0,0,0,0
53639,,,0,0,0,0,0,0,0,0,...,,,,,,,0,0,0,0
53644,,,1,0,0,0,0,1,0,0,...,,,,,,,0,0,1,0


In [29]:
with_encoded_charset_df['feat-content-type-boundary'] = with_encoded_charset_df['content-type-boundary'].isna().astype(
    int)
with_binary_boundary_df = with_encoded_charset_df
with_binary_boundary_df[get_feat_cols(with_binary_boundary_df)]

Unnamed: 0,feat-cc-count,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,feat-encoded-content-transfer-encoding_binary,feat-encoded-content-transfer-encoding_quoted-printable,feat-encoded-precedence_bulk,feat-encoded-precedence_first-class,feat-encoded-precedence_list,...,feat-match-from-sender,feat-x-msmail-priority,feat-x-priority,feat-x-spam-level,feat-x-spam-status,feat-encoded-content-type-charset_iso-8859-1,feat-encoded-content-type-charset_iso-8859-15,feat-encoded-content-type-charset_us-ascii,feat-encoded-content-type-charset_windows-1252,feat-content-type-boundary
0,,0.0,0,0,0,0,0,0,0,0,...,,,,,,0,0,0,0,1
5,,1.0,1,0,0,0,0,1,0,0,...,,,,,,0,0,1,0,1
10,,1.0,1,0,0,0,0,0,0,0,...,,,,,,0,0,0,0,1
15,,1.0,0,0,0,0,1,0,0,0,...,,,,,,0,0,0,0,1
20,,1.0,1,0,0,0,0,1,0,0,...,,,,,,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53629,,,0,0,0,0,0,1,0,0,...,0.0,,,,,0,0,0,0,1
53634,2.0,,0,1,0,0,0,0,0,0,...,,,,,,0,0,0,0,1
53639,,,0,0,0,0,0,0,0,0,...,,,,,,0,0,0,0,1
53644,,,1,0,0,0,0,1,0,0,...,,,,,,0,0,1,0,1


In [30]:
def parse_mailer(row):
    x_mailer = row['x-mailer']
    if pd.isna(x_mailer):
        return row

    result = re.sub(r"[\(\[\{].*?[\)\]\}]", "", x_mailer)
    result = re.sub(r"([\w\-]*?([\d]+\.)+[\+\d\w\-]*?)", "", result)
    result = re.sub(r"([\d\/]+)", "", result)
    result = result.lower()
    result = re.sub(r"version|with", "", result)
    result = re.sub(r"[\W]", "", result)

    row['x-mailer'] = result
    return row


with_mailer_df = with_binary_boundary_df.apply(parse_mailer, axis=1)
with_mailer_df['x-mailer'].describe()

count                        3370
unique                        152
top       microsoftoutlookexpress
freq                          652
Name: x-mailer, dtype: object

In [31]:
def bin_mailers(row, mailers):
    mailer = row['x-mailer']
    if pd.isna(mailer):
        return row

    if mailer not in mailers:
        row['x-mailer'] = 'other'

    return row


def get_top_mailers(df):
    spam_counts = df[df['category'] == 'spam']['x-mailer'].value_counts().sort_values(ascending=False)
    ham_counts = df[df['category'] == 'ham']['x-mailer'].value_counts().sort_values(ascending=False)
    ham = pd.DataFrame(ham_counts).rename(columns={'count': 'ham'})
    spam = pd.DataFrame(spam_counts).rename(columns={'count': 'spam'})
    mailer_counts = pd.concat([ham, spam], axis=1).fillna(0)
    mailer_counts['total'] = mailer_counts['spam'] + mailer_counts['ham']
    mailer_counts['spam_ratio'] = mailer_counts['spam'] / mailer_counts['total']
    return list(mailer_counts.sort_values(by=['total', 'spam_ratio'], ascending=False).iloc[:10].index)


top_mailers = get_top_mailers(with_mailer_df)
with_binned_mailer_df = with_mailer_df.apply(lambda r: bin_mailers(r, top_mailers), axis=1)
with_binned_mailer_df['x-mailer'].describe()

count      3370
unique       11
top       other
freq       1180
Name: x-mailer, dtype: object

In [32]:
with_encoded_mailer_df = one_hot_encode(with_binned_mailer_df, 'x-mailer')
with_encoded_mailer_df[get_feat_cols(with_encoded_mailer_df)]

Unnamed: 0,feat-cc-count,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,feat-encoded-content-transfer-encoding_binary,feat-encoded-content-transfer-encoding_quoted-printable,feat-encoded-precedence_bulk,feat-encoded-precedence_first-class,feat-encoded-precedence_list,...,feat-encoded-x-mailer_exmh,feat-encoded-x-mailer_internetmailservice,feat-encoded-x-mailer_microsoftcdoforwindows,feat-encoded-x-mailer_microsoftoutlookbuild,feat-encoded-x-mailer_microsoftoutlookexpress,feat-encoded-x-mailer_microsoftoutlookimobuild,feat-encoded-x-mailer_mozilla,feat-encoded-x-mailer_other,feat-encoded-x-mailer_sylpheedclaws,feat-encoded-x-mailer_ximianevolution
0,,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,,1.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
10,,1.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15,,1.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,,1.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53629,,,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
53634,2.0,,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
53639,,,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
53644,,,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Text Features

In [34]:
with_encoded_mailer_df['subject']

0            [Lockergnome Windows Daily]  Sticker Courtesy
5                                                      NaN
10       Great deals on perfect Summer cameras! (CNET S...
15       How Microsoft plans to take over your living r...
20            [NOVICE] pl/pgsql and returns timestamp type
                               ...                        
53629                                                  NaN
53634                                                  NaN
53639                        Speaking in Texas this Friday
53644                                                  NaN
53649                                                  NaN
Name: subject, Length: 9154, dtype: object

In [36]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
model = AutoModel.from_pretrained('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [72]:
embeddings = {}


def tokenize_email(row):
    subject = row['subject']
    if pd.isna(subject):
        return row

    encoded_input = tokenizer(subject, return_tensors='pt')
    output = model(**encoded_input)
    email_embedding = output.last_hidden_state.mean(dim=1)
    embeddings[row.name] = email_embedding.detach().numpy()
    return row


with_encoded_mailer_df.apply(tokenize_email, axis=1)
embeddings.keys()

dict_keys([0, 10, 15, 20, 25, 35, 40, 50, 55, 70, 75, 80, 85, 95, 110, 125, 130, 135, 140, 150, 155, 160, 165, 175, 180, 190, 195, 205, 210, 215, 220, 230, 245, 250, 255, 260, 265, 285, 290, 300, 305, 320, 325, 330, 345, 355, 360, 365, 375, 380, 385, 390, 395, 400, 405, 410, 415, 420, 425, 430, 440, 450, 470, 475, 485, 495, 500, 510, 525, 530, 535, 540, 550, 560, 565, 570, 575, 580, 585, 590, 595, 600, 605, 610, 615, 620, 625, 630, 635, 640, 650, 655, 660, 675, 680, 695, 715, 720, 725, 740, 745, 750, 755, 760, 765, 780, 790, 795, 800, 805, 810, 820, 825, 835, 845, 850, 855, 860, 865, 870, 880, 890, 900, 910, 915, 920, 930, 935, 945, 970, 975, 980, 985, 990, 995, 1000, 1015, 1025, 1035, 1045, 1050, 1060, 1070, 1075, 1080, 1085, 1095, 1100, 1105, 1110, 1115, 1120, 1125, 1140, 1150, 1160, 1175, 1195, 1200, 1210, 1215, 1220, 1225, 1230, 1245, 1250, 1260, 1265, 1275, 1290, 1305, 1310, 1320, 1330, 1335, 1340, 1355, 1360, 1365, 1370, 1375, 1385, 1390, 1395, 1410, 1415, 1420, 1430, 1455, 1460,

In [73]:
from sklearn.decomposition import PCA

embeddings_array = np.vstack(embeddings.values())
dimensions = 10
pca = PCA(n_components=dimensions)
reduced_embeddings = pca.fit_transform(embeddings_array)
reduced_embeddings

  embeddings_array = np.vstack(embeddings.values())


array([[-0.69894075, -1.0172611 ,  0.35746828, ..., -1.2071154 ,
         0.2889372 ,  0.3350447 ],
       [-1.7731428 ,  0.595579  ,  0.3318903 , ..., -1.1492454 ,
        -0.5359675 , -0.35100642],
       [-2.1570091 , -0.09000626, -0.41435495, ..., -0.77879965,
        -0.42611787,  0.06220271],
       ...,
       [-0.6522845 , -0.09001421, -1.0258795 , ..., -0.00417939,
         2.525723  , -0.3599519 ],
       [-0.44558877, -0.20607898, -0.21497835, ..., -1.027054  ,
        -0.21153268,  0.34008992],
       [ 0.986225  ,  0.6319695 , -1.0163678 , ..., -0.9033133 ,
        -0.25137243, -0.4272848 ]], dtype=float32)

In [74]:
with_tokenized_subject_df = with_encoded_mailer_df
i = 0
for dim in range(dimensions):
    with_tokenized_subject_df[f'feat-subject-embedding-{dim}'] = np.nan

for idx in embeddings.keys():
    for dim in range(dimensions):
        with_tokenized_subject_df.loc[idx, f'feat-subject-embedding-{dim}'] = reduced_embeddings[i][dim]
    i += 1

with_tokenized_subject_df[get_feat_cols(with_tokenized_subject_df)]

Unnamed: 0,feat-cc-count,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,feat-encoded-content-transfer-encoding_binary,feat-encoded-content-transfer-encoding_quoted-printable,feat-encoded-precedence_bulk,feat-encoded-precedence_first-class,feat-encoded-precedence_list,...,feat-subject-embedding-0,feat-subject-embedding-1,feat-subject-embedding-2,feat-subject-embedding-3,feat-subject-embedding-4,feat-subject-embedding-5,feat-subject-embedding-6,feat-subject-embedding-7,feat-subject-embedding-8,feat-subject-embedding-9
0,,0.0,0,0,0,0,0,0,0,0,...,-0.698941,-1.017261,0.357468,-0.379588,-1.680987,-0.042240,-0.179343,-1.207115,0.288937,0.335045
5,,1.0,1,0,0,0,0,1,0,0,...,,,,,,,,,,
10,,1.0,1,0,0,0,0,0,0,0,...,-1.773143,0.595579,0.331890,1.277573,-0.143662,-0.527984,0.594515,-1.149245,-0.535968,-0.351006
15,,1.0,0,0,0,0,1,0,0,0,...,-2.157009,-0.090006,-0.414355,0.713300,-0.002243,-0.657375,0.665949,-0.778800,-0.426118,0.062203
20,,1.0,1,0,0,0,0,1,0,0,...,-2.104642,-1.858202,0.353218,-0.002266,1.156176,0.989521,1.036399,1.093130,-0.053236,0.311117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53629,,,0,0,0,0,0,1,0,0,...,,,,,,,,,,
53634,2.0,,0,1,0,0,0,0,0,0,...,,,,,,,,,,
53639,,,0,0,0,0,0,0,0,0,...,0.986225,0.631970,-1.016368,-0.483061,0.258043,-0.659922,-0.543884,-0.903313,-0.251372,-0.427285
53644,,,1,0,0,0,0,1,0,0,...,,,,,,,,,,


In [91]:
def featurize_text(row, col, max_length):
    text = row[col]
    if pd.isna(text) or len(text) == 0:
        return row
    text = re.sub(r"[\t\s\n]+", "", text)
    row[f'feat-{col}-normalized-length'] = len(text) / max_length
    capital_letters, special_characters, digits = 0, 0, 0
    for s in text:
        if s.isupper():
            capital_letters += 1
        elif s.isdigit():
            digits += 1
        elif not s.isalnum():
            special_characters += 1
    row[f'feat-{col}-capital-letter-ratio'] = capital_letters / len(text)
    row[f'feat-{col}-special-character-ratio'] = special_characters / len(text)
    row[f'feat-{col}-digit-ratio'] = digits / len(text)

    return row


with_subject_features_df = with_tokenized_subject_df.apply(
    lambda r: featurize_text(r, max_length=with_tokenized_subject_df['subject'].str.len().max(), col='subject'),
    axis=1)

In [94]:
with_body_features_df = with_subject_features_df.apply(
    lambda r: featurize_text(r, max_length=with_subject_features_df['body'].str.len().max(), col='body'),
    axis=1)

In [96]:
spark_df = spark.createDataFrame(with_body_features_df[['category'] + get_feat_cols(with_body_features_df)])
spark_df.write.parquet(f'{DATA_FOLDER}/preprocessed.parquet', mode='overwrite')

23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for 12 writers
23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 58.46% for 13 writers
23/06/19 18:33:27 WARN MemoryManager: Total allocation exceeds 95.

# Training

In [97]:
df = spark.read.parquet(f'{DATA_FOLDER}/preprocessed.parquet')
preprocessed_df = df.toPandas()
preprocessed_df.describe()

Unnamed: 0,feat-cc-count,feat-content-type-boundary,feat-duplicate-message-id,feat-encoded-content-transfer-encoding_7bit,feat-encoded-content-transfer-encoding_8bit,feat-encoded-content-transfer-encoding_base64,feat-encoded-content-transfer-encoding_binary,feat-encoded-content-transfer-encoding_quoted-printable,feat-encoded-content-type-charset_iso-8859-1,feat-encoded-content-type-charset_iso-8859-15,...,feat-subject-normalized-length,feat-subject-special-character-ratio,feat-x-msmail-priority,feat-x-priority,feat-x-spam-level,feat-x-spam-status,feat-body-normalized-length,feat-body-capital-letter-ratio,feat-body-special-character-ratio,feat-body-digit-ratio
count,1962.0,9154.0,6109.0,9154.0,9154.0,9154.0,9154.0,9154.0,9154.0,9154.0,...,4798.0,4798.0,1149.0,1488.0,2156.0,2155.0,9154.0,9154.0,9154.0,9154.0
mean,3.007136,0.985689,0.478147,0.277802,0.105855,0.001092,0.002076,0.05189,0.006008,0.001639,...,0.156687,0.062115,1.075718,2.828629,0.067718,0.007425,0.010045,0.072883,0.144725,0.048042
std,6.011204,0.118775,0.499563,0.44794,0.307669,0.033036,0.045514,0.221817,0.077284,0.040449,...,0.076849,0.069987,0.387537,0.588674,0.628818,0.085866,0.029435,0.078673,0.075907,0.057052
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01005,0.0,-1.0,0.0,0.0,0.0,9e-05,0.0,0.001258,0.0
25%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.100503,0.0,1.0,3.0,0.0,0.0,0.00173,0.036157,0.080925,0.01101
50%,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.145729,0.05,1.0,3.0,0.0,0.0,0.00327,0.050542,0.134228,0.029155
75%,2.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.19598,0.095238,1.0,3.0,0.0,0.0,0.007013,0.073324,0.202592,0.066021
max,73.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.874372,0.714286,3.0,5.0,11.0,1.0,0.986811,0.966427,0.621425,0.65446
