In [125]:
import os
import pandas as pd
import re

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [126]:
CLEAN_HEADER = """Subject: {subject}
Message-ID: <GTUBE1.1010101@example.net>
Date: Wed, 23 Jul 2003 23:30:00 +0200
From: Sender <sender@example.net>
To: Recipient <recipient@example.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit"""

**CLEANING CSV DATA**

Adding generic unproblematic headers and filling in subject, etc.

In [127]:
enron = pd.read_csv('data/csvs/enronSpamHam.csv') 
ling = pd.read_csv('data/csvs/lingSpamHam.csv')

print(enron.columns)
print(ling.columns)

# only want to keep body and label columns
enron = enron[['Body', 'Label']].rename(columns={'Body': 'text', 'Label': 'label'})
ling = ling[['Body', 'Label']].rename(columns={'Body': 'text', 'Label': 'label'})

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Body', 'Label'], dtype='object')
Index(['Unnamed: 0', 'Body', 'Label'], dtype='object')


In [128]:
enron_text = enron['text']
enron_labels = enron['label']

ling_text = ling['text']
ling_labels = ling['label']

In [129]:
def extract_subj_body_csv(text):
    spl = text.split('\n', 1)
    subj = re.match(r'^subject:(.*)$', spl[0], re.IGNORECASE)
    if subj is None:
        subj = "Placeholder Subject"
    else:
        subj = subj.group(1).strip()
    body = spl[1].strip()
    return subj, body

In [130]:
def add_header_csv(text):
    subj, body = extract_subj_body_csv(text)
    return CLEAN_HEADER.format(subject=subj) + '\n\n' + body

In [131]:
enron_formatted = enron_text.apply(add_header_csv)
ling_formatted = ling_text.apply(add_header_csv)

In [132]:
new_enron_formatted = pd.DataFrame({'text': enron_formatted, 'label': enron_labels})
new_ling_formatted = pd.DataFrame({'text': ling_formatted, 'label': ling_labels})

new_enron_unformatted = pd.DataFrame({'text': enron_text, 'label': enron_labels})
new_ling_unformatted = pd.DataFrame({'text': ling_text, 'label': ling_labels})

In [133]:
new_enron_formatted.to_csv('data/formattedData/enronFormatted.csv', index=False)
new_ling_formatted.to_csv('data/formattedData/lingFormatted.csv', index=False)

new_enron_unformatted.to_csv('data/unformattedData/enronUnformatted.csv', index=False)
new_ling_unformatted.to_csv('data/unformattedData/lingUnformatted.csv', index=False)

**CLEANING SPAMASSASSIN CORPUS DATA**

Replacing headers with generic unproblematic headers and filling in subject, etc.

In [134]:
corpus_spam_path = "data/spam_assassin_corpus/spam_2"
corpus_ham_path = "data/spam_assassin_corpus/easy_ham"

def is_plaintext(text):
    header = text.split("\n\n", 1)[0].strip()
    if "content-type: text/plain" in header.lower():
        return True
    return False

def get_plaintext_corpus(path):
    plaintext_emails = []
    for fn in os.listdir(path):
        with open(os.path.join(path, fn)) as f:
            try:
                text = f.read()
                if is_plaintext(text):
                    plaintext_emails.append(text)
            except:
                continue
    return plaintext_emails

spam_corpus_text = get_plaintext_corpus(corpus_spam_path)
spam_corpus_labels = [1 for _ in spam_corpus_text]

ham_corpus_text = get_plaintext_corpus(corpus_ham_path)
ham_corpus_labels = [0 for _ in ham_corpus_text]

print(len(spam_corpus_text), len(ham_corpus_text))

347 1891


In [135]:
def replace_header_corpus(email):
    spl = email.split("\n\n", 1)
    header = spl[0].strip()
    body = spl[1].strip()

    subj = re.match(r'(.|\n)*subject:(.*)', header, flags = re.IGNORECASE).group(2).strip()
    return CLEAN_HEADER.format(subject=subj) + "\n\n" + body

In [136]:
spam_corpus_text_formatted = [replace_header_corpus(email) for email in spam_corpus_text]
ham_corpus_text_formatted = [replace_header_corpus(email) for email in ham_corpus_text]

In [137]:
corpus_text_formatted = spam_corpus_text_formatted + ham_corpus_text_formatted
corpus_labels = spam_corpus_labels + ham_corpus_labels

new_corpus_formatted = pd.DataFrame({'text': corpus_text_formatted, 'label': corpus_labels})
new_corpus_formatted.to_csv('data/formattedData/SAcorpusFormatted.csv', index=False)

In [138]:
corpus_text_unformatted = spam_corpus_text + ham_corpus_text
new_corpus_unformatted = pd.DataFrame({'text': corpus_text_unformatted, 'label': corpus_labels})
new_corpus_unformatted.to_csv('data/unformattedData/SAcorpusUnformatted.csv', index=False)
