In [14]:

from typing import List, Union
import datetime
import re
import os
from dateutil import parser
from tqdm import tqdm

import pandas as pd
import numpy as np
import email

### Utilities

In [13]:
def parse_email_document(path: str) -> pd.DataFrame:
    """
    Parses email raw file into a dataframe
    """
    
    # TODO: Some files have encoding troubles, as there are asci characteres that raises troubles in the open() block
    # TODO: Fix the encoding characters trouble
    try:
        with open(path) as f:
            contents = f.read()

        msg = email.message_from_string(contents)    

        if 'Cc' in msg:
            _cc = [re.sub('\s+','', msg['Cc']).split(',')] 
        else: 
            _cc = [np.nan]
            
        if 'Bcc' in msg:
            _bcc = [re.sub('\s+','', msg['Cc']).split(',')] 
        else: 
            _bcc = [np.nan]
            
        if 'To' in msg:
            _to = [re.sub('\s+','', msg['To']).split(',')]
        else:
            _to = [np.nan]
        
        attributes = {  
            "Message-ID": [msg["Message-ID"]],
            "Date": [msg["Date"]],
            "From": [re.sub('\s+','', msg['From']).split(',')],
            "To": _to,
            "Subject": [msg["Subject"]],
            "Cc": _cc,
            "Mime-Version": [msg["Mime-Version"]],
            "Content-Type": [msg["Content-Type"]],
            "Content-Transfer-Encoding": [msg["Content-Transfer-Encoding"]],
            "Bcc": _bcc,
            "X-From": [msg["X-From"]],
            "X-To": [msg["X-To"]],
            "X-cc": [msg["X-cc"]],
            "X-bcc": [msg["X-bcc"]],
            "X-Folder": [msg["X-Folder"]],
            "X-Origin": [msg["X-Origin"]],
            "X-FileName": [msg["X-FileName"]]
        }

        if msg.is_multipart():
            for part in email.get_payload():
                body = part.get_payload() 
        else:
            body = msg.get_payload() 
            
        attributes['body'] = body
        df = pd.DataFrame(attributes, columns=attributes.keys())
        return df
    except:
        pass


def get_email_paths(root_directory: str):
    """
    Get list of all files in subfolders of root directory
    """
    files_to_scratch = []
    for path, subdirs, files in os.walk(root_directory):
        
        for name in files:
            files_to_scratch.append(os.path.join(path, name))
            # print(os.path.join(path, name))
    
    return files_to_scratch


def parse_multiple_emails_document(files: List[str]) -> pd.DataFrame:
    """
    Parse list of mails into a dataframe
    """
    columns = ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Cc', 'Mime-Version',
       'Content-Type', 'Content-Transfer-Encoding', 'Bcc', 'X-From', 'X-To',
       'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'body']
    complete_df = pd.DataFrame(columns=columns)
    
    for path in tqdm(files):
        df = parse_email_document(path)
        complete_df = pd.concat([complete_df, df])
    
    return complete_df.reset_index(drop=True)



In [17]:
def change_date_type(dates: Union[pd.DataFrame, pd.Series]) -> List:
    """
    Formats string column into datetime object
    """
    column = []
    
    for date in dates:
        column.append(parser.parse(date).strftime("%d-%m-%Y %H:%M:%S"))
    
    series = pd.Series(column)
    return pd.to_datetime(series)


def str_to_list(row):
    """convert a string List into a List"""
    row = str(row).strip("[]").replace("'","")
    return row


def parsed_email_processing(df: pd.DataFrame) -> pd.DataFrame:
    """
    Basic email df formatting and cleaning
    """
    
    df['Date'] = change_date_type(df['Date'])
    
    df['body'] = df['body'].str.replace('\n','').str.replace('\t','')
    
    df['To'] = df['To'].astype('str')\
        .str.replace('b','')\
        .apply(str_to_list)
        
    df['From'] = df['From'].astype('str')\
        .str.replace('b','')\
        .apply(str_to_list)
    
    return df

### label selection and cleaning

#### Spam Data

In [20]:
spam_paths = get_email_paths('../data/spam/')
spam_df = parse_multiple_emails_document(spam_paths)

100%|██████████| 1093/1093 [00:04<00:00, 233.22it/s]


In [62]:
spam_df['spam'] = 1
spam_df.head()

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,body,spam
0,<8307461.1075860887277.JavaMail.evans@thyme>,"Tue, 3 Feb 2004 18:14:47 -0800 (PST)",[hotwebcash@lists.adversend.com],[m..presto@enron.com],Free Grants For Those In Need!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $\n\n...,1
1,<31088330.1075860887438.JavaMail.evans@thyme>,"Tue, 3 Feb 2004 18:14:47 -0800 (PST)",[hotwebcash@lists.adversend.com],[m..presto@enron.com],Your Opinion Counts...Win $1000!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $\n\n...,1
2,<8829953.1075860887541.JavaMail.evans@thyme>,"Tue, 3 Feb 2004 18:14:47 -0800 (PST)",[hotwebcash@lists.adversend.com],[m..presto@enron.com],LOSE POUNDS GUARANTEED!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * $ HOTWEBCASH $ * $ * $ * $ * $...,1
3,<25503194.1075860887462.JavaMail.evans@thyme>,"Tue, 3 Feb 2004 18:14:47 -0800 (PST)",[hotwebcash@lists.adversend.com],[m..presto@enron.com],Someone is searching for YOU!,,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $\n\n...,1
4,<3520246.1075860887301.JavaMail.evans@thyme>,"Tue, 3 Feb 2004 18:14:47 -0800 (PST)",[hotwebcash@lists.adversend.com],[m..presto@enron.com],"Take a Survey, Win a FREE New Computer!!",,1.0,text/plain; charset=us-ascii,7bit,,HotWebCash Newsletter <hotwebcash@lists.advers...,"Presto, Kevin M. </O=ENRON/OU=NA/CN=RECIPIENTS...",,,"\Kevin_Presto_Mar2002_1\Presto, Kevin M.\Junk ...",Presto-K,kpresto (Non-Privileged).pst,$ * $ * $ * $ * HOTWEBCASH * $ * $ * $ * $\n\n...,1


In [63]:
spam_df.to_parquet('../data/labeled_data/spam.parquet.gzip', compression='gzip', engine='fastparquet')

#### Not Spam Data

In [21]:
complete_df = pd.read_parquet('../data/preprocessing_output/cleaned_mails_v0.parquet.gzip')

In [22]:
complete_df

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,body
0,<17334447.1075857585446.JavaMail.evans@thyme>,2000-11-16 09:30:00,"[""msagel@home.com""]","[""jarnold@enron.com""]",Status,,1.0,text/plain; charset=ANSI_X3.4-1968,7bit,,"""Mark Sagel"" <msagel@home.com>","""John Arnold"" <jarnold@enron.com>",,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,John:?I'm not really sure what happened betwee...
1,<19171686.1075857585034.JavaMail.evans@thyme>,2000-08-12 05:05:00,"[""slafontaine@gloalp.com""]","[""john.arnold@enron.com""]",re:summer inverses,,1.0,text/plain; charset=us-ascii,7bit,,slafontaine@globalp.com,John.Arnold@enron.com,,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,i suck-hope youve made more money in natgas la...
2,<29887033.1075857630725.JavaMail.evans@thyme>,2001-05-15 09:43:00,"[""iceoperations@intcx.com""]","[""icehelpdesk@intcx.com"", ""internalmarketing@i...",The WTI Bullet swap contracts,,1.0,text/plain; charset=us-ascii,7bit,,ICE Operations <ICEOperations@intcx.com>,"**ICEHELPDESK <**ICEHELPDESK@intcx.com>, **Int...",,,\John_Arnold_Jun2001\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,"Hi, Following the e-mail you have received y..."
3,<29084893.1075849630138.JavaMail.evans@thyme>,2000-11-27 01:49:00,"[""jeff.youngflesh@enron.com""]","[""anthony.gilmore@enron.com"", ""colleen.koenig@...",Invitation: EBS/GSS Meeting w/Bristol Babcock ...,,1.0,text/plain; charset=us-ascii,7bit,,Jeff Youngflesh,"Anthony Gilmore, Colleen Koenig, Jennifer Stew...",,,\John_Arnold_Nov2001\Notes Folders\Notes inbox,ARNOLD-J,jarnold.nsf,Conference Room TBD. This meeting will be to ...
4,<30248874.1075857584813.JavaMail.evans@thyme>,2000-12-12 09:33:00,"[""caroline.aramo@enron.com""]","[""mike.grigsy@enron.com""]",Harvard Mgmt,"b'[""john.arnold@enron.com""]'",1.0,text/plain; charset=us-ascii,7bit,"b'[""john.arnold@enron.com""]'",Caroline Abramo,Mike Grigsby,John Arnold,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,Mike- I have their trader coming into the offi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517306,<18618854.1075840028791.JavaMail.evans@thyme>,2001-11-10 09:39:47,"[""jodi.droll@xcelenergy.com""]","[""isas@wscc.com""]",RE: Scheduling Time constant,,1.0,text/plain; charset=us-ascii,7bit,,"Droll, Jodi <Jodi.Droll@XCELENERGY.COM>",Interchange Scheduling & Accounting Subcommitt...,,,"\ExMerge - Scholtes, Diana\STF\Current issues",SCHOLTES-D,,I (PSCO TP & CA) agree with Don. I thought we...
517307,<14350892.1075840028690.JavaMail.evans@thyme>,2001-10-17 16:11:39,"[""mark.hackney@aps.com""]","[""isas@wscc.com""]","FW: Standards Announcement - October 17, 2001",,1.0,text/plain; charset=us-ascii,7bit,,"Hackney, Mark W(Z39911) <Mark.Hackney@aps.com>",Interchange Scheduling & Accounting Subcommitt...,,,"\ExMerge - Scholtes, Diana\STF\Current issues",SCHOLTES-D,,FYI!!-----Original Message-----From: Barbara B...
517308,<21704474.1075840029683.JavaMail.evans@thyme>,2001-06-04 01:42:00,"[""paul.rice@pacificorp.com""]","[""isas@wscc.com""]",Late tags,,1.0,text/plain; charset=us-ascii,7bit,,"Rice, Paul <Paul.Rice@Pacificorp.com>",Interchange Scheduling & Accounting Subcommitt...,,,"\ExMerge - Scholtes, Diana\STF\E-TAG",SCHOLTES-D,,Nothing is easy is it?? At the risk of the cr...
517309,<9367927.1075840029633.JavaMail.evans@thyme>,2001-09-04 23:11:00,"[""gjcarter@pa.gov""]","[""\hara@enron.com"", ""khara@avistaenergy.com"", ...",RE: BCHA Automatic Denial/Approval,,1.0,text/plain; charset=us-ascii,7bit,,"Carter, Gloria J - TMS-DITT1 <gjcarter@bpa.gov>","'Hara, Kathy' <KHara@avistaenergy.com>, Cara ...",,,"\ExMerge - Scholtes, Diana\STF\E-TAG",SCHOLTES-D,,"I think you are right on! In addition, I would..."


In [41]:
bool_mask = complete_df['From'].str.contains("enron.com")
not_spam_df = complete_df[bool_mask].sample(400)

In [42]:
# not_spam_df.to_csv('../data/no_spam/non_spam_contenients.csv')

## Labeling Data

The csv was read again because the datset was examined one by one to confirm the mails were not spam

In [55]:
nonspam_df = pd.read_csv('../data/no_spam/non_spam_contenients.csv', index_col=0).reset_index(drop=True)

In [56]:
nonspam_df['spam'] = 0
nonspam_df.head()

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,body,spam
0,<8986238.1075840559090.JavaMail.evans@thyme>,25/01/02 9:22,"[""c..giron@enron.com""]","[""elena.chilkina@enron.com"", ""m..love@enron.com""]",NG-PRICE_FinancialOnly.xls,"b'[""ed.mcmichael@enron.com"", ""c..gossett@enron...",1,text/plain; charset=us-ascii,7bit,"b'[""ed.mcmichael@enron.com"", ""c..gossett@enron...","Giron, Darron C. </O=ENRON/OU=NA/CN=RECIPIENTS...","Chilkina, Elena </O=ENRON/OU=NA/CN=RECIPIENTS/...","McMichael Jr., Ed </O=ENRON/OU=NA/CN=RECIPIENT...",,"\ExMerge - Giron, Darron C.\Sent Items",GIRON-D,darron giron 6-26-02.PST,The file was too big to email. It contains al...,0
1,<29456384.1075857390009.JavaMail.evans@thyme>,05/08/01 5:31,"[""outlook.team@enron.com""]","[""angela.mendez@enron.com"", ""angeline.stewart@...",1-URGENT - Outlook Email Notification (new),,1,text/plain; charset=ANSI_X3.4-1968,quoted-printable,,Outlook Migration Team,"Angela Mendez, Angeline Stewart, Bobbie LaChap...",,,\Errol_McLaughlin_Jun2001\Notes Folders\All do...,McLaughlin-E,emclaug.nsf,OUTLOOK EMAIL NOTIFICATIONYour Date of Migrati...,0
2,<11995525.1075851754784.JavaMail.evans@thyme>,15/12/00 7:31,"[""samantha.oyd@enron.com""]","[""o.owen@enron.com"", ""laurel.adams@enron.com"",...",MIECO INC. - CORRECTION,,1,text/plain; charset=us-ascii,7bit,,Samantha Boyd,"Bob Bowen, Laurel Adams, Andrea R Guillen, Lar...",,,\Jason_Williams_Nov2001\Notes Folders\Legal,WILLIAMS-J,jwilli10.nsf,We have received an executed Master Agreement:...,0
3,<5560011.1075843043584.JavaMail.evans@thyme>,22/11/00 0:21,"[""susan.mara@enron.com""]","[""james.steffes@enron.com""]",Re: Enron's Response Today - Key Point to Focu...,"b'[""alan.comnes@enron.com"", ""bernadette.hawkin...",1,text/plain; charset=us-ascii,7bit,"b'[""alan.comnes@enron.com"", ""bernadette.hawkin...",Susan J Mara,James D Steffes,"Alan Comnes, bernadette Hawkins, Christopher F...",,\Jeff_Dasovich_Dec2000\Notes Folders\All docum...,DASOVICH-J,jdasovic.nsf,I know that WPTF is supporting Hebert's approa...,0
4,<13364954.1075842809447.JavaMail.evans@thyme>,14/05/01 3:03,"[""kim.ward@enron.com""]","[""arry.tycholiz@enron.com"", ""gerald.nemec@enro...",Palo Alto,,1,text/plain; charset=us-ascii,7bit,,Kim Ward,"Barry Tycholiz, Gerald Nemec",,,\Gerald_Nemec_Dec2000_June2001_2\Notes Folders...,NEMEC-G,gnemec.nsf,Do we need to meet this morning regarding our ...,0


In [61]:
nonspam_df.to_parquet('../data/labeled_data/nonspam.parquet.gzip', compression='gzip', engine='fastparquet')