In [22]:
import pandas as pd
import numpy as np
import email
import re
from pprint import pprint
import sys
import os 
from typing import List
from tqdm import tqdm

In [4]:
def parse_email_document(path: str) -> pd.DataFrame:
    """
    Parses email raw file into a dataframe
    """
    
    # TODO: Some files have encoding troubles, as there are asci characteres that raises troubles in the open() block
    # TODO: Fix the encoding characters trouble
    try:
        with open(path) as f:
            contents = f.read()

        msg = email.message_from_string(contents)    

        if 'Cc' in msg:
            _cc = [re.sub('\s+','', msg['Cc']).split(',')] 
        else: 
            _cc = [np.nan]
            
        if 'Bcc' in msg:
            _bcc = [re.sub('\s+','', msg['Cc']).split(',')] 
        else: 
            _bcc = [np.nan]
            
        if 'To' in msg:
            _to = [re.sub('\s+','', msg['To']).split(',')]
        else:
            _to = [np.nan]
        
        attributes = {  
            "Message-ID": [msg["Message-ID"]],
            "Date": [msg["Date"]],
            "From": [re.sub('\s+','', msg['From']).split(',')],
            "To": _to,
            "Subject": [msg["Subject"]],
            "Cc": _cc,
            "Mime-Version": [msg["Mime-Version"]],
            "Content-Type": [msg["Content-Type"]],
            "Content-Transfer-Encoding": [msg["Content-Transfer-Encoding"]],
            "Bcc": _bcc,
            "X-From": [msg["X-From"]],
            "X-To": [msg["X-To"]],
            "X-cc": [msg["X-cc"]],
            "X-bcc": [msg["X-bcc"]],
            "X-Folder": [msg["X-Folder"]],
            "X-Origin": [msg["X-Origin"]],
            "X-FileName": [msg["X-FileName"]]
        }

        if msg.is_multipart():
            for part in email.get_payload():
                body = part.get_payload() 
        else:
            body = msg.get_payload() 
            
        attributes['body'] = body
        df = pd.DataFrame(attributes, columns=attributes.keys())
        return df
    except:
        pass

In [5]:
def get_email_paths(root_directory: str):
    """
    Get list of all files in subfolders of root directory
    """
    files_to_scratch = []
    for path, subdirs, files in os.walk(root_directory):
        
        for name in files:
            files_to_scratch.append(os.path.join(path, name))
            # print(os.path.join(path, name))
    
    return files_to_scratch

In [25]:
def parse_multiple_emails_document(files: List[str]) -> pd.DataFrame:
    """
    Parse list of mails into a dataframe
    """
    columns = ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Cc', 'Mime-Version',
       'Content-Type', 'Content-Transfer-Encoding', 'Bcc', 'X-From', 'X-To',
       'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'body']
    complete_df = pd.DataFrame(columns=columns)
    
    for path in tqdm(files):
        df = parse_email_document(path)
        complete_df = pd.concat([complete_df, df])
    
    return complete_df.reset_index(drop=True)

In [10]:
path = '/Users/luis.morales/Desktop/MLOpsBootcamp/MLOpsCapstoneProject/data/testing_data/blair-l/acctg___measurement_issues/5.'
root = '/Users/luis.morales/Desktop/MLOpsBootcamp/MLOpsCapstoneProject/data/testing_data/'

In [42]:
testing = get_email_paths(root)
testing

['/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/scholtes-d/stf/e_tag/3.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/scholtes-d/stf/e_tag/1.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/scholtes-d/stf/e_tag/2.']

In [18]:
single_email_df = parse_email_document(path)

In [26]:
# testing = ['/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/testing_data/arcor/3.',
# '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/testing_data/arcor/1.',
#  '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/testing_data/arcor/2.']

multiple_emails_df = parse_multiple_emails_document(testing)

100%|██████████| 6898/6898 [00:46<00:00, 147.26it/s]


In [43]:
maildir_path = '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir'
maildir_paths = get_email_paths(maildir_path)

In [44]:
maildir_paths

['/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/.DS_Store',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/.DS_Store',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/36.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/19.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/50.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/3.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/9.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/13.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/75.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/35.',
 '/Users/luis.morales/Desktop/MLOpsBootcamp/FinalProject/maildir/arnold-j/notes_inbox/53.',
 '/Users/lu

In [32]:
maildir_df = parse_multiple_emails_document(maildir_paths)

100%|██████████| 99/99 [00:00<00:00, 208.66it/s]


In [40]:
# maildir_df.to_parquet('./../data/preprocessing_output/subset')
maildir_df.to_parquet('./../data/preprocessing_output/subset.parquet.gzip', compression='gzip', engine='fastparquet')

In [41]:
pd.read_parquet('./../data/preprocessing_output/subset.parquet.gzip')

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,body
0,<17334447.1075857585446.JavaMail.evans@thyme>,"Thu, 16 Nov 2000 09:30:00 -0800 (PST)","b'[""msagel@home.com""]'","b'[""jarnold@enron.com""]'",Status,,1.0,text/plain; charset=ANSI_X3.4-1968,7bit,,"""Mark Sagel"" <msagel@home.com>","""John Arnold"" <jarnold@enron.com>",,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,John:\n?\nI'm not really sure what happened be...
1,<19171686.1075857585034.JavaMail.evans@thyme>,"Fri, 8 Dec 2000 05:05:00 -0800 (PST)","b'[""slafontaine@globalp.com""]'","b'[""john.arnold@enron.com""]'",re:summer inverses,,1.0,text/plain; charset=us-ascii,7bit,,slafontaine@globalp.com,John.Arnold@enron.com,,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,i suck-hope youve made more money in natgas la...
2,<29887033.1075857630725.JavaMail.evans@thyme>,"Tue, 15 May 2001 09:43:00 -0700 (PDT)","b'[""iceoperations@intcx.com""]'","b'[""icehelpdesk@intcx.com"", ""internalmarketing...",The WTI Bullet swap contracts,,1.0,text/plain; charset=us-ascii,7bit,,ICE Operations <ICEOperations@intcx.com>,"**ICEHELPDESK <**ICEHELPDESK@intcx.com>, **Int...",,,\John_Arnold_Jun2001\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,"Hi,\n\n\n Following the e-mail you have rece..."
3,<29084893.1075849630138.JavaMail.evans@thyme>,"Mon, 27 Nov 2000 01:49:00 -0800 (PST)","b'[""jeff.youngflesh@enron.com""]'","b'[""anthony.gilmore@enron.com"", ""colleen.koeni...",Invitation: EBS/GSS Meeting w/Bristol Babcock ...,,1.0,text/plain; charset=us-ascii,7bit,,Jeff Youngflesh,"Anthony Gilmore, Colleen Koenig, Jennifer Stew...",,,\John_Arnold_Nov2001\Notes Folders\Notes inbox,ARNOLD-J,jarnold.nsf,Conference Room TBD. \n\nThis meeting will be...
4,<30248874.1075857584813.JavaMail.evans@thyme>,"Tue, 12 Dec 2000 09:33:00 -0800 (PST)","b'[""caroline.abramo@enron.com""]'","b'[""mike.grigsby@enron.com""]'",Harvard Mgmt,"b'[""john.arnold@enron.com""]'",1.0,text/plain; charset=us-ascii,7bit,"b'[""john.arnold@enron.com""]'",Caroline Abramo,Mike Grigsby,John Arnold,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,Mike- I have their trader coming into the offi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,<24569099.1075849631099.JavaMail.evans@thyme>,"Mon, 4 Dec 2000 03:08:00 -0800 (PST)","b'[""jeff.youngflesh@enron.com""]'","b'[""daniel.coleman@enron.com""]'",Re: Vulcan Signs,"b'[""craig.brown@enron.com"", ""colleen.koenig@en...",1.0,text/plain; charset=us-ascii,7bit,"b'[""craig.brown@enron.com"", ""colleen.koenig@en...",Jeff Youngflesh,Daniel Coleman,"Craig H Brown, Colleen Koenig, Jennifer Medcal...",,\John_Arnold_Nov2001\Notes Folders\Vulcan signs,ARNOLD-J,jarnold.nsf,"Dan, \nThank you for the opportunity. I am op..."
95,<12009781.1075849631122.JavaMail.evans@thyme>,"Mon, 4 Dec 2000 23:08:00 -0800 (PST)","b'[""craig.brown@enron.com""]'","b'[""heidi.smith@enron.com""]'",Re: Vulcan Signs,"b'[""jeff.youngflesh@enron.com"", ""jennifer.medc...",1.0,text/plain; charset=us-ascii,7bit,"b'[""jeff.youngflesh@enron.com"", ""jennifer.medc...",Craig H Brown,Heidi Smith,"Jeff Youngflesh, Jennifer Medcalf",,\John_Arnold_Nov2001\Notes Folders\Vulcan signs,ARNOLD-J,jarnold.nsf,Heidi:\n\nPlease outline the Vulcan contract f...
96,<25351532.1075852689302.JavaMail.evans@thyme>,"Fri, 5 Oct 2001 07:56:38 -0700 (PDT)","b'[""soblander@carrfut.com""]'","b'[""soblander@carrfut.com""]'",option candlesticks as a hot link 10/5,,1.0,text/plain; charset=ANSI_X3.4-1968,7bit,,soblander@carrfut.com@ENRON <IMCEANOTES-soblan...,soblander@carrfut.com,,,"\JARNOLD (Non-Privileged)\Arnold, John\Deleted...",Arnold-J,JARNOLD (Non-Privileged).pst,The information contained herein is based on s...
97,<22134312.1075861665211.JavaMail.evans@thyme>,"Tue, 20 Nov 2001 16:08:27 -0800 (PST)","b'[""errol.mclaughlin@enron.com""]'","b'[""john.arnold@enron.com"", ""bilal.bajwa@enron...",TRV Notification: (NG - PROPT P/L - 11/20/2001),,1.0,text/plain; charset=us-ascii,7bit,,"McLaughlin Jr., Errol </O=ENRON/OU=NA/CN=RECIP...","Arnold, John </O=ENRON/OU=NA/CN=RECIPIENTS/CN=...",,,"\JARNOLD (Non-Privileged)\Arnold, John\Deleted...",Arnold-J,JARNOLD (Non-Privileged).pst,The report named: NG - PROPT P/L <http://trv.c...
