In [1]:
import pandas as pd
import numpy as np
import email
import re
import os 
from typing import List, Union
from tqdm import tqdm
import sys
import datetime
from dateutil import parser

import spacy
import nltk
from nltk.corpus import stopwords

### Utilities

In [145]:
def parse_email_document(path: str) -> pd.DataFrame:
    """
    Parses email raw file into a dataframe
    """
    
    # TODO: Some files have encoding troubles, as there are asci characteres that raises troubles in the open() block
    # TODO: Fix the encoding characters trouble
    try:
        with open(path) as f:
            contents = f.read()

        msg = email.message_from_string(contents)    

        if 'Cc' in msg:
            _cc = [re.sub('\s+','', msg['Cc']).split(',')] 
        else: 
            _cc = [np.nan]
            
        if 'Bcc' in msg:
            _bcc = [re.sub('\s+','', msg['Cc']).split(',')] 
        else: 
            _bcc = [np.nan]
            
        if 'To' in msg:
            _to = [re.sub('\s+','', msg['To']).split(',')]
        else:
            _to = [np.nan]
        
        attributes = {  
            "Message-ID": [msg["Message-ID"]],
            "Date": [msg["Date"]],
            "From": [re.sub('\s+','', msg['From']).split(',')],
            "To": _to,
            "Subject": [msg["Subject"]],
            "Cc": _cc,
            "Mime-Version": [msg["Mime-Version"]],
            "Content-Type": [msg["Content-Type"]],
            "Content-Transfer-Encoding": [msg["Content-Transfer-Encoding"]],
            "Bcc": _bcc,
            "X-From": [msg["X-From"]],
            "X-To": [msg["X-To"]],
            "X-cc": [msg["X-cc"]],
            "X-bcc": [msg["X-bcc"]],
            "X-Folder": [msg["X-Folder"]],
            "X-Origin": [msg["X-Origin"]],
            "X-FileName": [msg["X-FileName"]]
        }

        if msg.is_multipart():
            for part in email.get_payload():
                body = part.get_payload() 
        else:
            body = msg.get_payload() 
            
        attributes['body'] = body
        df = pd.DataFrame(attributes, columns=attributes.keys())
        return df
    except:
        pass


def get_email_paths(root_directory: str):
    """
    Get list of all files in subfolders of root directory
    """
    files_to_scratch = []
    for path, subdirs, files in os.walk(root_directory):
        
        for name in files:
            files_to_scratch.append(os.path.join(path, name))
            # print(os.path.join(path, name))
    
    return files_to_scratch


def parse_multiple_emails_document(files: List[str]) -> pd.DataFrame:
    """
    Parse list of mails into a dataframe
    """
    columns = ['Message-ID', 'Date', 'From', 'To', 'Subject', 'Cc', 'Mime-Version',
       'Content-Type', 'Content-Transfer-Encoding', 'Bcc', 'X-From', 'X-To',
       'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName', 'body']
    complete_df = pd.DataFrame(columns=columns)
    
    for path in tqdm(files):
        df = parse_email_document(path)
        complete_df = pd.concat([complete_df, df])
    
    return complete_df.reset_index(drop=True)



In [146]:
def change_date_type(dates: Union[pd.DataFrame, pd.Series]) -> List:
    """
    Formats string column into datetime object
    """
    column = []
    
    for date in dates:
        column.append(parser.parse(date).strftime("%d-%m-%Y %H:%M:%S"))
    
    series = pd.Series(column)
    return pd.to_datetime(series)

In [147]:
def str_to_list(row):
    """convert a string List into a List"""
    row = str(row).strip("[]").replace("'","")
    return row

### Cleaning

In [148]:
# Remember to pull the data from dvc
complete_df = pd.read_parquet('../data/preprocessing_output/parsedmails.parquet.gzip')

In [149]:
complete_df.head()

Unnamed: 0,Message-ID,Date,From,To,Subject,Cc,Mime-Version,Content-Type,Content-Transfer-Encoding,Bcc,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,body
0,<17334447.1075857585446.JavaMail.evans@thyme>,"Thu, 16 Nov 2000 09:30:00 -0800 (PST)","b'[""msagel@home.com""]'","b'[""jarnold@enron.com""]'",Status,,1.0,text/plain; charset=ANSI_X3.4-1968,7bit,,"""Mark Sagel"" <msagel@home.com>","""John Arnold"" <jarnold@enron.com>",,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,John:\n?\nI'm not really sure what happened be...
1,<19171686.1075857585034.JavaMail.evans@thyme>,"Fri, 8 Dec 2000 05:05:00 -0800 (PST)","b'[""slafontaine@globalp.com""]'","b'[""john.arnold@enron.com""]'",re:summer inverses,,1.0,text/plain; charset=us-ascii,7bit,,slafontaine@globalp.com,John.Arnold@enron.com,,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,i suck-hope youve made more money in natgas la...
2,<29887033.1075857630725.JavaMail.evans@thyme>,"Tue, 15 May 2001 09:43:00 -0700 (PDT)","b'[""iceoperations@intcx.com""]'","b'[""icehelpdesk@intcx.com"", ""internalmarketing...",The WTI Bullet swap contracts,,1.0,text/plain; charset=us-ascii,7bit,,ICE Operations <ICEOperations@intcx.com>,"**ICEHELPDESK <**ICEHELPDESK@intcx.com>, **Int...",,,\John_Arnold_Jun2001\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,"Hi,\n\n\n Following the e-mail you have rece..."
3,<29084893.1075849630138.JavaMail.evans@thyme>,"Mon, 27 Nov 2000 01:49:00 -0800 (PST)","b'[""jeff.youngflesh@enron.com""]'","b'[""anthony.gilmore@enron.com"", ""colleen.koeni...",Invitation: EBS/GSS Meeting w/Bristol Babcock ...,,1.0,text/plain; charset=us-ascii,7bit,,Jeff Youngflesh,"Anthony Gilmore, Colleen Koenig, Jennifer Stew...",,,\John_Arnold_Nov2001\Notes Folders\Notes inbox,ARNOLD-J,jarnold.nsf,Conference Room TBD. \n\nThis meeting will be...
4,<30248874.1075857584813.JavaMail.evans@thyme>,"Tue, 12 Dec 2000 09:33:00 -0800 (PST)","b'[""caroline.abramo@enron.com""]'","b'[""mike.grigsby@enron.com""]'",Harvard Mgmt,"b'[""john.arnold@enron.com""]'",1.0,text/plain; charset=us-ascii,7bit,"b'[""john.arnold@enron.com""]'",Caroline Abramo,Mike Grigsby,John Arnold,,\John_Arnold_Dec2000\Notes Folders\Notes inbox,Arnold-J,Jarnold.nsf,Mike- I have their trader coming into the offi...


In [150]:
complete_df['Date'] = change_date_type(complete_df['Date'])

In [151]:
complete_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517311 entries, 0 to 517310
Data columns (total 18 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Message-ID                 517311 non-null  object        
 1   Date                       517311 non-null  datetime64[ns]
 2   From                       517311 non-null  object        
 3   To                         495466 non-null  object        
 4   Subject                    517311 non-null  object        
 5   Cc                         127847 non-null  object        
 6   Mime-Version               517282 non-null  object        
 7   Content-Type               517282 non-null  object        
 8   Content-Transfer-Encoding  517282 non-null  object        
 9   Bcc                        127847 non-null  object        
 10  X-From                     517282 non-null  object        
 11  X-To                       517282 non-null  object  

In [152]:
complete_df['body'] = complete_df['body'].str.replace('\n','').str.replace('\t','')

In [153]:
complete_df['body']

0         John:?I'm not really sure what happened betwee...
1         i suck-hope youve made more money in natgas la...
2          Hi,  Following the e-mail you have received y...
3         Conference Room TBD.  This meeting will be to ...
4         Mike- I have their trader coming into the offi...
                                ...                        
517306    I (PSCO TP & CA) agree with Don.  I thought we...
517307    FYI!!-----Original Message-----From: Barbara B...
517308    Nothing is easy is it??  At the risk of the cr...
517309    I think you are right on! In addition, I would...
517310    >  -----Original Message-----> From: Hara, Kat...
Name: body, Length: 517311, dtype: object

In [154]:
complete_df['To'] = complete_df['To'].astype('str').str.replace('b','')
complete_df['To'] = complete_df['To'].apply(str_to_list)
complete_df['To']

0                                     ["jarnold@enron.com"]
1                                 ["john.arnold@enron.com"]
2         ["icehelpdesk@intcx.com", "internalmarketing@i...
3         ["anthony.gilmore@enron.com", "colleen.koenig@...
4                                 ["mike.grigsy@enron.com"]
                                ...                        
517306                                    ["isas@wscc.com"]
517307                                    ["isas@wscc.com"]
517308                                    ["isas@wscc.com"]
517309    ["\hara@enron.com", "khara@avistaenergy.com", ...
517310    ["cara.semperger@enron.com", "demetrios.fotiou...
Name: To, Length: 517311, dtype: object

In [156]:
complete_df['From'] = complete_df['From'].astype('str').str.replace('b','')
complete_df['From'] = complete_df['From'].apply(str_to_list)
complete_df['From']

0                   ["msagel@home.com"]
1            ["slafontaine@gloalp.com"]
2           ["iceoperations@intcx.com"]
3         ["jeff.youngflesh@enron.com"]
4          ["caroline.aramo@enron.com"]
                      ...              
517306    ["jodi.droll@xcelenergy.com"]
517307         ["mark.hackney@aps.com"]
517308     ["paul.rice@pacificorp.com"]
517309              ["gjcarter@pa.gov"]
517310       ["khara@avistaenergy.com"]
Name: From, Length: 517311, dtype: object

In [159]:
output_path = '/Users/luis.morales/Desktop/MLOpsBootcamp/MLOpsCapstoneProject/data/preprocessing_output/cleaned_mails_v0'
complete_df.to_parquet(f"{output_path}.parquet.gzip", compression='gzip', engine='fastparquet')