In [None]:
import pandas as pd
import email
from email.message import EmailMessage
from email import policy
from email.parser import BytesParser
import glob
import os
from collections import defaultdict
from bs4 import BeautifulSoup
import random
from typing import Tuple, List, Dict, Union
import re

In [None]:
pwd()

In [None]:
ceo_fraud_gc_paths = glob.glob('/data/dataset/CEO_Fraud_-_Gift_Cards/*/*.eml')

#### Function for extracting email body content - to be called within `extract_features` below

In [None]:
def extract_body(msg: EmailMessage) -> Tuple[List[str], str, str]:
    """
    Recursively extracts MIME content types, plain text, HTML, and attachment types from an EmailMessage object.

    This function performs a depth-first traversal of the email structure to collect:
    
    - All MIME content types found in the message parts.
    - The first encountered plain text body (`text/plain`), cleaned of excess whitespace and UBC's caution tag.
    - The first encountered HTML body (`text/html`).
    - A fallback plain text version extracted from HTML if no `text/plain` is found.
    - A list of MIME types for all attachments.

    Parameters
    ----------
    msg : EmailMessage
        The email message object from which to extract content.

    Returns
    -------
    tuple :
        A tuple containing:
        - parts (list of str): All MIME content types found (e.g., 'text/plain', 'text/html', etc.).
        - text_plain (str): The raw plain text body of the email, or extracted from HTML if not available.
        - text_clean (str): Cleaned version of `text_plain`, with whitespace trimmed and caution tags removed.
        - text_html (str): The raw HTML body of the email, if present; otherwise None.
        - attachment_types (list of str): MIME content types of all attachments.

    Raises
    ------
    TypeError
        If the input `msg` is not an instance of `EmailMessage`.

    Notes
    -----
    - Only the first encountered plain and HTML bodies are extracted.
    - If a `text/plain` part is not available, plain text is extracted from the HTML using BeautifulSoup.
    - The cleaning step removes excess whitespace and UBC’s caution tag: `[CAUTION: Non-UBC Email]`.
    """
    
    if not isinstance(msg, EmailMessage):
        raise TypeError(f"Expect msg to be a EmailMessage but got {type(msg)}")
        
    parts = []
    text_plain = '' 
    text_plain_from_html = '' # use this if 'text_plain' is blank 
    text_clean = '' 
    text_html = None 
    attachment_types = []

    content_type = msg.get_content_type()
    parts.append(content_type)
    
    if msg.is_multipart():
        for part in msg.iter_parts():
            sub_parts, sub_text_plain, sub_text_clean, sub_text_html, sub_attachment_types = extract_body(part)
            parts.extend(sub_parts)
            attachment_types.extend(sub_attachment_types)

            # Prioritize first plain or html content found
            if text_plain == '' and sub_text_plain != '':
                text_plain = sub_text_plain
            if text_html is None and sub_text_html is not None: 
                text_html = sub_text_html
    else:
        disposition = msg.get_content_disposition()

        if content_type == 'text/html':
            try:
                text_html = msg.get_content()
            except Exception:
                pass
            try:
                text_plain_from_html = BeautifulSoup(text_html, 'html.parser').get_text()
            except Exception:
                pass

        if content_type == 'text/plain':
            text_plain = msg.get_content()
        if disposition == 'attachment':
            attachment_types.append(content_type)

    if text_plain != '': # prioritize plain text from text/plain over plain text parsed from html 
        text_clean = " ".join(text_plain.split()).replace('[CAUTION: Non-UBC Email]', '').lstrip()
        #text_clean =  text_clean.replace('[CAUTION: Non-UBC Email]', '').lstrip()

        return parts, text_plain, text_clean, text_html, attachment_types
    
    else:
        text_clean = " ".join(text_plain_from_html.split()).replace('[CAUTION: Non-UBC Email]', '').lstrip()
        #text_clean =  text_plain_from_html.replace('[CAUTION: Non-UBC Email]', '').lstrip()

    return parts, text_plain_from_html, text_clean, text_html, attachment_types

In [None]:
# #### SECOND VERSION 

# def extract_body(msg: EmailMessage) -> Tuple[List[str], str, str]:
#     """
#     Recursively extracts content from an `EmailMessage` object, including content types, plain text, HTML content,
#     and any attachments present in the email.

#     This function handles both multipart and single-part messages. For multipart messages, it performs a depth-first 
#     traversal of the message parts to collect:
    
#     - All MIME content types found in the message.
#     - The first encountered plain text and HTML body content.
#     - The content types of all attachments found within the email.

#     Additionally, the plain text content is cleaned by removing excessive whitespace and a specific cautionary tag
#     from UBC's email system.

#     Parameters
#     ----------
#     msg : EmailMessage
#         The email message object to extract content from.

#     Returns
#     -------
#     tuple of (list of str, str, str, list of str)
#         A tuple containing:

#         - parts : list of str
#             A list of MIME content types found in the message parts (e.g., 'text/plain', 'text/html').
#         - text_plain : str
#             The plain text content of the email. If not directly available, it may be extracted from the HTML part.
#             Cleaned of extra whitespace and UBC's email caution notice.
#         - text_html : str
#             The HTML content of the email, if any (empty string if not found).
#         - attachment_types : list of str
#             A list of MIME content types corresponding to attachments found in the email.

#     Raises
#     ------
#     TypeError
#         If the input `msg` is not an instance of `EmailMessage`.
    
#     """

#     if not isinstance(msg, EmailMessage):
#         raise TypeError(f"Expect msg to be a EmailMessage but got {type(msg)}")
        
#     parts = []
#     text_plain = '' 
#     text_plain_from_html = '' # only use this if above is blank 
#     text_clean = '' 
#     text_html = None 
#     attachment_types = []

#     for part in msg.walk():
#         content_type = part.get_content_type()
#         disposition = part.get_content_disposition()
#         parts.append(content_type)

        
#         if content_type == 'text/html':
#             try:
#                 if text_html is None and msg.get_content() is not None: # prioritize first html content found 
#                     text_html = msg.get_content()
#                     text_plain_from_html = BeautifulSoup(text_html, 'html.parser').get_text()
#             except Exception:
#                 pass

#         if content_type == 'text/plain':
#             if text_plain == '' and msg.get_content() != '': # prioritize first text/plain content found 
#                 text_plain = msg.get_content()

#         if disposition == 'attachment':
#             attachment_types.append(content_type)

#     if text_plain != '': # prioritize plain text from text/plain over plain text parsed from html 
#         text_clean = " ".join(text_plain.split())  
#         text_clean =  text_clean.replace('[CAUTION: Non-UBC Email]', '').lstrip()

#         return parts, text_plain, text_clean, text_html, attachment_types
    
#     else:
#         text_clean = " ".join(text_plain_from_html.split())  
#         text_clean =  text_plain_from_html.replace('[CAUTION: Non-UBC Email]', '').lstrip()

#         return parts, text_plain_from_html, text_clean, text_html, attachment_types


#### Function for extracting all needed features 

In [None]:
def extract_email_metadata(path: Union[str, pd.Series]) -> Union[Dict[str, Union[str, bool, None, List[str]]], pd.Series]:
    """
    Extracts detailed metadata and content from one or more `.eml` email files.

    This function parses a single email file path or a series of file paths, reading and processing
    each `.eml` file to extract a comprehensive set of metadata and body content features. It includes:
    
    - Header information (sender, receiver, subject, routing, authentication)
    - MIME content structure
    - Plain and HTML body content
    - Attachment types

    Parameters
    ----------
    path : str or pd.Series
        A file path to a single `.eml` email file, or a pandas Series of file paths.

    Returns
    -------
    dict or pd.Series
        If a single file path is provided, returns a dictionary with extracted metadata and content.
        If a Series of file paths is provided, returns a Series of dictionaries, one per file.

        The extracted metadata includes:

        Header Information:
        - 'path' : str
            File path to the parsed email.
        - 'is_multipart' : bool
            Whether the email is a multipart message.
        - 'From' : str
            Raw 'From' header value.
        - 'From_name' : str
            Extracted sender name (if present).
        - 'From_email' : str
            Sender's email address.
        - 'From_email_domain' : str
            Domain portion of the sender's email address.
        - 'To' : str
            Raw 'To' header value.
        - 'To_name' : str
            Extracted recipient name (if present).
        - 'To_email' : str
            Recipient's email address.
        - 'To_email_domain' : str
            Domain portion of the recipient's email address.
        - 'Subject' : str or None
            Email subject line.
        - 'Received' : list of str
            List of 'Received' headers showing the delivery path.
        - 'Authentication-Results' : str or None
            SPF, DKIM, and DMARC authentication results, if present.
        - 'received-spf' : str or None
            Sender Policy Framework result, if present.
        - 'DKIM-Signature' : str or None
            DomainKeys Identified Mail signature, if present.
        - 'Return-Path' : str or None
            Return path address for bounces, if present.
        - 'Content-Language' : str or None
            Language declared in the email content.
        - 'Reply-To' : str or None
            Reply-to address, if different from 'From'.

        Body Content:
        - 'Content_types' : list of str
            List of MIME content types found in the email body (e.g., 'text/plain', 'text/html').
        - 'text_plain' : str
            Raw plain text body of the email, or empty string if not present.
        - 'text_clean' : str
            Cleaned version of the plain text, with whitespace normalized and cautionary tags removed.
        - 'text_html' : str or None
            Raw HTML body of the email, if available.
        - 'attachment_types' : list of str
            MIME types of all attachments found in the message.

    Raises
    ------
    TypeError
        If the input `path` is not a string or a pandas Series.

    Examples
    --------
    >>> extract_email_metadata("sample_email.eml")
    {'From_email': 'alice@example.com', 'Subject': 'Meeting Reminder', ...}

    >>> paths = pd.Series(["email1.eml", "email2.eml"])
    >>> metadata_series = extract_email_metadata(paths)
    >>> metadata_series.iloc[0]['From_email']
    'bob@example.com'
    """
    
    def _extract_email_metadata(path):

        with open(path, 'rb') as fp:
            msg = BytesParser(policy=policy.default).parse(fp)

        features_dict = {}
        multipart_bool = msg.is_multipart()
        sender = msg['From']
        
        if sender:
            sender_regex = re.search(r"^(.*?)\s<(.*)>", sender) # if "From" matches the pattern of "Name <email_address>", will store name as group 1, email as group 2

            if sender_regex: # some "From" only have the email, not the sender name. In this case sender_regex will be None 
                sender_name = sender_regex.group(1) 
                sender_name = sender_name if sender_name[0] != "\"" else sender_name[1:-1] ## remove quotations if name is wrapped in them 
                sender_email = sender_regex.group(2)
                sender_email_domain = sender_email.split('@')[-1]
            
            else: # if sender_regex is None 
                sender_name = ''
                sender_email = sender
                sender_email_domain = sender_email.split('@')[-1]
        else: 
            sender = ''
            sender_name = ''
            sender_email = ''
            sender_email_domain = ''

        receiver = msg['To']

        if receiver:
            receiver_regex = re.search(r"^(.*?)\s<(.*)>", receiver)

            if receiver_regex:
                receiver_name = receiver_regex.group(1)
                receiver_name = receiver_name if receiver_name[0] != "\"" else receiver_name[1:-1]
                receiver_email = receiver_regex.group(2)
                receiver_email_domain = receiver_email.split('@')[-1]
            
            else: 
                receiver_name = ''
                receiver_email = receiver
                receiver_email_domain = receiver_email.split('@')[-1]
        
        else: 
            receiver = ''
            receiver_name = ''
            receiver_email = ''
            receiver_email_domain = ''

        subject = msg['Subject']
        received = msg.get_all('Received')
        auth_result = msg['Authentication-Results']
        received_spf = msg['received-spf']
        DKIM_Signature = msg['DKIM-Signature']
        Return_Path = msg['Return-Path']
        content_language = msg['Content-Language']
        reply_to = msg['Reply-To']
        
        content_types, text_plain, text_clean, text_html, attachment_types = extract_body(msg)

        features_dict['path'] = path
        features_dict['is_multipart'] = multipart_bool
        features_dict['From'] = sender
        features_dict['From_name'] = sender_name
        features_dict['From_email'] = sender_email
        features_dict['From_email_domain'] = sender_email_domain
        features_dict['To'] = receiver
        features_dict['To_name'] = receiver_name
        features_dict['To_email'] = receiver_email
        features_dict['To_email_domain'] = receiver_email_domain
        features_dict['Subject'] = subject
        features_dict['Received'] = list(received) if received else []
        features_dict['Authentication-Results'] = auth_result 
        features_dict['received-spf'] = received_spf
        features_dict['DKIM-Signature'] = DKIM_Signature
        features_dict['Return-Path'] = Return_Path
        features_dict['Content-Language'] = content_language 
        features_dict['Reply-To'] = reply_to
        features_dict['Content_types'] = list(content_types) if content_types else []
        features_dict['text_plain'] = text_plain
        features_dict['text_clean'] = text_clean
        features_dict['text_html'] = text_html
        features_dict['attachment_types'] = attachment_types

        return features_dict
    
    if isinstance(path, str):
        return _extract_email_metadata(path)
    
    if isinstance(path, pd.Series):

        extracted_dict_list = []

        for individual_path in path:
            extracted_dict_list.append(_extract_email_metadata(individual_path))
        return pd.Series(extracted_dict_list)
    
    else:
        raise TypeError(f"Expect path to be either a str or pd.Series but got {type(path)}")



#### Testing new function (taking in path instead of msg)

In [None]:
toy_output = extract_email_metadata(ceo_fraud_gc_paths[3])
pd.DataFrame([toy_output])

In [None]:
toy_output_series = extract_email_metadata(pd.Series(ceo_fraud_gc_paths[:100]))
toy_output_series

In [None]:
toy_output_series = extract_email_metadata(pd.Series(ceo_fraud_gc_paths[:100]))
pd.DataFrame(toy_output_series.to_list())

In [None]:
extract_email_metadata(ceo_fraud_gc_paths[:100])

##### Testing parsing one single msg

In [None]:
with open(ceo_fraud_gc_paths[3], 'rb') as fp:
    msg = BytesParser(policy=policy.default).parse(fp)

toy_output = extract_email_metadata(msg)

In [None]:
toy_df = pd.DataFrame([toy_output])

In [None]:
parts = []

for part in msg.walk():
    parts.append(part.get_content_type())

parts

In [None]:
msg.get_payload()[0].get_payload()

In [None]:
pd.DataFrame([toy_output])

In [None]:
text_plain = toy_output['text_plain']
text_plain

In [None]:
" ".join(" ".split())

In [None]:
email = '"Woo, Clement" <clement.woo@ubc.ca>'
email_regex = re.search(r"^(.*?)\s<(.*)>", email)

In [None]:
name = '"Woo, Clement"' 
name = name if name[0] != "\"" else name[1:-1]
name

In [None]:
email = 'melanie.kuxdorf@ubc.ca'
email_regex = re.search(r"^(.*?)\s<(.*)>", email)

In [None]:
type(email_regex)

##### Testing parsing 100 msg

In [None]:
toy_list = ceo_fraud_gc_paths[:100]

In [None]:
extracted_dict_list = []
ticket_id_list = []

for path in toy_list:

    with open(path, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    extracted_dict_list.append(extract_email_metadata(msg))
    ticket_id_list.append(path.split('/')[-2])



In [None]:
df = pd.DataFrame(extracted_dict_list)
df.insert(0, 'ticket_id', ticket_id_list)

In [None]:
df.head()

#### Testing parsing 5000 msg

In [None]:
all_paths = glob.glob('/data/dataset/*/*/*.eml')
legit_paths = [path for path in all_paths if 'Legit' in path or 'Spam' in path]
malicious_path = [path for path in all_paths if path not in legit_paths]

In [None]:
malicious_sampled = random.sample(malicious_path, 5000)

In [None]:
extracted_dict_list = []
ticket_id_list = []

for path in malicious_sampled:

    with open(path, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    extracted_dict_list.append(extract_email_metadata(msg))
    ticket_id_list.append(path.split('/')[-2])

In [None]:
df = pd.DataFrame(extracted_dict_list)
df.insert(0, 'ticket_id', ticket_id_list)
df.head()

#### self-phising emails

In [None]:
self_phish_paths = glob.glob('/data/dataset/Self-Phishing/*/*.eml')
sample_self_phish_paths = random.sample(self_phish_paths, 500)

In [None]:
extracted_dict_list_sp = []
ticket_id_list_sp = []

for path in sample_self_phish_paths:

    with open(path, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    extracted_dict_list_sp.append(extract_email_metadata(msg))
    ticket_id_list_sp.append(path.split('/')[-2])

df_sp = pd.DataFrame(extracted_dict_list_sp)
df_sp.insert(0, 'ticket_id', ticket_id_list_sp)
df_sp.head()

#### legit emails

In [None]:
sample_legit_paths = random.sample(legit_paths, 500)

In [None]:
extracted_dict_list_legit = []
ticket_id_list_legit = []

for path in sample_legit_paths:

    with open(path, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    extracted_dict_list_legit.append(extract_email_metadata(msg))
    ticket_id_list_legit.append(path.split('/')[-2])

df_legit = pd.DataFrame(extracted_dict_list_legit)
df_legit.insert(0, 'ticket_id', ticket_id_list_legit)
df_legit.head()