In [None]:
import pandas as pd
import email
from email.message import EmailMessage
from email import policy
from email.parser import BytesParser
import glob
import os
from collections import defaultdict
from bs4 import BeautifulSoup
import random
from typing import Tuple, List, Dict, Union
import re

In [None]:
ceo_fraud_gc_paths = glob.glob('/data/dataset/CEO_Fraud_-_Gift_Cards/*/*.eml')
toy_list = ceo_fraud_gc_paths[:100]

#### Loading in the `extract_body` and `extract_email_metadata` functions 

In [None]:
def extract_body(msg: EmailMessage) -> Tuple[List[str], str, str]:
    """
    Recursively extracts content from an `EmailMessage` object, including content types, plain text, and HTML body parts.

    Handles both multipart and single-part messages. For multipart messages, the function performs a depth-first traversal 
    through message parts to collect all content types and returns the first encountered plain text and HTML content.

    Parameters
    ----------
    msg : EmailMessage
        The email message object to extract features from.

    Returns
    -------
    tuple of (list of str, str, str)
        A tuple containing:
        
        - parts : list of str
            A list of MIME content types found in the message parts.
        - plain_text_body : str
            The plain text body of the email (empty string if not found).
        - html_text_body : str
            The HTML body of the email (empty string if not found).
    """

    if not isinstance(msg, EmailMessage):
        raise TypeError(f"Expect msg to be a EmailMessage but got {type(msg)}")
        
    parts = []
    plain_text_body = ''
    plain_text_body_clean = ''
    html_text_body = ''
    attachment_types = []

    if msg.is_multipart():
        for part in msg.iter_parts():
            sub_parts, plain, html, sub_attachments = extract_body(part)
            parts.extend(sub_parts)
            attachment_types.extend(sub_attachments)

            # Prioritize first plain or html content found
            if plain_text_body == '' and plain != '':
                plain_text_body = plain
            if html_text_body == '' and html != '':
                html_text_body = html
    else:
        content_type = msg.get_content_type()
        main_type = msg.get_content_maintype()
        disposition = msg.get_content_disposition()
        parts.append(content_type)

        if content_type == 'text/plain':
            plain_text_body = msg.get_content()
        elif content_type == 'text/html':
            try:
                html_text_body = msg.get_content()
            except Exception:
                pass
            try:
                plain_text_body = BeautifulSoup(html_text_body, 'html.parser').get_text()
            except Exception:
                pass
        if disposition == 'attachment':
            attachment_types.append(content_type)
    
    plain_text_body_clean = " ".join(plain_text_body.split())    

    return parts, plain_text_body_clean, html_text_body, attachment_types

In [None]:
def extract_email_metadata(msg: EmailMessage) -> Dict[str, Union[str, bool, None, List[str]]]:
    """
    Extracts metadata and content from an `EmailMessage` object, including headers,
    multipart status, content types, plain text, and HTML body parts.

    Parameters
    ----------
    msg : EmailMessage
        The email message object to extract features from.

    Returns
    -------
    dict
        A dictionary containing extracted fields:
        
        - 'is_multipart' : bool
            Whether the email is multipart.
        - 'From' : str
            Sender of the email.
        - 'To' : str
            Receiver of the email.
        - 'Subject' : str
            Subject of the email.
        - 'Received' : list of str
            List of 'Received' headers.
        - 'Authentication-Results' : str or None
            The 'Authentication-Results' header, if present.
        - 'Content_types' : list of str
            MIME content types present in the message.
        - 'plain_text_body' : str
            Extracted plain text body of the email.
        - 'html_text_body' : str
            Extracted HTML body of the email, if present.

    Raises
    ------
    TypeError
        If `msg` is not an instance of `EmailMessage`.

    Examples
    --------
    >>> features = extract_email_features(msg)
    """
    if not isinstance(msg, EmailMessage):
        raise TypeError(f"Expect msg to be a EmailMessage but got {type(msg)}")

    features_dict = {}

    multipart_bool = msg.is_multipart()

    sender = msg['From']
    if sender:
        sender_regex = re.search(r"^(.*?)\s<(.*)>", sender) # if "From" matches the pattern of "Name <email_adress>", will store name as group 1, email as group 2

        if sender_regex: # some "From" only have the email, not the sender name. In this case sender_regex will be None 
            sender_name = sender_regex.group(1) 
            sender_name = sender_name if sender_name[0] != "\"" else sender_name[1:-1] ## remove quotations if name is wrapped in them 
            sender_email = sender_regex.group(2)
            sender_email_domain = sender_email.split('@')[-1]
        
        else: # if sender_regex is None 
            sender_name = ''
            sender_email = sender
            sender_email_domain = sender_email.split('@')[-1]
    else: 
        sender = ''
        sender_name = ''
        sender_email = ''
        sender_email_domain = ''

    receiver = msg['To']

    if receiver:
        receiver_regex = re.search(r"^(.*?)\s<(.*)>", receiver)

        if receiver_regex:
            receiver_name = receiver_regex.group(1)
            receiver_name = receiver_name if receiver_name[0] != "\"" else receiver_name[1:-1]
            receiver_email = receiver_regex.group(2)
            receiver_email_domain = receiver_email.split('@')[-1]
        
        else: 
            receiver_name = ''
            receiver_email = receiver
            receiver_email_domain = receiver_email.split('@')[-1]
    
    else: 
        receiver = ''
        receiver_name = ''
        receiver_email = ''
        receiver_email_domain = ''

    subject = msg['Subject']
    received = msg.get_all('Received')
    auth_result = msg['Authentication-Results']
    received_spf = msg['received-spf']
    DKIM_Signature = msg['DKIM-Signature']
    Return_Path = msg['Return-Path']
    content_langauge = msg['Content-Language']
    reply_to = msg['Reply-To']
    
    content_types, plain_text_body_clean, html_text_body, attachment_types = extract_body(msg)

    features_dict['is_multipart'] = multipart_bool
    features_dict['From'] = sender
    features_dict['From_name'] = sender_name
    features_dict['From_email'] = sender_email
    features_dict['From_email_domain'] = sender_email_domain
    features_dict['To'] = receiver
    features_dict['To_name'] = receiver_name
    features_dict['To_email'] = receiver_email
    features_dict['To_email_domain'] = receiver_email_domain
    features_dict['Subject'] = subject
    features_dict['Received'] = list(received) if received else []
    features_dict['Authentication-Results'] = auth_result 
    features_dict['received-spf'] = received_spf
    features_dict['DKIM-Signature'] = DKIM_Signature
    features_dict['Return-Path'] = Return_Path
    features_dict['Content-Language'] = content_langauge 
    features_dict['Reply-To'] = reply_to
    features_dict['Content_types'] = list(content_types) if content_types else []
    features_dict['plain_text_body_clean'] = plain_text_body_clean
    features_dict['html_text_body'] = html_text_body
    features_dict['attachment_types'] = attachment_types

    return features_dict

#### Run a sample dataset of size 100

In [None]:
extracted_dict_list = []
ticket_id_list = []

for path in toy_list:

    with open(path, 'rb') as fp:
        msg = BytesParser(policy=policy.default).parse(fp)

    extracted_dict_list.append(extract_email_metadata(msg))
    ticket_id_list.append(path.split('/')[-2])


df = pd.DataFrame(extracted_dict_list)
df.insert(0, 'ticket_id', ticket_id_list)
df.head()

#### `has_header_value` 

- Return boolean indicating whether SPF, DKIM and authentication results are present in the sending domain

In [None]:
def has_header_value(header_value: Union[str, pd.Series]) -> Union[bool, pd.Series]:
    """
    Check whether an email header (e.g., Received-SPF, DKIM-Signature, Authentication-Result) is present.

    Parameters
    ----------
    header_value : str or pandas.Series
        The header string or a Series of header strings.

    Returns
    -------
    bool or pandas.Series
        True or boolean Series indicating presence (non-null).

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> has_header_value("v=spf1 include:_spf.google.com ~all")
    True

    >>> import pandas as pd
    >>> s = pd.Series(["v=spf1", None, ""])
    >>> has_header_value(s)
    0     True
    1    False
    2     True
    dtype: bool
    """
    if isinstance(header_value, pd.Series):
        return header_value.notnull()
    elif isinstance(header_value, str):
        return header_value is not None
    else:
        raise TypeError("Input must be a string or a pandas Series")

In [None]:
has_spf_signature = has_header_value(df['Received-SPF'])
has_spf_signature 

In [None]:
has_dkim_signature = has_header_value(df['DKIM-Signature'])
has_dkim_signature

In [None]:
has_auth_result = has_header_value(df['Authentication-Results'])
has_auth_result

#### `has_dmarc_authentication`

- Check whether the DMARC protocol is present in the sending domain

In [None]:
def has_dmarc_authentication(auth_result: Union[str, pd.Series]) -> Union[bool, pd.Series]:
    """
    Check whether the 'Authentication-Results' field contains a DMARC result.

    Parameters
    ----------
    auth_result : str or pandas.Series
        A string or Series representing the 'Authentication-Results' header.

    Returns
    -------
    bool or pandas.Series
        True or a boolean Series indicating whether 'dmarc=' is present 
        (case-insensitive). If input is a Series, returns a Series of booleans.
        If input is a string, returns a single boolean.

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> has_dmarc_authentication("mx.google.com; dmarc=pass (p=NONE)")
    True

    >>> import pandas as pd
    >>> s = pd.Series(["spf=pass", "dmarc=fail", None])
    >>> has_dmarc_authentication(s)
    0    False
    1     True
    2    False
    dtype: bool
    """
    
    if isinstance(auth_result, pd.Series):
        return auth_result.str.contains('dmarc=', case=False, na=False)
    elif isinstance(auth_result, str):
        return 'dmarc=' in auth_result.lower()
    else:
        raise TypeError("Input must be a string or a pandas Series")

In [None]:
has_dmarc_authentication_test = has_dmarc_authentication(df['Authentication-Results'])
has_dmarc_authentication_test

#### `get_dkim_result`

- Extract DKIM result from `Authentication-Results`

In [None]:
def get_dkim_result(auth_result: Union[str, pd.Series]) -> Union[str, pd.Series]:
    """
    Extract the DKIM result from the 'Authentication-Results' header.

    Parameters
    ----------
    auth_result : str or pandas.Series
        A string or Series representing the 'Authentication-Results' header.

    Returns
    -------
    str or pandas.Series
        The extracted DKIM result (e.g., 'pass', 'fail', 'neutral'). 
        Returns 'none' if no DKIM result is found.
        If input is a Series, returns a Series of strings.
        If input is a string, returns a single string.

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> get_dkim_result("mx.google.com; dkim=pass header.d=example.com")
    'pass'

    >>> import pandas as pd
    >>> s = pd.Series([
    ...     "dkim=pass header.d=example.com", 
    ...     "spf=pass", 
    ...     None
    ... ])
    >>> get_dkim_result(s)
    0    pass
    1    none
    2    none
    dtype: object
    """
    
    if isinstance(auth_result, pd.Series):
        return auth_result.apply(lambda x: re.search(r'dkim=(\w+)', x).group(1) if isinstance(x, str) and re.search(r'dkim=(\w+)', x) else 'none')
    elif isinstance(auth_result, str):
        dkim_result = re.search(r'dkim=(\w+)', auth_result)
        return dkim_result.group(1) if dkim_result else 'none'
    else:
        raise TypeError("Input must be a string or a pandas Series")
    

In [None]:
dkim_result = get_dkim_result(df['Authentication-Results'])
dkim_result

#### `get_spf_result`

- Extract DKIM result from `receive-spf`

In [None]:
def get_spf_result(receive_spf: Union[str, pd.Series]) -> Union[str, pd.Series]:
    """
    Extract the SPF result from the 'Received-SPF' header.

    This function returns the first 5 characters of the input string, which typically
    correspond to the SPF result (e.g., "pass", "fail", "soft", "neutr"). If the input is not
    a string or is None, 'none' is returned.

    Parameters
    ----------
    receive_spf : str or pandas.Series
        A string or Series representing the 'Received-SPF' header from an email.

    Returns
    -------
    str or pandas.Series
        The extracted SPF result (typically one of: 'pass', 'fail', 'soft', 'neutr').
        Returns 'none' if the value is not a valid string.
        If input is a Series, returns a Series of results.
        If input is a string, returns a single result string.

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> get_spf_result("pass (google.com: domain of example.com designates ...")
    'pass'

    >>> import pandas as pd
    >>> s = pd.Series([
    ...     "fail (google.com: domain of attacker.com ...)",
    ...     "softfail (google.com ...)",
    ...     None
    ... ])
    >>> get_spf_result(s)
    0     fail
    1    soft
    2    none
    dtype: object
    """
     
    if isinstance(receive_spf, pd.Series):
        return receive_spf.apply(lambda x: re.search(r'^(\w+)', x).group(0) if isinstance(x, str) and re.search(r'^(\w+)', x) else 'none')
    elif isinstance(receive_spf, str):
        spf_result = re.search(r'^(\w+)', receive_spf).group(0)
        return spf_result if spf_result else 'none'
    else:
        raise TypeError("Input must be a string or a pandas Series")

In [None]:
spf_result = get_spf_result(df['received-spf'])
spf_result

#### `get_dmarc_result`

- Extract DMARC result from `Authentication-Results`

In [None]:
def get_dmarc_result(auth_result: Union[str, pd.Series]) -> Union[str, pd.Series]:
    """
    Extract the DMARC authentication result from the Authentication-Results header.

    Parameters
    ----------
    auth_result : str or pandas.Series
        The Authentication-Results header from the email. Can be a single string or a Series of strings.

    Returns
    -------
    str or pandas.Series
        The extracted DMARC result (`pass`, `fail`, etc.) as a string or a Series of strings.
        Returns 'none' if DMARC result is not found or if the input is not a valid string.

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> get_dmarc_result("Authentication-Results: mx.google.com; dmarc=pass header.from=example.com")
    'pass'

    >>> import pandas as pd
    >>> s = pd.Series([
    ...     "dmarc=pass header.from=example.com",
    ...     "spf=pass smtp.mailfrom=example.com",
    ...     None
    ... ])
    >>> get_dmarc_result(s)
    0     pass
    1     none
    2     none
    dtype: object
    """
    
    if isinstance(auth_result, pd.Series):
        return auth_result.apply(lambda x: re.search(r'dmarc=(\w+)', x).group(1) if isinstance(x, str) and re.search(r'dmarc=(\w+)', x) else 'none')
    elif isinstance(auth_result, str):
        dmarc_result = re.search(r'dmarc=(\w+)', auth_result)
        return dmarc_result.group(1) if dmarc_result else 'none'
    else:
        raise TypeError("Input must be a string or a pandas Series")

In [None]:
dmarc_result = get_dmarc_result(df['Authentication-Results'])
dmarc_result 

#### `extract_dkim_domain`, `dkim_domain_matches_sender`

- Extract sender domain from `DKIM-Signature` and compares it to `From` domain 

In [None]:
def extract_dkim_domain(dkim_signature: Union[str, pd.Series]) -> Union[str, pd.Series]:
    """
    Extract the domain from the DKIM-Signature header field.

    Parameters
    ----------
    dkim_signature : str or pandas.Series
        The DKIM-Signature header string or a Series of such strings from email headers.
        Expected to contain the `d=` tag that specifies the signing domain.

    Returns
    -------
    str or pandas.Series
        The domain extracted from the `d=` tag of the DKIM-Signature. Returns a lowercase string
        if input is a string, or a Series of lowercase strings (with None for unparsable/missing input) if input is a Series.

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> extract_dkim_domain("v=1; a=rsa-sha256; d=example.com; s=selector1;")
    'example.com'

    >>> import pandas as pd
    >>> s = pd.Series([
    ...     "v=1; a=rsa-sha256; d=example.com; s=selector1;",
    ...     "v=1; a=rsa-sha256; d=another.org; s=selector2;",
    ...     None
    ... ])
    >>> extract_dkim_domain(s)
    0    example.com
    1    another.org
    2          None
    dtype: object
    """

    def extract(sig: str) -> Union[str, None]:
        if not isinstance(sig, str):
            return None
        match = re.search(r'd=([\w\.-]+)', sig)
        return match.group(1).lower() if match else None

    if isinstance(dkim_signature, pd.Series):
        return dkim_signature.apply(extract)
    elif isinstance(dkim_signature, str):
        return extract(dkim_signature)
    else:
        raise TypeError("dkim_signature must be either a string or a pandas Series.")
    

def dkim_domain_matches_sender(dkim_signature: Union[str, pd.Series], sender_domain: Union[str, pd.Series]) -> bool:
    """
    Compare the domain used in the DKIM-Signature header (`d=`) with the sender's domain.

    Parameters
    ----------
    dkim_signature : str or pandas.Series
        The DKIM-Signature header string or a Series of such strings from email headers.
        Expected to contain a `d=` tag that specifies the signing domain.

    sender_domain : str or pandas.Series
        The domain part of the sender's email address (e.g., the part after '@') or a Series of such domains.

    Returns
    -------
    bool or pandas.Series
        Returns True if the DKIM domain matches the sender domain (case-insensitive exact match), 
        or a Series of booleans for element-wise comparisons.

    Raises
    ------
    TypeError
        If the inputs are not both strings or both pandas Series of the same length.

    Examples
    --------
    >>> dkim_domain_matches_sender("v=1; a=rsa-sha256; d=example.com; s=selector1;", "example.com")
    True

    >>> import pandas as pd
    >>> sigs = pd.Series(["v=1; d=example.com;", "v=1; d=another.org;", None])
    >>> senders = pd.Series(["example.com", "another.org", "missing.com"])
    >>> dkim_domain_matches_sender(sigs, senders)
    0     True
    1     True
    2    False
    dtype: bool
    """
    
    if isinstance(dkim_signature, pd.Series) and isinstance(sender_domain, pd.Series):
        dkim_domains = extract_dkim_domain(dkim_signature)
        sender_domains = sender_domain.fillna('').str.lower()
        return dkim_domains == sender_domains

    elif isinstance(dkim_signature, str) and isinstance(sender_domain, str):
        dkim_domain = extract_dkim_domain(dkim_signature)
        return dkim_domain == sender_domain.lower() if dkim_domain else False

    else:
        raise TypeError("Both inputs must be either str or pd.Series of the same length.")

In [None]:
dkim_domain_match = dkim_domain_matches_sender(df['DKIM-Signature'], df['From_email_domain'])
dkim_domain_match

In [None]:
extracted_domain_from_dkim = extract_dkim_domain(df['DKIM-Signature'])
extracted_domain_from_dkim

#### `has_attachment`

- Return true if the email has one or more attachment

In [None]:
def has_attachment(attachment_list: Union[List, pd.Series]) -> bool: 
    """
    Determines whether attachments are present.

    Parameters
    ----------
    attachment_list : list or pd.Series
        A list of attachments or a Series of such lists.

    Returns
    -------
    bool or pd.Series
        True if the list has attachments, or a Series of booleans for each list.
    """
    
    if isinstance(attachment_list, pd.Series):
        return attachment_list.apply(lambda x: len(x)>0 if isinstance(x, List) else False)

    if isinstance(attachment_list, List):
        return len(attachment_list)>0 
    
    else: 
        raise TypeError("attachment_list must be either a List or a pandas Series.")

In [None]:
has_attachment = has_attachment(df['attachment_types'])
has_attachment

#### `number_of_received`

- Returns the number of email servers an email has passed through, based on the count of "Received" headers.

In [None]:
def number_of_received(received_headers: Union[List[str], pd.Series]) -> Union[int, pd.Series]:
    """
    Returns the number of email servers an email has passed through, 
    based on the count of "Received" headers.

    Parameters
    ----------
    received_headers : list of str or pandas.Series
        A list of "Received" headers for a single email, or a Series of such lists for multiple emails.

    Returns
    -------
    int or pandas.Series
        - If input is a list: returns the count of "Received" headers (i.e., number of hops).
        - If input is a Series: returns a Series of counts for each email.

    Raises
    ------
    TypeError
        If the input is neither a list nor a pandas Series.

    Examples
    --------
    >>> number_of_received(['from server1', 'from server2', 'from server3'])
    3

    >>> import pandas as pd
    >>> s = pd.Series([
    ...     ['from server1', 'from server2'],
    ...     ['from server1'],
    ...     []
    ... ])
    >>> number_of_received(s)
    0    2
    1    1
    2    0
    dtype: int64
    """
    if isinstance(received_headers, pd.Series):
        return received_headers.apply(lambda x: len(x) if isinstance(x, list) else 0)

    if isinstance(received_headers, list):
        return len(received_headers)

    raise TypeError("received_headers must be either a list or a pandas Series.")

In [None]:
number_received = number_of_received(df['Received'])
number_received

#### `to_from_match`

- Return true if the "To" email address matches the "From" email address. 

In [None]:
def to_from_match(to_email: Union[str, pd.Series], from_email: Union[str, pd.Series]) -> Union[bool, pd.Series]: 
    """
    Checks whether the 'To' and 'From' email addresses match.

    Parameters
    ----------
    to_email : str or pd.Series
        The recipient email address or a Series of addresses.
    from_email : str or pd.Series
        The sender email address or a Series of addresses.

    Returns
    -------
    bool or pd.Series
        True if the email addresses match (case-insensitive), or Series of booleans if input is Series.

    Examples
    --------
    >>> to_from_match("alice@example.com", "Alice@Example.com")
    True

    >>> to_from_match(
    ...     pd.Series(["alice@example.com", "bob@example.com"]),
    ...     pd.Series(["ALICE@example.com", "eve@example.com"])
    ... )
    0     True
    1    False
    dtype: bool
    """
    
    if isinstance(to_email, pd.Series) and isinstance(from_email, pd.Series):
        return to_email.str.lower() == from_email.str.lower()
    elif isinstance(to_email, str) and isinstance(from_email, str):
        return to_email.lower() == from_email.lower()
    else:
        raise TypeError("Both inputs must be either str or pd.Series of equal length.")

In [None]:
to_from_match_test = to_from_match(df['To'], df['From'])
to_from_match_test

#### `extract_spf_domain`, `spf_domain_matches_sender`

- Extract sender domain from `received-spf` and compares it to `From` domain 

In [None]:
def extract_spf_email(received_spf: Union[str, pd.Series]) -> Union[str, pd.Series]:
    """
    Extract the sender email from the `Received-SPF` header field.

    This function looks for the `envelope-from=` field inside the SPF result string and extracts the email address.

    Parameters
    ----------
    received_spf : str or pandas.Series
        The `Received-SPF` header string or a Series of such strings from email headers.
        Expected to contain the `envelope-from=` field.

    Returns
    -------
    str or pandas.Series
        The email extracted from the `envelope-from=` field. Returns a lowercase string if input is a string,
        or a Series of lowercase strings (with None for unparsable or missing entries) if input is a Series.

    Raises
    ------
    TypeError
        If the input is neither a string nor a pandas Series.

    Examples
    --------
    >>> extract_spf_email('pass (google.com: domain of test@example.com designates 1.2.3.4 as permitted sender) client-ip=1.2.3.4; envelope-from="test@example.com"; helo=mail.example.com;')
    'example.com'

    >>> import pandas as pd
    >>> s = pd.Series([
    ...     'pass (google.com: domain of test@example.com designates 1.2.3.4 as permitted sender) envelope-from="test@example.com";',
    ...     'neutral (spf=neutral) envelope-from="user@another.org";',
    ...     None
    ... ])
    >>> extract_spf_email
    0    example.com
    1    another.org
    2          None
    dtype: object
    """

    def extract(spf: str) -> Union[str, None]:
        if not isinstance(spf, str):
            return None
        match = re.search(r'envelope-from=["\']?([^"\'>\s]+)["\']?', spf)
        return match.group(1).lower() if match else None

    if isinstance(received_spf, pd.Series):
        return received_spf.apply(extract)
    elif isinstance(received_spf, str):
        return extract(received_spf)
    else:
        raise TypeError("receive_spf must be either a string or a pandas Series.")
    

def spf_email_matches_sender(received_spf: Union[str, pd.Series], sender_email: Union[str, pd.Series]) -> bool:
    """
    Compare the sender email address in the SPF record (`envelope-from=`) with the sender's email address.

    This function extracts the full email address from the SPF record's `envelope-from=` field
    and compares it to the provided sender email address. The comparison is case-insensitive 
    and supports both string inputs and pandas Series for batch evaluation.

    Parameters
    ----------
    received_spf : str or pandas.Series
        A single `Received-SPF` header string or a Series of such strings from email headers.

    sender_email : str or pandas.Series
        The sender's email address (e.g., "user@example.com") or a Series of such addresses.

    Returns
    -------
    bool or pandas.Series
        Returns True if the SPF `envelope-from=` email address matches the given sender email 
        (case-insensitive exact match), or a Series of booleans for element-wise comparison.

    Raises
    ------
    TypeError
        If the inputs are not both strings or both pandas Series of the same length.

    Examples
    --------
    >>> spf_email_matches_sender(
    ...     'pass (google.com: domain of test@example.com designates 1.2.3.4 as permitted sender) envelope-from="test@example.com";',
    ...     'test@example.com'
    ... )
    True

    >>> import pandas as pd
    >>> spf_headers = pd.Series([
    ...     'pass envelope-from="user@example.com";',
    ...     'pass envelope-from="admin@another.org";',
    ...     None
    ... ])
    >>> sender_emails = pd.Series(['user@example.com', 'admin@another.org', 'no-reply@other.com'])
    >>> spf_email_matches_sender(spf_headers, sender_emails)
    0     True
    1     True
    2    False
    dtype: bool
    """
    
    if isinstance(received_spf, pd.Series) and isinstance(sender_email, pd.Series):
        spf_domains = extract_spf_email(received_spf)
        sender_domains = sender_email.fillna('').str.lower()
        return spf_domains == sender_domains

    elif isinstance(received_spf, str) and isinstance(sender_email, str):
        spf_domain = extract_spf_email(received_spf)
        return spf_domain == sender_email.lower() if spf_domain else False

    else:
        raise TypeError("Both inputs must be either str or pd.Series of the same length.")

In [None]:
spf_email_match = spf_email_matches_sender(df['received-spf'], df['From_email'])
spf_email_match