In [None]:
import pandas as pd
import re
import warnings

In [None]:
raw_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/raw/sample-small.parquet').query('`target_3` != "self_phishing"')
processed_df = pd.read_parquet('/data/workspace/dataset/sampled-dataset/processed/sample-small.parquet')

## Match sender's email domain using `From` and `Reply-To` fields

### Extract email address from any header field

In [None]:
from email.utils import parseaddr

def get_name_from_email_header(header_field: str | pd.Series) -> str | pd.Series | None:
    """
    Extract display name from an email field's 'From' or similar headers.
        
    This function parses email header fields to extract the human-readable name
    part, which helps identify how the sender presented themselves in the email.
        
    Parameters
    ----------
    header_field : str or pandas.Series
        The email header field content (like 'From', 'To', 'Reply-To', etc.)
        that typically contains name and email address
            
    Returns
    -------
    str, pandas.Series or None
        The extracted name part of the email header
        - If input is a string: Returns name as string or header_field if no name found
        - If input is a Series: Returns a Series of names
        - Returns None if the input is None
            
    Examples
    --------
    >>> get_name_from_email_header('John Doe <john@example.com>')
    'John Doe'
    >>> get_name_from_email_header('no-name@example.com')
    'no-name@example.com'
    """
    if isinstance(header_field, pd.Series):
        return header_field.apply(get_name_from_email_header) # type: ignore

    if not header_field:
        warnings.warn("None entry found in `header_field`")
        return None
    
    if not isinstance(header_field, str):
        raise ValueError("`header_field` must be str or pandas.Series")
    
    name, email = parseaddr(header_field)
    
    if (name == '' and email == ''):
        name = header_field

    return name

def get_email_addr_from_email_header(header_field: str | pd.Series) -> str | pd.Series | None:
    """
    Extract email address from an email field's 'From' or similar headers.
        
    This function parses email header fields to extract the email address part,
    which helps identify the actual address used by the sender.
        
    Parameters
    ----------
    header_field : str or pandas.Series
        The email header field content (like 'From', 'To', 'Reply-To', etc.)
        that typically contains name and email address
            
    Returns
    -------
    str, pandas.Series or None
        The extracted email address part of the email header
        - If input is a string: Returns email address as string or None if no valid address found
        - If input is a Series: Returns a Series of email addresses
        - Returns None if the input is None
            
    Examples
    --------
    >>> get_email_addr_from_email_header('John Doe <john@example.com>')
    'john@example.com'
    >>> get_email_addr_from_email_header('invalid-format')
    None
    """
    if isinstance(header_field, pd.Series):
        return header_field.apply(get_email_addr_from_email_header) # type: ignore

    if not header_field:
        warnings.warn("None entry found in `header_field`")
        return None
    
    if not isinstance(header_field, str):
        raise ValueError("`header_field` must be str or pandas.Series")
    
    name, email = parseaddr(header_field)

    return email

In [None]:
reply_to_emails = get_email_addr_from_email_header(raw_df['Reply-To'])

reply_to_emails

### Extract domain of email address

In [None]:
def get_email_domain(email_address: str | pd.Series | None) -> str | pd.Series | None:
    """
    Extract domain from an email address.
        
    Parameters
    ----------
    email_address : str or pandas.Series
        Email address or series of email addresses
        
    Returns
    -------
    str, pandas.Series or None
        The extracted domain part of the email address(es)
        - If input is a string: Returns domain as string or None if not a valid email
        - If input is a Series: Returns a Series of domains
        
    Examples
    --------
    >>> extract_email_domain('user@example.com')
    'example.com'
    >>> extract_email_domain('invalid-email')
    None
    """
    if isinstance(email_address, pd.Series):
        return email_address.apply(get_email_domain) # type: ignore

    if not email_address:
        warnings.warn("None entry found in `email_address`")
        return None
    
    if not isinstance(email_address, str):
        raise ValueError("`email_address` must be str or pandas.Series or None")

    parts = email_address.strip().split('@')
    
    if len(parts) == 2 and parts[1]:
        return parts[1].lower()
    else:
        return None

In [None]:
reply_to_domain = get_email_domain(reply_to_emails)
reply_to_domain

### Match domains in `From` and `Reply-To` fields

In [None]:
def check_different_reply_domain(from_domain: str | pd.Series | None, reply_to_domain: str | pd.Series | None) -> bool | pd.Series:
    """
    Check if the domain from 'From' header matches the domain from 'Reply-To' header.

    This function evaluates if there's a mismatch between the sender's domain in the 'From' 
    field and the domain in the 'Reply-To' field, which is a common indicator of phishing.

    Parameters
    ----------
    from_domain : str or pandas.Series or None
        Domain extracted from the 'From' email header field
    reply_to_domain : str or pandas.Series or None
        Domain extracted from the 'Reply-To' email header field

    Returns
    -------
    bool or pandas.Series of bool
        True if there's a suspicious configuration (different domains or missing fields),
        False if the configuration appears normal

    Notes
    -----
    Returns True (suspicious) in the following cases:
    - Reply-To exists but From doesn't
    - Both Reply-To and From are missing
    - Reply-To and From domains don't match
    """
    if isinstance(from_domain, pd.Series) and isinstance(reply_to_domain, pd.Series):
        result = pd.Series(False, index=from_domain.index)
        
        # Case where Reply-To domain doesn't exist
        no_reply_mask = reply_to_domain.isna() | (reply_to_domain == '')
        # Subcase: From exists (normal)
        has_from_mask = ~(from_domain.isna() | (from_domain == ''))
        
        # Both missing is suspicious
        result[no_reply_mask & ~has_from_mask] = True
        
        # Case where Reply-To domain exists
        has_reply_mask = ~no_reply_mask
        # Different domains is suspicious
        result[has_reply_mask & has_from_mask & (from_domain != reply_to_domain)] = True
        # Missing From but having Reply-To is suspicious
        result[has_reply_mask & ~has_from_mask] = True
        
        return result
    
    if not (isinstance(from_domain, str) and isinstance(reply_to_domain, str)):
        raise ValueError("`from_domain` and `reply_to_domain` must both be str or pandas.Series")
    
    # Original logic for string inputs
    # Case 1 & 2: Reply-To domain does not exist
    if not reply_to_domain:
        # If From exists, this is normal configuration
        if from_domain:
            return False
        # Both missing is suspicious
        else:
            return True
    
    # Case 3, 4 & 5: Reply-To domain exists
    else:
        # From domain exists
        if from_domain:
            # Match is normal
            if reply_to_domain == from_domain:
                return False
            # Different domains is suspicious
            else:
                return True
        # Missing From but having Reply-To is suspicious
        else:
            return True

In [None]:
reply_domains_different = check_different_reply_domain(raw_df['From_email_domain'], reply_to_domain)
reply_domains_different

## Use information from first legitimate `Received` header

### Extract first `Received` header that records the first server-to-server transfer (internal to external transfer)

In [None]:
from ipaddress import ip_address
from array import array

def extract_first_server_transfer(received_headers: list | pd.Series | array | None) -> str | pd.Series | None:
    """
    Extract the first meaningful 'Received' header that captures the server-to-server transfer.

    This function identifies the first server-to-server email transfer from external
    to UBC mail systems, which is useful for analyzing the email's origin.

    Parameters
    ----------
    received_headers : list or pandas.Series or array.array or None
        A list of 'Received' headers from an email or collection of emails,
        usually ordered from most recent (index 0) to oldest (index n)

    Returns
    -------
    str or pandas.Series or None
        The identified first server-to-server transfer header, or None if not found
        - If input is a list: Returns a string with the first relevant header
        - If input is a Series: Returns a Series with processed headers
        
    Notes
    -----
    The function prioritizes finding:
    1. Headers showing the UBC mail relay connection (indicating an incoming email)
    2. Headers with external IP addresses (often indicating external connections)
    3. Any header with both "from" and "by" components that isn't internal
    """
    if isinstance(received_headers, pd.Series):
        return received_headers.apply(extract_first_server_transfer) # type: ignore
    
    if not isinstance(received_headers, list) and hasattr(received_headers, 'tolist'):
        received_headers = received_headers.tolist() # type: ignore

    if not received_headers:
        warnings.warn("None entry found in `received_headers`")
        return None
    
    if not isinstance(received_headers, list):
        raise ValueError("`received_headers` must either be a list or able to be converted to a list")
    
    # Reverse the headers to start from the earliest transfer
    for header in reversed(received_headers):
        if not isinstance(header, str):
            continue
            
        # Skip internal transfers and known patterns for local processing
        if "localhost" in header or "127.0.0.1" in header or \
           "with mapi id" in header or \
           "envelope-from" in header:
            continue

        # Look for the UBC mail relay pattern
        if re.search(r'by\s+[a-zA-Z0-9-]+\.mail-relay\.ubc\.ca', header):
            return header
        
        # Look for headers with external IP addresses
        # This pattern matches "unknown [IP]" which often indicates external connections
        if re.search(r'unknown \[\d+\.\d+\.\d+\.\d+\]', header):
            return header
        
        # Also match any "from" that has a real IP that's not localhost
        if "from" in header and "by" in header:
            ip_matches = re.findall(r'\[(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\]', header)
            for ip in ip_matches:
                address = ip_address(ip)
                if address.is_global:
                    return header
    
    # If nothing matched our criteria, use the first header with from and by
    for header in reversed(received_headers):
        # Skip internal transfers and known patterns for local processing
        if "localhost" in header or "127.0.0.1" in header or \
           "with mapi id" in header or \
           "envelope-from" in header:
            continue

        if isinstance(header, str) and "from" in header and "by" in header:
            return header
    
    return None

In [None]:
first_received_headers = extract_first_server_transfer(raw_df['Received'])

first_received_headers

### Extract the domain name of the sending server

In [None]:
def extract_domain_from_received_header(received_header: str | pd.Series | None) -> str | pd.Series | None:
    """
    Extract domain name from 'From' field in Received email header.
    
    This function parses a Received email header to extract the domain name
    of the sending server, which helps establish the source of the email.
    
    Parameters
    ----------
    received_header : str or pandas.Series or None
        The Received header field containing information about email routing
        
    Returns
    -------
    str, pandas.Series or None
        The extracted domain name of the sending mail server
        - If input is a string: Returns domain as string or None if not found
        - If input is a Series: Returns a Series of domain names
        
    Examples
    --------
    >>> header = "from mail-server.example.com ([192.168.1.1]) by receiver.com"
    >>> extract_domain_from_received_header(header)
    'mail-server.example.com'
    """
    if isinstance(received_header, pd.Series):
        return received_header.apply(extract_domain_from_received_header) # type: ignore

    if not received_header:
        warnings.warn("None entry found in `received_header`")
        return None
    
    if not isinstance(received_header, str):
        raise ValueError("`received_header` must be str or pandas.Series or None")
        
    domain_pattern = r'from\s+([a-zA-Z0-9][-a-zA-Z0-9.]*\.[a-zA-Z0-9][-a-zA-Z0-9.]*)[\s\(]'
    
    match = re.search(domain_pattern, received_header)
    if match:
        return match.group(1)
    
    fallback_pattern = r'from\s+[^\(]*\([^H]*HELO\s+([a-zA-Z0-9][-a-zA-Z0-9.]*\.[a-zA-Z0-9][-a-zA-Z0-9.]*)'
    
    match = re.search(fallback_pattern, received_header)
    if match:
        return match.group(1)
    
    return None

In [None]:
sender_server_domain = extract_domain_from_received_header(first_received_headers)

sender_server_domain

In [None]:
def extract_ip_from_received_header(received_header: str | pd.Series | None) -> str | pd.Series | None:
    """
    Extract IP address from 'Received' header in an email.

    This function identifies and extracts the IP address of the sending mail server
    from the 'Received' header, useful for analyzing the true origin of an email.

    Parameters
    ----------
    received_header : str or pandas.Series or None
        The Received header string containing information about email routing,
        or a Series of such headers
        
    Returns
    -------
    str, pandas.Series or None
        The extracted IP address of the sending server
        - If input is a string: Returns the first IP address found or None
        - If input is a Series: Returns a Series of extracted IP addresses
        
    Examples
    --------
    >>> header = "from mail-server.example.com ([192.168.1.1]) by receiver.com"
    >>> extract_ip_from_received_header(header)
    '192.168.1.1'
    """
    if isinstance(received_header, pd.Series):
        return received_header.apply(extract_ip_from_received_header) # type: ignore

    if not received_header:
        warnings.warn("None entry found in `received_header`")
        return None
    
    if not isinstance(received_header, str):
        raise ValueError("`received_header` must be str or pandas.Series or None")
    
    if isinstance(received_header, str):
        # Extract IPv4 addresses (looking for standard dotted decimal format in brackets)
        ipv4_pattern = r'\[(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\]'
        ipv4_matches = re.findall(ipv4_pattern, received_header)
        
        if ipv4_matches:
            return ipv4_matches[0]
        
        # Extract IPv6 addresses (various formats)
        # Standard IPv6 format with colons in brackets
        ipv6_pattern1 = r'\[([0-9a-fA-F:]+:[0-9a-fA-F:]+)\]'
        # IPv6 with port specification
        ipv6_pattern2 = r'([0-9a-fA-F:]+:[0-9a-fA-F:]+)\)'
        # IPv6 with HELO
        ipv6_pattern3 = r'\(([0-9a-fA-F:]+:[0-9a-fA-F:]+)\)'
        
        ipv6_matches = []
        ipv6_matches.extend(re.findall(ipv6_pattern1, received_header))
        ipv6_matches.extend(re.findall(ipv6_pattern2, received_header))
        ipv6_matches.extend(re.findall(ipv6_pattern3, received_header))
        
        # Filter out non-IP-looking matches and add valid IPv6 addresses
        for match in ipv6_matches:
            if ':' in match and match.count(':') >= 2:  # Simple validation for IPv6
                return match
        
        return None
    
    if not received_header or not isinstance(received_header, list):
        return 
    
    result = []
    
    for header in received_header:
        if not isinstance(header, str):
            result.append(None)
            continue
            
        # Extract IPv4 addresses
        ipv4_pattern = r'\[(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\]'
        ipv4_matches = re.findall(ipv4_pattern, header)
        
        if ipv4_matches:
            result.append(ipv4_matches[0])
            continue
        
        # Extract IPv6 addresses (various formats)
        ipv6_pattern1 = r'\[([0-9a-fA-F:]+:[0-9a-fA-F:]+)\]'
        ipv6_pattern2 = r'([0-9a-fA-F:]+:[0-9a-fA-F:]+)\)'
        ipv6_pattern3 = r'\(([0-9a-fA-F:]+:[0-9a-fA-F:]+)\)'
        
        ipv6_matches = []
        ipv6_matches.extend(re.findall(ipv6_pattern1, header))
        ipv6_matches.extend(re.findall(ipv6_pattern2, header))
        ipv6_matches.extend(re.findall(ipv6_pattern3, header))
        
        for match in ipv6_matches:
            if ':' in match and match.count(':') >= 2:
                result.append(match)
                break
        else:  # No IPv6 match found
            result.append(None)
    
    return result if result else None

In [None]:
sender_server_ip = extract_ip_from_received_header(first_received_headers)

sender_server_ip

### Check if email originates from UBC (signs of Outbound phishing or spam)

In [None]:
def check_email_from_ubc(received_header: str | pd.Series | None) -> bool | pd.Series:
    """
    Check if an email originates from UBC based on received header.

    This function identifies emails that originate from UBC's email infrastructure,
    which can help distinguish between legitimate internal emails and external phishing
    attempts masquerading as internal communications.

    Parameters
    ----------
    received_header : str or pandas.Series or None
        The first meaningful 'Received' header from an email or collection of emails
        
    Returns
    -------
    bool or pandas.Series
        True if the email originates from a UBC mail server, False otherwise
        - If input is a string: Returns a boolean indicating UBC origin
        - If input is a Series: Returns a Series of boolean values
        
    Examples
    --------
    >>> received = "from mail-server-1.ubc.ca (10.10.10.10) by mail-server-0.ubc.ca"
    >>> check_email_from_ubc(received)
    True
    """
    if isinstance(received_header, pd.Series):
        return received_header.apply(check_email_from_ubc) # type: ignore
        
    if not received_header:
        warnings.warn("None entry found in `received_header`")
        return False
    
    if not isinstance(received_header, str):
        raise ValueError("`received_header` must be str or pandas.Series or None")
    
    pattern = r'from\s+[^\s]+\.ubc\.ca\b'
    match = re.search(pattern, received_header, re.IGNORECASE)
    
    return match is not None

In [None]:
originates_from_ubc = check_email_from_ubc(first_received_headers)

originates_from_ubc

### Check if `name_servers` of `From_email_domain` matches that of sending server in `Received`

In [None]:
from whois import whois
from functools import cache
import time

@cache
def __get_name_servers(domain: str) -> list | None:
    """
    Private method for `get_name_servers`.

    Parameters
    ----------
    domain : str
        Domain name to query for name servers
        
    Returns
    -------
    list or None
        List of name servers for the domain,
        or None if lookup fails or domain doesn't exist
    """    
    if not domain:
        warnings.warn("None entry found in `domain`")
        return None
    
    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            result = whois(domain)
            if 'name_servers' in result:
                return result['name_servers']
        except Exception as e:
            if attempt == max_attempts - 1:
                return None
        time.sleep(0.5)
    
    return None

def get_name_servers(domain: str | pd.Series | None) -> list | pd.Series | None:
    """
    Extract name servers for a domain using Python's whois library.

    This function looks up DNS name servers for a domain name, which helps verify 
    the authenticity of domain ownership and identify mismatches that might 
    indicate spoofing attempts.

    Parameters
    ----------
    domain : str or pandas.Series
        Domain name(s) to query for name servers
        
    Returns
    -------
    list, pandas.Series, or None
        List of name servers for the domain, Series of lists for multiple domains,
        or None if lookup fails or domain doesn't exist

    Examples
    --------
    >>> get_name_servers('google.com')
    ['ns1.google.com', 'ns2.google.com', 'ns3.google.com', 'ns4.google.com']
    >>> get_name_servers('nonexistentdomain123456789.com')
    None
    """
    if isinstance(domain, pd.Series):
        return domain.apply(lambda x: __get_name_servers(x))   # type: ignore
    
    return __get_name_servers(domain)

In [None]:
sender_from_name_servers = get_name_servers(sender_server_domain)

sender_from_name_servers

In [None]:
sender_received_name_servers = get_name_servers(raw_df['From_email_domain'])

sender_received_name_servers

In [None]:
from collections import Counter

def check_sender_name_servers_match(sender_from_name_servers: list | pd.Series | None, 
                                    sender_received_name_servers: list | pd.Series | None) -> bool | pd.Series:
    """
    Check if name servers from sender domain (based on 'From') match name servers from 'Received' domain.
    
    Parameters
    ----------
    sender_from_name_servers : list, pandas.Series, or None
        List of name servers from the sender's domain
    sender_received_name_servers : list, pandas.Series, or None
        List of name servers from the received message domain
        
    Returns
    -------
    bool or pandas.Series
        True if name servers match, False otherwise
    """
    if isinstance(sender_from_name_servers, pd.Series) and isinstance(sender_received_name_servers, pd.Series):
        matches = pd.Series(False, index=sender_from_name_servers.index)
        
        for idx in sender_from_name_servers.index:
            try:
                from_ns = sender_from_name_servers.loc[idx]
                received_ns = sender_received_name_servers.loc[idx]
                
                if not from_ns or not received_ns:
                    matches[idx] = False
                    continue
                    
                matches[idx] = Counter(from_ns) == Counter(received_ns)
            except Exception as e:
                matches[idx] = False
                
        return matches
    
    if hasattr(sender_from_name_servers, 'tolist'):
        sender_from_name_servers = sender_from_name_servers.tolist()
    if hasattr(sender_received_name_servers, 'tolist'):
        sender_received_name_servers = sender_received_name_servers.tolist()
    
    if not sender_from_name_servers or not sender_received_name_servers:
        return False
    
    if not (isinstance(sender_from_name_servers, list) and isinstance(sender_received_name_servers, list)):
        raise ValueError("Both inputs must be a list or coercible to a list")
    
    return Counter(sender_from_name_servers) == Counter(sender_received_name_servers)

In [None]:
name_servers_match = check_sender_name_servers_match(sender_from_name_servers, sender_received_name_servers)

name_servers_match

### Count number of internal server-to-server transfers

In [None]:
def get_routing_before_ubc(received_headers: list | pd.Series) -> list | pd.Series:
    """
    Extract routing headers before UBC mail servers.

    This function filters out the received headers to identify server transfer paths
    before reaching UBC's mail infrastructure, which helps analyze the origin
    and path of potentially suspicious emails.

    Parameters
    ----------
    received_headers : list or pandas.Series
        List of received headers from an email or Series of such lists

    Returns
    -------
    list or pandas.Series
        Headers that indicate routing path before entering UBC mail servers
        - If input is a list: Returns filtered list of received headers
        - If input is a Series: Returns Series with filtered headers for each email
    """
    if isinstance(received_headers, pd.Series):
        return received_headers.apply(get_routing_before_ubc)   # type: ignore

    domain_pattern = r'from\s+[^\s]+\.ubc\.ca\b'

    servers = []
    
    external = False
    for header in received_headers:
        match = re.search(domain_pattern, header)
        if not match:
            external = True

        if external == True:
            servers.append(header)

    return servers

In [None]:
routing_before_ubc = get_routing_before_ubc(raw_df['Received'])
routing_before_ubc

In [None]:
def get_internal_server_transfer_count(received_header: list | pd.Series) -> int | pd.Series:
    """
    Count the number of internal server transfers in the email routing path.

    This function analyzes the received headers to identify how many internal server 
    transfers occurred before reaching the email's final destination, which helps 
    detect potential internal mail server abuse or unusual routing patterns.

    Parameters
    ----------
    received_header : list or pandas.Series
        List of received headers from an email or Series of such lists

    Returns
    -------
    int or pandas.Series
        The number of internal server transfers detected
        - If input is a list: Returns integer count of internal transfers
        - If input is a Series: Returns Series of counts for each email
        
    Examples
    --------
    >>> headers = ["from internal.example.com ([192.168.1.1])", "from mail.external.com ([203.0.113.1])"]
    >>> get_internal_server_transfer_count(headers)
    1
    """
    if isinstance(received_header, pd.Series):
        return received_header.apply(get_internal_server_transfer_count)    # type: ignore
    
    if not isinstance(received_header, list):
        return 0

    results = []
    for header in received_header:
        try:
            ip = ip_address(extract_ip_from_received_header(header))    # type: ignore
            results.append(ip.is_private)
        except:
            results.append(False)

    return sum(results)

In [None]:
internal_server_count_before_ubc = get_internal_server_transfer_count(routing_before_ubc)
internal_server_count_before_ubc

## Extract hyperlinks from `text_html`

In [None]:
from lxml import html

def extract_hyperlinks(text_html: bytes | str | pd.Series) -> set | pd.Series:
    """
    Extract hyperlinks from HTML content.

    This function parses HTML content to extract URL links (<a href> tags),
    which helps analyze email content for phishing indicators and malicious URLs.

    Parameters
    ----------
    text_html : bytes, str, or pandas.Series
        HTML content from an email or collection of emails

    Returns
    -------
    set or pandas.Series
        Set of extracted hyperlinks or Series of sets for multiple emails
        - If input is a string or bytes: Returns a set of URLs
        - If input is a Series: Returns a Series of URL sets

    Examples
    --------
    >>> extract_hyperlinks('<a href="https://example.com">Link</a>')
    {'https://example.com'}
    >>> extract_hyperlinks('<html><body>No links here</body></html>')
    set()
    """
    if isinstance(text_html, pd.Series):
        return text_html.apply(extract_hyperlinks) # type: ignore

    try:
        tree = html.fromstring(text_html)
    except:
        return set()
    
    links = tree.xpath('//a[@href]')
    
    urls = {link.get('href') for link in links}

    return urls

In [None]:
hyperlinks = extract_hyperlinks(raw_df['text_html'])

hyperlinks

## More granular content information

In [None]:
def get_content_count(content_types: list | array | pd.Series) -> dict | pd.Series:
    """
    Count different content types in an email.

    This function analyzes the Content-Type headers from an email and categorizes them
    into text, multimedia, and other types, providing a count for each category.

    Parameters
    ----------
    content_types : list, array.array, or pandas.Series
        A list or Series of content type strings from email headers
        
    Returns
    -------
    dict or pandas.Series
        Dictionary or Series of dictionaries with counts for each content type category:
        - text: Count of text/* content types (text/plain, text/html, etc.)
        - multimedia: Count of image/*, audio/*, video/* content types
        - others: Count of all other content types
        
    Examples
    --------
    >>> get_content_count(['text/plain', 'text/html', 'image/jpeg'])
    {'text': 2, 'multimedia': 1, 'others': 0}
    """
    if isinstance(content_types, pd.Series):
        return content_types.apply(get_content_count)  # type: ignore
    
    if not isinstance(content_types, pd.Series) and hasattr(content_types, 'tolist'):
        content_types = content_types.tolist() # type: ignore
    
    if not isinstance(content_types, list):
        raise ValueError("`content_types` must be a list or array.array")
        
    count = {
        'text': 0,
        'multimedia': 0,
        'others': 0,
    }
    
    for content_type in content_types:
        if not isinstance(content_type, str):
            raise ValueError
        
        if content_type.startswith('multipart/'):
            continue
        elif content_type.startswith('text/'):
            count['text'] += 1
        elif content_type.startswith('image/') or content_type.startswith('audio/') or content_type.startswith('video/'):
            count['multimedia'] += 1
        else:
            count['others'] += 1
    
    return count

In [None]:
results = get_content_count(raw_df['Content_types'])

results