In [10]:
import pandas as pd
import re
import hashlib

# Read the first 10 rows of "emails.csv"
emails_df = pd.read_csv("emails.csv", nrows=1000)

print(emails_df.count())

file       1000
message    1000
dtype: int64


In [15]:
import email
from email import policy
import re

def parse_email(email_str):
    # Parse the email string
    msg = email.message_from_string(email_str, policy=policy.default)
    return msg

def extract_forwarded_email_headers(payload):
    # Use regex to find the original headers in the forwarded email|
    forwarded_headers = {}
    forwarded_headers['From'] = re.findall(r'From: (.+)', payload)
    forwarded_headers['To'] = re.findall(r'To: (.+)', payload)
    forwarded_headers['Date'] = re.findall(r'Date: (.+)', payload)
    forwarded_headers['Subject'] = re.findall(r'Subject: (.+)', payload)
    return forwarded_headers

def find_included_email_parts(msg):
    # This function checks if the email contains another email
    included_emails = []
    if msg.is_multipart():
        for part in msg.iter_parts():
            included_emails.extend(find_included_email_parts(part))
    else:
        payload = msg.get_payload()
        if isinstance(payload, str) and any(indicator in payload for indicator in ["Forwarded message", "Original Message", "From:", "Sent:", "To:", "Subject:"]):
            included_emails.append(msg)
    return included_emails

def extract_included_email(msg):
    # Extracts the included email text and headers
    included_email_texts = []
    if msg.is_multipart():
        for part in msg.iter_parts():
            included_email_texts.extend(extract_included_email(part))
    else:
        payload = msg.get_payload(decode=True)
        if payload:
            payload = payload.decode()
            # Detect forwarded email indicators
            start_idx = min(
                (payload.find(indicator) for indicator in ["Forwarded message", "Original Message", "From:", "Sent:", "To:", "Subject:"] if payload.find(indicator) != -1),
                default=-1
            )
            if start_idx != -1:
                included_email_texts.append(payload[start_idx:])
                # Extract headers
                headers = extract_forwarded_email_headers(payload[start_idx:])
                return headers, payload[start_idx:]
    return None, None

In [46]:
from fuzzywuzzy import fuzz

class EmailNamesResult:
    def __init__(self):
        self.result = {'from': {}, 'to': {}}

    def add_from(self, from_email, from_name):
        if from_email:
            self.result['from'][from_email] = from_name

    def add_to(self, to_email, to_name):
        if to_email:
            self.result['to'][to_email] = to_name

def get_names_from_email(email):  
    result = EmailNamesResult()

    # Extract the 'From' email address
    from_email = email.get('From')

    # Extract the 'To' email addresses
    to_emails = email.get_all('To', [])

    # Extract the 'X-From' and 'X-To' headers
    x_from = email.get('X-From')
    x_to = email.get_all('X-To', [])

    # Match the name in 'X-From' with the 'From' email address using fuzzy matching
    if from_email and x_from and fuzz.partial_ratio(from_email, x_from) > 40:
        result.add_from(from_email, x_from)    
    # Match the names in 'X-To' with the 'To' email addresses using fuzzy matching
    for to_email in to_emails:
        for x_to_entry in x_to:
            if fuzz.partial_ratio(to_email, x_to_entry) > 40:
                result.add_to(to_email, x_to_entry)
                break

    return result.result

In [50]:
entry = parse_email(emails_df.iloc[123]['message'])
entry.get_payload(decode=True)


b'---------------------- Forwarded by Phillip K Allen/HOU/ECT on 07/26/2000 \n10:49 AM ---------------------------\n   \n\tEnron North America Corp.\n\t\n\tFrom:  Kristian J Lande                           07/25/2000 02:24 PM\n\t\n\nTo: Christopher F Calger/PDX/ECT@ECT, Jake Thomas/HOU/ECT@ECT, Frank W \nVickers/HOU/ECT@ECT, Elliot Mainzer/PDX/ECT@ECT, Michael McDonald/SF/ECT@ECT, \nDavid Parquet/SF/ECT@ECT, Laird Dyer/SF/ECT@ECT, Jim Buerkle/PDX/ECT@ECT, Jim \nGilbert/PDX/ECT@ECT, Terry W Donovan/HOU/ECT@ECT, Jeff G \nSlaughter/ENRON_DEVELOPMENT@ENRON_DEVELOPMENT, Tim Belden/HOU/ECT@ECT, Mike \nSwerzbin/HOU/ECT@ECT, Matt Motley/PDX/ECT@ECT, Robert Badeer/HOU/ECT@ECT, \nSean Crandall/PDX/ECT@ECT, Diana Scholtes/HOU/ECT@ECT, Tom \nAlonso/PDX/ECT@ECT, Mark Fischer/PDX/ECT@ECT, Tim Heizenrader/PDX/ECT@ECT\ncc: Phillip K Allen/HOU/ECT@ECT \nSubject: New Generation Update 7/24/00\n\n\n'

In [None]:
entry = parse_email(emails_df.iloc[123]['message'])
print(entry)