In [51]:
import imaplib
import email
import pandas as pd
from bs4 import BeautifulSoup
import re
from dateutil import parser

def fetch_emails(credentials, since_date="01-Jan-2025"):

    # Extract email address and password from credentials
    email_addr = credentials['gmail_credentials']['email']
    email_pass = credentials['gmail_credentials']['password']
    
    # Connect to the Gmail IMAP server
    mail = imaplib.IMAP4_SSL('imap.gmail.com')
    mail.login(email_addr, email_pass)

    # Select the mailbox you want to use
    mail.select('inbox')

    # Search for emails since the specified date
    status, messages = mail.search(None, f'(SINCE "{since_date}")')

    # Get the email IDs
    email_ids = messages[0].split()

    # Initialize a list to store email details
    email_details = []

    # Fetch the email data and store in the list
    for num in email_ids:
        status, data = mail.fetch(num, '(RFC822)')
        msg = email.message_from_bytes(data[0][1])
        email_info = {
            'Date': msg['date'],
            'Subject': msg['subject'],
            'From': msg['from'],
            'To': msg['to'],
            'Message-ID': msg['message-id'],
            'Body': None,
            'Reply-To': msg['reply-to']
        }
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    email_info['Body'] = part.get_payload(decode=True).decode(errors='ignore')
                    break
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode(errors='ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    email_info['Body'] = soup.get_text()
                    break
        else:
            email_info['Body'] = msg.get_payload(decode=True).decode(errors='ignore')
        email_details.append(email_info)

    # Convert the list to a pandas DataFrame
    df_emails_imap = pd.DataFrame(email_details)

    # Remove any characters from the datestring that appear like this ' (*)'
    df_emails_imap['Date'] = df_emails_imap['Date'].str.replace(r'\s*\(.*?\)', '', regex=True)
    
    # Remove any date strings that end in 'GMT'
    df_emails_imap['Date'] = df_emails_imap['Date'].str.replace(r'GMT$', '', regex=True)
    
    # Convert the 'Date' column to datetime with specified format, handling timezones
    def parse_date(date_str):
        try:
            return parser.parse(date_str)
        except (parser.ParserError, TypeError):
            return pd.NaT

    df_emails_imap['Date'] = df_emails_imap['Date'].apply(parse_date)
    df_emails_imap['Date'] = pd.to_datetime(df_emails_imap['Date'], errors='coerce', utc=True)
    
    # If 'To' column is missing, set it to the email address from credentials
    df_emails_imap['To'] = df_emails_imap['To'].fillna(email_addr)
    
    # If 'Reply-To' column is missing, set it to the 'From' address
    df_emails_imap['Reply-To'] = df_emails_imap['Reply-To'].fillna(df_emails_imap['From'])
    return df_emails_imap


In [52]:
import yaml

# Define the Since Date
since_date = "01-Dec-2024"

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

df_emails_imap = fetch_emails(credentials, since_date=since_date)
print(df_emails_imap)

                         Date  \
0   2024-12-01 08:19:12+00:00   
1   2024-12-01 08:51:40+00:00   
2   2024-12-01 12:03:31+00:00   
3   2024-12-01 12:27:32+00:00   
4   2024-12-01 13:10:38+00:00   
..                        ...   
977 2025-01-22 09:16:50+00:00   
978 2025-01-22 09:28:43+00:00   
979 2025-01-22 10:48:00+00:00   
980 2025-01-22 12:25:31+00:00   
981 2025-01-22 12:47:36+00:00   

                                               Subject  \
0                     See what's streaming in December   
1    Coming soon to State Theatre New Jersey: Cirqu...   
2                 =?utf-8?q?Will_Obama_go_to_jail=3F?=   
3     Your Daily Digest for Sun, 12/1 is ready to view   
4    Your Weekly Ad + Holiday Savings to Deck the H...   
..                                                 ...   
977                                     Account update   
978  Thanks for your interest in the Senior Presale...   
979  CSS Experiments, Aurelia 2, Postgres vs MySQL,...   
980  Advisory Message: 

In [53]:
import pickle
import os

# Create the pickles directory if it does not exist
pickles_dir = '../.pickles'
os.makedirs(pickles_dir, exist_ok=True)

# List of DataFrame names
df_names = ['df_emails_imap']

# Loop through the list and store each DataFrame
for name in df_names:
    with open(f"{pickles_dir}/{name}.pkl", 'wb') as f:
        pickle.dump(globals()[name], f)


In [54]:
import yaml

import pickle

# Open the pickle file and load the DataFrame
with open('../.pickles/df_emails_imap.pkl', 'rb') as f:
    df_emails_imap = pickle.load(f)

# Find the latest date timestamp
latest_date = df_emails_imap['Date'].max()

# Define the Since Date
since_date = latest_date.strftime('%d-%b-%Y')

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

df_emails_imap = fetch_emails(credentials, since_date=since_date)
print(df_emails_imap)

                       Date  \
0 2025-01-22 05:14:18+00:00   
1 2025-01-22 06:24:50+00:00   
2 2025-01-22 08:06:54+00:00   
3 2025-01-22 09:16:50+00:00   
4 2025-01-22 09:28:43+00:00   
5 2025-01-22 10:48:00+00:00   
6 2025-01-22 12:25:31+00:00   
7 2025-01-22 12:47:36+00:00   

                                             Subject  \
0                      Job application status update   
1        Thanks for your interest in Aera Technology   
2      [Newsletter] Secrets of high-performing teams   
3                                     Account update   
4  Thanks for your interest in the Senior Presale...   
5  CSS Experiments, Aurelia 2, Postgres vs MySQL,...   
6  Advisory Message: Treptow Rd will be closed to...   
7   Your Daily Digest for Wed, 1/22 is ready to view   

                                                From                      To  \
0                    Stryker <stryker@myworkday.com>  Mike.Cancell@gmail.com   
1           Aera Technology <no-reply@hire.lever.co>  m