In [16]:
import imaplib
import email
import pandas as pd
from bs4 import BeautifulSoup
import re

def fetch_emails(credentials, since_date="01-Jan-2025"):

    # Extract email address and password from credentials
    email_addr = credentials['gmail_credentials']['email']
    email_pass = credentials['gmail_credentials']['password']
    
    # Connect to the Gmail IMAP server
    mail = imaplib.IMAP4_SSL('imap.gmail.com')
    mail.login(email_addr, email_pass)

    # Select the mailbox you want to use
    mail.select('inbox')

    # Search for emails since the specified date
    status, messages = mail.search(None, f'(SINCE "{since_date}")')

    # Get the email IDs
    email_ids = messages[0].split()

    # Initialize a list to store email details
    email_details = []

    # Fetch the email data and store in the list
    for num in email_ids:
        status, data = mail.fetch(num, '(RFC822)')
        msg = email.message_from_bytes(data[0][1])
        email_info = {
            'Date': msg['date'],
            'Subject': msg['subject'],
            'From': msg['from'],
            'To': msg['to'],
            'Message-ID': msg['message-id'],
            'Body': None,
            'Reply-To': msg['reply-to']
        }
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    email_info['Body'] = part.get_payload(decode=True).decode(errors='ignore')
                    break
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode(errors='ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    email_info['Body'] = soup.get_text()
                    break
        else:
            email_info['Body'] = msg.get_payload(decode=True).decode(errors='ignore')
        email_details.append(email_info)

    # Convert the list to a pandas DataFrame
    df_emails_imap = pd.DataFrame(email_details)

    # Convert the 'Date' column to datetime with specified format, handling timezones
    df_emails_imap['Date'] = pd.to_datetime(df_emails_imap['Date'], errors='coerce', utc=True)

    return df_emails_imap

In [17]:
import yaml
import imaplib

# Define the Since Date
since_date = "01-Jan-2025"

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

# Initialize the mail variable
mail = imaplib.IMAP4_SSL('imap.gmail.com')

df_emails_imap = fetch_emails(credentials, since_date=since_date)
print(df_emails_imap)

                         Date  \
0   2025-01-01 08:46:16+00:00   
1   2025-01-01 12:41:22+00:00   
2   2025-01-01 15:29:30+00:00   
3   2025-01-01 16:25:11+00:00   
4   2025-01-01 16:26:57+00:00   
..                        ...   
435 2025-01-21 18:02:57+00:00   
436 2025-01-21 20:32:57+00:00   
437 2025-01-21 20:54:13+00:00   
438 2025-01-21 21:07:44+00:00   
439 2025-01-21 21:28:59+00:00   

                                               Subject  \
0                                       Account update   
1    Bonus available:  Michael, you've scored an op...   
2           The 50/30/20 budget: Calculate your budget   
3           Mike, we make vehicle maintenance a breeze   
4    Google: We've received your payment for 3318-8...   
..                                                 ...   
435                    RE: Feedback for Matt Krayowski   
436  Five years, same price: Michael, add Fios Home...   
437             MojoTech - Data and Analytics Director   
438                   N

  df_emails_imap['Date'] = pd.to_datetime(df_emails_imap['Date'], errors='coerce', utc=True)
