# Main Func to Retrieve Emails

## For Gmail, this Method Requires an App Password 

### See https://myaccount.google.com/security to Administer

### Note the Other Way to do this is to Create OATH2 Credentials but until/unless your App is put into Prod, you will need to Authorize each time you run

In [1]:
import imaplib
import email
import pandas as pd
from bs4 import BeautifulSoup
import re
from dateutil import parser
from tqdm import tqdm

def fetch_emails(credentials, since_date="01-Jan-2025"):

    # Extract email address and password from credentials
    email_addr = credentials['gmail_credentials']['email']
    email_pass = credentials['gmail_credentials']['password']
    
    # Connect to the Gmail IMAP server
    mail = imaplib.IMAP4_SSL('imap.gmail.com')
    mail.login(email_addr, email_pass)

    # Select the mailbox you want to use
    mail.select('inbox')

    # Search for emails since the specified date
    status, messages = mail.search(None, f'(SINCE "{since_date}")')

    # Get the email IDs
    email_ids = messages[0].split()

    # Initialize a list to store email details
    email_details = []

    # Fetch the email data and store in the list
    for num in tqdm(email_ids, desc="Fetching emails"):
        status, data = mail.fetch(num, '(RFC822)')
        msg = email.message_from_bytes(data[0][1])
        email_info = {
            'Date': msg['date'],
            'Subject': msg['subject'],
            'From': msg['from'],
            'To': msg['to'],
            'Message-ID': msg['message-id'],
            'Body': None,
            'Reply-To': msg['reply-to']
        }
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    email_info['Body'] = part.get_payload(decode=True).decode(errors='ignore')
                    break
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode(errors='ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    email_info['Body'] = soup.get_text()
                    break
        else:
            email_info['Body'] = msg.get_payload(decode=True).decode(errors='ignore')
        email_details.append(email_info)

    # Convert the list to a pandas DataFrame
    df_emails_imap = pd.DataFrame(email_details)

    # Remove any characters from the datestring that appear like this ' (*)'
    df_emails_imap['Date'] = df_emails_imap['Date'].str.replace(r'\s*\(.*?\)', '', regex=True)
    
    # Remove any date strings that end in 'GMT'
    df_emails_imap['Date'] = df_emails_imap['Date'].str.replace(r'GMT$', '', regex=True)
    
    # Convert the 'Date' column to datetime with specified format, handling timezones
    def parse_date(date_str):
        try:
            return parser.parse(date_str)
        except (parser.ParserError, TypeError):
            return pd.NaT

    df_emails_imap['Date'] = df_emails_imap['Date'].apply(parse_date)
    df_emails_imap['Date'] = pd.to_datetime(df_emails_imap['Date'], errors='coerce', utc=True)
    
    # If 'To' column is missing, set it to the email address from credentials
    df_emails_imap['To'] = df_emails_imap['To'].fillna(email_addr)
    
    # If 'Reply-To' column is missing, set it to the 'From' address
    df_emails_imap['Reply-To'] = df_emails_imap['Reply-To'].fillna(df_emails_imap['From'])
    return df_emails_imap


# Initial One-time Call The Fetch Func to Retrieve Emails 

## Use this format for the Credential YAML 
gmail_credentials:
  email: "YOUR_EMAIL@gmail.com"
  password: "YOUR_APP_KEY"

In [2]:
import yaml

# Define the Since Date
since_date = "01-Dec-2024"

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

df_emails_imap = fetch_emails(credentials, since_date=since_date)
print(df_emails_imap)

Fetching emails: 100%|██████████| 1178/1178 [01:15<00:00, 15.70it/s]

                          Date  \
0    2024-12-01 08:19:12+00:00   
1    2024-12-01 08:51:40+00:00   
2    2024-12-01 12:03:31+00:00   
3    2024-12-01 12:27:32+00:00   
4    2024-12-01 13:10:38+00:00   
...                        ...   
1173 2025-01-29 15:27:38+00:00   
1174 2025-01-29 17:32:11+00:00   
1175 2025-01-29 17:37:16+00:00   
1176 2025-01-29 19:05:23+00:00   
1177 2025-01-29 19:27:20+00:00   

                                                Subject  \
0                      See what's streaming in December   
1     Coming soon to State Theatre New Jersey: Cirqu...   
2                  =?utf-8?q?Will_Obama_go_to_jail=3F?=   
3      Your Daily Digest for Sun, 12/1 is ready to view   
4     Your Weekly Ad + Holiday Savings to Deck the H...   
...                                                 ...   
1173                            Your Delta Dental Quote   
1174                 Access your recent insurance quote   
1175                                 Welcome to EyeMed!   
1




## Save the DF to a Pickle to be Accessed Downstream

In [3]:
import pickle
import os

# Create the pickles directory if it does not exist
pickles_dir = '../.pickles'
os.makedirs(pickles_dir, exist_ok=True)

# List of DataFrame names
df_names = ['df_emails_imap']

# Loop through the list and store each DataFrame
for name in df_names:
    with open(f"{pickles_dir}/{name}.pkl", 'wb') as f:
        pickle.dump(globals()[name], f)


# For Incremental Loads

## Get Most Recent Date from Pickled DF to Incremental Email Fetches

## And Use Below to Get Daily Incremental Emails into DF

In [4]:
import yaml

import pickle

# Open the pickle file and load the DataFrame
with open('../.pickles/df_emails_imap.pkl', 'rb') as f:
    df_emails_imap = pickle.load(f)

# Find the latest date timestamp
latest_date = df_emails_imap['Date'].max()

# Define the Since Date
since_date = latest_date.strftime('%d-%b-%Y')

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

df_emails_imap = fetch_emails(credentials, since_date=since_date)
print(df_emails_imap)

                        Date  \
0  2025-01-22 05:14:18+00:00   
1  2025-01-22 06:24:50+00:00   
2  2025-01-22 08:06:54+00:00   
3  2025-01-22 09:16:50+00:00   
4  2025-01-22 09:28:43+00:00   
5  2025-01-22 10:48:00+00:00   
6  2025-01-22 12:25:31+00:00   
7  2025-01-22 12:47:36+00:00   
8  2025-01-22 14:38:26+00:00   
9  2025-01-22 14:50:17+00:00   
10 2025-01-22 15:00:32+00:00   
11 2025-01-22 15:19:12+00:00   
12 2025-01-22 15:43:38+00:00   
13 2025-01-22 15:45:11+00:00   
14 2025-01-22 16:05:05+00:00   
15 2025-01-22 16:09:19+00:00   
16 2025-01-22 16:12:06+00:00   
17 2025-01-22 16:13:09+00:00   
18 2025-01-22 16:19:00+00:00   
19 2025-01-22 16:23:03+00:00   
20 2025-01-22 16:24:19+00:00   
21 2025-01-22 16:24:34+00:00   
22 2025-01-22 16:28:15+00:00   
23 2025-01-22 16:38:01+00:00   

                                              Subject  \
0                       Job application status update   
1         Thanks for your interest in Aera Technology   
2       [Newsletter] Secrets