# Main Func to Retrieve Emails

## For Gmail, this Method Requires an App Password 

### See https://myaccount.google.com/security to Administer

### Note the Other Way to do this is to Create OATH2 Credentials but until/unless your App is put into Prod, you will need to Authorize each time you run

In [1]:
import imaplib
import email
import pandas as pd
from bs4 import BeautifulSoup
import re
from dateutil import parser
from tqdm import tqdm

def fetch_emails(credentials, since_date="01-Jan-2025"):

    # Extract email address and password from credentials
    email_addr = credentials['gmail_credentials']['email']
    email_pass = credentials['gmail_credentials']['password']
    
    # Connect to the Gmail IMAP server
    mail = imaplib.IMAP4_SSL('imap.gmail.com')
    mail.login(email_addr, email_pass)

    # Select the mailbox you want to use
    mail.select('inbox')

    # Search for emails since the specified date
    status, messages = mail.search(None, f'(SINCE "{since_date}")')

    # Get the email IDs
    email_ids = messages[0].split()

    # Initialize a list to store email details
    email_details = []

    # Fetch the email data and store in the list
    for num in tqdm(email_ids, desc="Fetching emails"):
        status, data = mail.fetch(num, '(RFC822)')
        msg = email.message_from_bytes(data[0][1])
        email_info = {
            'Date': msg['date'],
            'Subject': msg['subject'],
            'From': msg['from'],
            'To': msg['to'],
            'Message-ID': msg['message-id'],
            'Body': None,
            'Reply-To': msg['reply-to']
        }
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == 'text/plain':
                    email_info['Body'] = part.get_payload(decode=True).decode(errors='ignore')
                    break
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode(errors='ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    email_info['Body'] = soup.get_text()
                    break
        else:
            email_info['Body'] = msg.get_payload(decode=True).decode(errors='ignore')
        email_details.append(email_info)

    # Convert the list to a pandas DataFrame
    df_emails_imap = pd.DataFrame(email_details)

    # Remove any characters from the datestring that appear like this ' (*)'
    df_emails_imap['Date'] = df_emails_imap['Date'].str.replace(r'\s*\(.*?\)', '', regex=True)
    
    # Remove any date strings that end in 'GMT'
    df_emails_imap['Date'] = df_emails_imap['Date'].str.replace(r'GMT$', '', regex=True)
    
    # Convert the 'Date' column to datetime with specified format, handling timezones
    def parse_date(date_str):
        try:
            return parser.parse(date_str)
        except (parser.ParserError, TypeError):
            return pd.NaT

    df_emails_imap['Date'] = df_emails_imap['Date'].apply(parse_date)
    df_emails_imap['Date'] = pd.to_datetime(df_emails_imap['Date'], errors='coerce', utc=True)
    
    # If 'To' column is missing, set it to the email address from credentials
    df_emails_imap['To'] = df_emails_imap['To'].fillna(email_addr)
    
    # If 'Reply-To' column is missing, set it to the 'From' address
    df_emails_imap['Reply-To'] = df_emails_imap['Reply-To'].fillna(df_emails_imap['From'])
    return df_emails_imap


# Initial One-time Call The Fetch Func to Retrieve Emails 

## Use this format for the Credential YAML 
gmail_credentials:
  email: "YOUR_EMAIL@gmail.com"
  password: "YOUR_APP_KEY"

In [2]:
import yaml

# Define the Since Date
since_date = "01-Dec-2024"

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

df_emails_imap = fetch_emails(credentials, since_date=since_date)
print(df_emails_imap)

Fetching emails: 100%|██████████| 1203/1203 [01:25<00:00, 14.02it/s]


                          Date  \
0    2024-12-01 08:19:12+00:00   
1    2024-12-01 08:51:40+00:00   
2    2024-12-01 12:03:31+00:00   
3    2024-12-01 12:27:32+00:00   
4    2024-12-01 13:10:38+00:00   
...                        ...   
1198 2025-01-30 13:37:39+00:00   
1199 2025-01-30 13:47:51+00:00   
1200 2025-01-30 13:51:06+00:00   
1201 2025-01-30 14:05:01+00:00   
1202 2025-01-30 14:27:47+00:00   

                                                Subject  \
0                      See what's streaming in December   
1     Coming soon to State Theatre New Jersey: Cirqu...   
2                  =?utf-8?q?Will_Obama_go_to_jail=3F?=   
3      Your Daily Digest for Sun, 12/1 is ready to view   
4     Your Weekly Ad + Holiday Savings to Deck the H...   
...                                                 ...   
1198  Your Application - (US) Head of Product Analytics   
1199               Thank you for applying to the ASPCA!   
1200                      We received your application!   
1

## Save the DF to a Pickle to be Accessed Downstream

In [3]:
import pickle
import os

# Create the pickles directory if it does not exist
pickles_dir = '../.pickles'
os.makedirs(pickles_dir, exist_ok=True)

# List of DataFrame names
df_names = ['df_emails_imap']

# Loop through the list and store each DataFrame
for name in df_names:
    with open(f"{pickles_dir}/{name}.pkl", 'wb') as f:
        pickle.dump(globals()[name], f)


# For Incremental Loads

## Get Most Recent Date from Pickled DF to Incremental Email Fetches

## And Use Below to Get Daily Incremental Emails into DF

#### Now this is updated to Support both Initial and Incremtal Loads. 
NOTE: This would be the pipleine to automate in a workflow.

In [6]:
import yaml
import pickle
import os
from datetime import datetime, timedelta

# Define the Since Date
try:
    # Open the pickle file and load the DataFrame
    with open('../.pickles/df_emails_imap.pkl', 'rb') as f:
        df_emails_imap = pickle.load(f)
    
    # Find the latest date timestamp
    latest_date = df_emails_imap['Date'].max()
    
    # Define the Since Date
    since_date = latest_date.strftime('%d-%b-%Y')
    print(f"Latest email date found: {latest_date}")
except FileNotFoundError:
    # If the pickle file is not found, set the date to now - 1 year
    since_date = (datetime.now() - timedelta(days=365)).strftime('%d-%b-%Y')
    print("Pickle file not found. Setting since_date to one year ago.")

print(f"Fetching emails since: {since_date}")

# Load credentials from yaml file
with open('../../../Credentials/gmail_credentials.yaml', 'r') as file:
    credentials = yaml.safe_load(file)

# Fetch new emails
df_new_emails = fetch_emails(credentials, since_date=since_date)

# Number of rows before adding new emails
rows_before = len(df_emails_imap)

# Append new data rows to the existing DataFrame
df_emails_imap = pd.concat([df_emails_imap, df_new_emails]).drop_duplicates().reset_index(drop=True)

# Write the updated DataFrame back to the original pickle file
with open('../.pickles/df_emails_imap.pkl', 'wb') as f:
    pickle.dump(df_emails_imap, f)

print(f"Number of new emails fetched: {len(df_new_emails)}")
print(f"Total number of emails after update: {len(df_emails_imap)}")
print(f"Number of rows before adding new emails: {rows_before}")
print(df_emails_imap)

Latest email date found: 2025-01-30 14:27:47+00:00
Fetching emails since: 30-Jan-2025


Fetching emails: 100%|██████████| 19/19 [00:02<00:00,  7.31it/s]

Number of new emails fetched: 19
Total number of emails after update: 1203
Number of rows before adding new emails: 1203
                          Date  \
0    2024-12-01 08:19:12+00:00   
1    2024-12-01 08:51:40+00:00   
2    2024-12-01 12:03:31+00:00   
3    2024-12-01 12:27:32+00:00   
4    2024-12-01 13:10:38+00:00   
...                        ...   
1198 2025-01-30 13:37:39+00:00   
1199 2025-01-30 13:47:51+00:00   
1200 2025-01-30 13:51:06+00:00   
1201 2025-01-30 14:05:01+00:00   
1202 2025-01-30 14:27:47+00:00   

                                                Subject  \
0                      See what's streaming in December   
1     Coming soon to State Theatre New Jersey: Cirqu...   
2                  =?utf-8?q?Will_Obama_go_to_jail=3F?=   
3      Your Daily Digest for Sun, 12/1 is ready to view   
4     Your Weekly Ad + Holiday Savings to Deck the H...   
...                                                 ...   
1198  Your Application - (US) Head of Product Analytics  


