In [1]:
# todo
# - read the email df
# - use the df and sample one
# - write test cases for each or pick that sample up and write for that case

In [1]:
import os
import re
from pathlib import Path
import email
from datetime import datetime
import imaplib
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())
EMAIL = os.environ.get("EMAIL")
APP_PASSWORD = os.environ.get("APP_PASSWORD")

In [3]:
# dataset for testing filter logic
data_dir = Path("./data")
email_dataset = pd.read_csv(data_dir / "mail_samples_7th_Apr_24.csv")
email_dataset.head()

Unnamed: 0,subject,from,body,sent_files,date
0,Check out New Jobs Posted by Top Companies,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-03-31 16:34:37+00:00
1,lunarflu mentioned you in Hugging Face,Discord <notifications@discord.com>,"\r\n <!doctype html>\r\n <html xmlns=""ht...",,2024-03-31 15:50:21+00:00
2,"""Comparing with Apple.... too much""",Reddit <noreply@redditmail.com>,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 S...",,2024-03-31 15:44:47+00:00
3,Suprriya Kauul ( She / Her / Hers) sent you a ...,LinkedIn <messages-noreply@linkedin.com>,-----------------------------------------\r\n\...,,2024-03-31 13:21:48+00:00
4,Rina (Owner) has a Residential House in Kaikh...,Magicbricks <projectsonmb@magicbricks.com>,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T...",,2024-03-31 18:47:30+05:30


In [4]:
for idx, row in email_dataset.iterrows():
    matched = re.findall(r"\<(.*)@(.*)\.com\>", row["from"])
    if matched:
        email_dataset.at[idx, "email_type"] = matched[0][0]
        email_dataset.at[idx, "sender"] = matched[0][1]

In [5]:
def login_to_gmail(email: str, app_password: str) -> imaplib.IMAP4_SSL:
    try:
        connection = imaplib.IMAP4_SSL("imap.gmail.com")
        connection.login(email, app_password)
        print("Login successful")
        return connection
    except imaplib.IMAP4.error:
        print("Login failed")
        return None


def list_mailboxes(connection: imaplib.IMAP4_SSL) -> None:
    print("List of Mailboxes:")
    status, mailbox_list = connection.list()
    if status == "OK":
        for mailbox in mailbox_list:
            mbox_str = [
                re.sub(r"[\\\(\)]", "", e.strip())
                for e in mailbox.decode().split('"/"')
            ]
            print(f"{mbox_str[1]}: {mbox_str[0]}")


def get_body_text(msg):
    body_texts = []
    # process multi-part message
    if msg.is_multipart():
        for part in msg.walk():
            if "attachment" not in str(part.get("Content-Disposition")):
                body = part.get_payload(decode=True)
                if body:
                    try:
                        body_texts.append(body.decode())
                    except UnicodeDecodeError:
                        continue
    else:
        # process message with single part
        try:
            body_texts.append(msg.get_payload(decode=True).decode())
        except UnicodeDecodeError:
            pass
    return "\n".join(body_texts)


def mailbox_df(email_id_list, connection):
    all_emails = []
    for entry in tqdm(email_id_list):

        status, msg_data = connection.fetch(entry, "(RFC822)")
        if status == "OK":
            msg = email.message_from_bytes(msg_data[0][1])
            all_emails.append(
                {
                    "subject": msg.get("Subject"),
                    "from": msg.get("From"),
                    "body": get_body_text(msg),
                    "date": msg.get("Date"),
                }
            )
    return pd.DataFrame(all_emails)

In [10]:
def delete_emails(imap_connection, email_ids):
    """
    Marks the emails with the given IDs for deletion and expunges them.

    Args:
        imap_connection: The IMAP connection object.
        email_ids: A list of email IDs (strings) to delete.
    """
    if not email_ids:
        print("Email list to delete is empty.")
        return

    status, _ = imap_connection.store(" ".join(email_ids), "+FLAGS", "\\Deleted")
    if status != "OK":
        print(f"Error marking emails for deletion, emails: {email_ids}")

    status, _ = imap_connection.expunge()
    if status != "OK":
        print(f"Error expunging deleted emails")
    print("Email deletion process completed.")




In [11]:
connection = login_to_gmail(EMAIL, APP_PASSWORD)
mailbox = "INBOX"

status, _ = connection.select(mailbox)
if status == "OK":
    print(f"{mailbox} Selected")
else:
    print(f"Error Selecting {mailbox} ")

job_alert_on = False
if not job_alert_on:
    mark_job_alert_emails(connection)

OK
OK


In [12]:
status, _ = connection.expunge()
if status != "OK":
    print(f"Error expunging deleted emails")
print("Email deletion process completed.")

Email deletion process completed.


In [58]:
connection.close()
connection.logout()

### Data Analysis

In [6]:
# linkedin specific
# have a status called looking for job or not

In [9]:
linkedin_mails = email_dataset.loc[email_dataset["sender"] == "linkedin"]
linkedin_mails.head()

Unnamed: 0,subject,from,body,sent_files,date,email_type,sender
3,Suprriya Kauul ( She / Her / Hers) sent you a ...,LinkedIn <messages-noreply@linkedin.com>,-----------------------------------------\r\n\...,,2024-03-31 13:21:48+00:00,messages-noreply,linkedin
5,=?UTF-8?Q?=E2=80=9Cmachine_learning_engineer=E...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for machine learning engineer i...,,2024-03-31 11:53:07+00:00,jobalerts-noreply,linkedin
25,=?UTF-8?Q?=E2=80=9Cdata_scientist=E2=80=9D_opp...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Bengaluru...,,2024-03-31 11:53:07+00:00,jobalerts-noreply,linkedin
40,"SAJID, you have new application updates this week",LinkedIn <jobs-noreply@linkedin.com>,Check out the status of your applications on L...,,2024-04-02 21:18:08+00:00,jobs-noreply,linkedin
46,"Add Goutam Bhat, Assistant Manager | Data Scie...",LinkedIn <messages-noreply@linkedin.com>,Do you know Goutam Bhat?\r\n22 mutual connecti...,,2024-04-02 14:17:39+00:00,messages-noreply,linkedin


In [10]:
email_dataset[email_dataset.sender == "iimjobs"]

Unnamed: 0,subject,from,body,sent_files,date,email_type,sender
0,Check out New Jobs Posted by Top Companies,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-03-31 16:34:37+00:00,info,iimjobs
7,Interesting Job Opportunity JP Morgan Chase - ...,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-03-31 09:35:53+00:00,info,iimjobs
15,Some Handpicked Jobs for you!,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-03-31 06:35:00+00:00,info,iimjobs
20,iimjobs.com - Your Personalized Jobfeed,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-03-31 03:14:59+00:00,info,iimjobs
27,Top Jobs from Leading Companies based on your ...,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-03-31 11:36:05+00:00,info,iimjobs
38,iimjobs.com - Your Personalized Jobfeed,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-04-03 02:52:01+00:00,info,iimjobs
44,WorkIndia is hiring for Data Analyst - eCommer...,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-04-02 15:31:41+00:00,info,iimjobs
74,WorkIndia is hiring for Data Analyst - eCommer...,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-04-02 09:33:29+00:00,info,iimjobs
77,Tata Steel - Senior Technologist - Process Sim...,Team iimjobs <info@iimjobs.com>,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",,2024-04-02 07:30:25+00:00,info,iimjobs
79,Some Handpicked Jobs for you!,"""iimjobs.com"" <info@iimjobs.com>",Please Enable Javascript\n<!DOCTYPE html> <htm...,,2024-04-02 06:30:58+00:00,info,iimjobs


In [7]:
email_dataset.loc[email_dataset["sender_entity"] == "iimjobs"]

KeyError: 'sender_entity'

In [7]:
linkedin_mails.mail_type.value_counts()

mail_type
jobalerts-noreply        28
messages-noreply          5
jobs-listings             3
inmail-hit-reply          2
jobs-noreply              1
notifications-noreply     1
Name: count, dtype: int64

In [8]:
linkedin_mails.loc[linkedin_mails["mail_type"] == "jobalerts-noreply"]

Unnamed: 0,subject,from,body,sent_files,date,mail_type,sender_entity
5,=?UTF-8?Q?=E2=80=9Cmachine_learning_engineer=E...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for machine learning engineer i...,,2024-03-31 11:53:07+00:00,jobalerts-noreply,linkedin
25,=?UTF-8?Q?=E2=80=9Cdata_scientist=E2=80=9D_opp...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Bengaluru...,,2024-03-31 11:53:07+00:00,jobalerts-noreply,linkedin
68,=?UTF-8?Q?=E2=80=9Cmachine_learning_engineer=E...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for machine learning engineer i...,,2024-04-02 14:48:22+00:00,jobalerts-noreply,linkedin
71,=?UTF-8?Q?=E2=80=9Cdata_scientist=E2=80=9D_opp...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Bengaluru...,,2024-04-02 11:49:28+00:00,jobalerts-noreply,linkedin
72,=?UTF-8?Q?=E2=80=9Cmachine_learning_enginee?=\...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for machine learning engineer i...,,2024-04-02 11:49:28+00:00,jobalerts-noreply,linkedin
78,=?UTF-8?Q?=E2=80=9Cdata_scientist=E2=80=9D_opp...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Dublin\r\...,,2024-04-02 06:41:09+00:00,jobalerts-noreply,linkedin
88,=?UTF-8?Q?=E2=80=9Cdata_scientist=E2=80=9D_opp...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Ireland\r...,,2024-04-02 06:41:09+00:00,jobalerts-noreply,linkedin
99,=?UTF-8?Q?=E2=80=9Cdata_scientist=E2=80=9D:_22...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Bengaluru...,,2024-04-01 11:57:10+00:00,jobalerts-noreply,linkedin
100,=?UTF-8?Q?=E2=80=9Cmachine_learning_engineer=E...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for machine learning engineer i...,,2024-04-01 11:57:11+00:00,jobalerts-noreply,linkedin
116,=?UTF-8?Q?Now_hiring:_4_=E2=80=9Cdata_scientis...,LinkedIn Job Alerts <jobalerts-noreply@linkedi...,Your job alert for data scientist in Dublin\r\...,,2024-04-03 06:58:35+00:00,jobalerts-noreply,linkedin


('BYE', [b'LOGOUT Requested'])