# Get the Emails from Pickled DF

In [24]:
import pickle
import os

pickle_path = '../.pickles/df_emails_imap.pkl'

if os.path.exists(pickle_path):
    print("Loading df_emails_imap DataFrame from pickle file...")
    with open(pickle_path, 'rb') as f:
        df_emails_imap = pickle.load(f)
    print("DataFrame loaded successfully.")
else:
    print(f"Error: Pickle file {pickle_path} not found.")


Loading df_emails_imap DataFrame from pickle file...
DataFrame loaded successfully.


In [25]:
from IPython.display import display

display(df_emails_imap.tail(3))
df_emails_imap.info()

Unnamed: 0,Date,Subject,From,To,Message-ID,Body,Reply-To
1200,2025-01-30 13:51:06+00:00,We received your application!,eisneramper@myworkday.com,mike@mikecancell.com,<1409290418.13919826.1738245066869@myworkday.com>,"<!doctype html><html xmlns:v=""urn:schemas-micr...",noreply.workday@eisnerampermail.com
1201,2025-01-30 14:05:01+00:00,Thank you for your application to Figma,no-reply@figma.com,Mike.Cancell@gmail.com,<20250130140501.abd7bf94be4ee386@figma.com>,"Hi Michael,\r\n\r\nThank you for your interest...",no-reply@figma.com
1202,2025-01-30 14:27:47+00:00,50% off one full-price item!,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?= <extracare@yo...,mike@mikecancell.com,<0.1.F.2D9.1DB7323229121DA.0@omp.your.cvs.com>,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?= <noreply@your...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   Date        1203 non-null   datetime64[ns, UTC]
 1   Subject     1203 non-null   object             
 2   From        1203 non-null   object             
 3   To          1203 non-null   object             
 4   Message-ID  1203 non-null   object             
 5   Body        1203 non-null   object             
 6   Reply-To    1203 non-null   object             
dtypes: datetime64[ns, UTC](1), object(6)
memory usage: 65.9+ KB


## Add Some Additional Date Info

In [26]:
import pandas as pd

# Check if 'Timestamp' column exists, if not rename 'Date' to 'Timestamp'
if 'Timestamp' not in df_emails_imap.columns:
    df_emails_imap.rename(columns={'Date': 'Timestamp'}, inplace=True)

# Convert 'Timestamp' column to datetime format only if needed
if df_emails_imap['Timestamp'].dtype != 'datetime64[ns]':
    df_emails_imap['Timestamp'] = pd.to_datetime(df_emails_imap['Timestamp'], errors='coerce')

# Remove the duplicate 'Timestamp' column if it exists
if df_emails_imap.columns.duplicated().any():
    df_emails_imap = df_emails_imap.loc[:, ~df_emails_imap.columns.duplicated()]

# Add new columns derived from 'Timestamp'
df_emails_imap['Date'] = df_emails_imap['Timestamp'].dt.date
df_emails_imap['Time'] = df_emails_imap['Timestamp'].dt.time
df_emails_imap['Day_of_Week'] = df_emails_imap['Timestamp'].dt.dayofweek
df_emails_imap['Day_of_Week_String'] = df_emails_imap['Timestamp'].dt.strftime('%a')

# Display the schema of the updated DataFrame
df_emails_imap.info()
# Display the last 5 rows of the updated DataFrame
display(df_emails_imap.tail(3))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   Timestamp           1203 non-null   datetime64[ns, UTC]
 1   Subject             1203 non-null   object             
 2   From                1203 non-null   object             
 3   To                  1203 non-null   object             
 4   Message-ID          1203 non-null   object             
 5   Body                1203 non-null   object             
 6   Reply-To            1203 non-null   object             
 7   Date                1203 non-null   object             
 8   Time                1203 non-null   object             
 9   Day_of_Week         1203 non-null   int32              
 10  Day_of_Week_String  1203 non-null   object             
dtypes: datetime64[ns, UTC](1), int32(1), object(9)
memory usage: 98.8+ KB


Unnamed: 0,Timestamp,Subject,From,To,Message-ID,Body,Reply-To,Date,Time,Day_of_Week,Day_of_Week_String
1200,2025-01-30 13:51:06+00:00,We received your application!,eisneramper@myworkday.com,mike@mikecancell.com,<1409290418.13919826.1738245066869@myworkday.com>,"<!doctype html><html xmlns:v=""urn:schemas-micr...",noreply.workday@eisnerampermail.com,2025-01-30,13:51:06,3,Thu
1201,2025-01-30 14:05:01+00:00,Thank you for your application to Figma,no-reply@figma.com,Mike.Cancell@gmail.com,<20250130140501.abd7bf94be4ee386@figma.com>,"Hi Michael,\r\n\r\nThank you for your interest...",no-reply@figma.com,2025-01-30,14:05:01,3,Thu
1202,2025-01-30 14:27:47+00:00,50% off one full-price item!,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?= <extracare@yo...,mike@mikecancell.com,<0.1.F.2D9.1DB7323229121DA.0@omp.your.cvs.com>,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?= <noreply@your...,2025-01-30,14:27:47,3,Thu


## From the From and To Cols, Extract the Actual Addr and the Display Name in Sep Cols

In [27]:
import pandas as pd
import re

# Function to extract email address and display name
def extract_email_and_name(email):
    match = re.match(r'(?:"?([^"]*)"?\s)?(?:<?(.+@[^>]+)>?)', email)
    if match:
        display_name, email_address = match.groups()
        if not display_name:
            display_name = email_address.split('@')[0]
        return display_name, email_address
    return email, email

# Apply the function to the 'From' column if 'From_Display_Name' and 'From_Addr' columns do not exist
if 'From_Display_Name' not in df_emails_imap.columns and 'From_Addr' not in df_emails_imap.columns:
    df_emails_imap[['From_Display_Name', 'From_Addr']] = df_emails_imap['From'].apply(lambda x: pd.Series(extract_email_and_name(x)))

# Apply the function to the 'To' column if 'To_Display_Name' and 'To_Addr' columns do not exist
if 'To_Display_Name' not in df_emails_imap.columns and 'To_Addr' not in df_emails_imap.columns:
    df_emails_imap[['To_Display_Name', 'To_Addr']] = df_emails_imap['To'].apply(lambda x: pd.Series(extract_email_and_name(x)))

# Apply the function to the 'Reply-To' column if 'Reply-To_Display_Name' and 'Reply-To_Addr' columns do not exist
if 'Reply-To_Display_Name' not in df_emails_imap.columns and 'Reply-To_Addr' not in df_emails_imap.columns:
    df_emails_imap[['Reply-To_Display_Name', 'Reply-To_Addr']] = df_emails_imap['Reply-To'].apply(lambda x: pd.Series(extract_email_and_name(x)))

# Function to extract main domain from email address
def extract_main_domain(email):
    domain = email.split('@')[-1]
    main_domain = '.'.join(domain.split('.')[-2:])
    return main_domain

# Apply the function to the 'From_Addr' column to create 'From_Domain' column if it does not exist
if 'From_Domain' not in df_emails_imap.columns:
    df_emails_imap['From_Domain'] = df_emails_imap['From_Addr'].apply(extract_main_domain)

# Display the last 10 rows of the updated DataFrame
display(df_emails_imap.tail(3))
df_emails_imap.info()

Unnamed: 0,Timestamp,Subject,From,To,Message-ID,Body,Reply-To,Date,Time,Day_of_Week,Day_of_Week_String,From_Display_Name,From_Addr,To_Display_Name,To_Addr,Reply-To_Display_Name,Reply-To_Addr,From_Domain
1200,2025-01-30 13:51:06+00:00,We received your application!,eisneramper@myworkday.com,mike@mikecancell.com,<1409290418.13919826.1738245066869@myworkday.com>,"<!doctype html><html xmlns:v=""urn:schemas-micr...",noreply.workday@eisnerampermail.com,2025-01-30,13:51:06,3,Thu,eisneramper,eisneramper@myworkday.com,mike,mike@mikecancell.com,noreply.workday,noreply.workday@eisnerampermail.com,myworkday.com
1201,2025-01-30 14:05:01+00:00,Thank you for your application to Figma,no-reply@figma.com,Mike.Cancell@gmail.com,<20250130140501.abd7bf94be4ee386@figma.com>,"Hi Michael,\r\n\r\nThank you for your interest...",no-reply@figma.com,2025-01-30,14:05:01,3,Thu,no-reply,no-reply@figma.com,Mike.Cancell,Mike.Cancell@gmail.com,no-reply,no-reply@figma.com,figma.com
1202,2025-01-30 14:27:47+00:00,50% off one full-price item!,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?= <extracare@yo...,mike@mikecancell.com,<0.1.F.2D9.1DB7323229121DA.0@omp.your.cvs.com>,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?= <noreply@your...,2025-01-30,14:27:47,3,Thu,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,extracare@your.cvs.com,mike,mike@mikecancell.com,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,noreply@your.cvs.com,cvs.com


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Timestamp              1203 non-null   datetime64[ns, UTC]
 1   Subject                1203 non-null   object             
 2   From                   1203 non-null   object             
 3   To                     1203 non-null   object             
 4   Message-ID             1203 non-null   object             
 5   Body                   1203 non-null   object             
 6   Reply-To               1203 non-null   object             
 7   Date                   1203 non-null   object             
 8   Time                   1203 non-null   object             
 9   Day_of_Week            1203 non-null   int32              
 10  Day_of_Week_String     1203 non-null   object             
 11  From_Display_Name      1203 non-null   object           

## Quick Check for Missing Data

In [28]:
missing_data = df_emails_imap.isnull().sum()
print(missing_data)

Timestamp                0
Subject                  0
From                     0
To                       0
Message-ID               0
Body                     0
Reply-To                 0
Date                     0
Time                     0
Day_of_Week              0
Day_of_Week_String       0
From_Display_Name        0
From_Addr                0
To_Display_Name          0
To_Addr                  0
Reply-To_Display_Name    0
Reply-To_Addr            0
From_Domain              0
dtype: int64


## Clean up the Body to remove HTML & CSS Markup

In [29]:
import warnings
from bs4 import BeautifulSoup
import re

# Ignore MarkupResemblesLocatorWarning
warnings.filterwarnings("ignore", category=UserWarning, message="MarkupResemblesLocatorWarning:*")

# Function to remove HTML markup, CSS style tags, and URLs
def remove_markup_and_urls(text):
    soup = BeautifulSoup(text, 'html.parser')
    # Remove CSS style tags
    for style in soup(["style"]):
        style.decompose()
    text_without_markup = soup.get_text()
    text_without_urls = re.sub(r'http\S+|www\S+', '', text_without_markup, flags=re.MULTILINE)
    # Remove any remaining CSS markup
    text_without_css = re.sub(r'{.*?}', '', text_without_urls, flags=re.DOTALL)
    # Remove any remaining HTML tags
    clean_text = re.sub(r'<.*?>', '', text_without_css)
    return clean_text.strip()

# Add 'Body_Text' column if it doesn't already exist
if 'Body_Text' not in df_emails_imap.columns:
    df_emails_imap['Body_Text'] = df_emails_imap['Body'].apply(remove_markup_and_urls)

# Display the schema of the updated DataFrame
df_emails_imap.info()

  soup = BeautifulSoup(text, 'html.parser')
  soup = BeautifulSoup(text, 'html.parser')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Timestamp              1203 non-null   datetime64[ns, UTC]
 1   Subject                1203 non-null   object             
 2   From                   1203 non-null   object             
 3   To                     1203 non-null   object             
 4   Message-ID             1203 non-null   object             
 5   Body                   1203 non-null   object             
 6   Reply-To               1203 non-null   object             
 7   Date                   1203 non-null   object             
 8   Time                   1203 non-null   object             
 9   Day_of_Week            1203 non-null   int32              
 10  Day_of_Week_String     1203 non-null   object             
 11  From_Display_Name      1203 non-null   object           

## Drop the Original From/To Cols (post split)

In [30]:
# Drop the specified columns if they exist
columns_to_drop = ['From', 'To', 'Body', 'Reply-To']
df_emails_imap.drop(columns=[col for col in columns_to_drop if col in df_emails_imap.columns], inplace=True)

# Display the last 3 rows of the remaining columns
display(df_emails_imap.tail(3))

Unnamed: 0,Timestamp,Subject,Message-ID,Date,Time,Day_of_Week,Day_of_Week_String,From_Display_Name,From_Addr,To_Display_Name,To_Addr,Reply-To_Display_Name,Reply-To_Addr,From_Domain,Body_Text
1200,2025-01-30 13:51:06+00:00,We received your application!,<1409290418.13919826.1738245066869@myworkday.com>,2025-01-30,13:51:06,3,Thu,eisneramper,eisneramper@myworkday.com,mike,mike@mikecancell.com,noreply.workday,noreply.workday@eisnerampermail.com,myworkday.com,"Hi Michael Cancell , Thank you for your intere..."
1201,2025-01-30 14:05:01+00:00,Thank you for your application to Figma,<20250130140501.abd7bf94be4ee386@figma.com>,2025-01-30,14:05:01,3,Thu,no-reply,no-reply@figma.com,Mike.Cancell,Mike.Cancell@gmail.com,no-reply,no-reply@figma.com,figma.com,"Hi Michael,\r\n\r\nThank you for your interest..."
1202,2025-01-30 14:27:47+00:00,50% off one full-price item!,<0.1.F.2D9.1DB7323229121DA.0@omp.your.cvs.com>,2025-01-30,14:27:47,3,Thu,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,extracare@your.cvs.com,mike,mike@mikecancell.com,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,noreply@your.cvs.com,cvs.com,CVS\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...


## Create a Category from the Body and From Addr

### Manual classification

In [31]:
# Define a function to categorize the body text
def categorize_body_text(body_text):
    body_text_lower = body_text.lower()
    keywords = {
        'Job Rejection': ['rejection', 'not selected', 'unfortunately', 'keep your resume on file', 'regret'],
        'Job Application': ['application', 'received', 'apply', 'your interest', 'employment'],
        'Invoice': ['invoice'],
        'Promotion': ['promotion'],
        'Newsletter': ['newsletter'],
        'Reminder': ['reminder'],
        'Meeting': ['meeting', 'schedule'],
        'Password Reset': ['password', 'reset'],
        'Shipping': ['shipping', 'delivery'],
        'Receipt': ['receipt', 'purchase'],
        'Subscription': ['welcome to']
    }
    for category, words in keywords.items():
        if any(word in body_text_lower for word in words):
            return category
    return 'Other'

# Apply the function to the 'Body_Text' column
df_emails_imap['Category'] = df_emails_imap['Body_Text'].apply(categorize_body_text)

# Function to categorize by From_Addr
def categorize_from_addr(from_addr):
    keywords = {
        'Bank': ['wellsfargo', 'chase', 'bankofamerica', 'citibank', 'usbank', 'pnc', 'tdbank', 'capitalone', 
                 'bbt', 'suntrust', 'ally', 'hsbc', 'santander', 'citizensbank', 'fifththirdbank', 'keybank', 
                 'huntington', 'bmo', 'm&t', 'regions', 'synchrony', 'americanexpress', 'bank of princeton'],
        'Job Application': ['greenhouse', 'lever', 'dayforce', 'trimble', 'workday', 'career', 'job', 'talent', 
                            'linkedin', 'indeed', 'glassdoor', 'monster', 'ziprecruiter', 'simplyhired', 'angel.co', 
                            'hired', 'jobvite', 'smartrecruiters', 'icims', 'bamboohr', 'jazzhr', 'recruitee', 
                            'jobscore', 'recruiterbox', 'clearcompany', 'breezyhr', 'jobadder', 'workable', 
                            'recruitcrm', 'hiretual', 'hireez', 'hirebridge', 'hireology', 'hirevue', 'jobsoid', 
                            'recruiterflow', 'talentreef', 'teamtailor', 'workpop', 'zoho', 'recruiter', 
                            'recruitment', 'recruiting', 'careers'],
        'Shipping': ['usps', 'ups', 'fedex', 'dhl', 'ontrac', 'lasership', 'amazon'],
        'Insurance': ['njmmail', 'bcbs', 'aetna', 'cigna', 'humana', 'unitedhealthcare', 'anthem', 'metlife', 
                      'prudential', 'allstate', 'statefarm', 'geico', 'libertymutual', 'progressive', 'nationwide', 
                      'farmers', 'travelers', 'ameriprise', 'mutualofomaha', 'guardianlife','njm'],
        'Entertainment': ['imdb'],
        'Music': ['bandsintown'],
        'Newsletter': ['quora'],
        'Finance': ['synchronyfinancial'],
        'Tech': ['google', 'aws', 'azure', 'gcp', 'cloud'],
        'Food': ['chick-fil-a']
    }
    for category, words in keywords.items():
        if any(word in from_addr.lower() for word in words):
            return category
    return 'Other'

# Apply the function to the 'From_Addr' column if 'From_Addr_Category' column does not exist
if 'From_Addr_Category' not in df_emails_imap.columns:
    df_emails_imap['From_Addr_Category'] = df_emails_imap['From_Addr'].apply(categorize_from_addr)

# Combine the categories from Body_Text and From_Addr
def combine_categories(row):
    return row['Category'] if row['Category'] != 'Other' else row['From_Addr_Category']

# Apply the function to combine categories only if 'Email_Category' column does not exist
if 'Email_Category' not in df_emails_imap.columns:
    df_emails_imap['Email_Category'] = df_emails_imap.apply(combine_categories, axis=1)

# Drop the intermediate 'From_Addr_Category' column if it exists
if 'From_Addr_Category' in df_emails_imap.columns:
    df_emails_imap.drop(columns=['From_Addr_Category'], inplace=True)

# Display the last 3 rows of the updated DataFrame
display(df_emails_imap.tail(3))


Unnamed: 0,Timestamp,Subject,Message-ID,Date,Time,Day_of_Week,Day_of_Week_String,From_Display_Name,From_Addr,To_Display_Name,To_Addr,Reply-To_Display_Name,Reply-To_Addr,From_Domain,Body_Text,Category,Email_Category
1200,2025-01-30 13:51:06+00:00,We received your application!,<1409290418.13919826.1738245066869@myworkday.com>,2025-01-30,13:51:06,3,Thu,eisneramper,eisneramper@myworkday.com,mike,mike@mikecancell.com,noreply.workday,noreply.workday@eisnerampermail.com,myworkday.com,"Hi Michael Cancell , Thank you for your intere...",Job Application,Job Application
1201,2025-01-30 14:05:01+00:00,Thank you for your application to Figma,<20250130140501.abd7bf94be4ee386@figma.com>,2025-01-30,14:05:01,3,Thu,no-reply,no-reply@figma.com,Mike.Cancell,Mike.Cancell@gmail.com,no-reply,no-reply@figma.com,figma.com,"Hi Michael,\r\n\r\nThank you for your interest...",Job Rejection,Job Rejection
1202,2025-01-30 14:27:47+00:00,50% off one full-price item!,<0.1.F.2D9.1DB7323229121DA.0@omp.your.cvs.com>,2025-01-30,14:27:47,3,Thu,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,extracare@your.cvs.com,mike,mike@mikecancell.com,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,noreply@your.cvs.com,cvs.com,CVS\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,Job Application,Job Application


### Machine Learning Classification
Have tried several diff libs for AI classification. None are working well. Not enough of a sample. Will circle back to this later.

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

# Prepare the data
X = df_emails_imap['Body_Text']
y = df_emails_imap['Category']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline that combines the TF-IDF vectorizer with a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(stop_words='english'), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Predict the categories for the test set
y_pred = model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# Apply the model to the entire dataset to create a new column with the predicted categories
df_emails_imap['ml_body_cat'] = model.predict(df_emails_imap['Body_Text'])

# Display the last 3 rows of the updated DataFrame
display(df_emails_imap.tail(3))


                 precision    recall  f1-score   support

        Invoice       0.83      0.45      0.59        11
Job Application       0.58      0.99      0.73       114
  Job Rejection       0.00      0.00      0.00        11
        Meeting       0.00      0.00      0.00        10
     Newsletter       0.00      0.00      0.00         2
          Other       0.69      0.38      0.49        53
 Password Reset       0.00      0.00      0.00         2
      Promotion       0.00      0.00      0.00         2
        Receipt       0.00      0.00      0.00         7
       Reminder       0.00      0.00      0.00         3
       Shipping       1.00      0.63      0.77        19
   Subscription       0.00      0.00      0.00         7

       accuracy                           0.62       241
      macro avg       0.26      0.20      0.22       241
   weighted avg       0.54      0.62      0.54       241



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Timestamp,Subject,Message-ID,Date,Time,Day_of_Week,Day_of_Week_String,From_Display_Name,From_Addr,To_Display_Name,To_Addr,Reply-To_Display_Name,Reply-To_Addr,From_Domain,Body_Text,Category,Email_Category,ml_body_cat
1200,2025-01-30 13:51:06+00:00,We received your application!,<1409290418.13919826.1738245066869@myworkday.com>,2025-01-30,13:51:06,3,Thu,eisneramper,eisneramper@myworkday.com,mike,mike@mikecancell.com,noreply.workday,noreply.workday@eisnerampermail.com,myworkday.com,"Hi Michael Cancell , Thank you for your intere...",Job Application,Job Application,Job Application
1201,2025-01-30 14:05:01+00:00,Thank you for your application to Figma,<20250130140501.abd7bf94be4ee386@figma.com>,2025-01-30,14:05:01,3,Thu,no-reply,no-reply@figma.com,Mike.Cancell,Mike.Cancell@gmail.com,no-reply,no-reply@figma.com,figma.com,"Hi Michael,\r\n\r\nThank you for your interest...",Job Rejection,Job Rejection,Job Application
1202,2025-01-30 14:27:47+00:00,50% off one full-price item!,<0.1.F.2D9.1DB7323229121DA.0@omp.your.cvs.com>,2025-01-30,14:27:47,3,Thu,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,extracare@your.cvs.com,mike,mike@mikecancell.com,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=,noreply@your.cvs.com,cvs.com,CVS\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...,Job Application,Job Application,Job Application


# Design a Schema for the Data

## Create an Email Addresses Dim

In [33]:
# Extract unique 'From' addresses and display names
from_addresses = df_emails_imap[['From_Display_Name', 'From_Addr']].drop_duplicates()
from_addresses.columns = ['Display_Name', 'fk_Email_Addr']

# Extract unique 'To' addresses and display names
to_addresses = df_emails_imap[['To_Display_Name', 'To_Addr']].drop_duplicates()
to_addresses.columns = ['Display_Name', 'fk_Email_Addr']

# Combine the two DataFrames
df_email_addresses_dim = pd.concat([from_addresses, to_addresses]).drop_duplicates().reset_index(drop=True)

# Reorder columns to make 'fk_Email_Addr' the first column
cols = ['fk_Email_Addr'] + [col for col in df_email_addresses_dim.columns if col != 'fk_Email_Addr']
df_email_addresses_dim = df_email_addresses_dim[cols]

# Display the email addresses dimension
display(df_email_addresses_dim)

Unnamed: 0,fk_Email_Addr,Display_Name
0,do-not-reply@imdb.com,IMDb.com
1,notifications@updates.bandsintown.com,Bandsintown
2,english-personalized-digest@quora.com,Quora Digest
3,USPSInformeddelivery@email.informeddelivery.us...,USPS Informed Delivery
4,extracare@your.cvs.com,=?UTF-8?B?Q1ZTIEV4dHJhQ2FyZQ==?=
...,...,...
531,MIKE.CANCELL@GMAIL.COM,'MIKE.CANCELL@GMAIL.COM'
532,mikecancell@gmail.com,mikecancell
533,dbriere@interactions.com,Benemax Service <Benemax.service@onedigital.co...
534,mike@mikecancell.com,'mike@mikecancell.com'


### Save the Email Addresses to a Pickle for Caching

In [34]:
import pickle
import os

# Create the pickles directory if it does not exist
pickles_dir = '../.pickles'
os.makedirs(pickles_dir, exist_ok=True)

# Save the df_email_addresses_dim DataFrame to a pickle file
with open(f"{pickles_dir}/df_email_addresses_dim.pkl", 'wb') as f:
    pickle.dump(df_email_addresses_dim, f)

## Create Dim for Domains 

In [35]:
import pandas as pd
import pickle
import os
from tqdm import tqdm

# Load the existing df_main_domains_dim DataFrame from the pickle file
pickle_path = '../.pickles/df_main_domains_dim.pkl'
if os.path.exists(pickle_path):
    with open(pickle_path, 'rb') as f:
        df_main_domains_dim = pickle.load(f)
    print("DataFrame loaded successfully from pickle file.")
else:
    print("Pickle file not found. Please ensure the file exists and try again.")

# Ensure df_emails_imap is defined before this cell is executed
if 'df_emails_imap' in globals():
    # Extract unique main domains from the 'From_Domain' column
    unique_main_domains = df_emails_imap['From_Domain'].unique()

    # Filter out domains that already exist in df_main_domains_dim
    new_domains = [domain for domain in tqdm(unique_main_domains, desc="Filtering new domains") if domain not in df_main_domains_dim['Main_Domain'].values]

    # Create a new DataFrame with these new main domains
    df_new_main_domains = pd.DataFrame(new_domains, columns=['Main_Domain'])

    # Append the new domains to the existing df_main_domains_dim DataFrame
    df_main_domains_dim = pd.concat([df_main_domains_dim, df_new_main_domains], ignore_index=True)

    # Display the updated DataFrame
    display(df_main_domains_dim)

    # Save the updated df_main_domains_dim DataFrame back to the pickle file
    with open(pickle_path, 'wb') as f:
        pickle.dump(df_main_domains_dim, f)
    print("Updated DataFrame saved to pickle file.")
    print(f"Number of new domains added: {len(new_domains)}")
else:
    print("Error: df_emails_imap is not defined. Please ensure the cell defining df_emails_imap is executed.")


DataFrame loaded successfully from pickle file.


Filtering new domains:   0%|          | 0/251 [00:00<?, ?it/s]

Filtering new domains: 100%|██████████| 251/251 [00:00<00:00, 144195.36it/s]


Unnamed: 0,Main_Domain,Domain_Name,Org_Name,Name,Whois_Retrieved,was_scraped,og_site_name,og_title,og_description,og_url,og_image
0,imdb.com,IMDB.COM,Unknown,Unknown,True,True,Unknown,"IMDb: Ratings, Reviews, and Where to Watch the...",IMDb is the world's most popular and authorita...,https://www.imdb.com/,https://m.media-amazon.com/images/G/01/imdb/im...
1,bandsintown.com,BANDSINTOWN.COM,Identity Protection Service,On behalf of bandsintown.com owner,True,True,Unknown,Unknown,Unknown,Unknown,Unknown
2,quora.com,QUORA.COM,"Quora, Inc",Legal Team,True,True,Unknown,Quora,,https://www.quora.com/,https://qsf.cf2.quoracdn.net/-4-images.share_d...
3,usps.com,USPS.COM,Unknown,Unknown,True,True,Unknown,Welcome | USPS,"Welcome to USPS.com. Track packages, pay and p...",https://www.usps.com/,https://www.usps.com/assets/images/welcome/usp...
4,cvs.com,CVS.COM,Unknown,Unknown,True,True,Unknown,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...
246,login.gov,,,,,,,,,,
247,marinerwealth.com,,,,,,,,,,
248,auctiontechnologygroup.com,,,,,,,,,,
249,aspca.org,,,,,,,,,,


Updated DataFrame saved to pickle file.
Number of new domains added: 6


### Now from the Main Domain, Let's try to Get the Company Name from the Internet

#### Uses Parallelization for Performance

In [36]:
import whois
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import time
import pandas as pd
import os
import pickle

# Function to get detailed information from main domain using whois with retry mechanism
def get_whois_info(main_domain, retries=3, delay=5):
    for attempt in range(retries):
        try:
            time.sleep(1.5)  # Add a delay before each whois call to address rate limiting
            domain_info = whois.whois(main_domain)
            return {
                'Domain_Name': domain_info.get('domain_name', 'Unknown'),
                'Org_Name': domain_info.get('org', 'Unknown'),
                'Name': domain_info.get('name', 'Unknown')
            }
        except Exception as e:
            print(f"Error resolving whois info for domain {main_domain} on attempt {attempt + 1}: {e}")
            time.sleep(delay)
    return {
        'Domain_Name': 'Unknown',
        'Org_Name': 'Unknown',
        'Name': 'Unknown'
    }

# Function to apply get_whois_info in parallel
def fetch_whois_info(domains):
    whois_info_list = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(get_whois_info, domains), total=len(domains), desc="Fetching WHOIS info"))
    for result in results:
        whois_info_list.append(result)
    return whois_info_list

# Load the existing df_main_domains_dim DataFrame from the pickle file if it exists
pickle_path = '../.pickles/df_main_domains_dim.pkl'
if os.path.exists(pickle_path):
    print("Loading existing df_main_domains_dim DataFrame from pickle file...")
    with open(pickle_path, 'rb') as f:
        df_main_domains_dim = pickle.load(f)
    print("DataFrame loaded successfully.")
else:
    print("Pickle file not found. A new pickle file will be created after data is retrieved.")
    # Initialize the DataFrame with necessary columns if pickle file is not found
    df_main_domains_dim = df_main_domains_dim.assign(
        Domain_Name='Unknown',
        Org_Name='Unknown',
        Name='Unknown',
        Whois_Retrieved=False
    )

# Ensure the necessary columns are present in the DataFrame
required_columns = ['Domain_Name', 'Org_Name', 'Name', 'Whois_Retrieved']
for col in required_columns:
    if col not in df_main_domains_dim.columns:
        df_main_domains_dim[col] = 'Unknown' if col != 'Whois_Retrieved' else False

# Filter domains that need whois info
print("Filtering domains that need whois info...")
domains_to_process = df_main_domains_dim[
    (df_main_domains_dim['Whois_Retrieved'] == False) | 
    (df_main_domains_dim['Whois_Retrieved'].isnull())
]['Main_Domain']
print(f"Found {len(domains_to_process)} domains to process.")
total_domains = len(df_main_domains_dim)
skipped_domains = total_domains - len(domains_to_process)
print(f"Total domains: {total_domains}")
print(f"Skipped domains (already have WHOIS info): {skipped_domains}")

# Apply the function to the filtered domains with progress indicator
if len(domains_to_process) > 0:
    print("Starting to fetch WHOIS info in parallel...")
    whois_info = fetch_whois_info(domains_to_process)
    print("WHOIS info fetching completed.")

    # Convert the list of dictionaries to a DataFrame
    print("Converting WHOIS info to DataFrame...")
    whois_info_df = pd.DataFrame(whois_info, index=domains_to_process.index)
    print("Conversion completed.")

    # Update the existing DataFrame with the new whois info
    print("Updating the existing DataFrame with the new WHOIS info...")
    df_main_domains_dim.update(whois_info_df)
    print("DataFrame updated successfully.")

    # Add a boolean column indicating that WHOIS data has been retrieved
    df_main_domains_dim.loc[domains_to_process.index, 'Whois_Retrieved'] = True

# Display the updated DataFrame
display(df_main_domains_dim)
# Save the updated df_main_domains_dim DataFrame back to the pickle file
with open(pickle_path, 'wb') as f:
    pickle.dump(df_main_domains_dim, f)
print("Updated DataFrame saved to pickle file.")

Loading existing df_main_domains_dim DataFrame from pickle file...
DataFrame loaded successfully.
Filtering domains that need whois info...
Found 6 domains to process.
Total domains: 251
Skipped domains (already have WHOIS info): 245
Starting to fetch WHOIS info in parallel...


Fetching WHOIS info: 100%|██████████| 6/6 [00:02<00:00,  2.58it/s]

WHOIS info fetching completed.
Converting WHOIS info to DataFrame...
Conversion completed.
Updating the existing DataFrame with the new WHOIS info...
DataFrame updated successfully.





Unnamed: 0,Main_Domain,Domain_Name,Org_Name,Name,Whois_Retrieved,was_scraped,og_site_name,og_title,og_description,og_url,og_image
0,imdb.com,IMDB.COM,Unknown,Unknown,True,True,Unknown,"IMDb: Ratings, Reviews, and Where to Watch the...",IMDb is the world's most popular and authorita...,https://www.imdb.com/,https://m.media-amazon.com/images/G/01/imdb/im...
1,bandsintown.com,BANDSINTOWN.COM,Identity Protection Service,On behalf of bandsintown.com owner,True,True,Unknown,Unknown,Unknown,Unknown,Unknown
2,quora.com,QUORA.COM,"Quora, Inc",Legal Team,True,True,Unknown,Quora,,https://www.quora.com/,https://qsf.cf2.quoracdn.net/-4-images.share_d...
3,usps.com,USPS.COM,Unknown,Unknown,True,True,Unknown,Welcome | USPS,"Welcome to USPS.com. Track packages, pay and p...",https://www.usps.com/,https://www.usps.com/assets/images/welcome/usp...
4,cvs.com,CVS.COM,Unknown,Unknown,True,True,Unknown,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...
246,login.gov,login.gov,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,True,,,,,,
247,marinerwealth.com,MARINERWEALTH.COM,Identity Protection Service,On behalf of marinerwealth.com owner,True,,,,,,
248,auctiontechnologygroup.com,AUCTIONTECHNOLOGYGROUP.COM,"Domains By Proxy, LLC",Registration Private,True,,,,,,
249,aspca.org,aspca.org,The American Society for the Prevention of Cru...,REDACTED FOR PRIVACY,True,,,,,,


Updated DataFrame saved to pickle file.


### Now try to scrape the domain for meta data

In [37]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import pickle
import os

# Load the existing df_main_domains_dim DataFrame from the pickle file if it exists
pickle_path = '../.pickles/df_main_domains_dim.pkl'
if os.path.exists(pickle_path):
    with open(pickle_path, 'rb') as f:
        df_main_domains_dim = pickle.load(f)
    print("DataFrame loaded successfully from pickle file.")
else:
    print("Pickle file not found. Please ensure the file exists and try again.")

# Add 'was_scraped' column if it doesn't exist
if 'was_scraped' not in df_main_domains_dim.columns:
    df_main_domains_dim['was_scraped'] = False

# Add meta data columns if they don't exist
meta_columns = ['og_site_name', 'og_title', 'og_description', 'og_url', 'og_image']
for col in meta_columns:
    if col not in df_main_domains_dim.columns:
        df_main_domains_dim[col] = 'Unknown'

# Function to get meta tag data with retry logic
def get_meta_data(domain_name, retries=3, delay=5):
    meta_data = {
        'og_site_name': 'Unknown',
        'og_title': 'Unknown',
        'og_description': 'Unknown',
        'og_url': 'Unknown',
        'og_image': 'Unknown'
    }
    for attempt in range(retries):
        try:
            url = f"http://{domain_name}"
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            for tag in meta_data.keys():
                meta_tag = soup.find('meta', attrs={'name': tag.replace('_', ':')}) or soup.find('meta', attrs={'property': tag.replace('_', ':')})
                if meta_tag and 'content' in meta_tag.attrs:
                    meta_data[tag] = meta_tag['content']
            return meta_data
        except Exception as e:
            print(f"Error fetching meta data for domain {domain_name} on attempt {attempt + 1}: {e}")
            time.sleep(delay)
    return meta_data

# Filter domains that need meta data (excluding 'Unknown' as valid data and not already scraped)
domains_to_process = df_main_domains_dim[
    (df_main_domains_dim['was_scraped'] == False)
]['Main_Domain']

print(f"Total domains to process: {len(domains_to_process)}")

# Function to apply get_meta_data in parallel
def fetch_meta_data(domains):
    meta_data_list = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(get_meta_data, domains), total=len(domains), desc="Fetching meta data"))
    for result in results:
        meta_data_list.append(result)
    return meta_data_list

# Apply the function to the filtered domains with progress indicator
meta_data = fetch_meta_data(domains_to_process)

# Convert the list of dictionaries to a DataFrame
meta_data_df = pd.DataFrame(meta_data, index=domains_to_process.index)

# Update the existing DataFrame with the new meta data
df_main_domains_dim.update(meta_data_df)

# Set 'was_scraped' to True for the processed domains
df_main_domains_dim.loc[domains_to_process.index, 'was_scraped'] = True

# Display the updated DataFrame
display(df_main_domains_dim)

# Save the updated df_main_domains_dim DataFrame to a pickle file
with open(pickle_path, 'wb') as f:
    pickle.dump(df_main_domains_dim, f)

print("Meta data fetching and processing completed successfully.")
print(f"Total domains processed: {len(domains_to_process)}")
print(f"Total domains skipped (already had meta data): {skipped_domains}")
print(f"Updated DataFrame saved to pickle file: {pickle_path}")


DataFrame loaded successfully from pickle file.
Total domains to process: 0


Fetching meta data: 0it [00:00, ?it/s]


Unnamed: 0,Main_Domain,Domain_Name,Org_Name,Name,Whois_Retrieved,was_scraped,og_site_name,og_title,og_description,og_url,og_image
0,imdb.com,IMDB.COM,Unknown,Unknown,True,True,Unknown,"IMDb: Ratings, Reviews, and Where to Watch the...",IMDb is the world's most popular and authorita...,https://www.imdb.com/,https://m.media-amazon.com/images/G/01/imdb/im...
1,bandsintown.com,BANDSINTOWN.COM,Identity Protection Service,On behalf of bandsintown.com owner,True,True,Unknown,Unknown,Unknown,Unknown,Unknown
2,quora.com,QUORA.COM,"Quora, Inc",Legal Team,True,True,Unknown,Quora,,https://www.quora.com/,https://qsf.cf2.quoracdn.net/-4-images.share_d...
3,usps.com,USPS.COM,Unknown,Unknown,True,True,Unknown,Welcome | USPS,"Welcome to USPS.com. Track packages, pay and p...",https://www.usps.com/,https://www.usps.com/assets/images/welcome/usp...
4,cvs.com,CVS.COM,Unknown,Unknown,True,True,Unknown,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...,...,...,...,...,...,...
246,login.gov,login.gov,REDACTED FOR PRIVACY,REDACTED FOR PRIVACY,True,,,,,,
247,marinerwealth.com,MARINERWEALTH.COM,Identity Protection Service,On behalf of marinerwealth.com owner,True,,,,,,
248,auctiontechnologygroup.com,AUCTIONTECHNOLOGYGROUP.COM,"Domains By Proxy, LLC",Registration Private,True,,,,,,
249,aspca.org,aspca.org,The American Society for the Prevention of Cru...,REDACTED FOR PRIVACY,True,,,,,,


Meta data fetching and processing completed successfully.
Total domains processed: 0
Total domains skipped (already had meta data): 245
Updated DataFrame saved to pickle file: ../.pickles/df_main_domains_dim.pkl


## Create a Fact Table

### First Let's Review the Dimensions so we know what to join

In [38]:
# Display the schema of df_email_addresses_dim
print("Schema of df_email_addresses_dim:")
df_email_addresses_dim.info()

Schema of df_email_addresses_dim:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   fk_Email_Addr  536 non-null    object
 1   Display_Name   536 non-null    object
dtypes: object(2)
memory usage: 8.5+ KB


In [39]:
# Display the schema of df_main_domains_dim
print("\nSchema of df_main_domains_dim:")
df_main_domains_dim.info()


Schema of df_main_domains_dim:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251 entries, 0 to 250
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Main_Domain      251 non-null    object
 1   Domain_Name      251 non-null    object
 2   Org_Name         251 non-null    object
 3   Name             251 non-null    object
 4   Whois_Retrieved  251 non-null    object
 5   was_scraped      245 non-null    object
 6   og_site_name     245 non-null    object
 7   og_title         245 non-null    object
 8   og_description   245 non-null    object
 9   og_url           245 non-null    object
 10  og_image         245 non-null    object
dtypes: object(11)
memory usage: 21.7+ KB


In [40]:
# Display the schema of df_emails_imap
print("\nSchema of df_emails_imap:")
df_emails_imap.info()


Schema of df_emails_imap:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1203 entries, 0 to 1202
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   Timestamp              1203 non-null   datetime64[ns, UTC]
 1   Subject                1203 non-null   object             
 2   Message-ID             1203 non-null   object             
 3   Date                   1203 non-null   object             
 4   Time                   1203 non-null   object             
 5   Day_of_Week            1203 non-null   int32              
 6   Day_of_Week_String     1203 non-null   object             
 7   From_Display_Name      1203 non-null   object             
 8   From_Addr              1203 non-null   object             
 9   To_Display_Name        1203 non-null   object             
 10  To_Addr                1203 non-null   object             
 11  Reply-To_Display_Name  1203 n

# Save the DFs as Pickles

### Check the DFs that Exist

In [41]:
# List all DataFrames
dataframes = [var for var in globals() if isinstance(globals()[var], pd.DataFrame)]
print(dataframes)

['df_emails_imap', 'from_addresses', 'to_addresses', 'df_email_addresses_dim', 'df_main_domains_dim', 'df_new_main_domains', 'whois_info_df', 'meta_data_df']


### Only Save the Ones we Really Need

In [42]:
import pickle
import os

# Create the pickles directory if it does not exist
pickles_dir = '../.pickles'
os.makedirs(pickles_dir, exist_ok=True)

# List of DataFrame names
df_names = ['df_emails_imap', 'df_main_domains_dim']

# Loop through the list and store each DataFrame if it exists
for name in df_names:
    if name in globals():
        with open(f"{pickles_dir}/{name}.pkl", 'wb') as f:
            pickle.dump(globals()[name], f)
    else:
        print(f"DataFrame {name} does not exist and will not be pickled.")
