# Email Sender Analysis

This notebook analyzes email data to extract and explore sender information.

In [1]:
import json
from pathlib import Path

# Load the JSON file
data_path = Path('../data/1000mails_30days.json')
with open(data_path, 'r') as f:
    emails = json.load(f)

print(f"Loaded {len(emails)} emails")

Loaded 660 emails


In [3]:
import re

# Extract only email addresses (not names)
def extract_email(sender):
    """Extract email address from sender field"""
    # Match email in angle brackets: Name <email@domain.com>
    match = re.search(r'<([^>]+)>', sender)
    if match:
        return match.group(1)
    # Otherwise, return the whole string (it's probably just an email)
    return sender

sender_emails = [extract_email(email['sender']) for email in emails]

# Get unique emails
unique_emails = sorted(set(sender_emails))

# Display first 10 unique emails
print("First 10 unique sender emails:")
for i, email in enumerate(unique_emails[:10], 1):
    print(f"{i}. {email}")

print(f"\nTotal emails: {len(sender_emails)}")
print(f"Unique emails: {len(unique_emails)}")

# Save unique emails to file
output_path = Path('../data/unique_emails.txt')
with open(output_path, 'w') as f:
    for email in unique_emails:
        f.write(f"{email}\n")

print(f"\n✓ Saved {len(unique_emails)} unique emails to {output_path}")

First 10 unique sender emails:
1. MaredoGrill@hello.sevenrooms.com
2. Quanta@SimonsFoundation.org
3. Quanta@simonsfoundation.org
4. SWTOR@email.swtor.com
5. StarTrekOnline@arcgames.com
6. account-update@amazon.de
7. allianz@allianz.hr
8. architect-newsletter@mail.infoq.com
9. austinm@mail.packtpub.com
10. aws-marketing-email-replies@amazon.com

Total emails: 660
Unique emails: 105

✓ Saved 105 unique emails to ../data/unique_emails.txt


In [12]:
# Filter out unwanted email domains
filter_keywords = [
    'microsoft', 'aws', 'nvidia', 'boosteroid', 'linkedin', 
    'gamespot', 'atlassian', 'epicgames', 'blender', 'baeldung', 
    'croz', 'gog', 'glovo', 'allianz', 'overseas', 'gls',
    'sevenrooms', 'simonsfoundation', 'swtor', 'packtpub', 
    'amazon', '2k.com', 'arcgames', 'stardockcorporation', 
    'stardockentertainment', 'bolt', 'computervision', 'ebay', 'google',
    'netflix', 'pickboxnow', 'pbz', 'links', 'azfond', 'ubisoft',
    'asuswebstorage', 'paypal', 'gov.hr', 'clouding', 'asus', 'steampowered',
    'unicreditgroup', 'oracle', 'vaadin', 'mailboxde', 'openrouter'
]

def should_filter(email_address):
    """Check if email should be filtered out"""
    email_lower = email_address.lower()
    return any(keyword in email_lower for keyword in filter_keywords)

# Filter emails
filtered_emails = [email for email in unique_emails if not should_filter(email)]

print(f"Original unique emails: {len(unique_emails)}")
print(f"Filtered emails: {len(filtered_emails)}")
print(f"Removed: {len(unique_emails) - len(filtered_emails)}")

# Display first 10 filtered emails
print("\nFirst 10 filtered emails:")
for i, email in enumerate(filtered_emails[:20], 1):
    print(f"{i}. {email}")

# Save filtered emails to file
filtered_output_path = Path('../data/filtered_emails.txt')
with open(filtered_output_path, 'w') as f:
    for email in filtered_emails:
        f.write(f"{email}\n")

print(f"\n✓ Saved {len(filtered_emails)} filtered emails to {filtered_output_path}")

Original unique emails: 105
Filtered emails: 22
Removed: 83

First 10 filtered emails:
1. architect-newsletter@mail.infoq.com
2. bytebytego@substack.com
3. editor1@kdnuggets.com
4. hello@newsletters.venturebeat.com
5. info@email.meetup.com
6. marktechpost-newsletter@mail.beehiiv.com
7. news@prot.is
8. newsletters@arstechnica.com
9. no-reply@substack.com
10. noreply.uk@asendia.com
11. noreply@email.openai.com
12. not-for-reply@physorg.com
13. pragmaticengineer+announcements@substack.com
14. pragmaticengineer+deepdives@substack.com
15. pragmaticengineer@substack.com
16. rwieruch@substack.com
17. sebastianraschka@substack.com
18. secpro@substack.com
19. superhuman@mail.joinsuperhuman.ai
20. team@m.ngrok.com

✓ Saved 22 filtered emails to ../data/filtered_emails.txt
