### Install Dependencies

In [31]:
# !pip install langdetect geoip2
!pwd

/home/sagemaker-user/snitchmail-eda


In [32]:
import os
import json
import gzip
import shutil
import pandas as pd
from email.header import decode_header, make_header
from langdetect import detect
import geoip2.database
from bs4 import XMLParsedAsHTMLWarning
import warnings
from bs4 import BeautifulSoup
from collections import defaultdict
import awswrangler as wr
wr.engine.set("python") 
from multiprocessing import Pool
import logging
logging.getLogger('nltk').setLevel(logging.ERROR)

warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)

### Connect to Sendgrid Role

In [33]:
import boto3
from pprint import pprint

sts = boto3.Session().client("sts", region_name="us-east-1")
response = sts.assume_role(
    RoleArn="arn:aws:iam::375084544312:role/mimesample_delegate",
    RoleSessionName="mimesamples-access"
)

ACCESS_KEY = response["Credentials"]["AccessKeyId"]
SECRET_KEY = response["Credentials"]["SecretAccessKey"]
SESSION_TOKEN = response["Credentials"]["SessionToken"]

session = boto3.Session(
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
    aws_session_token=SESSION_TOKEN
)
s3 = session.client('s3')

# Your bucket name
bucket_name = 'mime-samples-production'

In [34]:
COMPRESSED_DATA_PATH = './raw_emails/'
JSON_PATH = './raw_emails_json/'
#create folder if not already
os.makedirs(COMPRESSED_DATA_PATH, exist_ok=True)
os.makedirs(JSON_PATH, exist_ok=True)
# for f in os.listdir(EMAIL_PATH):
#     os.remove(os.path.join(EMAIL_PATH, f))
files_in_compressed_data = os.listdir(COMPRESSED_DATA_PATH)
files_in_json_data = os.listdir(JSON_PATH)

In [35]:
len(files_in_compressed_data), len(files_in_json_data)

(1, 2)

In [36]:
# paginator = s3.get_paginator('list_objects_v2')
# pages = paginator.paginate(Bucket=bucket_name, Prefix='users/50733733/')
file_list = wr.s3.list_objects("s3://mime-samples-production/users/50733733/", boto3_session=session)
for file in file_list:
    print(file)
# pages = paginator.paginate(Bucket=bucket_name, Prefix='json/day=20250412/')

s3://mime-samples-production/users/50733733/4b3544d1-837b-4995-87a6-5d9ad434f216.json
s3://mime-samples-production/users/50733733/c57823f7-4fe1-43a0-a231-ae0fb1a7e433.json
s3://mime-samples-production/users/50733733/cda86c6d-f6c3-474f-9644-29175d574016.json
s3://mime-samples-production/users/50733733/dafed507-8e22-4ad5-ba83-63e753d4778a.json


In [47]:
# for page in pages:
#     for obj in page['Contents']:
for key in file_list:
        user = key.split('/')[4]
        user_base_path = os.path.join(JSON_PATH, 'users', user)
        if not os.path.exists(user_base_path):
            os.mkdir(user_base_path)
        local_filename = os.path.join(user_base_path, key.split('/')[5])  # extract just the filename
        print(f"Downloading: {key} to {local_filename}")
        
        # Download file from S3 to local
        wr.s3.download(key, local_filename, boto3_session=session)
        print(f"Downloaded {local_filename} successfully.")

Downloading: s3://mime-samples-production/users/50733733/4b3544d1-837b-4995-87a6-5d9ad434f216.json to ./raw_emails_json/users/50733733/4b3544d1-837b-4995-87a6-5d9ad434f216.json
Downloaded ./raw_emails_json/users/50733733/4b3544d1-837b-4995-87a6-5d9ad434f216.json successfully.
Downloading: s3://mime-samples-production/users/50733733/c57823f7-4fe1-43a0-a231-ae0fb1a7e433.json to ./raw_emails_json/users/50733733/c57823f7-4fe1-43a0-a231-ae0fb1a7e433.json
Downloaded ./raw_emails_json/users/50733733/c57823f7-4fe1-43a0-a231-ae0fb1a7e433.json successfully.
Downloading: s3://mime-samples-production/users/50733733/cda86c6d-f6c3-474f-9644-29175d574016.json to ./raw_emails_json/users/50733733/cda86c6d-f6c3-474f-9644-29175d574016.json
Downloaded ./raw_emails_json/users/50733733/cda86c6d-f6c3-474f-9644-29175d574016.json successfully.
Downloading: s3://mime-samples-production/users/50733733/dafed507-8e22-4ad5-ba83-63e753d4778a.json to ./raw_emails_json/users/50733733/dafed507-8e22-4ad5-ba83-63e753d477

## S3 File Structure
1 page = 1000 files
1 file = .gz file --> unzip --> json 
1 json file ~= 2000 emails

json/day=20250412/<filename>.gz --> 1 JSON file --> multiple emails (each line = 1 email) ~ 2000

users/<userid>/<filename>.json --> single email in 1 line = 1 email

Suspended User  Suspension Date
<user_id>       <date>

emails <date> - 3 days to <date> = Phish

- number of users
- emails per date

In [10]:
from preprocessor import PreProcessor

2025-04-22 15:12:40.276166: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
def process_file(file_path, date):
    local_data = {
        'user_id': [], 'msg_id': [], 'sg_event_id': [], 'subject': []
        # , 'html_file': []
        , 'mime': [], 'email_from': [], 'email_to': [], 'email_date': [],
        'originating_ip': [], 'originating_ip_country': [], 'lang': []
    }
    
    p = PreProcessor()
    # email_body_dir = os.path.join('emails_with_subject_body', date)
    # os.makedirs(email_body_dir, exist_ok=True)

    with open(file_path, 'r') as f:
        for line in f:
            try:
                j = json.loads(line.strip())
                raw_mime = j["raw_mime"]
                # subject_body = p._pull_subject_body(raw_mime)
                
                # File writing handled per-process
                # filename_base = os.path.splitext(os.path.basename(file_path))[0]
                # body_filename = os.path.join(email_body_dir, f"{filename_base}.html")
                # with open(body_filename, 'w') as f_write:
                #     f_write.write(subject_body[1])

                # Subject
                subject = str(make_header(decode_header(j["event"]["payload"]["subject"]))) if j["event"]["payload"]["subject"] else None

                # GeoIP setup per process
                with geoip2.database.Reader('./GeoIP2-Country.mmdb') as reader:
                    originating_ip = j["event"]["payload"]["originating_ip"]
                    try:
                        country = reader.country(originating_ip).country.name
                    except:
                        country = 'N/A'

                # Language detection
                lang = detect(subject) if subject else 'N/A'

                # Append data
                local_data['user_id'].append(j["event"]["payload"]["userid"])
                local_data['msg_id'].append(j["event"]["payload"]["msgid"])
                local_data['sg_event_id'].append(j["event"]["payload"]["sg_event_id"])
            
                local_data['email_from'].append(j["event"]["payload"]["email_from"])
                local_data['email_to'].append(j["event"]["payload"]["email"])
                local_data['email_date'].append(j["event"]["payload"]["date"])
                
                local_data['subject'].append(subject)
                # local_data['html_file'].append(body_filename)
                local_data['mime'].append(raw_mime)
                local_data['originating_ip'].append(j["event"]["payload"]["originating_ip"])
                local_data['originating_ip_country'].append(country)
                local_data['lang'].append(lang)

            except Exception as e:
                print(f"Error in {file_path}: {str(e)}")
    os.remove(file_path)
    return local_data

In [12]:
def process_date_folder(date_folder):
    date = os.path.basename(date_folder)
    files = [os.path.join(date_folder, f) for f in os.listdir(date_folder)
             if os.path.isfile(os.path.join(date_folder, f))]
    
    with Pool() as pool:
        results = pool.starmap(process_file, [(f, date) for f in files])
    
    # Aggregate results
    combined = {k: [] for k in results[0]}
    for result in results:
        for k, v in result.items():
            combined[k].extend(v)
    return combined

In [13]:
from datetime import datetime, timedelta

def get_date_list(start_date_str, end_date_str):
    # Parse the input strings into datetime objects
    start_date = datetime.strptime(start_date_str, '%Y%m%d')
    end_date = datetime.strptime(end_date_str, '%Y%m%d')

    # Generate the list of dates
    date_list = []
    current_date = start_date
    while current_date <= end_date:
        date_list.append(current_date.strftime('%Y%m%d'))
        current_date += timedelta(days=1)
    
    return date_list

# Example usage
start = '20250320'
end = '20250404'
dates = get_date_list(start, end)

In [None]:

for d in dates[::-1]:
    date_base_path = os.path.join(COMPRESSED_DATA_PATH, d)
    os.makedirs(date_base_path, exist_ok=True)
    
    json_date_base_path = os.path.join(JSON_PATH, 'dates', d)
    os.makedirs(json_date_base_path, exist_ok=True)

    # paginator = s3.get_paginator('list_objects_v2')
    # pages = paginator.paginate(Bucket=bucket_name, Prefix=f'json/day={d}/')
    location = f"s3://mime-samples-production/json/day={d}"
    keys = wr.s3.list_objects(location, boto3_session=session)
    for key in keys:
        local_filename = os.path.join(date_base_path, key.split('/')[-1])  # extract just the filename
        print(f"Downloading: {key} to {local_filename}")
        
        # Download file from S3 to local
        # s3.download_file(bucket_name, key, local_filename)
        wr.s3.download(key, local_filename, boto3_session=session)
        print(f"Downloaded {local_filename} successfully.")

        # Unzip to JSON
        with gzip.open(local_filename, 'rb') as f_in:
            base_name, _ = os.path.splitext(key.split('/')[-1])
            filepath_out = os.path.join(json_date_base_path, base_name + '.json')
            with open(filepath_out, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        os.remove(local_filename)
        
        print(f"Unzipped file saved to: {filepath_out}")

    processed = process_date_folder(json_date_base_path)
        
    df = pd.DataFrame(processed)
    df = df.sort_values(['user_id', 'sg_event_id'])

    csv_output = 'csv_files/wo_label/date_wise/'
    os.makedirs(csv_output, exist_ok=True)

    output_filename = os.path.join(csv_output, f'data_woLabel_{d}.csv')
    df.to_csv(output_filename, index=False)
    print(f"\n\nProcessed CSV with DF of size {len(df)} and saved to {output_filename}!\n")

Downloading: s3://mime-samples-production/json/day=20250404/email-content-raw-production-1-2025-04-04-00-10-43-2f4655c2-c8f4-4471-8e2b-6b9065d1e36c.gz to ./raw_emails/20250404/email-content-raw-production-1-2025-04-04-00-10-43-2f4655c2-c8f4-4471-8e2b-6b9065d1e36c.gz


In [36]:
df.head(), df.shape

(      user_id                                             msg_id  \
 236  35459534  -h2aAimDR4OLypvFDN1mZQ.recvd-7c497566f-fz9wr-1...   
 21   35459534  6iZejZ1CSVuIyMctJWs9YA.recvd-7c497566f-vxxrj-1...   
 286  35459534  8CDbYFszRgqdvmqNpixCQQ.recvd-7c497566f-b6nlg-1...   
 419  35459534  MCxFLDVbTLuyFHiEu4V-AA.recvd-7c497566f-gv7jj-1...   
 633  35459534  MFlmznL3Q6usAQriKNjyTQ.recvd-7c497566f-l7v8h-1...   
 
                                            sg_event_id  \
 236  cHJvY2Vzc2VkLTM1NDU5NTM0LS1oMmFBaW1EUjRPTHlwdk...   
 21   cHJvY2Vzc2VkLTM1NDU5NTM0LTZpWmVqWjFDU1Z1SXlNY3...   
 286  cHJvY2Vzc2VkLTM1NDU5NTM0LThDRGJZRnN6UmdxZHZtcU...   
 419  cHJvY2Vzc2VkLTM1NDU5NTM0LU1DeEZMRFZiVEx1eUZIaU...   
 633  cHJvY2Vzc2VkLTM1NDU5NTM0LU1GbG16bkwzUTZ1c0FRcm...   
 
                                                subject  \
 236  CoreTest message for - TestV3SendPersonalizati...   
 21   CoreTest message for - TestV3SendPersonalizati...   
 286  CoreTest message for - TestV3SendPersonalizat

## User-wise CSV generation

# Snowflake Connection to get Suspended Accounts

In [15]:
import snowflake.connector

conn = snowflake.connector.connect(
    user='monica.tare@sendgrid.com',
    account='sendgrid.us-east-1',
    authenticator= 'externalbrowser',
    warehouse='TRUST_INSIGHTS_STAGING_WH',
    database='TRUST_INSIGHTS_PROD',
    role='MDS_UNMASKED_ACCESS_ROLLUP',  # TRUST_INSIGHTS_DEVELOPER
    schema="trust_insights_prod_schema"
)

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...
Going to open: https://sendgrid.us-east-1.snowflakecomputing.com/console/login?login_name=monica.tare%40sendgrid.com&browser_mode_redirect_port=35815&proof_key=rIIhqMkUWFnINOswRacCKQ%2FApZh6ksijtf2Kz4x5UxY%3D to authenticate...
We were unable to open a browser window for you, please open the url above manually then paste the URL you are redirected to into the terminal.


Enter the URL the SSO URL redirected you to:  http://localhost:35815/?token=7VlJl6LKEt77K-rYS08VM4Knq-5jcEAFRETFHUMyyCiJgP76h1ZZXVXdt7vv7T5v8U6vNCMzIiPiC8Iv5PNfTRLfVaCAYZY-drEHtHsHUidzw9R_7Bqr0T3T_evpM7SSGM8HSwDzLIXgTgSwDFOrvCoFZZnDAYJAkLp-EboPR3gPLFjeYw8wzWovtiLgZEl-bFX8h_Yb4gEXiTM_TLt3kvjYDd0-RpEkhjIEzlA01cf6GIER_T5N4xRDtIfS282r7LGbR6E7SgDV0BuHGSc0U7hrLgwysz9fWhVRUlZeNegMVvtQPomc0-pDeARSCksrLR-7OIpT9yh5j5MrlBxQ5ABlH1iS3XXv1rc84Jc8tJlJ4eA59MfusUgHmQVDOEitBMBB6Qx0Tp4P2qODvMjKzMnim04Dn7PSJqWu64eaeMgKH8FRFEO28lx3ApBY3ZesDq7OFXejrEis8vv3XCShe-9djw5AWobl6Z2f31e3IATFBbPu0xvvsqi0rqiAJsKK_pG1YD_V9iNzjbnMZ-Stk0-fXTjQQ79F_liAl4vdvwsWRVAWac-4MPQ_dV91gSulXnZdClaapaFjxeH5WksyKIPMveNiPyvCMkj-NosYejF8Dxrn3sHI9FMXee_aTxtCyZuH90lWgE8FtO5hYOEU_WJyCTxQtE8EuDOW0mP3049q9aq0KqwUXlCC75c_dOhdykBagTjLgXsPb3G9OPXzBr-RqqfPwBlIqRMfYVgB5VIlueUAeLcogBc28xC2VdjAW10B5x-hgLx178PyORti6Lfd4x8i1CLw6R0uz1bWVnwETzBRXAC9wtkNw36u89Q8OPFeIUt0pGS-XyBLelMJZmSo_uPVo7fKV8Erys_LD2X6WlbPGiUiulruIezkpKvHwjTNaMmyTttpUO5QkftofoKbZrbD

In [16]:
import pandas as pd
def query_snowflake(query):
    df = pd.read_sql(
        query,
        conn
    )
    df.columns = [col.lower() for col in df.columns]
    return df

In [17]:
query = """
select * from edo_dw_prod.acq_maildb.acq_hist_maildb_admin_note
where agent_notes like '%DIRMBS:RElSTUJT%'
    and created_at >= dateadd(month, -2, current_timestamp)
"""
df = query_snowflake(query)

  df = pd.read_sql(


In [18]:
df.columns # agent_notes, account column = user_id

Index(['etl_acq_row_num', 'etl_src_id', 'etl_src_abbrv', 'etl_src_table_name',
       'etl_src_start_date', 'etl_src_end_date', 'etl_uid_code', 'etl_md5',
       'etl_ins_run_id', 'etl_upd_run_id', 'etl_ins_date', 'etl_upd_date',
       'etl_del_date', 'id', 'user_id', 'agent_id', 'admin_notes_category_id',
       'package_id', 'package_name', 'parent_id', 'linked_note_id',
       'agent_notes', 'created_at', 'updated_at'],
      dtype='object')

In [20]:
users = df[['user_id', 'created_at']]

In [23]:
suspended_users = df.loc[
    df.groupby('user_id')['created_at'].idxmax(),
    ['user_id', 'created_at']
].reset_index(drop=True)

In [24]:
suspended_users

Unnamed: 0,user_id,created_at
0,3956193,2025-03-05 19:20:32
1,4828170,2025-03-13 08:18:54
2,7955388,2025-03-21 20:41:19
3,13503997,2025-02-24 13:25:19
4,23017485,2025-02-24 08:37:59
...,...,...
801,52181849,2025-04-22 15:24:32
802,52183024,2025-04-22 15:24:32
803,52184050,2025-04-22 16:34:13
804,52184741,2025-04-22 16:34:13


In [25]:
suspended_users.to_csv('csv_files/suspended_users_20250424.csv', index=False)

In [26]:
unique_users = list(suspended_users['user_id'].unique())

In [63]:
len(unique_users)

806

In [50]:
def process_json_lines(json_lines, s3_loc):
    local_data = {
        'user_id': [], 'msg_id': [], 'sg_event_id': [], 'subject': []
        , 'mime': [], 'email_from': [], 'email_to': [], 'email_date': []
        , 'originating_ip': [], 'originating_ip_country': [], 'lang': []
        , 's3_path': []
    }
    
    p = PreProcessor()

    for j in json_lines:
        try:
            raw_mime = j["raw_mime"]

            # Subject
            subject = str(make_header(decode_header(j["event"]["payload"]["subject"]))) if j["event"]["payload"]["subject"] else None

            # GeoIP setup per process
            with geoip2.database.Reader('./GeoIP2-Country.mmdb') as reader:
                originating_ip = j["event"]["payload"]["originating_ip"]
                try:
                    country = reader.country(originating_ip).country.name
                except:
                    country = 'N/A'

            # Language detection
            lang = detect(subject) if subject else 'N/A'

            # Append data
            local_data['user_id'].append(j["event"]["payload"]["userid"])
            local_data['msg_id'].append(j["event"]["payload"]["msgid"])
            local_data['sg_event_id'].append(j["event"]["payload"]["sg_event_id"])
        
            local_data['email_from'].append(j["event"]["payload"]["from"])
            local_data['email_to'].append(j["event"]["payload"]["email"])
            local_data['email_date'].append(j["event"]["payload"]["date"])
            
            local_data['subject'].append(subject)
            # local_data['html_file'].append(body_filename)
            local_data['mime'].append(raw_mime)
            local_data['originating_ip'].append(j["event"]["payload"]["originating_ip"])
            local_data['originating_ip_country'].append(country)
            local_data['lang'].append(lang)
            local_data['s3_path'].append(s3_loc)

        except Exception as e:
            print(f"Error in {s3_loc}: {str(e)}")
    return local_data

In [60]:
user_email_count = defaultdict(int)
for user in unique_users:
    if user not in user_email_count:
        user_emails = defaultdict(list)
        location = f"s3://mime-samples-production/users/{user}"
        files = wr.s3.list_objects(location, boto3_session=session)
        for filename in files:
            print(f"Reading: {filename}")
            
            # Read file from S3 in memory
            key = '/'.join(filename.split('/')[3:])
            obj = s3.get_object(Bucket=bucket_name, Key=key)
            lines = obj['Body'].read().decode("utf-8").splitlines()
            json_objects = [json.loads(line) for line in lines]
            user_emails[key] = json_objects
            
        with Pool() as pool:
            results = pool.starmap(process_json_lines, [(user_emails[s3_loc], s3_loc) for s3_loc in user_emails])
            
        # Aggregate results
        if results:
            processed = {k: [] for k in results[0]}
            for result in results:
                for k, v in result.items():
                    processed[k].extend(v)       
            df = pd.DataFrame(processed)
            df = df.sort_values(['sg_event_id'])
    
            csv_output = 'csv_files/wo_label/user_wise/'
            os.makedirs(csv_output, exist_ok=True)
        
            output_filename = os.path.join(csv_output, f'data_woLabel_{user}.csv')
            df.to_csv(output_filename, index=False)
            user_email_count[user] = len(df)
            print(f"\n\nProcessed CSV with DF of size {len(df)} and saved to {output_filename}!\n")
        else:
            user_email_count[user] = 0
            print(f"No files found for {user}: {files}")

Reading: s3://mime-samples-production/users/3956193/05df04f3-b0c5-4249-bc8c-fede78cfd990.json
Reading: s3://mime-samples-production/users/3956193/05f9bcf0-7fe5-4cf0-9793-0deb09120de4.json
Reading: s3://mime-samples-production/users/3956193/06b648f5-4da8-4c48-b5ee-f09e0acbb32a.json
Reading: s3://mime-samples-production/users/3956193/0a010116-fa72-46ad-9e8b-86f61d917dfd.json
Reading: s3://mime-samples-production/users/3956193/0b3c405f-a6ea-42fc-9f30-6a2fe8c09048.json
Reading: s3://mime-samples-production/users/3956193/1311f2e2-cd28-4a77-bb59-2653f3dde436.json
Reading: s3://mime-samples-production/users/3956193/1561d429-37aa-4e92-85a0-8d761b706991.json
Reading: s3://mime-samples-production/users/3956193/161cdb2d-701c-466c-b278-900a0c471476.json
Reading: s3://mime-samples-production/users/3956193/191817d2-e513-4d5a-aaf7-75d3fcdb5f14.json
Reading: s3://mime-samples-production/users/3956193/1b3e8b76-4f84-4fa0-aa99-439942b52891.json
Reading: s3://mime-samples-production/users/3956193/1b920e66

In [61]:
len(user_email_count)

806

In [62]:
user_stats = pd.DataFrame()
user_stats['user'] = list(user_email_count.keys())
user_stats['email_count'] = list(user_email_count.values())

In [65]:
user_stats.to_csv('csv_files/user_stats_20250424.csv', index=False)