In [2]:
import awswrangler as wr
import boto3
import gzip
import json
import io
import pandas as pd
from tqdm import tqdm
from datetime import datetime, timedelta
from concurrent.futures import ProcessPoolExecutor, as_completed

In [3]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [25]:
sts = boto3.Session().client("sts", region_name="us-east-1")
response = sts.assume_role(
    RoleArn="arn:aws:iam::375084544312:role/mimesample_delegate",
    RoleSessionName="mimesamples-access"
)

ACCESS_KEY = response["Credentials"]["AccessKeyId"]
SECRET_KEY = response["Credentials"]["SecretAccessKey"]
SESSION_TOKEN = response["Credentials"]["SessionToken"]

session = boto3.Session(
    aws_access_key_id=ACCESS_KEY,
    aws_secret_access_key=SECRET_KEY,
    aws_session_token=SESSION_TOKEN
)

In [26]:
s3 = session.client("s3")
bucket_name = 'mime-samples-production'

## Getting the email count of suspended users

In [15]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

def process_user(user_id):
    try:
        s3_path = f's3://mime-samples-production/users/{user_id}'
        file_list = wr.s3.list_objects(s3_path, boto3_session=session)
        count = len(file_list)
        actual_email_count = 0
        for file in file_list:
            actual_email_count += wr.s3.read_json(file, boto3_session=session).shape[0]
        return (user_id, count, actual_email_count)
    except Exception as e:
        print(e)
        return (user_id, 0, 0)

results = []
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(process_user, user_id) for user_id in users]
    for future in tqdm(as_completed(futures), total=len(users), desc="Processing users"):
        results.append(future.result())

df = pd.DataFrame(results, columns=["user_id", "email_count", "actual_email_count"])


NameError: name 'users' is not defined

In [None]:
df.to_csv("suspended_user_email_count.csv",index=False)

## Getting the email count datewise

In [27]:
import awswrangler as wr
import boto3
import gzip
import pandas as pd
from datetime import datetime, timedelta
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm

# Dates: last 30 days
start_date = datetime.strptime("2025-03-17", "%Y-%m-%d")
end_date = datetime.strptime("2025-03-17", "%Y-%m-%d")
days = (end_date - start_date).days + 1

dates = [(start_date + timedelta(days=i)).strftime("%Y%m%d") for i in range(days)]

# Only count lines in each gzipped file (each line = one email)
def process_day(day_str):
    try:
        s3_path = f"s3://mime-samples-production/json/day={day_str}"
        file_list = wr.s3.list_objects(s3_path, boto3_session=session)

        if not file_list:
            return pd.DataFrame()  # Nothing for this day

        counts = []
        for key in file_list:
            try:
                no_scheme = key.replace("s3://", "", 1)
                bucket_name, prefix = no_scheme.split("/", 1)

                obj = s3.get_object(Bucket=bucket_name, Key=prefix)
                with gzip.GzipFile(fileobj=obj['Body']) as gz:
                    line_count = sum(1 for _ in gz)

                counts.append({
                    "date": pd.to_datetime(day_str).date(),
                    "source_file": key,
                    "email_count": line_count
                })

            except Exception as e:
                counts.append({
                    "date": pd.to_datetime(day_str).date(),
                    "source_file": key,
                    "email_count": 0,
                    "error": str(e)
                })

        return pd.DataFrame(counts)
    
    except Exception as e:
        print(f"[Error] Failed to process day {day_str}: {e}")
        return pd.DataFrame()

# Run the processing function in parallel
all_dfs = []
MAX_WORKERS = 4

with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_day, day): day for day in dates}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing dates"):
        try:
            result = future.result()
            if not result.empty:
                all_dfs.append(result)
        except Exception as e:
            print(f"[Thread Error] Date {futures[future]} failed: {e}")

# Final aggregation
if all_dfs:
    dfs = pd.concat(all_dfs, ignore_index=True)
else:
    dfs = pd.DataFrame()


Processing dates: 100%|██████████| 1/1 [35:58<00:00, 2158.68s/it]


In [24]:
dfs.groupby('date')['email_count'].sum().reset_index()

Unnamed: 0,date,email_count
0,2025-03-22,5040177
