In [1]:
pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [6]:
import boto3
import joblib
import pandas as pd
import re
from io import StringIO
from email import policy
from email.parser import BytesParser
from io import BytesIO

In [3]:
# Define preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [4]:
# Load the saved model
model_filename = 'CountVectorizer_Logistic_Regression_model.pkl'  
model = joblib.load(model_filename)

In [5]:
# AWS S3 configuration
s3_bucket_name = 'spam-email-test-data'
s3_folder = 'test-emails/'

In [7]:
# Create an S3 client
s3_client = boto3.client('s3')

In [8]:
# Function to read email content from S3
def read_email_from_s3(file_key):
    obj = s3_client.get_object(Bucket=s3_bucket_name, Key=file_key)
    email_bytes = obj['Body'].read()
    email_message = BytesParser(policy=policy.default).parsebytes(email_bytes)
    return email_message.get_body(preferencelist=('plain', 'html')).get_content()

In [9]:
# List objects in the S3 folder
response = s3_client.list_objects_v2(Bucket=s3_bucket_name, Prefix=s3_folder)

if 'Contents' in response:
    for content in response['Contents']:
        file_key = content['Key']
        if file_key.endswith('.eml'):  # Ensure we're processing .eml files
            # Read the email from S3
            input_email = read_email_from_s3(file_key)

            # Preprocess the input email
            processed_email = preprocess_text(input_email)

            # Predict using the loaded model
            prediction = model.predict([processed_email])[0]

            # Output the result
            label = 'Spam' if prediction == 1 else 'Not Spam'
            print(f'The email from {file_key} is classified as: {label}')
else:
    print("No emails found in the specified S3 folder.")

The email from test-emails/Red White & Glow Sale!.eml is classified as: Spam
The email from test-emails/𝗬𝗼𝘂 𝗿𝗲𝗰𝗲𝗶𝘃𝗲𝗱 𝗮 𝗱𝗶𝗿𝗲𝗰𝘁 𝗱𝗲𝗽𝗼𝘀𝗶𝘁𝗲𝗱 𝗼𝗳 $18000,00.eml is classified as: Spam
The email from test-emails/🟢 𝗬𝗼𝘂 𝗿𝗲𝗰𝗲𝗶𝘃𝗲𝗱 𝗮 𝗱𝗶𝗿𝗲𝗰𝘁 𝗱𝗲𝗽𝗼𝘀𝗶𝘁𝗲𝗱 𝗼𝗳 💲 5 500.00 -- ID_#44267.eml is classified as: Spam
