In [1]:
!pip install pandas scikit-learn boto3



In [12]:
import boto3
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

In [13]:
# Step 1: Download Dataset from S3
bucket_name = 'the-enron-email-dataset'
file_key = 'combined_data.csv'
local_file = 'dataset.csv'

In [14]:
# Create an S3 client
s3_client = boto3.client('s3')
s3_client.download_file(bucket_name, file_key, local_file)

In [15]:
# Load the dataset
data = pd.read_csv(local_file)

In [16]:
# Step 2: Preprocess the Data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [17]:
# Apply preprocessing
data['text'] = data['text'].apply(preprocess_text)

In [18]:
# Step 2: Preprocess the Data
# Split the data into features and labels
X = data['text']
y = data['label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Define vectorizers and models
vectorizers = {
    'CountVectorizer': CountVectorizer(),
    'TfidfVectorizer': TfidfVectorizer()
}

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier()
}

In [None]:
# Train models and evaluate accuracy
for vectorizer_name, vectorizer in vectorizers.items():
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', model)
        ])
        
        # Train the model
        pipeline.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = pipeline.predict(X_test)
        
        # Print the accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{vectorizer_name} + {model_name} Accuracy: {accuracy:.4f}')
        
        # Save the model
        joblib.dump(pipeline, f'{vectorizer_name.replace(" ", "_")}_{model_name.replace(" ", "_")}_model.pkl')
        print(f'{vectorizer_name} + {model_name} model saved as {vectorizer_name.replace(" ", "_")}_{model_name.replace(" ", "_")}_model.pkl')

CountVectorizer + Naive Bayes Accuracy: 0.9753
CountVectorizer + Naive Bayes model saved as CountVectorizer_Naive_Bayes_model.pkl
CountVectorizer + Logistic Regression Accuracy: 0.9859
CountVectorizer + Logistic Regression model saved as CountVectorizer_Logistic_Regression_model.pkl
TfidfVectorizer + Random Forest Accuracy: 0.9841
TfidfVectorizer + Random Forest model saved as TfidfVectorizer_Random_Forest_model.pkl
