In [1]:
import boto3
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import joblib 

In [2]:
# Step 1: Download the preprocessed dataset from S3
bucket_name = 'the-enron-email-dataset'  
file_key = 'preprocessed_data.csv'
local_file = 'preprocessed_data.csv'

In [3]:
# Create an S3 client
s3_client = boto3.client('s3')
s3_client.download_file(bucket_name, file_key, local_file)

# Load the preprocessed dataset
data = pd.read_csv(local_file)

In [4]:
# Step 2: Handle Missing Values
# Drop rows with missing text or label values
data = data.dropna(subset=['text', 'label'])

# Ensure 'text' column is of string type
data['text'] = data['text'].astype(str)

In [5]:
# Step 2: Prepare the data
X = data['text']
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Step 3: Define the pipeline with different vectorizers for different models
pipelines = {
    'Naive Bayes': Pipeline([
        ('vect', CountVectorizer()),
        ('clf', MultinomialNB())
    ]),
    'Logistic Regression': Pipeline([
        ('vect', CountVectorizer()),  
        ('clf', LogisticRegression(max_iter=1000))  
    ]),
    'Random Forest': Pipeline([
        ('vect', TfidfVectorizer()),  
        ('clf', RandomForestClassifier())
    ])
}


In [7]:
# Step 5: Train and evaluate each model
results = {}
for name, pipeline in pipelines.items():
    print(f"Training {name}...")
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    
    results[name] = {
        'accuracy': accuracy,
        'classification_report': classification_rep
    }
    
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_rep)

Training Naive Bayes...
Results for Naive Bayes:
Accuracy: 0.9735
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      7947
           1       0.98      0.96      0.97      8742

    accuracy                           0.97     16689
   macro avg       0.97      0.97      0.97     16689
weighted avg       0.97      0.97      0.97     16689

Training Logistic Regression...
Results for Logistic Regression:
Accuracy: 0.9844
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      7947
           1       0.98      0.99      0.99      8742

    accuracy                           0.98     16689
   macro avg       0.98      0.98      0.98     16689
weighted avg       0.98      0.98      0.98     16689

Training Random Forest...
Results for Random Forest:
Accuracy: 0.9808
Classification Report:
              precision    recall  f1-score   support

    

In [18]:
# Initialize S3 client
s3_client = boto3.client('s3')

# Define the models to save and upload
models = {
    'Logistic Regression': pipeline_logistic_regression,
    'Naive Bayes': pipeline_naive_bayes,
    'Random Forest': pipeline_random_forest
}

# Local paths for saving the models
local_model_paths = {
    'Logistic Regression': 'CountVectorizer_Logistic_Regression_model.pkl',
    'Naive Bayes': 'CountVectorizer_Naive_Bayes_model.pkl',
    'Random Forest': 'TfidfVectorizer_Random_Forest_model.pkl'
}

# S3 paths for uploading the models
s3_model_paths = {
    'Logistic Regression': 'models/CountVectorizer_Logistic_Regression_model.pkl',
    'Naive Bayes': 'models/CountVectorizer_Naive_Bayes_model.pkl',
    'Random Forest': 'models/TfidfVectorizer_Random_Forest_model.pkl'
}

# Save and upload each model
for model_name, model in models.items():
    local_path = local_model_paths[model_name]
    s3_path = s3_model_paths[model_name]

    # Save the model locally
    try:
        joblib.dump(model, local_path)
        print(f'{model_name} model saved locally as {local_path}')
    except Exception as e:
        print(f'Error saving {model_name} model locally: {e}')

    # Upload the model to S3
    try:
        s3_client.upload_file(local_path, bucket_name, s3_path)
        print(f'{local_path} has been uploaded to s3://{bucket_name}/{s3_path}')
    except Exception as e:
        print(f'Error uploading {local_path} to S3: {e}')

NameError: name 'pipeline_logistic_regression' is not defined