In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc

# Load dataset
df = pd.read_csv('enron_spam_data.csv')

# Sampling the dataset to reduce size
df = df.sample(frac=0.1, random_state=42)  # Use 10% of the data

In [2]:
import re

def regexClean(message_content):
    message_content = str(message_content) 
    message_content = re.sub(r'(http|https|ftp)://[a-zA-Z0-9\\./]+', ' ', message_content) 
    message_content = re.sub(r'[A-Za-z0-9._\\-]+@[A-Za-z0-9-]*\\.[a-z]{2,3}', '', message_content)
    message_content = re.sub(r'<[^<]+?>', '', message_content) 
    message_content = message_content.replace('\n', ' ')
    message_content = message_content.lower()
    return message_content

df['Cleaned_Message'] = df['Message'].apply(regexClean)

In [5]:
from string import punctuation

def punct_removal(message_content):
    message_content = str(message_content)
    message_content = re.sub(r'\d+', ' ', message_content)
    message_content = message_content.replace(r'[^a-zA-Z]', '') 
    message_content = message_content.translate(str.maketrans('', '', punctuation))
    return message_content

df_with_punct_numb_removed = df.copy()
df_with_punct_numb_removed['Cleaned_Message'] = df['Cleaned_Message'].apply(punct_removal)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

def stem_stopword_rem(message_content):
    message_content = str(message_content)
    stop_words = stopwords.words('english')
    words = [eachword for eachword in message_content.split() if eachword not in stop_words]
    stemmer = SnowballStemmer('english')
    stemmed = [stemmer.stem(eachword) for eachword in words]
    return ' '.join(stemmed)

df_stem_stopword_removed = df_with_punct_numb_removed.copy()
df_stem_stopword_removed['Cleaned_Message'] = df_with_punct_numb_removed['Cleaned_Message'].apply(stem_stopword_rem)
display(df_stem_stopword_removed)

In [None]:
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df_stem_stopword_removed['Cleaned_Message']).toarray()

# Convert the vectorized data to a DataFrame for better visualization
vectorized_df = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
print(vectorized_df.head())

In [None]:
# Define the target variable
y = df['Spam/Ham'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for Grid Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy']
}

# Perform Grid Search
model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print(f"Best Parameters: {best_params}")

# Predict on the test set with the best model
y_pred = best_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

In [None]:
# Confusion Matrix
def plot_confusion_matrix(y_test, y_pred, class_labels=None, title='Confusion Matrix', cmap=plt.cm.Blues):
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 7))
    plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = [0, 1]
    plt.xticks(tick_marks, ['Ham', 'Spam'], rotation=45)
    plt.yticks(tick_marks, ['Ham', 'Spam'])
    
    thresh = conf_matrix.max() / 2.
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, format(conf_matrix[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if conf_matrix[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.show()
    
plot_confusion_matrix(y_test, y_pred, class_labels=None, title='Random Trees on stemmed data')

In [None]:
# ROC Curve
y_prob = best_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob, pos_label='spam')
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:0.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
importances = best_model.feature_importances_
indices = sorted(range(len(importances)), key=lambda i: importances[i])[-10:]  # Top 10 features
plt.figure(figsize=(10, 7))
plt.barh(range(len(indices)), [importances[i] for i in indices], align='center')
plt.yticks(range(len(indices)), [vectorizer.get_feature_names_out()[i] for i in indices])
plt.xlabel('Relative Importance')
plt.title('Top 10 Feature Importances')
plt.show()