In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import classification_report, accuracy_score

# Load the SMS Spam Collection dataset
df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])

# Data preprocessing
df['spam'] = df['label'].apply(lambda x: 1 if x == 'spam' else 0)
X = df['message']
y = df['spam']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction
vectorizer = CountVectorizer()
X_train_count = vectorizer.fit_transform(X_train)
X_test_count = vectorizer.transform(X_test)

# Define parameter grids for each model
param_grids = {
    'Gaussian Naive Bayes': {},
    'Multinomial Naive Bayes': {'alpha': [0.5, 1.0, 1.5, 2.0]},
    'Bernoulli Naive Bayes': {'alpha': [0.5, 1.0, 1.5, 2.0], 'binarize': [0.0, 0.5, 1.0]}
}

# Models without hyperparameters
base_models = {
    'Gaussian Naive Bayes': GaussianNB(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Bernoulli Naive Bayes': BernoulliNB()
}
# Hyperparameter tuning and evaluation
for name, model in base_models.items():
    if name == 'Gaussian Naive Bayes':
        # Gaussian Naive Bayes does not have specific parameters to tune with GridSearchCV
        model.fit(X_train_count.toarray(), y_train)
        y_pred = model.predict(X_test_count.toarray())
    else:
        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy')
        grid_search.fit(X_train_count, y_train)
        model = grid_search.best_estimator_
        y_pred = model.predict(X_test_count)

    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    print(f"=== {name} ===")
    if name != 'Gaussian Naive Bayes':
        print("Best Parameters:", grid_search.best_params_)
    print("Accuracy:", accuracy)
    print(report)

=== Gaussian Naive Bayes ===
Accuracy: 0.9067264573991032
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       966
           1       0.60      0.93      0.73       149

    accuracy                           0.91      1115
   macro avg       0.79      0.91      0.84      1115
weighted avg       0.94      0.91      0.91      1115

=== Multinomial Naive Bayes ===
Best Parameters: {'alpha': 1.0}
Accuracy: 0.9919282511210762
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       966
           1       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115

=== Bernoulli Naive Bayes ===
Best Parameters: {'alpha': 0.5, 'binarize': 0.0}
Accuracy: 0.9883408071748879
              precision    recall  f1-score   support

           0       0.99      1.0