In [1]:
import pickle
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB, ComplementNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
# loading data
with open('/Users/osiprovin/freelance_1/Spam Filter Project/Data/data.pkl', 'rb') as f:
    data = pickle.load(f)

In [3]:
# unpacking data
trainx = data["trainx"]
trainy = data["trainy"]
testx = data["testx"]
testy = data["testy"]

In [4]:
models = {"Multinomial Naive Bayes": MultinomialNB(),
          "Support Vector Machine": SVC(),
          "Random Forest": RandomForestClassifier(),
          "Gradient Boosting": GradientBoostingClassifier(),
          "BernoulliNB": BernoulliNB(),
          "ComplementNB": ComplementNB()}

In [5]:
for name, model in models.items():
    model.fit(trainx, trainy)
    prediction = model.predict(testx)
    accuracy = accuracy_score(testy, prediction)
    report = classification_report(testy, prediction)
    
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy}")
    print("Classification Report:")
    print(report)
    print("----------------------------------\n")

Model: Multinomial Naive Bayes
Accuracy: 0.9766816143497757
Classification Report:
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99       978
           1       0.91      0.90      0.90       137

    accuracy                           0.98      1115
   macro avg       0.95      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115

----------------------------------



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Model: Support Vector Machine
Accuracy: 0.8771300448430494
Classification Report:
              precision    recall  f1-score   support

          -1       0.88      1.00      0.93       978
           1       0.00      0.00      0.00       137

    accuracy                           0.88      1115
   macro avg       0.44      0.50      0.47      1115
weighted avg       0.77      0.88      0.82      1115

----------------------------------

Model: Random Forest
Accuracy: 0.884304932735426
Classification Report:
              precision    recall  f1-score   support

          -1       0.88      1.00      0.94       978
           1       1.00      0.06      0.11       137

    accuracy                           0.88      1115
   macro avg       0.94      0.53      0.52      1115
weighted avg       0.90      0.88      0.84      1115

----------------------------------

Model: Gradient Boosting
Accuracy: 0.8977578475336323
Classification Report:
              precision    recall  f1-score

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
import numpy as np

# Creating parameters grid for hyperparameter selection
param_grid = {
    'alpha': np.linspace(0.001, 5, 250)
}

# Creating MultinomialNB model
nb = MultinomialNB()

# Selecting best parameters using GridSearchCV
grid_search = GridSearchCV(estimator=nb, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1)

# Fitting model with selected parameters
grid_search.fit(trainx, trainy)

# Outputting best hyperparameters
print("Best parameters found by GridSearchCV:", grid_search.best_params_)


Fitting 5 folds for each of 250 candidates, totalling 1250 fits
Best parameters found by GridSearchCV: {'alpha': 1.1855020080321284}


In [7]:
# predicting with improved model
prediction = grid_search.predict(testx)

accuracy = accuracy_score(testy, prediction)
report = classification_report(testy, prediction)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.97847533632287
Classification Report:
              precision    recall  f1-score   support

          -1       0.99      0.99      0.99       978
           1       0.91      0.91      0.91       137

    accuracy                           0.98      1115
   macro avg       0.95      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [8]:
with open("/Users/osiprovin/freelance_1/Spam Filter Project/Data/model.pkl", "wb") as f:
    pickle.dump(grid_search, f)