# SPAM DETECTOR

In [1]:
import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

def spam_detector(train_df, valid_df, test_df):
    
    vectorizer = TfidfVectorizer(
        min_df=2, 
        ngram_range=(1, 2), 
        lowercase=True,
    )
    
    X_train = vectorizer.fit_transform(train_df["text"])
    X_val = vectorizer.transform(valid_df["text"])
    X_test = vectorizer.transform(test_df["text"])
    y_train = train_df["label"].values
    y_val = valid_df["label"].values

    models = {
        "LogisticRegression": LogisticRegression(random_state=0, max_iter=1000),
        "MultinomialNB": MultinomialNB(),
        "DecisionTreeClassifier": DecisionTreeClassifier(random_state=0),
        "LinearSVC": LinearSVC(),
    }

    confusion_matrices = {}
    f1_scores = {}
    best_name, best_model, best_missed = None, None, None
    for name, clf in models.items():
        
        clf.fit(X_train, y_train)
        val_pred = clf.predict(X_val)
        cm = confusion_matrix(y_val, val_pred, labels=[0, 1])
        confusion_matrices[name] = cm
        f1 = f1_score(y_val, val_pred, average="macro")
        f1_scores[name] = np.round(f1, 4)
        missed = int(cm[0, 1])
        
        if best_missed is None or missed < best_missed:
            best_missed = missed
            best_name = name
            best_model = clf

    test_pred = best_model.predict(X_test)

    results = {
        "LogisticRegression": confusion_matrices["LogisticRegression"],
        "MultinomialNB": confusion_matrices["MultinomialNB"],
        "DecisionTreeClassifier": confusion_matrices["DecisionTreeClassifier"],
        "LinearSVC": confusion_matrices["LinearSVC"],
        "BestClassifier": best_name,
        "TfidfVectorizer": X_test,
        "Prediction": test_pred,
        "F1 Results": f1_scores,
    }
    return results

In [2]:
data_directory = r"C:\Users\sb013698\Desktop\ML Test\Datasets\Spam Detector"
train_df, valid_df, test_df = [pd.read_csv(os.path.join(data_directory, f"{split}.csv")) for split in ["train","valid","test"]]

result_dict = spam_detector(train_df, valid_df, test_df)
print(f'Best Classifier: {result_dict["BestClassifier"]}')
print(f'F1 Scores: \n{result_dict["F1 Results"]}')

Best Classifier: LinearSVC
F1 Scores: 
{'LogisticRegression': 0.9193, 'MultinomialNB': 0.9193, 'DecisionTreeClassifier': 0.9194, 'LinearSVC': 0.9737}


# END