In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix

In [2]:
def spam_detector(train_df, valid_df, test_df):
    vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2))
    X_train = vectorizer.fit_transform(train_df['text'])
    X_valid = vectorizer.transform(valid_df['text'])
    X_test = vectorizer.transform(test_df['text'])

    y_train = train_df['label'].astype(int)
    y_valid = valid_df['label'].astype(int)

    classifiers = {
        'LogisticRegression': LogisticRegression(random_state=0, max_iter=1000),
        'MultinomialNB': MultinomialNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(random_state=0),
        'LinearSVC': LinearSVC()
    }

    result = {}
    best_model_name = None
    min_false_nagative = float('inf')
    best_model = None

    for name, clf in classifiers.items():
        clf.fit(X_train, y_train)
        y_pred_valid = clf.predict(X_valid)
        cm = confusion_matrix(y_valid, y_pred_valid, labels=[0, 1])

        result[name] = {"ConfusionMatrix": cm, "Model": clf}

        if cm.shape == (2, 2):
            false_nagative = cm[1, 0]
        else:
            false_nagative = 0

        if false_nagative < min_false_nagative or (
            false_nagative == min_false_nagative and name == 'LinearSVC'
        ):
            min_false_nagative = false_nagative
            best_model_name = name
            best_model = clf

    y_pred_test = best_model.predict(X_test)  # 1D output

    results = {
        "LogisticRegression": result['LogisticRegression'],
        "MultinomialNB": result['MultinomialNB'],
        "DecisionTreeClassifier": result['DecisionTreeClassifier'],
        "LinearSVC": result['LinearSVC'],
        "BestClassifier": best_model_name,
        "TfidfVectorizer": X_test,
        "Prediction": y_pred_test,  # 1D output
    }
    return results



In [3]:
train_df, valid_df, test_df = [pd.read_csv(f"data/{name}.csv") for name in ['train', 'valid', 'test']]

In [4]:
output_result = spam_detector(train_df, valid_df, test_df)

In [8]:
print(output_result['BestClassifier'])
print(output_result[output_result['BestClassifier']]['ConfusionMatrix'])

LinearSVC
[[ 61   6]
 [  0 518]]
