In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')

data = data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])
data.columns = ['label', 'text']

data = data.dropna()

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

In [3]:
#TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

In [4]:
#Naive Bayes
nb_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('nb', MultinomialNB())
])

print("Naive Bayes Model:")
evaluate_model(nb_pipeline, X_train, y_train, X_test, y_test)

Naive Bayes Model:
Accuracy: 0.9668161434977578
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [5]:
#Logistic Regression
lr_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('lr', LogisticRegression(max_iter=1000))
])

print("\nLogistic Regression Model:")
evaluate_model(lr_pipeline, X_train, y_train, X_test, y_test)


Logistic Regression Model:
Accuracy: 0.9524663677130045
              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



In [6]:
#Support Vector Machine model
svm_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('svm', SVC())
])

print("\nSupport Vector Machine Model:")
evaluate_model(svm_pipeline, X_train, y_train, X_test, y_test)



Support Vector Machine Model:
Accuracy: 0.9766816143497757
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       965
        spam       0.99      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115

