In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [21]:
df = pd.read_csv("spam.csv", encoding="latin-1")

df = df[['label', 'message']]
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.value_counts('label')

label
0    4825
1     747
Name: count, dtype: int64

In [22]:
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
vectorizer = TfidfVectorizer(stop_words='english',max_df=0.9)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [25]:
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM (Linear)": SVC(kernel='linear'),
    "SVM (RBF)": SVC(kernel='rbf'),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [26]:
results = {}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")


KNN: 0.9130
Logistic Regression: 0.9525
SVM (Linear): 0.9794
SVM (RBF): 0.9767
Decision Tree: 0.9695
Naive Bayes: 0.9668
Random Forest: 0.9767
Gradient Boosting: 0.9587


In [27]:
sorted(results.items(), key=lambda x: x[1], reverse=True)


[('SVM (Linear)', 0.979372197309417),
 ('SVM (RBF)', 0.9766816143497757),
 ('Random Forest', 0.9766816143497757),
 ('Decision Tree', 0.9695067264573991),
 ('Naive Bayes', 0.9668161434977578),
 ('Gradient Boosting', 0.9587443946188341),
 ('Logistic Regression', 0.9524663677130045),
 ('KNN', 0.9130044843049328)]

In [28]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print("Best model:", best_model_name)
print(classification_report(y_test, best_model.predict(X_test_tfidf)))


Best model: SVM (Linear)
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.97      0.87      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

