In [49]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [50]:
df = pd.read_csv("spam.csv", encoding="latin-1")

df = df[['label', 'message']]
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.value_counts('label')

label
0    4825
1     747
Name: count, dtype: int64

In [52]:
X = df['message']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [53]:
vectorizer = TfidfVectorizer(stop_words='english',max_df=0.9)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [54]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [55]:
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM (Linear)": SVC(kernel='linear', class_weight='balanced'),
    "SVM (RBF)": SVC(kernel='rbf', class_weight='balanced'),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}


In [56]:
results = {}

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    preds = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")


KNN: 0.9130
Logistic Regression: 0.9525
SVM (Linear): 0.9803
SVM (RBF): 0.9767
Decision Tree: 0.9695
Naive Bayes: 0.9668
Random Forest: 0.9767
Gradient Boosting: 0.9587


In [57]:
sorted(results.items(), key=lambda x: x[1], reverse=True)


[('SVM (Linear)', 0.9802690582959641),
 ('SVM (RBF)', 0.9766816143497757),
 ('Random Forest', 0.9766816143497757),
 ('Decision Tree', 0.9695067264573991),
 ('Naive Bayes', 0.9668161434977578),
 ('Gradient Boosting', 0.9587443946188341),
 ('Logistic Regression', 0.9524663677130045),
 ('KNN', 0.9130044843049328)]

In [58]:
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]

print("Best model:", best_model_name)
print(classification_report(y_test, best_model.predict(X_test_tfidf)))


Best model: SVM (Linear)
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.95      0.90      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [59]:
    from sklearn.metrics import classification_report, confusion_matrix
    
    svm_linear = models["SVM (Linear)"]
    preds = svm_linear.predict(X_test_tfidf)
    
    print(classification_report(y_test, preds))
    print(confusion_matrix(y_test, preds))


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       965
           1       0.95      0.90      0.92       150

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[958   7]
 [ 15 135]]


In [60]:
feature_names = vectorizer.get_feature_names_out()
coefficients = svm_linear.coef_.toarray()[0]

top_spam = np.argsort(coefficients)[-10:]
top_ham = np.argsort(coefficients)[:10]

print("Top spam words:")
print(feature_names[top_spam])

print("\nTop ham words:")
print(feature_names[top_ham])


Top spam words:
['service' 'ringtone' '88066' 'www' 'claim' '07090201529' 'mobile' 'txt'
 '50' 'uk']

Top ham words:
['sir' 'way' 'happy' 'later' 'home' 'ok' 'lt' 'hey' 'said' 'll']


In [61]:
svm_linear = models["SVM (Linear)"]

def test_message(text):
    vec = vectorizer.transform([text])
    pred = svm_linear.predict(vec)[0]
    return "Spam" if pred == 1 else "Ham"

test_message("Congratulations! You have won a free iPhone")


'Spam'

In [62]:
test_message("Hey, are you coming to class tomorrow?")

'Ham'

In [63]:
test_message("URGENT! Claim your reward now")

'Spam'

In [64]:
test_message("http ringtone")

'Spam'

In [65]:
df.value_counts('label')


label
0    4825
1     747
Name: count, dtype: int64

In [66]:
svm_linear.decision_function(
    vectorizer.transform(["URGENT! Claim your reward now"])
)


array([1.43609847])

In [68]:
scores = svm_linear.decision_function(X_test_tfidf)

uncertain_idx = np.argsort(np.abs(scores))[:10]

pd.DataFrame({
    "message": X_test.iloc[uncertain_idx],
    "score": scores[uncertain_idx],
    "actual": y_test.iloc[uncertain_idx]
})


Unnamed: 0,message,score,actual
1044,We know someone who you know that fancies you....,-0.022883,1
4904,Warner Village 83118 C Colin Farrell in SWAT t...,-0.023001,1
4071,Loans for any purpose even if you have Bad Cre...,-0.065629,1
3758,"GOD ASKED, \What is forgiveness?\"" A little ch...",-0.079443,0
5120,PRIVATE! Your 2003 Account Statement for 078,0.08818,1
416,Alright i have a new goal now,0.115075,0
3195,"I av a new number, . Wil u only use this one,ta.",-0.127968,0
683,Hi I'm sue. I am 20 years old and work as a la...,0.134368,1
4147,Please call Amanda with regard to renewing or ...,-0.153762,1
1268,Can U get 2 phone NOW? I wanna chat 2 set up m...,0.169264,1


array([[0.3541289, 0.6458711]])