In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

# Step 1: Load Dataset
df = pd.read_csv("spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']

# Step 2: Encode Labels
df['label'] = LabelEncoder().fit_transform(df['label'])  # spam=1, ham=0

# Step 3: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

# Step 4: Create Pipeline (Preprocessing + Model)
model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', MultinomialNB())
])

# Step 5: Train Model
model.fit(X_train, y_train)

# Step 6: Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"✅ Accuracy: {accuracy*100:.2f}%")
print(f"✅ Precision: {precision:.2f}")
print(f"✅ Recall: {recall:.2f}")


✅ Accuracy: 96.68%
✅ Precision: 1.00
✅ Recall: 0.75


In [2]:
from sklearn.svm import SVC

# SVM Pipeline
svm_model = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', SVC(kernel='linear'))  # Linear kernel for text data
])

# Train SVM model
svm_model.fit(X_train, y_train)

# Evaluate
y_pred_svm = svm_model.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svm) * 100:.2f}%")
print(f"SVM Precision: {precision_score(y_test, y_pred_svm):.2f}")
print(f"SVM Recall: {recall_score(y_test, y_pred_svm):.2f}")


SVM Accuracy: 97.94%
SVM Precision: 0.97
SVM Recall: 0.87


In [3]:
import joblib

# Save the trained SVM model
joblib.dump(svm_model, 'svm_spam.pkl')


['svm_spam.pkl']