In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
# 1. Load Dataset

df = pd.read_csv(
    "SpamClassifier.csv",
    engine='python',
    quotechar='"',
    on_bad_lines='skip'
)
df.columns = ['label', 'text']

In [4]:
# 2. Split Data

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [5]:

# 3. Text Vectorization (TF-IDF)

vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [6]:
# 4. Define Models & Hyperparameters

models = {
    "SVM": SVC(),
    "RandomForest": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(max_iter=1000)
}

param_grids = {
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    "RandomForest": {
        'n_estimators': [50, 100],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5]
    },
    "LogisticRegression": {
        'C': [0.1, 1, 10],
        'solver': ['lbfgs', 'liblinear']
    }
}

# Initialize dictionary to store best models
best_models = {}


In [7]:
# 5. GridSearchCV for each model
#GridSearchCV gets you the best hyperparameters for your problem
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grids[model_name],
        cv=5,
        scoring='accuracy',
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train_vec, y_train)  # Fit on training data
    best_models[model_name] = grid.best_estimator_  # Save the best model
    print(f"Best Parameters for {model_name}: {grid.best_params_}")
    print(f"Best CV Accuracy for {model_name}: {grid.best_score_}\n")


Running GridSearchCV for SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best CV Accuracy for SVM: 0.9798002445984508

Running GridSearchCV for RandomForest...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Parameters for RandomForest: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best CV Accuracy for RandomForest: 0.9753126682872573

Running GridSearchCV for LogisticRegression...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Parameters for LogisticRegression: {'C': 10, 'solver': 'lbfgs'}
Best CV Accuracy for LogisticRegression: 0.9721703834558844



In [8]:
# 6. Evaluate each model on test set

for model_name, model in best_models.items():
    y_pred = model.predict(X_test_vec)
    print(f"--- {model_name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred), "\n")

--- SVM ---
Accuracy: 0.9856502242152466
Classification Report:
               precision    recall  f1-score   support

         ham       0.99      1.00      0.99       955
        spam       0.97      0.93      0.95       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Confusion Matrix:
 [[951   4]
 [ 12 148]] 

--- RandomForest ---
Accuracy: 0.9811659192825112
Classification Report:
               precision    recall  f1-score   support

         ham       0.98      1.00      0.99       955
        spam       0.99      0.88      0.93       160

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

Confusion Matrix:
 [[954   1]
 [ 20 140]] 

--- LogisticRegression ---
Accuracy: 0.9856502242152466
Classification Report:
               precision    recall  f1-s

In [9]:
# 7. Find the best model
best_model_name = max(best_models, key=lambda name: accuracy_score(y_test, best_models[name].predict(X_test_vec)))
print(f"Best model on test set: {best_model_name}")

Best model on test set: SVM
