# Linear SVM from Scratch for Spam Detection
We implement a linear Support Vector Machine using the primal form with hinge-loss. The optimisation goal is to minimise `||w||^2` while keeping examples on the correct side of the margin. We use stochastic sub-gradient descent: for each training point, if it lies within the margin we nudge the weight vector toward the correct label; otherwise we only apply weight decay. Although this implementation is simple, it demonstrates how SVMs emphasise the hardest-to-classify samples.

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
records = pd.DataFrame({
    "text": [
        "limited time offer claim your prize now",
        "meeting reminder for project update",
        "win cash by entering free lottery",
        "family dinner plans for saturday",
        "exclusive deal just for you click",
        "invoice attached for last month",
        "cheap meds available order today",
        "team outing scheduled at 5pm",
        "congratulations you have won a voucher",
        "please review the attached report"
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]
})

train_df = records.sample(frac=0.7, random_state=1)
test_df = records.drop(train_df.index)

token_sets = [set(text.lower().split()) for text in train_df["text"]]
vocab = sorted(set().union(*token_sets))
index_map = {word: idx for idx, word in enumerate(vocab)}

def vectorize(text):
    """Simple bag-of-words vector counting token frequency."""
    vec = np.zeros(len(vocab))
    for token in text.lower().split():
        if token in index_map:
            vec[index_map[token]] += 1
    return vec

X_train = np.vstack(train_df["text"].apply(vectorize).to_numpy())
y_train = train_df["label"].apply(lambda lbl: 1 if lbl == 1 else -1).to_numpy()

X_test = np.vstack(test_df["text"].apply(vectorize).to_numpy())
y_test = test_df["label"].apply(lambda lbl: 1 if lbl == 1 else -1).to_numpy()

In [None]:
def train_linear_svm(X, y, learning_rate=0.01, lambda_reg=0.01, epochs=40):
    weights = np.zeros(X.shape[1])
    bias = 0.0

    for epoch in range(epochs):
        for xi, yi in zip(X, y):
            condition = yi * (np.dot(xi, weights) + bias)
            if condition >= 1:
                # Only apply L2 regularisation when the point is correctly classified
                weights -= learning_rate * (2 * lambda_reg * weights)
            else:
                # Pull the decision boundary toward the misclassified sample
                weights -= learning_rate * (2 * lambda_reg * weights - yi * xi)
                bias += learning_rate * yi
    return weights, bias

weights, bias = train_linear_svm(X_train, y_train)

In [None]:
def predict_linear_svm(X, weights, bias):
    scores = X @ weights + bias
    return np.where(scores >= 0, 1, -1)

y_pred = predict_linear_svm(X_test, weights, bias)

print("Confusion matrix:")
print(confusion_matrix((y_test == 1).astype(int), (y_pred == 1).astype(int)))

print("\nDetailed report:")
print(classification_report((y_test == 1).astype(int), (y_pred == 1).astype(int), target_names=["normal", "spam"]))