# ðŸ“§ SMS Spam Classifier â€“ Full Pipeline (Colab Ready)

This notebook implements an endâ€‘toâ€‘end **SMS spam detection** project:

- Load & clean the UCI SMS Spam dataset  
- Text preprocessing + TFâ€‘IDF feature extraction  
- Train multiple ML models (MultinomialNB, Logistic Regression, Linear SVM)  
- Hyperparameter tuning with `RandomizedSearchCV`  
- Model evaluation (Accuracy, Precision, Recall, F1, ROCâ€‘AUC)  
- Graphs for model performance (ROC curves, F1 comparison, metric bars)  
- `classify_sms()` helper  
- Simple interactive textbox (ipywidgets)  
- FastAPI app exercised via `TestClient` (RESTâ€‘style API inside Colab)  


## 1. Imports & Configuration

In [None]:
import os, re, string, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, classification_report,
    confusion_matrix, roc_auc_score, roc_curve, auc
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt

RANDOM_STATE = 42

UCI_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
LOCAL_DATA_PATH = None  # if you already have SMSSpamCollection TSV, put its path here

SAVE_DIR = "/content/artifacts_sms_spam"
os.makedirs(SAVE_DIR, exist_ok=True)

print("Environment ready.")

## 2. Load UCI SMS Spam Dataset

In [None]:
import zipfile, io, requests

def load_uci_sms_spam(local_path: str | None = None) -> pd.DataFrame:
    """Return DataFrame with columns ['label', 'message'].

    If local_path is provided, read it as TSV. Otherwise, download from UCI.
    """
    if local_path and os.path.exists(local_path):
        df = pd.read_csv(local_path, sep="\t", header=None, names=["label", "message"], encoding="utf-8")
        return df
    
    r = requests.get(UCI_URL, timeout=30)
    r.raise_for_status()
    z = zipfile.ZipFile(io.BytesIO(r.content))
    with z.open("SMSSpamCollection") as f:
        df = pd.read_csv(f, sep="\t", header=None, names=["label", "message"], encoding="utf-8")
    return df

df = load_uci_sms_spam(LOCAL_DATA_PATH)
df = df.dropna(subset=["label", "message"]).copy()
df["label"] = df["label"].str.strip().str.lower()

assert set(df["label"].unique()) <= {"ham", "spam"}, "Unexpected labels present!"

print("Dataset shape:", df.shape)
print(df["label"].value_counts())

## 3. Text Cleaning / Preprocessing

In [None]:
URL_RE = re.compile(r"(https?://\S+|www\.\S+)", flags=re.IGNORECASE)
HTML_RE = re.compile(r"<.*?>")
NUM_RE  = re.compile(r"\d+")
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text(s: str) -> str:
    s = s.strip().lower()
    s = URL_RE.sub(" URL ", s)
    s = HTML_RE.sub(" ", s)
    s = NUM_RE.sub(" NUM ", s)
    s = s.translate(PUNCT_TABLE)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

df["message_clean"] = df["message"].astype(str).apply(clean_text)
df[["label", "message_clean"]].head()

## 4. Train / Validation / Test Split

In [None]:
X = df["message_clean"].values
y = (df["label"].values == "spam").astype(int)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_STATE, stratify=y_temp
)

print(f"Train: {len(X_train)}, Valid: {len(X_valid)}, Test: {len(X_test)}")

## 5. TF-IDF Feature Extractors

In [None]:
tfidf_word = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.98,
    max_features=200_000
)

tfidf_char = TfidfVectorizer(
    analyzer="char_wb",
    ngram_range=(3, 5),
    min_df=2,
    max_features=200_000
)

print("Vectorizers ready.")

## 6. Model Pipelines

In [None]:
pipelines = {}

pipelines["mnb_word"] = Pipeline([
    ("tfidf", tfidf_word),
    ("clf", MultinomialNB(alpha=0.5))
])

pipelines["lr_char"] = Pipeline([
    ("tfidf", tfidf_char),
    ("clf", LogisticRegression(
        solver="liblinear",
        penalty="l2",
        C=2.0,
        max_iter=200,
        class_weight="balanced",
        random_state=RANDOM_STATE
    ))
])

pipelines["svm_char_calibrated"] = Pipeline([
    ("tfidf", tfidf_char),
    ("svm_cal", CalibratedClassifierCV(
        LinearSVC(C=1.0, random_state=RANDOM_STATE),
        method="sigmoid",
        cv=3
    ))
])

print("Base pipelines:", list(pipelines.keys()))

## 7. Hyperparameter Tuning â€“ Logistic Regression (RandomizedSearchCV)

In [None]:
param_dist = {
    "clf__C": np.logspace(-2, 1.3, 12),
    "clf__penalty": ["l2"],
    "clf__solver": ["liblinear"],
}

lr_search = RandomizedSearchCV(
    estimator=Pipeline([
        ("tfidf", tfidf_char),
        ("clf", LogisticRegression(
            class_weight="balanced",
            max_iter=300,
            random_state=RANDOM_STATE
        ))
    ]),
    param_distributions=param_dist,
    n_iter=12,
    scoring="f1",
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE),
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=0
)

print("Running LR RandomizedSearchCV ...")
lr_search.fit(X_train, y_train)
pipelines["lr_char_tuned"] = lr_search.best_estimator_

print("Best LR params:", lr_search.best_params_)
print("Pipelines now:", list(pipelines.keys()))

## 8. Evaluate Models on Validation Set

In [None]:
def eval_model(name, pipe, X_tr, y_tr, X_va, y_va):
    pipe.fit(X_tr, y_tr)
    preds = pipe.predict(X_va)
    
    try:
        proba = pipe.predict_proba(X_va)[:, 1]
    except Exception:
        try:
            scores = pipe.decision_function(X_va)
            proba = MinMaxScaler().fit_transform(scores.reshape(-1,1)).ravel()
        except Exception:
            proba = None
    
    acc = accuracy_score(y_va, preds)
    p, r, f1, _ = precision_recall_fscore_support(
        y_va, preds, average="binary", zero_division=0
    )
    auc_score = roc_auc_score(y_va, proba) if proba is not None else np.nan
    
    print(f"\n=== {name} ===")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {p:.4f} | Recall: {r:.4f} | F1: {f1:.4f}")
    if proba is not None:
        print(f"ROC-AUC : {auc_score:.4f}")
    print("\nClassification Report:\n", classification_report(y_va, preds, target_names=["ham","spam"], zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_va, preds))
    
    return {"name": name, "pipe": pipe, "acc": acc, "f1": f1, "auc": auc_score}

results = []
for name, pipe in pipelines.items():
    results.append(eval_model(name, pipe, X_train, y_train, X_valid, y_valid))

print("\nDone evaluating all models.")

## 9. Pick Best Model & Evaluate on Test Set

In [None]:
res_df = pd.DataFrame(results)
res_df["auc_rank"] = res_df["auc"].rank(ascending=False, method="min")
res_df["f1_rank"] = res_df["f1"].rank(ascending=False, method="min")
res_df["score"] = 2*res_df["f1_rank"] + res_df["auc_rank"]

leaderboard = res_df.sort_values(["score", "f1", "auc"], ascending=[True, False, False])
best_row = leaderboard.iloc[0]
best_model = best_row["pipe"]

print("Leaderboard (Validation):")
display(leaderboard[["name", "acc", "f1", "auc", "score"]])

print("\nSelected best model:", best_row["name"])

X_train_full = np.concatenate([X_train, X_valid])
y_train_full = np.concatenate([y_train, y_valid])
best_model.fit(X_train_full, y_train_full)

test_preds = best_model.predict(X_test)

try:
    test_proba = best_model.predict_proba(X_test)[:, 1]
except Exception:
    try:
        scores = best_model.decision_function(X_test)
        test_proba = MinMaxScaler().fit_transform(scores.reshape(-1,1)).ravel()
    except Exception:
        test_proba = None

test_acc = accuracy_score(y_test, test_preds)
test_p, test_r, test_f1, _ = precision_recall_fscore_support(
    y_test, test_preds, average="binary", zero_division=0
)
test_auc = roc_auc_score(y_test, test_proba) if test_proba is not None else np.nan

print(f"\n=== Final Test Evaluation (Best Model: {best_row['name']}) ===")
print(f"Accuracy : {test_acc:.4f}")
print(f"Precision: {test_p:.4f} | Recall: {test_r:.4f} | F1: {test_f1:.4f}")
if test_proba is not None:
    print(f"ROC-AUC : {test_auc:.4f}")
print("\nClassification Report (Test):\n", classification_report(y_test, test_preds, target_names=["ham","spam"], zero_division=0))
print("Confusion Matrix (Test):\n", confusion_matrix(y_test, test_preds))

## 10. Visualizations â€“ Model Metrics & ROC Curves

In [None]:
# 10.1 F1 score comparison across models
model_names = res_df["name"].tolist()
f1_scores = res_df["f1"].tolist()

plt.figure(figsize=(8, 5))
bars = plt.bar(model_names, f1_scores)
plt.title("F1 Score Comparison Across Models (Validation)")
plt.xlabel("Model")
plt.ylabel("F1 Score")
plt.ylim(0, 1)
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, f"{yval:.3f}", ha="center")
plt.show()

# 10.2 Best model metrics on test set
metrics = {
    "Accuracy": test_acc,
    "Precision": test_p,
    "Recall": test_r,
    "F1 Score": test_f1
}

plt.figure(figsize=(6, 5))
bars = plt.bar(list(metrics.keys()), list(metrics.values()))
plt.ylim(0, 1)
plt.title("Best Model â€“ Test Metrics")
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, f"{yval:.3f}", ha="center")
plt.show()

# 10.3 ROC curve for best model
if test_proba is not None:
    fpr, tpr, _ = roc_curve(y_test, test_proba)
    roc_auc_val = auc(fpr, tpr)

    plt.figure(figsize=(6, 5))
    plt.plot(fpr, tpr, linewidth=2, label=f"Best model (AUC = {roc_auc_val:.3f})")
    plt.plot([0, 1], [0, 1], linestyle="--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve â€“ Best Model (Test)")
    plt.legend()
    plt.grid(True)
    plt.show()

# 10.4 ROC comparison for all models
plt.figure(figsize=(8, 6))
for r in results:
    name = r["name"]
    pipe = r["pipe"]
    pipe.fit(X_train_full, y_train_full)
    try:
        proba = pipe.predict_proba(X_test)[:, 1]
    except Exception:
        try:
            scores = pipe.decision_function(X_test)
            proba = MinMaxScaler().fit_transform(scores.reshape(-1,1)).ravel()
        except Exception:
            continue
    fpr, tpr, _ = roc_curve(y_test, proba)
    roc_auc_all = auc(fpr, tpr)
    plt.plot(fpr, tpr, linewidth=1.5, label=f"{name} (AUC={roc_auc_all:.3f})")

plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison â€“ All Models (Test)")
plt.legend()
plt.grid(True)
plt.show()

## 11. Save Best Model & Helper Function `classify_sms`

In [None]:
import joblib

MODEL_PATH = os.path.join(SAVE_DIR, f"best_model_{best_row['name']}.joblib")
joblib.dump(best_model, MODEL_PATH)
print("Saved best model to:", MODEL_PATH)

def classify_sms(messages):
    """Classify a single SMS string or list of strings."""
    single = False
    if isinstance(messages, str):
        messages = [messages]
        single = True
    
    cleaned = [clean_text(m) for m in messages]
    preds = best_model.predict(cleaned)
    
    try:
        proba = best_model.predict_proba(cleaned)[:, 1]
    except Exception:
        try:
            scores = best_model.decision_function(cleaned)
            proba = MinMaxScaler().fit_transform(scores.reshape(-1,1)).ravel()
        except Exception:
            proba = np.zeros(len(cleaned))
    
    out = []
    for m, p, s in zip(messages, preds, proba):
        out.append({
            "input": m,
            "pred_label": "spam" if p == 1 else "ham",
            "spam_probability": float(s)
        })
    return out[0] if single else out

print("\nExample classifications:")
print(classify_sms("WINNER!! You have won a free ticket, reply NOW!"))
print(classify_sms("Hey, are we still on for lunch at 12?"))

## 12. Interactive SMS Classifier (ipywidgets)

In [None]:
from ipywidgets import Textarea, Button, VBox, Output, HBox
from IPython.display import display

sms_input = Textarea(
    value="WINNER!! You have won a free ticket to Bahamas, reply NOW to claim",
    placeholder="Type an SMS message here...",
    description="SMS:",
    layout={"width": "100%", "height": "80px"},
    disabled=False,
)

classify_button = Button(
    description="Classify",
    disabled=False,
    tooltip="Click to classify this SMS as ham/spam",
)

out = Output()

def on_classify_clicked(b):
    out.clear_output()
    text = sms_input.value.strip()
    if not text:
        with out:
            print("Please type a message first.")
        return
    result = classify_sms(text)
    with out:
        print("Input:", result["input"])
        print("Predicted label:", result["pred_label"])
        print("Spam probability:", f"{result['spam_probability']:.4f}")

classify_button.on_click(on_classify_clicked)

ui = VBox([sms_input, HBox([classify_button]), out])
display(ui)

## 13. FastAPI TestClient â€“ REST-style API Inside Colab

In [None]:
!pip -q install fastapi

from fastapi import FastAPI
from pydantic import BaseModel
from fastapi.testclient import TestClient

app = FastAPI(title="SMS Spam Classifier API (Colab Demo)")

class SMSRequest(BaseModel):
    message: str

class SMSResponse(BaseModel):
    input: str
    pred_label: str
    spam_probability: float

@app.post("/predict", response_model=SMSResponse)
def predict_sms(req: SMSRequest):
    result = classify_sms(req.message)
    return SMSResponse(
        input=result["input"],
        pred_label=result["pred_label"],
        spam_probability=result["spam_probability"],
    )

client = TestClient(app)
print("FastAPI app and TestClient ready.")

### 13.1 Test the `/predict` Endpoint

In [None]:
payload = {"message": "WINNER!! You have won a free ticket to Bahamas, reply NOW to claim"}
r = client.post("/predict", json=payload)
print("Status code:", r.status_code)
print("JSON response:", r.json())