---
### 📌 Executive Summary
This project compares **Naive Bayes** and **Logistic Regression** models for SMS spam detection using both **CountVectorizer** and **TF-IDF** features.  
Results show that while Naive Bayes is faster and simpler, **Logistic Regression with TF-IDF** achieves the **best balance of accuracy, recall, and overall performance**.
---


In [None]:
from math import e
# LOAD DATASET USING URL
from urllib.request import urlretrieve
import zipfile
import os
import pandas as pd
URL="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
urlretrieve(URL,"smsspamcollection.zip")
zip_ref=zipfile.ZipFile("smsspamcollection.zip","r")
zip_ref.extractall()
zip_ref.close()
os.remove("smsspamcollection.zip")

#LOAD INTO DATAFRAME
txt=pd.read_csv("SMSSpamCollection",sep="\t",header=None,names=["label","message"])
txt.head()
txt.info()
txt.keys()

#MAP LABELS TO INTEGERS
txt["label"]=txt["label"].map({"ham":0,"spam":1})

#STANDARDIZING TEXT
import string
txt['message']=txt['message'].str.lower().str.replace(r'[^\w\s]'," ", regex=True)

#TRAIN/TEST SPLIT
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(txt["message"],txt["label"],test_size=0.2,random_state=42)

#VECTORIZATION
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
counts=CountVectorizer(stop_words='english',max_features=2000 , ngram_range=(1,2))
tfidf=TfidfVectorizer(stop_words='english',max_features=2000 , ngram_range=(1,2))
x_train_counts=counts.fit_transform(X_train)
x_train_tfidf=tfidf.fit_transform(X_train)
X_test_counts=counts.transform(X_test)
X_test_tfidf=tfidf.transform(X_test)

#TRAIN MODELS
#1)NAIVE BAYES MODEL
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()
nb_tfidf=nb.fit(x_train_tfidf,y_train)
nb_counts=nb.fit(x_train_counts,y_train)
#2)LOGISTIC REGRESSION
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression( max_iter=2000, solver ='saga',penalty='l2',C=1.0,random_state=42 )
lr_counts=lr.fit(x_train_counts,y_train)
lr_tfidf=lr.fit(x_train_tfidf,y_train)

#PREDICTIONS AND PROBABILITIES
y_pred_nb_counts=nb_counts.predict(X_test_counts)
y_pred_nb_tfidf=nb_tfidf.predict(X_test_tfidf)
y_pred_lr_counts=lr_counts.predict(X_test_counts)
y_pred_lr_tfidf=lr_tfidf.predict(X_test_tfidf)
y_pob_nb_counts=nb_counts.predict_proba(X_test_counts)
y_pob_nb_tfidf=nb_tfidf.predict_proba(X_test_tfidf)
y_pob_lr_counts=lr_counts.predict_proba(X_test_counts)
y_pob_lr_tfidf=lr_tfidf.predict_proba(X_test_tfidf)

#EVALUATION
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score,recall_score,roc_auc_score
def evaluation(y_test,y_pred,y_pob):
    return [round(accuracy_score(y_test,y_pred),4),
          round(precision_score(y_test,y_pred),4),
          round(recall_score(y_test,y_pred),4),
          round(f1_score(y_test,y_pred),4),
          round(roc_auc_score(y_test,y_pob[:,1]),4)
          ]


results={
    "NB Counts":evaluation(y_test,y_pred_nb_counts,y_pob_nb_counts),
    "NB TFIDF":evaluation(y_test,y_pred_nb_tfidf,y_pob_nb_tfidf),
    "LR Counts":evaluation(y_test,y_pred_lr_counts,y_pob_lr_counts),
    "LR TFIDF":evaluation(y_test,y_pred_lr_tfidf,y_pob_lr_tfidf)
}

df_results=pd.DataFrame(results).T
df_results.columns=["Accuracy Score","Precision Score","Recall Score","F1 Score","ROC AUC Score"]
display(df_results.head())

#CONFUSION MATRIX
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
def plot_confusion_matrix(name,y_test,y_pred):
  print(f"Confusion matrix of {name} :")
  cm = confusion_matrix(y_test, y_pred)
  sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Ham","Spam"], yticklabels=["Ham","Spam"])
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title(f"Confusion Matrix for {name}")
  plt.show()


plot_confusion_matrix("NB Counts",y_test,y_pred_nb_counts)
plot_confusion_matrix("NB TFIDF",y_test,y_pred_nb_tfidf)
plot_confusion_matrix("LR Counts",y_test,y_pred_lr_counts)
plot_confusion_matrix("LR TFIDF",y_test,y_pred_lr_tfidf)

---
### 📌 Bottom Line

- **Naive Bayes (NB)**  
  ✅ Very fast, lightweight, and easy to implement.  
  ✅ Performs surprisingly well on text data.  
  ❌ Assumes independence between words (bag-of-words), which is not always true.  
  ❌ Slightly lower recall compared to Logistic Regression (misses some spam).  

- **Logistic Regression (LR)**  
  ✅ More powerful and flexible than NB.  
  ✅ Generally achieves higher accuracy, recall, and F1-score.  
  ✅ Handles overlapping word distributions better.  
  ❌ Slower to train compared to NB.  
  ❌ Requires parameter tuning (e.g., regularization, solver).  

- **Feature Comparison**  
  - **CountVectorizer**: Simple word frequency counts, fast, but less informative.  
  - **TF-IDF**: Weighs words by importance, reduces impact of common words → **consistently better performance**.  

📊 **Best performer in this study:**  
**Logistic Regression with TF-IDF** → balanced precision, recall, and overall accuracy, making it the most reliable for spam detection in this dataset.
---
