# SPAM SMS DETECTION

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from sklearn.pipeline import make_pipeline

In [2]:
# Load the SMS dataset from the CSV file
data = pd.read_csv("C:/Users/admin/Desktop/spam.csv", encoding='latin-1')


In [3]:
# Remove unnecessary columns and rename the remaining ones
data = data[['v1', 'v2']]
data.columns = ['label', 'message']

In [4]:
# Convert labels to binary values (0 for 'ham' and 1 for 'spam')
data['label'] = data['label'].map({'ham': 0, 'spam': 1})


In [5]:
# Lowercase and remove punctuation from the messages
data['message'] = data['message'].str.lower().replace('[^\w\s]', '', regex=True)

In [6]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.2, random_state=42)

In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words("english"), max_df=0.85)

In [8]:
classifiers = {
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(),
    "Support Vector Machine": SVC()
}

for clf_name, clf in classifiers.items():
    pipeline = make_pipeline(tfidf_vectorizer, clf)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print(f"Classifier: {clf_name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Confusion Matrix:\n{confusion}")
    print(f"Classification Report:\n{report}")
    print("=" * 50)


Classifier: Naive Bayes
Accuracy: 0.97
Confusion Matrix:
[[965   0]
 [ 36 114]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115

Classifier: Logistic Regression
Accuracy: 0.95
Confusion Matrix:
[[961   4]
 [ 50 100]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.96      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115

Classifier: Support Vector Machine
Accuracy: 0.98
Confusion Matrix:
[[963   2]
 [ 25 125]]
Classification Report:
              prec

In [15]:
# Example: Predict if a message is spam 
example_message = ["Free gift! Click here to claim now!"]
prediction = classifier.predict(example_message)
if prediction[0] == 0:
    print("The message is HAM (not spam).")
else:
    print("The message is SPAM.")


The message is SPAM.


In [16]:
# Example: Predict if a message is ham
example_message = ["U dun say so early hor... U c already then say..."]
prediction = classifier.predict(example_message)
if prediction[0] == 0:
    print("The message is HAM (not spam).")
else:
    print("The message is SPAM.")


The message is HAM (not spam).
