In [98]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import numpy as np


In [99]:
# load dataset
df = pd.read_csv("smsspamcollection/SMSSpamCollection", sep='\t', header=None, names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [100]:
# extra_spam = pd.DataFrame({
#     'label': ['spam']*5,
#     'message': [
#         "Win $1000 now, no experience needed!",
#         "Claim your free iPhone. Limited time offer!",
#         "Click here to earn money fast!",
#         "You have won a lottery. Contact now!",
#         "Work from home and earn daily!"
#     ]
# })
# df = pd.concat([df, extra_spam], ignore_index=True)


In [101]:
df.shape

(5572, 2)

In [102]:
df.notnull()

Unnamed: 0,label,message
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True
...,...,...
5567,True,True
5568,True,True
5569,True,True
5570,True,True


In [103]:
# Convert Labels to 0 & 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [104]:
# # Features and labels
# X = df['message']
# y = df['label']

# # Convert text to TF-IDF vectors
# tfidf = TfidfVectorizer(stop_words='english')  # removes common stopwords like 'the', 'and', etc.
# X_vect = tfidf.fit_transform(X)

# # # Split into train/test sets
# # X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.2, random_state=42)

`X_vect` is the TF-IDF representation of all text.

Then you use `kf.split(X_vect)` to apply K-Fold on the already vectorized data.

In [105]:
import math

def classification_performance(conf_matrix):
    TP = conf_matrix[1, 1]
    TN = conf_matrix[0, 0]
    FP = conf_matrix[0, 1]
    FN = conf_matrix[1, 0]

    acc = (TP + TN) / (TP + TN + FP + FN)

    prec = TP / (TP + FP) if (TP + FP) != 0 else 0
    rec = TP / (TP + FN) if (TP + FN) != 0 else 0
    spec = TN / (TN + FP) if (TN + FP) != 0 else 0
    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) != 0 else 0
    GM = math.sqrt(rec * spec)

    return acc, prec, rec, spec, f1, GM


## Train the model using Multinominal Bayes

#### Trains a Multinomial Naive Bayes model using your `X_train` (TF-IDF) and `y_train` (labels: 0 = ham, 1 = spam)

#### Internally, the model learns which word patterns are more likely to be found in spam or ham

In [106]:
# k = 10
# kf = KFold(n_splits=k, shuffle=True, random_state=42)

# # Prepare arrays to store metrics
# acc = np.zeros(k)
# prec = np.zeros(k)
# recall = np.zeros(k)
# spec = np.zeros(k)
# F1 = np.zeros(k)
# GM = np.zeros(k)

# # Run K-Fold
# for i, (train_idx, test_idx) in enumerate(kf.split(X_vect)):
#     print(f"\n🔁 Fold {i+1}/{k}")
    
#     X_train, X_test = X_vect[train_idx], X_vect[test_idx]
#     y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]  # Use .iloc to avoid index mismatch

#     model = MultinomialNB()
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)

#     conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])
#     acc[i], prec[i], recall[i], spec[i], F1[i], GM[i] = classification_performance(conf_matrix)

#     print(f"✅ Accuracy: {acc[i]*100:.2f}%, Precision: {prec[i]*100:.2f}%, Recall: {recall[i]*100:.2f}%, Specificity: {spec[i]*100:.2f}%, F1: {F1[i]*100:.2f}%, GM: {GM[i]*100:.2f}%")

# # Print final stats
# print('\n📊 ========== FINAL SUMMARY ==========')
# print(f"Mean Accuracy:     {acc.mean()*100:.2f}%")
# print(f"Mean Precision:    {prec.mean()*100:.2f}%")
# print(f"Mean Recall:       {recall.mean()*100:.2f}%")
# print(f"Mean Specificity:  {spec.mean()*100:.2f}%")
# print(f"Mean F1 Score:     {F1.mean()*100:.2f}%")
# print(f"Mean Geometric Mean: {GM.mean()*100:.2f}%")

## Result Analysis of this
Accuracy	97.42%	Almost all messages are classified correctly.

Precision	99.84%	Nearly all predicted spams are truly spam — minimal false alarms.

Recall	81.03%	Model catches ~81% of actual spam — a few spam messages might slip through.

Specificity	99.98%	Model correctly identifies nearly all ham (non-spam) messages.

F1 Score	89.37%	A strong balance between precision and recall.

G-Mean	89.97%	Shows model is balanced across both classes.

# step 2 now we will use k fold on n training data nd test using testing data seperately

### Split into tarin and test set first

In [107]:
# Map ham/spam to 0/1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Drop any rows where label is NaN (i.e., unexpected values)
df = df.dropna(subset=['label'])

# Now define X and y
X = df['message']
y = df['label']


In [108]:
print("Shape of original DataFrame:", df.shape)
print("Unique values in 'label':", df['label'].unique())
print(df.head())


Shape of original DataFrame: (0, 2)
Unique values in 'label': []
Empty DataFrame
Columns: [label, message]
Index: []


In [111]:
# Reload the dataset if df is empty
if df.empty:
    df = pd.read_csv("smsspamcollection/SMSSpamCollection", sep='\t', header=None, names=["label", "message"])
    df['label'] = df['label'].map({'ham': 0, 'spam': 1})
    df = df.dropna(subset=['label'])

X = df['message']
y = df['label']

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
# vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95)
# X_train_vect = vectorizer.fit_transform(X_train_raw)
# X_test_vect = vectorizer.transform(X_test_raw)

In [112]:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
models = LogisticRegression()

k = 10
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

acc = np.zeros(k)
prec = np.zeros(k)
recall = np.zeros(k)
spec = np.zeros(k)
F1 = np.zeros(k)
GM = np.zeros(k)

# Loop through each fold
for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"\n🔁 Fold {i+1}/{k}")
    
    X_train_raw, X_test_raw = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Vectorize text
    tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=2, stop_words='english')
    X_train_vect = tfidf.fit_transform(X_train_raw)
    X_test_vect = tfidf.transform(X_test_raw)
    
    # Train model
    model = MultinomialNB()
    models.fit(X_train_vect, y_train)
    y_pred = models.predict(X_test_vect)

    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])
    acc[i], prec[i], recall[i], spec[i], F1[i], GM[i] = classification_performance(conf_matrix)

    # Print fold metrics
    print(f"✅ Accuracy: {acc[i]*100:.2f}%, Precision: {prec[i]*100:.2f}%, Recall: {recall[i]*100:.2f}%, Specificity: {spec[i]*100:.2f}%, F1: {F1[i]*100:.2f}%, GM: {GM[i]*100:.2f}%")

print('\n📊 ========= FINAL SUMMARY =========')
print(f"Mean Accuracy:     {acc.mean()*100:.2f}%")
print(f"Mean Precision:    {prec.mean()*100:.2f}%")
print(f"Mean Recall:       {recall.mean()*100:.2f}%")
print(f"Mean Specificity:  {spec.mean()*100:.2f}%")
print(f"Mean F1 Score:     {F1.mean()*100:.2f}%")
print(f"Mean Geometric Mean: {GM.mean()*100:.2f}%")



🔁 Fold 1/10
✅ Accuracy: 97.31%, Precision: 98.39%, Recall: 81.33%, Specificity: 99.79%, F1: 89.05%, GM: 90.09%

🔁 Fold 2/10
✅ Accuracy: 97.13%, Precision: 100.00%, Recall: 78.67%, Specificity: 100.00%, F1: 88.06%, GM: 88.69%

🔁 Fold 3/10
✅ Accuracy: 96.59%, Precision: 96.61%, Recall: 77.03%, Specificity: 99.59%, F1: 85.71%, GM: 87.58%

🔁 Fold 4/10
✅ Accuracy: 97.31%, Precision: 100.00%, Recall: 79.73%, Specificity: 100.00%, F1: 88.72%, GM: 89.29%

🔁 Fold 5/10
✅ Accuracy: 96.95%, Precision: 100.00%, Recall: 77.03%, Specificity: 100.00%, F1: 87.02%, GM: 87.77%

🔁 Fold 6/10
✅ Accuracy: 97.49%, Precision: 98.41%, Recall: 82.67%, Specificity: 99.79%, F1: 89.86%, GM: 90.83%

🔁 Fold 7/10
✅ Accuracy: 96.59%, Precision: 98.28%, Recall: 76.00%, Specificity: 99.79%, F1: 85.71%, GM: 87.09%

🔁 Fold 8/10
✅ Accuracy: 96.41%, Precision: 96.61%, Recall: 76.00%, Specificity: 99.59%, F1: 85.07%, GM: 87.00%

🔁 Fold 9/10
✅ Accuracy: 96.41%, Precision: 96.61%, Recall: 76.00%, Specificity: 99.59%, F1: 85.07

In [None]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'spam_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [None]:
# Load model and vectorizer
loaded_model = joblib.load('spam_model.pkl')
loaded_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Example test message
test_message = ["Win a free vacation! Reply YES to claim."]
test_vector = loaded_vectorizer.transform(test_message)
prediction = loaded_model.predict(test_vector)

print("Spam" if prediction[0] == 1 else "Ham")


Spam


In [115]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('model', models)
])

# Save entire pipeline
joblib.dump(pipeline, 'spam_pipeline.pkl')

# Load and test
loaded_pipeline = joblib.load('spam_pipeline.pkl')
result = loaded_pipeline.predict(["Free entry in 2 a wkly comp to win tickets!"])
print("Prediction:", "Spam" if result[0] == 1 else "Ham")

Prediction: Ham


In [116]:
from joblib import load

# Load the saved pipeline
pipeline = load("spam_pipeline.pkl")

# Sample messages for testing
test_messages = [
    "Hey John, I thought you might like this opportunity — earn $500/day working from home, no experience needed!",
    "Congratulations! You've won a $1000 Walmart gift card. Click here to claim.",
    "Are we still meeting at 6 PM today?",
    "URGENT! Your mobile number has won 1 million dollars!",
    "I'll call you back in 10 mins.",
    "Free entry in 2 a weekly competition to win FA Cup tickets. Text FA to 12345",
    "Hey, what time is the class tomorrow?",
]

# Predict
predictions = pipeline.predict(test_messages)

# Show results
for msg, label in zip(test_messages, predictions):
    print(f"'{msg}' => {'Spam' if label == 1 else 'Ham'}")


'Hey John, I thought you might like this opportunity — earn $500/day working from home, no experience needed!' => Ham
'Congratulations! You've won a $1000 Walmart gift card. Click here to claim.' => Spam
'Are we still meeting at 6 PM today?' => Ham
'URGENT! Your mobile number has won 1 million dollars!' => Spam
'I'll call you back in 10 mins.' => Ham
'Free entry in 2 a weekly competition to win FA Cup tickets. Text FA to 12345' => Ham
'Hey, what time is the class tomorrow?' => Ham
