In [None]:
# ✅ Step 0: Import Required Libraries

# Data handling and numerical operations
import numpy as np
import pandas as pd

# Machine Learning Libraries
from sklearn.model_selection import train_test_split

# Text Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# Machine Learning Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

# Model Evaluation
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

print("✅ Libraries imported successfully.")


✅ Libraries imported successfully.


In [73]:
# Step 1: Load the dataset from a CSV file and data cleaned
cvs_path = 'mail_l7_dataset.csv'
df = pd.read_csv(cvs_path)

print("✅ Dataset loaded successfully.")

✅ Dataset loaded successfully.


In [74]:
# Step 1: Data processing , cleaning and handling missing values in the dataset

# Step 1.0: Display the initial head of the dataset
print("=== INITIAL HEAD ===")
print(df.head())
print("✅ Initial head displayed successfully.")

=== INITIAL HEAD ===
  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
✅ Initial head displayed successfully.


In [75]:
# Step 1.1: Display the initial info of the dataset
print("=== INITIAL INFO ===")
print(df.info())
print("✅ Initial info displayed successfully.")

=== INITIAL INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
✅ Initial info displayed successfully.


In [76]:
# Step 1.2: Display the initial describe of the dataset
print("=== INITIAL DESCRIBE ===")
print(df.describe())
print("✅ Initial describe displayed successfully.")

=== INITIAL DESCRIBE ===
       Category                 Message
count      5572                    5572
unique        2                    5157
top         ham  Sorry, I'll call later
freq       4825                      30
✅ Initial describe displayed successfully.


In [77]:
# Step 1.3: Display the initial shape of the dataset
print("=== INITIAL SHAPE ===")
print(df.shape)
print("✅ Initial shape displayed successfully.")

=== INITIAL SHAPE ===
(5572, 2)
✅ Initial shape displayed successfully.


In [78]:
# Step 1.4: Display the initial columns of the dataset
print("=== INITIAL COLUMNS ===")
print(df.columns)
print("✅ Initial columns displayed successfully.")

=== INITIAL COLUMNS ===
Index(['Category', 'Message'], dtype='object')
✅ Initial columns displayed successfully.


In [79]:
# Step 1.5: Display the initial duplicates of the dataset
print("=== INITIAL DUPLICATES ===")
print(df.duplicated().sum())
print("✅ Initial duplicates displayed successfully.")

=== INITIAL DUPLICATES ===
415
✅ Initial duplicates displayed successfully.


In [80]:
# Step 1.6: Display the initial missing values of the dataset
print("=== INITIAL MISSING VALUES ===")
print(df.isnull().sum())
print("✅ Initial missing values displayed successfully.")

=== INITIAL MISSING VALUES ===
Category    0
Message     0
dtype: int64
✅ Initial missing values displayed successfully.


In [81]:
# Step 1.7: Display the initial value counts of the dataset
print("=== INITIAL VALUE COUNTS ===")
print(df.value_counts())
print("✅ Initial value counts displayed successfully.")

=== INITIAL VALUE COUNTS ===
Category  Message                                                                                                                                                                     
ham       Sorry, I'll call later                                                                                                                                                          30
          I cant pick the phone right now. Pls send a message                                                                                                                             12
          Ok...                                                                                                                                                                           10
          Okie                                                                                                                                                                             4
          Say this slowly.? GOD,

In [82]:
# Step 2: Handle missing values (replace with empty strings).
df = df.where(pd.notnull(df), "")

print("✅ NaNs replaced with empty strings successfully.")

✅ NaNs replaced with empty strings successfully.


In [83]:
# Step 3: Encode labels: spam -> 0, ham -> 1  (keep your original convention)
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham", "Category"] = 1

print("✅ Labels encoded successfully.")

✅ Labels encoded successfully.


In [84]:
# Step 4.1 : Head of the dataset
print("=== HEAD OF THE DATASET ===")
print(df.head())
print("✅ Head displayed successfully.")

=== HEAD OF THE DATASET ===
  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...
✅ Head displayed successfully.


In [85]:
# Step 4: Split into features (X = messages) and target (y = category).
X = df["Message"].astype(str)
y = df["Category"].astype(int)

print("✅ Features and target split successfully.")

✅ Features and target split successfully.


In [86]:
# Step 5: Split the data — 80% for training and 20% for testing

# Set a random state for reproducibility
RANDOM_STATE = 42

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Print the split sizes
print("\n=== SPLIT SIZES ===")
print("Train:", len(X_train), "| Test:", len(X_test))

print("\n✅ Data has been successfully split into training (80%) and testing (20%) sets.")



=== SPLIT SIZES ===
Train: 4457 | Test: 1115

✅ Data has been successfully split into training (80%) and testing (20%) sets.


In [87]:
# Step 6: Text Feature Extraction

# Use TfidfVectorizer to transform text into numeric vectors.
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

print("\n=== TF-IDF SHAPES ===")
print("X_train:", X_train_features.shape, " | X_test:", X_test_features.shape)

print("\n ✅ Text features extracted successfully.")


=== TF-IDF SHAPES ===
X_train: (4457, 7440)  | X_test: (1115, 7440)

 ✅ Text features extracted successfully.


In [88]:
# Step 7: Train Models

# STrep 7.1: Train a Logistic Regression model.
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

print("\n ✅ Logistic Regression model trained successfully.")



 ✅ Logistic Regression model trained successfully.


In [89]:
# Step 7.2: Train a Random Forest Classifier model.
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())

print("\n✅ Random Forest model trained successfully.")


✅ Random Forest model trained successfully.


In [90]:
# Step 7.3:Train a Naive Bayes (MultinomialNB) classifier.

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features.toarray())

print("\n✅ Naive Bayes (MultinomialNB) model trained successfully.")


✅ Naive Bayes (MultinomialNB) model trained successfully.


In [91]:
# Step 8: Helper functions: metrics + confusion matrix print
def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    """Print Accuracy, Precision, Recall, F1. pos_label=0 means 'spam' is positive."""
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec  = recall_score(y_true, y_pred, pos_label=pos_label)
    f1   = f1_score(y_true, y_pred, pos_label=pos_label)
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")

print("\n✅ Helper function 'print_clf_metrics()' defined successfully.")


✅ Helper function 'print_clf_metrics()' defined successfully.


In [92]:
# Step 8.1:  
def print_confmat(name, y_true, y_pred):
    """
    Confusion matrix with readable labels.
    Rows = Actual, Cols = Predicted
    Order: [Ham(1), Spam(0)] so you can see both classes clearly.
    """
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cm_df}")

print("\n✅ Helper function 'print_confmat()' defined successfully.")


✅ Helper function 'print_confmat()' defined successfully.


In [None]:
# ✅ Step 8.2: Show Evaluation Results for All Models

# Logistic Regression Results
print_clf_metrics("Logistic Regression", y_test, lr_pred)
print_confmat("Logistic Regression", y_test, lr_pred)

# Random Forest Results
print_clf_metrics("Random Forest", y_test, rf_pred)
print_confmat("Random Forest", y_test, rf_pred)

# Naive Bayes Results
print_clf_metrics("Naive Bayes", y_test, nb_pred)
print_confmat("Naive Bayes", y_test, nb_pred)


print("\n✅ All model evaluation results displayed successfully.")


Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.758  (positive = spam=0)
  F1-Score : 0.863  (positive = spam=0)

Logistic Regression – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113

Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)

Random Forest – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130

Naive Bayes Performance:
  Accuracy : 0.977
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.826  (positive = spam=0)
  F1-Score : 0.904  (positive = spam=0)

Naive Bayes – Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Ac

In [98]:
# ✅ Step 9: Sanity Check - Inspect a Single Test Message

# Choose a sample index to inspect
i = 8  # Change this number to check other test messages

# Get the sample text and true label
sample_text = X_test.iloc[i]
true_label  = y_test.iloc[i]

# Predict the label using the models
lr_pred_one = int(lr.predict(tfidf.transform([sample_text]))[0])
rf_pred_one = int(rf.predict(tfidf.transform([sample_text]).toarray())[0])
nb_pred_one = int(nb.predict(tfidf.transform([sample_text]))[0])

# Print the results
def lab2str(v):  # same readable output style
    return "Spam (0)" if v == 0 else "Ham (1)"

print("\n=== SINGLE MESSAGE CHECK ===")
snippet = (sample_text[:160] + "...") if len(sample_text) > 160 else sample_text
print("Text snippet:", snippet)
print("Actual      :", lab2str(true_label))
print("LR Pred     :", lab2str(lr_pred_one))
print("RF Pred     :", lab2str(rf_pred_one))
print("NB Pred     :", lab2str(nb_pred_one))

print("\n✅ Sanity check successful.")


=== SINGLE MESSAGE CHECK ===
Text snippet: Dear good morning now only i am up
Actual      : Ham (1)
LR Pred     : Ham (1)
RF Pred     : Ham (1)
NB Pred     : Ham (1)

✅ Sanity check successful.
