In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score


In [43]:
# data load
df = pd.read_csv("spam_pred_dataset.csv")
# print(df.head(10))


In [44]:
# category ayaan hagaa jinaynaa markan
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

print(df.head(5))

  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


In [45]:
# x , y u kala qaybinayna
X = df["Message"].astype(str)
y = df["Category"].astype(int)

# train test kusamaynayna 80% train 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [52]:
tfidf = TfidfVectorizer()

tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

print("X_train rows:", X_train_features.shape[0], "X_test rows:", X_test_features.shape[0])


X_train rows: 4457 X_test rows: 1115


In [53]:
# print(df.head(10))

In [54]:
# logistic regression 

lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)
# Predictions

print("Predictions:", lr_pred[:5])

Predictions: [1 1 1 1 1]


In [55]:
# Random forest 

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())

In [56]:
# naive bayes

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)

In [59]:
def print_metrics(name, y_true, y_pred, pos_label=0):
    acc = accuracy_score(y_true, y_pred,)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 =f1_score(y_true, y_pred, pos_label=pos_label)

    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}  (positive = spam=0)")
    print(f"  Recall   : {rec:.3f}  (positive = spam=0)")
    print(f"  F1-Score : {f1:.3f}  (positive = spam=0)")
def conf_mat(name, y_true, y_pred):
    cfm = confusion_matrix(y_true, y_pred, labels=[1,0])
    cfm_df = pd.DataFrame(
        cfm,
        index=["Actual Ham (1)", "Actual Spam (0)"],
        columns=["Pred Ham", "Pred Spam"]
    )
    print(f"\n{name} – Confusion Matrix:\n{cfm_df}")




print_metrics("Logistic Regression", y_test, lr_pred, pos_label=0)
conf_mat("Logistic Regression", y_test, lr_pred)

print_metrics("Random Forest", y_test, rf_pred, pos_label=0)
conf_mat("Random Forest", y_test, rf_pred)

print_metrics("Naive Bayes", y_test, nb_pred, pos_label=0)
conf_mat("Naive Bayes", y_test, nb_pred)


Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.758  (positive = spam=0)
  F1-Score : 0.863  (positive = spam=0)

Logistic Regression – Confusion Matrix:
                 Pred Ham  Pred Spam
Actual Ham (1)        966          0
Actual Spam (0)        36        113

Random Forest Performance:
  Accuracy : 0.983
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.872  (positive = spam=0)
  F1-Score : 0.932  (positive = spam=0)

Random Forest – Confusion Matrix:
                 Pred Ham  Pred Spam
Actual Ham (1)        966          0
Actual Spam (0)        19        130

Naive Bayes Performance:
  Accuracy : 0.977
  Precision: 1.000  (positive = spam=0)
  Recall   : 0.826  (positive = spam=0)
  F1-Score : 0.904  (positive = spam=0)

Naive Bayes – Confusion Matrix:
                 Pred Ham  Pred Spam
Actual Ham (1)        966          0
Actual Spam (0)        26        123


In [61]:
# sanity check 
m = 40

simple_text = X_test.iloc[m]  
label_true = y_test.iloc[m]




# Samee prediction
lr_pred_one = int(lr.predict(tfidf.transform([simple_text]))[0])
rf_pred_one = int(rf.predict(tfidf.transform([simple_text]).toarray())[0])
nb_pred_one = int(nb.predict(tfidf.transform([simple_text]))[0])

# Function-ka lab2str
def lab2str(r):
    return "spam (0)" if r == 0 else "ham (1)"

# Print results
print("\nSINGLE MESSAGE CHECK :")    
print("actual :", lab2str(label_true))
print("LR :", lab2str(lr_pred_one))
print("RF :", lab2str(rf_pred_one))
print("NB :", lab2str(nb_pred_one))



SINGLE MESSAGE CHECK :
actual : spam (0)
LR : ham (1)
RF : spam (0)
NB : spam (0)
