# General Steps
 - Load cleaned data
 - Train-Test split
 - TF-IDF vectorization

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# Train / test split
from sklearn.model_selection import train_test_split

# Text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

# Model persistence
import joblib


In [3]:
# Load preprocessed dataset
DATA_PATH = "E:\\Desktop\\02VC_Group_09\\datasets\\SMS_spam_collection\\sms_clean.csv"
df = pd.read_csv(DATA_PATH)

# Encode labels: ham -> 0, spam -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Features and target
X = df['clean_message']
y = df['label']

print("Dataset size:", df.shape)
print(df.head())


Dataset size: (5572, 2)
   label                                      clean_message
0      0  go jurong point crazi avail bugi n great world...
1      0                              ok lar joke wif u oni
2      1  free entri wkli comp win fa cup final tkt st m...
3      0                u dun say earli hor u c alreadi say
4      0          nah dont think goe usf live around though


In [4]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # preserve class distribution
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 4457
Testing samples: 1115


In [5]:
# Handle missing values
X_train = X_train.fillna('')
X_test = X_test.fillna('')

print("Missing values in training set:", X_train.isna().sum())
print("Missing values in test set:", X_test.isna().sum())

Missing values in training set: 0
Missing values in test set: 0


In [6]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),   # unigrams + bigrams
    min_df=2
)

# Fit on training data, transform both sets
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("TF-IDF feature shape:", X_train_vec.shape)


TF-IDF feature shape: (4457, 3000)


In [56]:
def predict_sms(message, model, vectorizer, dense=False):
    """
    Predict spam/ham for a single SMS message
    """
    vec = vectorizer.transform([message])
    if dense:
        vec = vec.toarray()
    pred = model.predict(vec)[0]
    return "Spam" if pred == 1 else "Ham"

# 1. Bagging
- Random Forest
- Bagging Classifier + SVM


## 1.1. Random Forest

In [None]:
# Random Forest model
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Initialize Random Forest classifier
rf_model = RandomForestClassifier(
    n_estimators=200,        # number of trees
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,               # use all CPU cores
    class_weight="balanced"  # handle class imbalance
)

# Train model
rf_model.fit(X_train_vec, y_train)


In [8]:
# Predictions
y_pred = rf_model.predict(X_test_vec)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9766816143497757

Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.99       966
        Spam       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[966   0]
 [ 26 123]]


In [39]:
# Save model and vectorizer
joblib.dump(rf_model, r"E:\Desktop\02VC_Group_09\spamSMS\random_forest_sms.joblib")
joblib.dump(tfidf, r"E:\Desktop\02VC_Group_09\spamSMS\tfidf_vectorizer.joblib")

['E:\\Desktop\\02VC_Group_09\\spamSMS\\tfidf_vectorizer.joblib']

In [24]:
example= "Congratulations! You've won a free prize. Call now!"
example_vec = tfidf.transform([example])
prediction = rf_model.predict(example_vec)
print("Prediction for example message:", "Spam" if prediction[0] == 1 else "Ham")

Prediction for example message: Ham


In [58]:
predict_sms(
    "Urrgent! text me to get your lucky prize iphone 17 promax",
    rf_model,
    tfidf
)

'Ham'

## 1.2. Bagging Classifier + SVM

In [12]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier

In [13]:
# Base SVM model for text data
svm_base = LinearSVC(
    C=1.0,
    class_weight="balanced",
    max_iter=5000,
    random_state=42
)

In [14]:
bagging_svm = BaggingClassifier(
    estimator=svm_base,
    n_estimators=15,        # number of bootstrap SVM models
    max_samples=0.8,        # percentage of samples per estimator
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

# Train the model
bagging_svm.fit(X_train_vec, y_train)

In [15]:
y_pred = bagging_svm.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9829596412556054

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.99      0.99       966
        Spam       0.96      0.91      0.93       149

    accuracy                           0.98      1115
   macro avg       0.97      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[961   5]
 [ 14 135]]


In [None]:
joblib.dump(bagging_svm, r"E:\Desktop\02VC_Group_09\spamSMS\bagging_svm_sms.joblib")
# joblib.dump(tfidf, r"E:\Desktop\02VC_Group_09\spamSMS\tfidf_vectorizer.joblib")

['E:\\Desktop\\02VC_Group_09\\spamSMS\\tfidf_vectorizer.joblib']

In [26]:
example_sms = ["Congratulations! You have won a free prize. Call now"]

example_vec = tfidf.transform(example_sms)
prediction = bagging_svm.predict(example_vec)[0]

print("Prediction:", "Spam" if prediction == 1 else "Ham")

Prediction: Spam


## 2. Boosting
- AdaBoost
- Gradient Boosting Classifier

## 2.1. AdaBoost

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

In [19]:
# Use shallow decision tree (decision stump) as weak learner
base_estimator = DecisionTreeClassifier(
    max_depth=1,
    random_state=42
)

adaboost_model = AdaBoostClassifier(
    estimator=base_estimator,
    n_estimators=200,
    learning_rate=0.5,
    random_state=42
)

# Train model
adaboost_model.fit(X_train_vec, y_train)



In [20]:
y_pred = adaboost_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9748878923766816

Classification Report:
              precision    recall  f1-score   support

         Ham       0.98      0.99      0.99       966
        Spam       0.95      0.86      0.90       149

    accuracy                           0.97      1115
   macro avg       0.96      0.93      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
[[959   7]
 [ 21 128]]


In [27]:
joblib.dump(adaboost_model, r"E:\Desktop\02VC_Group_09\spamSMS\adaboost_sms.joblib")
# joblib.dump(tfidf, r"E:\Desktop\02VC_Group_09\spamSMS\tfidf_vectorizer.joblib")

['E:\\Desktop\\02VC_Group_09\\spamSMS\\adaboost_sms.joblib']

In [28]:
example_sms = ["Congratulations! You have won a free prize. Call now"]  
example_vec = tfidf.transform(example_sms)
prediction = adaboost_model.predict(example_vec)[0]
print("Prediction:", "Spam" if prediction == 1 else "Ham")

Prediction: Spam


## 2.2. Gradient Boosting Classifier

In [29]:
from sklearn.ensemble import GradientBoostingClassifier

In [30]:
gb_model = GradientBoostingClassifier(
    n_estimators=150,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,
    random_state=42
)

# Train model
gb_model.fit(X_train_vec.toarray(), y_train)

In [31]:
y_pred_gb = gb_model.predict(X_test_vec.toarray())

print("Accuracy:", accuracy_score(y_test, y_pred_gb))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_gb, target_names=["Ham", "Spam"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

Accuracy: 0.9695067264573991

Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.98       966
        Spam       0.98      0.79      0.87       149

    accuracy                           0.97      1115
   macro avg       0.97      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
[[963   3]
 [ 31 118]]


In [None]:
joblib.dump(gb_model, r"E:\Desktop\02VC_Group_09\spamSMS\gradient_boosting_sms.joblib")
# joblib.dump(tfidf, r"E:\Desktop\02VC_Group_09\spamSMS\tfidf_vectorizer.joblib")

['E:\\Desktop\\02VC_Group_09\\spamSMS\\tfidf_vectorizer.joblib']

In [None]:
new_sms = [
    "Free entry in weekly competition to win cash prize",
    "I'll call you later when I get home"
]
tfidf.fit(X_train)
# TF-IDF transformation using the trained vectorizer
new_sms_vec = tfidf.transform(new_sms).toarray()


# Predict
predictions = gb_model.predict(new_sms_vec)

# Display results
for msg, pred in zip(new_sms, predictions):
    label = "Spam" if pred == 1 else "Ham"
    print(f"Message: {msg}")
    print(f"Prediction: {label}\n")

Message: Free entry in weekly competition to win cash prize
Prediction: Spam

Message: I'll call you later when I get home
Prediction: Ham



In [61]:
predict_sms(
    "Free entry in weekly competition to win cash prize",
    gb_model,
    tfidf.fit(X_train),
    dense=True
)

'Spam'

# 3. Stacking
- Logistic + SVM + RF ==> LR
- CNN1 + CNN2 ==> MLP


## 3.1. Logistic + SVM + RF ==> LR
### Base learners:
+ Logistic Regression
+ Linear SVM
+ Random Forest
### Meta learner:
+ Logistic Regression

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

In [63]:
base_estimators = [
    ("lr", LogisticRegression(
        max_iter=5000,
        class_weight="balanced"
    )),
    ("svm", LinearSVC(
        C=1.0,
        class_weight="balanced"
    )),
    ("rf", RandomForestClassifier(
        n_estimators=200,
        n_jobs=-1,
        random_state=42,
        class_weight="balanced"
    ))
]


In [64]:
stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(
        max_iter=5000,
        class_weight="balanced"
    ),
    cv=5,
    n_jobs=-1
)

# Train stacking model
stacking_model.fit(X_train_vec, y_train)


In [65]:
y_pred = stacking_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9713004484304932

Classification Report:
              precision    recall  f1-score   support

         Ham       0.99      0.98      0.98       966
        Spam       0.87      0.93      0.90       149

    accuracy                           0.97      1115
   macro avg       0.93      0.95      0.94      1115
weighted avg       0.97      0.97      0.97      1115


Confusion Matrix:
[[945  21]
 [ 11 138]]


In [67]:
joblib.dump(stacking_model, r"E:\Desktop\02VC_Group_09\spamSMS\stacking_lr_svm_rf.joblib")
# joblib.dump(tfidf, r"E:\Desktop\02VC_Group_09\spamSMS\tfidf_vectorizer.joblib")

['E:\\Desktop\\02VC_Group_09\\spamSMS\\stacking_lr_svm_rf.joblib']

In [68]:
predict_sms(
    "Free entry in weekly competition to win cash prize",
    stacking_model,
    tfidf
)

'Spam'

## 3.2. CNN1 + CNN2 ==> MLP

In [69]:
# code here