# General Steps
 - Load cleaned data
 - Train-Test split
 - TF-IDF vectorization

In [None]:
# Core libraries
import pandas as pd
import numpy as np

# Train / test split
from sklearn.model_selection import train_test_split

# Text vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)

# Model persistence
import joblib


In [3]:
# Load preprocessed dataset
DATA_PATH = "E:\\Desktop\\02VC_Group_09\\datasets\\SMS_spam_collection\\sms_clean.csv"
df = pd.read_csv(DATA_PATH)

# Encode labels: ham -> 0, spam -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Features and target
X = df['clean_message']
y = df['label']

print("Dataset size:", df.shape)
print(df.head())


Dataset size: (5572, 2)
   label                                      clean_message
0      0  go jurong point crazi avail bugi n great world...
1      0                              ok lar joke wif u oni
2      1  free entri wkli comp win fa cup final tkt st m...
3      0                u dun say earli hor u c alreadi say
4      0          nah dont think goe usf live around though


In [4]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # preserve class distribution
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 4457
Testing samples: 1115


In [5]:
# Handle missing values
X_train = X_train.fillna('')
X_test = X_test.fillna('')

print("Missing values in training set:", X_train.isna().sum())
print("Missing values in test set:", X_test.isna().sum())

Missing values in training set: 0
Missing values in test set: 0


In [6]:
# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2),   # unigrams + bigrams
    min_df=2
)

# Fit on training data, transform both sets
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

print("TF-IDF feature shape:", X_train_vec.shape)


TF-IDF feature shape: (4457, 3000)


# 1. Bagging
- Random Forest
- Bagging Classifier + SVM


## 1.1. Random Forest

In [None]:
# Random Forest model
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Initialize Random Forest classifier
rf_model = RandomForestClassifier(
    n_estimators=200,        # number of trees
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1,               # use all CPU cores
    class_weight="balanced"  # handle class imbalance
)

# Train model
rf_model.fit(X_train_vec, y_train)


In [8]:
# Predictions
y_pred = rf_model.predict(X_test_vec)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9766816143497757

Classification Report:
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.99       966
        Spam       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.95      1115
weighted avg       0.98      0.98      0.98      1115


Confusion Matrix:
[[966   0]
 [ 26 123]]


In [11]:
# Save model and vectorizer
joblib.dump(rf_model, r"E:\Desktop\02VC_Group_09\spamSMS\random_forest_sms.joblib")
joblib.dump(tfidf, r"E:\Desktop\02VC_Group_09\spamSMS\tfidf_vectorizer.joblib")

['E:\\Desktop\\02VC_Group_09\\spamSMS\\tfidf_vectorizer.joblib']

## 1.2. Bagging Classifier + SVM

## 2. Boosting
- AdaBoost
- Gradient Boosting Classifier

## 2.1. AdaBoost

In [None]:
# code 

## 2.2. Gradient Boosting Classifier

In [None]:
# code

# 3. Stacking
- Logistic + SVM + RF ==> LR
- CNN1 + CNN2 ==> MLP


## 3.1. Logistic + SVM + RF ==> LR

In [None]:
# code here 

## 3.2. CNN1 + CNN2 ==> MLP

In [None]:
# code here