In [None]:
# T·∫° Cao S∆°n - B22DCVT445
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Download NLTK data
nltk.download('stopwords', quiet=True)

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")

In [None]:
# T·∫° Cao S∆°n - B22DCVT445

# Load dataset 1: combined_dataset.csv
df1 = pd.read_csv('combined_dataset.csv')
df1.columns = ['label', 'text']
print(f"Dataset 1 shape: {df1.shape}")
print(f"Dataset 1 head:\n{df1.head()}\n")

# Load dataset 2: spam_ham_dataset.csv
df2 = pd.read_csv('spam_ham_dataset.csv')
df2 = df2[['label', 'text']]
print(f"Dataset 2 shape: {df2.shape}")
print(f"Dataset 2 head:\n{df2.head()}\n")

# Load dataset 3: spam.csv
df3 = pd.read_csv('spam.csv', encoding='latin-1')
df3 = df3.iloc[:, :2]  # Only take first 2 columns
df3.columns = ['label', 'text']
print(f"Dataset 3 shape: {df3.shape}")
print(f"Dataset 3 head:\n{df3.head()}\n")

# Merge all datasets
df = pd.concat([df1, df2, df3], ignore_index=True)
print(f"\nMerged dataset shape: {df.shape}")
print(f"Label distribution:\n{df['label'].value_counts()}")

In [None]:

# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Remove duplicates
df = df.drop_duplicates()
print(f"\nAfter removing duplicates: {df.shape}")

# Visualize label distribution
plt.figure(figsize=(8, 5))
df['label'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('Distribution of Spam vs Ham')
plt.xlabel('Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.show()

# Text length distribution
df['text_length'] = df['text'].apply(len)
print(f"\nText length statistics:")
print(df.groupby('label')['text_length'].describe())

In [None]:
# T·∫° Cao S∆°n - B22DCVT445
def preprocess_text(text):
    """
    Ti·ªÅn x·ª≠ l√Ω vƒÉn b·∫£n:
    - Chuy·ªÉn v·ªÅ ch·ªØ th∆∞·ªùng
    - Lo·∫°i b·ªè s·ªë
    - Lo·∫°i b·ªè d·∫•u c√¢u
    - Lo·∫°i b·ªè stopwords
    - Stemming
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Remove stopwords and apply stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(words)

# Apply preprocessing
print("Preprocessing text data...")
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Show examples
print("\nOriginal vs Cleaned examples:")
for i in range(3):
    print(f"\nOriginal: {df['text'].iloc[i][:100]}...")
    print(f"Cleaned: {df['cleaned_text'].iloc[i][:100]}...")

In [None]:
# T·∫° Cao S∆°n - B22DCVT445

# Convert labels to binary (spam=1, ham=0)
df['label_binary'] = df['label'].map({'spam': 1, 'ham': 0})

# Split features and labels
X = df['cleaned_text'].values
y = df['label_binary'].values

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"Training label distribution: {np.bincount(y_train)}")
print(f"Testing label distribution: {np.bincount(y_test)}")

In [None]:
# T·∫° Cao S∆°n - B22DCVT445

# Tokenization parameters
MAX_WORDS = 10000
MAX_LEN = 200
EMBEDDING_DIM = 128

# Initialize tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN, padding='post', truncating='post')

print(f"Vocabulary size: {len(tokenizer.word_index)}")
print(f"Training data shape: {X_train_pad.shape}")
print(f"Testing data shape: {X_test_pad.shape}")
print(f"\nExample sequence: {X_train_pad[0][:20]}")

In [None]:
def build_dense_model():
    """
    M√¥ h√¨nh Dense v·ªõi 7 layers:
    1. Embedding layer
    2. GlobalMaxPooling1D
    3-7. Dense layers v·ªõi Dropout
    """
    model = Sequential([
        # Layer 1: Embedding
        Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
        
        # Layer 2: Global Max Pooling
        GlobalMaxPooling1D(),
        
        # Layer 3: Dense layer
        Dense(256, activation='relu'),
        Dropout(0.5),
        
        # Layer 4: Dense layer
        Dense(128, activation='relu'),
        Dropout(0.4),
        
        # Layer 5: Dense layer
        Dense(64, activation='relu'),
        Dropout(0.3),
        
        # Layer 6: Dense layer
        Dense(32, activation='relu'),
        Dropout(0.2),
        
        # Layer 7: Output layer
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Build Dense model
dense_model = build_dense_model()
dense_model.summary()

In [None]:
# T·∫° Cao S∆°n - B22DCVT445

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7, verbose=1)

# Train Dense model
print("Training Dense model...")
history_dense = dense_model.fit(
    X_train_pad, y_train,
    epochs=30,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

In [None]:
# T·∫° Cao S∆°n - B22DCVT445

# Evaluate Dense model
y_pred_dense = (dense_model.predict(X_test_pad) > 0.5).astype(int)

print("="*50)
print("DENSE MODEL EVALUATION")
print("="*50)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_dense):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_dense, target_names=['Ham', 'Spam'])}")

# Confusion Matrix
cm_dense = confusion_matrix(y_test, y_pred_dense)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_dense, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - Dense Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(history_dense.history['accuracy'], label='Train Accuracy')
axes[0].plot(history_dense.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('Dense Model - Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history_dense.history['loss'], label='Train Loss')
axes[1].plot(history_dense.history['val_loss'], label='Val Loss')
axes[1].set_title('Dense Model - Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
def build_lstm_model():
    """
    M√¥ h√¨nh LSTM v·ªõi 7 layers:
    1. Embedding layer
    2. Bidirectional LSTM
    3. LSTM layer
    4-6. Dense layers v·ªõi Dropout
    7. Output layer
    """
    model = Sequential([
        # Layer 1: Embedding
        Embedding(input_dim=MAX_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
        
        # Layer 2: Bidirectional LSTM
        Bidirectional(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
        
        # Layer 3: LSTM layer
        LSTM(64, dropout=0.2, recurrent_dropout=0.2),
        
        # Layer 4: Dense layer
        Dense(128, activation='relu'),
        Dropout(0.5),
        
        # Layer 5: Dense layer
        Dense(64, activation='relu'),
        Dropout(0.4),
        
        # Layer 6: Dense layer
        Dense(32, activation='relu'),
        Dropout(0.3),
        
        # Layer 7: Output layer
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# Build LSTM model
lstm_model = build_lstm_model()
lstm_model.summary()

In [None]:
# Train LSTM model
print("Training LSTM model...")
history_lstm = lstm_model.fit(
    X_train_pad, y_train,
    epochs=30,
    batch_size=64,
    validation_split=0.2,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

In [None]:
# Evaluate LSTM model
y_pred_lstm = (lstm_model.predict(X_test_pad) > 0.5).astype(int)

print("="*50)
print("LSTM MODEL EVALUATION")
print("="*50)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred_lstm):.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred_lstm, target_names=['Ham', 'Spam'])}")

# Confusion Matrix
cm_lstm = confusion_matrix(y_test, y_pred_lstm)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_lstm, annot=True, fmt='d', cmap='Greens', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title('Confusion Matrix - LSTM Model')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Plot training history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(history_lstm.history['accuracy'], label='Train Accuracy')
axes[0].plot(history_lstm.history['val_accuracy'], label='Val Accuracy')
axes[0].set_title('LSTM Model - Accuracy')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history_lstm.history['loss'], label='Train Loss')
axes[1].plot(history_lstm.history['val_loss'], label='Val Loss')
axes[1].set_title('LSTM Model - Loss')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Compare model performance
comparison_data = {
    'Model': ['Dense', 'LSTM'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_dense),
        accuracy_score(y_test, y_pred_lstm)
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print("="*50)
print("MODEL COMPARISON")
print("="*50)
print(comparison_df.to_string(index=False))

# Bar chart comparison
plt.figure(figsize=(10, 6))
plt.bar(comparison_df['Model'], comparison_df['Accuracy'], color=['blue', 'green'], alpha=0.7)
plt.ylim(0.9, 1.0)
plt.ylabel('Accuracy')
plt.title('Model Comparison: Dense vs LSTM')
plt.grid(axis='y', alpha=0.3)

for i, v in enumerate(comparison_df['Accuracy']):
    plt.text(i, v + 0.002, f'{v:.4f}', ha='center', fontweight='bold')

plt.show()

# Training history comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].plot(history_dense.history['accuracy'], label='Dense Train', linestyle='-', color='blue')
axes[0].plot(history_dense.history['val_accuracy'], label='Dense Val', linestyle='--', color='blue')
axes[0].plot(history_lstm.history['accuracy'], label='LSTM Train', linestyle='-', color='green')
axes[0].plot(history_lstm.history['val_accuracy'], label='LSTM Val', linestyle='--', color='green')
axes[0].set_title('Training Accuracy Comparison')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True)

axes[1].plot(history_dense.history['loss'], label='Dense Train', linestyle='-', color='blue')
axes[1].plot(history_dense.history['val_loss'], label='Dense Val', linestyle='--', color='blue')
axes[1].plot(history_lstm.history['loss'], label='LSTM Train', linestyle='-', color='green')
axes[1].plot(history_lstm.history['val_loss'], label='LSTM Val', linestyle='--', color='green')
axes[1].set_title('Training Loss Comparison')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

In [None]:
def predict_message(text, model, model_name):
    """Predict if a message is spam or ham"""
    # Preprocess
    cleaned = preprocess_text(text)
    
    # Tokenize and pad
    sequence = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(sequence, maxlen=MAX_LEN, padding='post', truncating='post')
    
    # Predict
    prediction = model.predict(padded, verbose=0)[0][0]
    label = 'SPAM' if prediction > 0.5 else 'HAM'
    confidence = prediction if prediction > 0.5 else 1 - prediction
    
    print(f"\n{model_name} Model:")
    print(f"Message: {text[:100]}...")
    print(f"Prediction: {label} (Confidence: {confidence:.2%})")
    
    return label, confidence

# Test samples
test_messages = [
    "Congratulations! You've won a $1000 gift card. Click here to claim now!",
    "Hey, are we still meeting for lunch tomorrow at noon?",
    "URGENT: Your account has been suspended. Verify your identity immediately.",
    "Thanks for the meeting notes. I'll review them and get back to you.",
    "Free entry in 2 a weekly comp to win iPhone. Text WIN to 12345 now!"
]

print("="*70)
print("TESTING WITH SAMPLE MESSAGES")
print("="*70)

for i, msg in enumerate(test_messages, 1):
    print(f"\n{'='*70}")
    print(f"Test Message {i}:")
    print(f"{'='*70}")
    
    # Predict with both models
    dense_label, dense_conf = predict_message(msg, dense_model, "Dense")
    lstm_label, lstm_conf = predict_message(msg, lstm_model, "LSTM")

## 16. Nh·∫≠n x√©t v√† ph√¢n t√≠ch k·∫øt qu·∫£

### üìä K·∫øt qu·∫£ chi ti·∫øt:

#### **1. M√¥ h√¨nh Dense Network:**
- **Accuracy:** 97.77% ‚≠ê‚≠ê‚≠ê‚≠ê‚≠ê
- **Precision (Ham):** 99% - R·∫•t t·ªët trong vi·ªác nh·∫≠n di·ªán tin nh·∫Øn kh√¥ng ph·∫£i spam
- **Precision (Spam):** 95% - T·ªët trong vi·ªác ph√°t hi·ªán spam
- **Recall (Ham):** 98% - √çt b·ªè s√≥t tin nh·∫Øn ham
- **Recall (Spam):** 96% - Ph√°t hi·ªán ƒë∆∞·ª£c h·∫ßu h·∫øt spam
- **F1-Score:** 0.97-0.99 (r·∫•t c√¢n b·∫±ng)

**Confusion Matrix Dense:**
- True Ham: 2313, False Spam: 41 (ch·ªâ 1.7% ham b·ªã nh·∫≠n nh·∫ßm)
- True Spam: 762, False Ham: 29 (ch·ªâ 3.7% spam b·ªã b·ªè s√≥t)

---

#### **2. M√¥ h√¨nh LSTM:**
- **Accuracy:** 80.51% ‚≠ê‚≠ê‚≠ê
- **Precision (Ham):** 87%
- **Precision (Spam):** 61% - Th·∫•p h∆°n ƒë√°ng k·ªÉ
- **Recall (Ham):** 87%
- **Recall (Spam):** 62% - B·ªè s√≥t nhi·ªÅu spam
- **F1-Score:** 0.62-0.87 (kh√¥ng c√¢n b·∫±ng)

**Confusion Matrix LSTM:**
- True Ham: 2042, False Spam: 312 (13.2% ham b·ªã nh·∫≠n nh·∫ßm - cao!)
- True Spam: 490, False Ham: 301 (38% spam b·ªã b·ªè s√≥t - r·∫•t cao!)

---

### üîç Ph√¢n t√≠ch nguy√™n nh√¢n:

#### **T·∫°i sao Dense model t·ªët h∆°n LSTM (97.77% vs 80.51%)?**

**1. ƒê·∫∑c ƒëi·ªÉm d·ªØ li·ªáu:**
- Spam detection d·ª±a nhi·ªÅu v√†o **t·ª´ kh√≥a ƒë·∫∑c tr∆∞ng** ("free", "win", "congratulations", "claim", "urgent")
- **Kh√¥ng c·∫ßn context tu·∫ßn t·ª±** - ch·ªâ c·∫ßn bi·∫øt c√≥ xu·∫•t hi·ªán t·ª´ spam hay kh√¥ng
- Dense model v·ªõi GlobalMaxPooling1D b·∫Øt ƒë∆∞·ª£c c√°c t·ª´ kh√≥a quan tr·ªçng nh·∫•t

**2. Ki·∫øn tr√∫c ph√π h·ª£p:**
- **Dense model:** Nh·∫π, nhanh, ƒë∆°n gi·∫£n - ph√π h·ª£p v·ªõi bag-of-words features
- **LSTM model:** Ph·ª©c t·∫°p h∆°n, c·∫ßn h·ªçc sequential patterns - **overkill** cho b√†i to√°n n√†y
- LSTM c√≥ th·ªÉ b·ªã **underfitting** do:
  - Training ch·ªâ 5 epochs (stopped early)
  - C·∫ßn nhi·ªÅu data v√† epochs h∆°n ƒë·ªÉ converge

**3. Training behavior:**
- **Dense:** Converge nhanh (epoch 2 ƒë√£ ƒë·∫°t ~99% train accuracy), stable
- **LSTM:** H·ªçc ch·∫≠m h∆°n, validation loss dao ƒë·ªông (30% cu·ªëi c√πng), ch∆∞a converge t·ªët

**4. Overfitting:**
- **Dense:** Val accuracy = 98.5%, Train accuracy = 100% ‚Üí generalization t·ªët
- **LSTM:** Val accuracy cao h∆°n train accuracy ban ƒë·∫ßu, sau ƒë√≥ dao ƒë·ªông ‚Üí kh√¥ng ·ªïn ƒë·ªãnh

---

### ‚úÖ K·∫øt lu·∫≠n:

**M√¥ h√¨nh Dense Network l√† l·ª±a ch·ªçn t·ªët nh·∫•t** cho b√†i to√°n spam detection n√†y v√¨:

1. ‚úÖ **Hi·ªáu su·∫•t cao:** 97.77% accuracy
2. ‚úÖ **Precision/Recall c√¢n b·∫±ng:** C·∫£ 2 class ƒë·ªÅu > 95%
3. ‚úÖ **Training nhanh:** Ch·ªâ 8 epochs, m·ªói epoch < 1 gi√¢y
4. ‚úÖ **ƒê∆°n gi·∫£n, d·ªÖ deploy:** √çt parameters h∆°n LSTM
5. ‚úÖ **Robust:** Kh√¥ng overfitting, validation loss stable

**LSTM kh√¥ng ph√π h·ª£p** trong tr∆∞·ªùng h·ª£p n√†y v√¨:
- ‚ùå Qu√° ph·ª©c t·∫°p cho b√†i to√°n keyword-based
- ‚ùå Training ch·∫≠m (61ms/step vs 1ms/step c·ªßa Dense)
- ‚ùå K·∫øt qu·∫£ k√©m h∆°n nhi·ªÅu (80% vs 97%)
- ‚ùå B·ªè s√≥t 38% spam (301/791) - kh√¥ng ch·∫•p nh·∫≠n ƒë∆∞·ª£c

---

### üí° Khuy·∫øn ngh·ªã:

**S·ª≠ d·ª•ng Dense model** cho production v√¨:
- High accuracy + Fast inference
- Low false positive rate (ch·ªâ 1.7%)
- Low false negative rate (ch·ªâ 3.7%)

**ƒê·ªÉ c·∫£i thi·ªán LSTM** (n·∫øu mu·ªën th·ª≠):
- Train nhi·ªÅu epochs h∆°n (20-30 epochs)
- TƒÉng batch size
- Th·ª≠ learning rate kh√°c
- Th√™m data augmentation
- Nh∆∞ng v·∫´n kh√≥ v∆∞·ª£t qua Dense model cho b√†i to√°n n√†y!