In [None]:
# SMS Spam Detection - Google Colab Notebook

# Step 1: Install & Import Dependencies
!pip install nltk scikit-learn tensorflow wordcloud


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
import tensorflow as tf
import joblib
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dropout, LSTM, Bidirectional, Attention
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Step 2: Load dataset 1
df1 = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv", sep='\t', names=['label', 'message'])
# Map labels to 0 (ham) and 1 (spam)
df1['label'] = df1['label'].map({'ham': 0, 'spam': 1})


In [None]:
df1.shape
df1.info()
df1.columns

# Check the combined dataset
# print(df1['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   int64 
 1   message  5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


Index(['label', 'message'], dtype='object')

In [None]:
# #column 2, 3, 4 have majority missing values, so it is better to drop them.(Only while using the original csv from diffrent sites)
# df1.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)

# #displaying the edited dataframe
# df1

In [None]:
# Load dataset 2 and rename

#telegram spam dataset
df2 = pd.read_csv("/content/dataset.csv")

# Rename columns to match the original dataset
df2.rename(columns={'text_type': 'label', 'text': 'message'}, inplace=True)

# Map labels to 0 (ham) and 1 (spam)
df2['label'] = df2['label'].map({'ham': 0, 'spam': 1})


# Check the combined dataset
print(df2['label'].value_counts())
df2.info()


label
0    14337
1     6011
Name: count, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20348 entries, 0 to 20347
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    20348 non-null  int64 
 1   message  20348 non-null  object
dtypes: int64(1), object(1)
memory usage: 318.1+ KB


In [None]:
# Load dataset 3
# enron spam dataset

df3 = pd.read_csv("/content/enron_spam_data.csv", dtype={'Spam/Ham': str, 'Message': str}, low_memory=False)


In [None]:
# Drop unnecessary & unnamed columns
df3 = df3.loc[:, ~df3.columns.str.contains('Unnamed')]
df3 = df3.drop(['Message ID', 'Subject', 'Date'], axis=1)

# Rename columns
df3.rename(columns={'Spam/Ham': 'label', 'Message': 'message'}, inplace=True)

# Remove leading/trailing spaces & drop NaN values from label column
df3['label'] = df3['label'].astype(str).str.strip()
df3 = df3.dropna(subset=['label'])

# Convert cells to a single line
df3['message'] = df3['message'].apply(lambda x: x.replace('\n', ' ') if isinstance(x, str) else x)

# Keep only valid labels ("spam" or "ham")
df3 = df3[df3['label'].isin(['spam', 'ham'])].copy()  # Use .copy() to prevent warnings

In [None]:
# Map 'ham' to 0 and 'spam' to 1
df3['label'] = df3['label'].map({'ham': 0, 'spam': 1})

# Convert the 'label' column to int type
df3['label'] = df3['label'].astype(int)

# Drop NaN values in "message" column
df3 = df3.dropna(subset=['message'])

# Reset index after dropping rows
df3.reset_index(drop=True, inplace=True)

# Reorder columns
df3 = df3[['label', 'message']]

# Debugging: Final check
print(df3['label'].unique())  # Should be [0, 1]
print(df3.info())  # Verify non-null values & correct dtypes

[0 1]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33268 entries, 0 to 33267
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    33268 non-null  int64 
 1   message  33268 non-null  object
dtypes: int64(1), object(1)
memory usage: 519.9+ KB
None


In [None]:
# Merge all the datasets
df = pd.concat([df1, df2, df3], ignore_index=True)


df.info()
# Check the combined dataset
print(df['label'].value_counts())

# Save the merged dataset to a CSV file
df.to_csv("merged_spam_ham.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59188 entries, 0 to 59187
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    59188 non-null  int64 
 1   message  59188 non-null  object
dtypes: int64(1), object(1)
memory usage: 924.9+ KB
label
0    35578
1    23610
Name: count, dtype: int64


In [None]:
df.shape
df.isnull().sum()


Unnamed: 0,0
label,0
message,0


In [None]:
#check if there is any DUPLICATE values
df.duplicated().sum()

4278

In [None]:
#the dataset has DUPLICATE values, so we will have to REMOVE them
df = df.drop_duplicates(keep='first')

#displaying the edited dataframe
df.head()
df.duplicated().sum()

0

In [None]:
# Define a better stopword list (keep negations and important words)
custom_stopwords = set(stopwords.words('english')) - {"no", "not", "won't", "don't", "urgent", "free", "win", "claim", "offer"}

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^\w\s]', '', text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)


# Apply preprocessing to the dataset
df['cleaned_message'] = df['message'].apply(preprocess_text)

In [None]:
#check if there is any NULL value
df.isnull().sum()


Unnamed: 0,0
label,0
message,0
cleaned_message,0


In [None]:
#check if there is any DUPLICATE values
df.duplicated().sum()

0

In [None]:
#the dataset has DUPLICATE values, so we will have to REMOVE them
df = df.drop_duplicates(keep='first')

#displaying the edited dataframe
df.head()

Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


In [None]:
# #Step 4: Exploratory Data Analysis
# plt.pie(df['label'].value_counts(),  labels = ['NOT SPAM', 'SPAM'], autopct = '%0.2f', radius = 0.8)
# plt.show()

In [None]:
# Create new features
df['countCharacters'] = df['message'].apply(len)
df['countWords'] = df['message'].apply(lambda i: len(nltk.word_tokenize(i)))
df['countSentences'] = df['message'].apply(lambda i: len(nltk.sent_tokenize(i)))


In [None]:
# Show basic statistics
print(df[['countCharacters', 'countWords', 'countSentences']].describe())

In [None]:
# print(df[df['label'] == 1]['cleaned_message'].head(10))

In [None]:
# Ensure there's at least one word
spam_text = df[df['label'] == 1]['cleaned_message'].str.cat(sep=" ")
if len(spam_text.strip()) == 0:
    spam_text = "no_spam_words_found"

spam_wc = WordCloud(width=500, height=500, background_color='white').generate(spam_text)

plt.figure(figsize=(12, 6))
plt.imshow(spam_wc)
plt.axis("off")
plt.title("Word Cloud - Spam Messages")
plt.show()


In [None]:
ham_wc = WordCloud(width=500, height=500, background_color='white').generate(df[df['label'] == 0]['cleaned_message'].str.cat(sep=" "))
plt.figure(figsize=(12, 6))
plt.imshow(ham_wc)
plt.axis("off")
plt.title("Word Cloud - Non-Spam Messages")
plt.show()

In [None]:
#Pairplot to Identify Relationship Between Features

sns.pairplot(df[['countCharacters', 'countWords', 'countSentences', 'label']], hue="label")
plt.show()

In [None]:
 #Correlation Matrix and Heatmap

corr_matrix = df[['countCharacters', 'countWords', 'countSentences', 'label']].corr()
print(corr_matrix)

# Heatmap for correlation
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
# Split the dataset into train (80%), validation (10%), and test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Training set size: 43928
Validation set size: 5491
Test set size: 5491


In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(train_df['cleaned_message'])
X_val_vec = vectorizer.transform(val_df['cleaned_message'])
X_test_vec = vectorizer.transform(test_df['cleaned_message'])
y_train, y_val, y_test = train_df['label'], val_df['label'], test_df['label']





In [None]:
# Train Naive Bayes Model
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)

In [None]:
# Evaluate on validation set
y_val_pred = nb_model.predict(X_val_vec)
print("Naive Bayes Validation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
print(f"Precision: {precision_score(y_val, y_val_pred):.4f}")
print(f"Recall: {recall_score(y_val, y_val_pred):.4f}")
print(f"F1 Score: {f1_score(y_val, y_val_pred):.4f}")

# Evaluate on test set
y_test_pred = nb_model.predict(X_test_vec)
print("Naive Bayes Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred):.4f}")

Naive Bayes Validation Set Performance:
Accuracy: 0.9390
Precision: 0.9577
Recall: 0.8747
F1 Score: 0.9143
Naive Bayes Test Set Performance:
Accuracy: 0.9421
Precision: 0.9544
Recall: 0.8809
F1 Score: 0.9162


In [None]:
# Step 8: Build Bi-LSTM Model
MAX_WORDS = 5000
MAX_LEN = 100


In [None]:
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(train_df['cleaned_message'])
X_train_seq = tokenizer.texts_to_sequences(train_df['cleaned_message'])
X_val_seq = tokenizer.texts_to_sequences(val_df['cleaned_message'])
X_test_seq = tokenizer.texts_to_sequences(test_df['cleaned_message'])
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_val_pad = pad_sequences(X_val_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)





In [None]:
from tensorflow.keras import layers
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# Define Attention Layer
class AttentionLayer(layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def call(self, inputs):
        # Compute attention scores
        score = tf.nn.softmax(inputs, axis=1)
        # Apply attention to inputs
        return inputs * score

# Define the Optimized Bi-LSTM Model with Attention
model = keras.Sequential([
    layers.Embedding(MAX_WORDS, 128, input_length=MAX_LEN),

    # First Bi-LSTM Layer with Dropout
    Bidirectional(LSTM(128, return_sequences=True)),  # Increased LSTM units
    AttentionLayer(),
    Dropout(0.6),  # Increased dropout for better generalization

    # Second Bi-LSTM Layer with Dropout
    Bidirectional(LSTM(64)),  # Increased LSTM units
    Dropout(0.6),  # Increased dropout

    # Dense Layers
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define Class Weights (to handle imbalanced data)
class_weights = {0: 1, 1: 1.5}  # Adjusted weight for spam class

# Define Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the Model
history = model.fit(
    X_train_pad, y_train,
    epochs=10,
    batch_size=256,  # Increased batch size for faster training
    validation_data=(X_val_pad, y_val),
    class_weight=class_weights,  # Use class weights
    callbacks=[early_stopping]  # Prevent overfitting
)

Epoch 1/10




[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 58ms/step - accuracy: 0.6941 - loss: 0.6925 - val_accuracy: 0.9404 - val_loss: 0.1696
Epoch 2/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.9455 - loss: 0.1894 - val_accuracy: 0.9479 - val_loss: 0.1357
Epoch 3/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 58ms/step - accuracy: 0.9622 - loss: 0.1309 - val_accuracy: 0.9530 - val_loss: 0.1426
Epoch 4/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.9683 - loss: 0.1125 - val_accuracy: 0.9536 - val_loss: 0.1324
Epoch 5/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 55ms/step - accuracy: 0.9747 - loss: 0.0895 - val_accuracy: 0.9526 - val_loss: 0.1444
Epoch 6/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 53ms/step - accuracy: 0.9745 - loss: 0.0888 - val_accuracy: 0.9550 - val_loss: 0.1346
Epoch 7/10
[1m172/172[0m 

In [None]:

# Evaluate on test set
y_test_pred_lstm = (model.predict(X_test_pad) > 0.3).astype(int)
print("LSTM Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred_lstm):.4f}")
print(f"Precision: {precision_score(y_test, y_test_pred_lstm):.4f}")
print(f"Recall: {recall_score(y_test, y_test_pred_lstm):.4f}")
print(f"F1 Score: {f1_score(y_test, y_test_pred_lstm):.4f}")

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step
LSTM Test Set Performance:
Accuracy: 0.9534
Precision: 0.9153
Recall: 0.9589
F1 Score: 0.9366


In [None]:
import joblib
# Step 9: Save Model & Vectorizer
model.save("sms_spam_model.h5")

joblib.dump(vectorizer, "tfidf_vectorizer.pkl")
joblib.dump(tokenizer, "tokenizer.pkl")

print("✅ Model Training & Export Complete!")




✅ Model Training & Export Complete!


In [None]:
def predict_spam(text, threshold=0.5):
    processed_text = preprocess_text(text)
    text_seq = tokenizer.texts_to_sequences([processed_text])
    text_pad = keras.preprocessing.sequence.pad_sequences(text_seq, maxlen=MAX_LEN)
    prediction = model.predict(text_pad)
    print(f"Prediction Probability: {prediction[0][0]:.4f}")  # Debugging output
    return "SPAM" if prediction > threshold else "NOT SPAM"

In [None]:
test_messages = [
    "Win a free iPhone today! Reply with your details.",
    "Limited offer! Get 50% off on all products, click here!",
    "Hey, how are you doing? Long time no see!",
    "Click here to claim your $500 reward from Amazon!",
    "Urgent: Your Netflix account has been suspended. Click here to reactivate.",
    "You have a message from a friend. Open this link to view it.",
    "Dear customer, your package is waiting for pickup at the nearest location.",
    "Special promotion: Earn money fast by working from home! Contact us now.",
    "Your prize is ready! Claim your winnings before the deadline.",
    "Reminder: Pay your outstanding bill before it's too late.",
    "Don't miss out! Final hours of our mega sale, shop now!",
    "You've been selected for an exclusive giveaway. Act now!",
    "How are you!"
]

for text in test_messages:
    print(f"Message: '{text}' -> Prediction: {predict_spam(text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step
Prediction Probability: 0.9156
Message: 'Win a free iPhone today! Reply with your details.' -> Prediction: SPAM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
Prediction Probability: 0.9923
Message: 'Limited offer! Get 50% off on all products, click here!' -> Prediction: SPAM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Prediction Probability: 0.0168
Message: 'Hey, how are you doing? Long time no see!' -> Prediction: NOT SPAM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
Prediction Probability: 0.9956
Message: 'Click here to claim your $500 reward from Amazon!' -> Prediction: SPAM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
Prediction Probability: 0.9845
Message: 'Urgent: Your Netflix account has been suspended. Click here to reactivate.' -> Prediction: SPAM
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m