#SMS SPAM DETECTION USING RNN

## Loading Libraries


In [1]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np

## Loading Dataset

In [2]:
def load_data(filepath):
 # Read the data - tab separated
 data = pd.read_csv(filepath, sep='\t', names=['label', 'text'])
 return data

In [3]:
df = load_data('/content/SMSSpamCollection')

## Viewing Dataset

In [4]:
df.head(5)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing

In [5]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
from sklearn.preprocessing import LabelEncoder

# Loading dataset (assuming df has 'text' and 'label' columns)
max_words = 5000  # Vocabulary size
max_len = 100  # Max sequence length

# Tokenization
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(sequences, maxlen=max_len)


label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

## Preprocessing Info


In [7]:
print("\nPreprocessing Info:")
print(f"Vocabulary size: {len(tokenizer.word_index) + 1}")
print(f"Sequence length: {max_len}")
print(f"Total samples: {len(X)}")


Preprocessing Info:
Vocabulary size: 9011
Sequence length: 100
Total samples: 5572


## Building RNN model




In [8]:
vocab_size = min(len(tokenizer.word_index) + 1, max_words)
inputs = tf.keras.Input(shape=(max_len,))
x = tf.keras.layers.Embedding(vocab_size, 128)(inputs)
x = tf.keras.layers.SimpleRNN(64, return_sequences=True)(x)
x = tf.keras.layers.SimpleRNN(32)(x)
x = tf.keras.layers.Dense(16, activation='relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

## Model Compiling

In [9]:
model.compile(optimizer='adam',
 loss='binary_crossentropy',
 metrics=['accuracy'])

## Trainin the model

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',   # Monitor validation loss
    patience=3,           # Stop training if val_loss doesn't improve for 3 consecutive epochs
    restore_best_weights=True  # Restore the best model weights after training stops
)


history = model.fit(
    X_train, y_train,
    epochs=10,           # Increased epochs to allow for better training
    batch_size=32,       # Mini-batch size of 32
    validation_data=(X_test, y_test),  # Use X_test and y_test for validation
    callbacks=[early_stopping]
)

Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 50ms/step - accuracy: 0.8602 - loss: 0.4200 - val_accuracy: 0.9812 - val_loss: 0.0844
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.9805 - loss: 0.0746 - val_accuracy: 0.9830 - val_loss: 0.0486
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9924 - loss: 0.0272 - val_accuracy: 0.9901 - val_loss: 0.0407
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.9968 - loss: 0.0174 - val_accuracy: 0.9919 - val_loss: 0.0433
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9971 - loss: 0.0112 - val_accuracy: 0.9928 - val_loss: 0.0472
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9975 - loss: 0.0045 - val_accuracy: 0.9910 - val_loss: 0.0525


## Model Evaluation

In [12]:
loss,accuracy = model.evaluate(X_test,y_test)
print("Accuracy:",accuracy)
print("Loss:",loss)

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9891 - loss: 0.0420
Accuracy: 0.9901345372200012
Loss: 0.0406571701169014


## Prediction

In [13]:
def predict_sms_spam(text, max_length=100):

    sequences = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequences, maxlen=max_length, padding='post')
    prediction = model.predict(padded_sequence)
    return "Spam" if prediction[0] > 0.5 else "Not Spam"

sms_text = "Congratulations! You've won a $1000 gift card. Claim now."
print(predict_sms_spam(sms_text))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step   
Not Spam
