In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import nltk
from nltk.corpus import stopwords
import string

# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Use only necessary columns and rename them
df.columns = ['Class', 'Message']

# Encode labels: ham=0, spam=1
df['Class'] = df['Class'].map({'ham': 0, 'spam': 1})

# Download nltk stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = ''.join(ch for ch in text if ch not in punctuations)  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

# Apply preprocessing
df['cleaned_text'] = df['Message'].apply(preprocess_text)

# Parameters for tokenization and padding
max_words = 5000
max_seq_len = 100

# Tokenizer
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])

# Padding sequences to fixed length
X = pad_sequences(sequences, maxlen=max_seq_len)
y = df['Class'].values

# Split dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build RNN model
embedding_dim = 50
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_seq_len),
    SimpleRNN(64, activation='tanh'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

# Evaluate the model
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1-score: {f1:.4f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mohan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Epoch 1/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.8672 - loss: 0.3858 - val_accuracy: 0.9574 - val_loss: 0.1491
Epoch 2/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9758 - loss: 0.0900 - val_accuracy: 0.9596 - val_loss: 0.1419
Epoch 3/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9904 - loss: 0.0314 - val_accuracy: 0.9664 - val_loss: 0.1067
Epoch 4/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9956 - loss: 0.0221 - val_accuracy: 0.9529 - val_loss: 0.1712
Epoch 5/5
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.9908 - loss: 0.0372 - val_accuracy: 0.9664 - val_loss: 0.1233
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step
Test Accuracy: 0.9767
Test Precision: 0.9769
Test Recall: 0.8467
Test F1-score: 0.9071
