# Import libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.layers import SimpleRNN, Embedding, Dense

# Load the dataset

In [2]:
data = pd.read_csv('SPAM - Data.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


# Preprocess the data

In [4]:
texts = []
labels = []

for i,label in enumerate(data['Category']):
    texts.append(data['Message'][i])
    if label == 'ham':
        labels.append(0)
    else:
        labels.append(1)
    
texts = np.asarray(texts)
labels = np.asarray(labels)

In [5]:
texts[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [6]:
labels[0]

0

In [7]:
# Tokenization
tokenizer=Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [8]:
# Padding
max_len = 500
data = tf.keras.preprocessing.sequence.pad_sequences(sequences,maxlen=max_len)

In [9]:
data[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [10]:
# Shuffle the data

np.random.seed(42)
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [11]:
labels

array([0, 0, 0, ..., 0, 0, 0])

# Split the data into training and testing sets

In [12]:
training_samples = int(5572*0.8)
X_train = data[:training_samples]
y_train = labels[:training_samples]
X_test = data[training_samples:]
y_test = labels[training_samples:]

In [13]:
len(X_train)

4457

In [14]:
len(X_test)

1115

# Define the RNN model

In [15]:
max_features = 10000
num_epochs = 10
batch_size = 60

model = Sequential()
model.add(Embedding(max_features,32))
model.add(SimpleRNN(32))
model.add(Dense(1,activation = 'sigmoid'))

# Compile the model

In [16]:
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])

# Train the model

In [17]:
history = model.fit(X_train,y_train,epochs=num_epochs,batch_size=batch_size,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Evaluate the model

In [19]:
predict = model.predict(X_test)
accuracy = model.evaluate(X_test,y_test)

print("Test loss is {0:.2f} accuracy is {1:.2f}".format(accuracy[0],accuracy[1]))

Test loss is 0.19 accuracy is 0.93


In [22]:
nem_message = "Eh u remember how 2 spell his name... Yes i did. He v naughty make until i v wet."

nem_message_seq = tokenizer.texts_to_sequences([nem_message])
nem_message_pad = tf.keras.preprocessing.sequence.pad_sequences(nem_message_seq,maxlen=max_len)

prediction = model.predict(nem_message_pad)

predicted_label = "spam" if prediction[0][0] >0.5 else "ham"

print("Predicted label:",predicted_label)

Predicted label: ham
