## Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import tensorflow as tf

## Reading the dataset

In [2]:
df = pd.read_csv("spam.csv", delimiter=',', encoding='latin-1')
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


## Preprocessing

In [3]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.columns = ["Labels", "Text"]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Labels  5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [4]:
X = df.Text
Y = df.Labels
le = LabelEncoder()
Y = le.fit_transform(Y)
Y = Y.reshape(-1, 1)

In [5]:
max_words = 1000
max_length = 150
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_length)
sequences_matrix.shape

(5572, 150)

In [6]:
x_train, x_test, y_train, y_test = train_test_split(sequences_matrix, Y, test_size=0.15, shuffle=True, random_state=0)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((4736, 150), (836, 150), (4736, 1), (836, 1))

## Creating the LSTM model

In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_words, output_dim=128, input_shape=(max_length,)),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 128)          128000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 267,905
Trainable params: 267,905
Non-trainable params: 0
_________________________________________________________________


## Compile and Train the LSTM model

In [8]:
model.compile(
    loss='binary_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
    metrics=['accuracy']
)

model.fit(
    x_train,
    y_train,
    batch_size=128,
    epochs=8,
    validation_split=0.2
)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f22ce216e50>

## Save the LSTM Spam Classifier

In [9]:
model.save("spam_classifier.h5")

## Test the model

In [10]:
results = model.evaluate(x_test, y_test)
print(f'Test Loss: {results[0]:.2f}\tTest Accuracy: {results[1]*100:.2f}%')

Test Loss: 0.05	Test Accuracy: 98.68%
