In [None]:
import keras, re
import pandas as pd
from keras import ops
from keras import layers 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
class TransformerBlock(layers.Layer):
    """
    Taken from Keras Tutorial
    https://keras.io/examples/nlp/text_classification_with_transformer/
    """
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)


In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = ops.shape(x)[-1]
        positions = ops.arange(start=0, stop=maxlen, step=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


In [None]:
def tokenize(text):
    '''
    Simple text tokenizer that is used with the Term Frequency Inverse Document Frequency Vectorizer.\n 
    The way this function works is that it takes stop words from the Natural Language Toolkit NTLK[https://www.nltk.org/] 
    Passes them into a regex and uses WordNetLemmatizer to tokenize the text.
    :param text: to be tokenized
    :return: text tokens
    '''    
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())

    tokens = word_tokenize(text)

    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

In [7]:
sms = pd.read_csv('SMSSpamCollection', sep="\t", names=['label', 'text'])
all_words = CountVectorizer(tokenizer=tokenize)
word_counts = all_words.fit_transform(sms['text'].values)

sms['label'] = sms['label'].map({'spam': 1, 'ham': 0})
y = sms['label'].values

X_train, X_test, y_train, y_test = train_test_split(word_counts.toarray(), y, train_size=0.8)


X_train = keras.utils.pad_sequences(X_train, maxlen=250)
X_test = keras.utils.pad_sequences(X_test, maxlen=250)
print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")



Train shape: (4457, 250)
Test shape: (1115, 250)


In [7]:
maxlen = 250 
vocab_size = len(all_words.get_feature_names_out())  
embed_dim = 32  
num_heads = 2  
ff_dim = 32 

inputs = layers.Input(shape=(maxlen,))
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
transformer_block = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(embedding_layer, embedding_layer)
x = layers.GlobalAveragePooling1D()(transformer_block)
x = layers.Dropout(0.1)(x)
x = layers.Dense(16, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(8, activation="relu")(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(4, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
optimizer = keras.optimizers.Adam(learning_rate=0.001)
# Compile the model
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Train the model
history = model.fit(X_train, y_train, batch_size=32, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 44ms/step - accuracy: 0.8261 - loss: 0.4852 - val_accuracy: 0.8735 - val_loss: 0.3774
Epoch 2/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - accuracy: 0.8598 - loss: 0.4447 - val_accuracy: 0.8735 - val_loss: 0.3746
Epoch 3/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.8569 - loss: 0.4386 - val_accuracy: 0.8735 - val_loss: 0.3759
Epoch 4/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 40ms/step - accuracy: 0.8600 - loss: 0.4097 - val_accuracy: 0.8735 - val_loss: 0.3767
Epoch 5/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 45ms/step - accuracy: 0.8621 - loss: 0.4026 - val_accuracy: 0.8735 - val_loss: 0.3712
Epoch 6/10
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 42ms/step - accuracy: 0.8619 - loss: 0.4076 - val_accuracy: 0.8735 - val_loss: 0.3728
Epoch 7/10
[1m140/14