In [205]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers
from tensorflow.keras import losses
import re
import string
import keras

In [145]:
# Some initial setup steps and parameters for easy changeability

SEQUENCE_LENGTH = 100
TRAIN_TEST_SPLIT = 0.25
MAX_FEATURES = 10000
EMBEDDING_DIM = 16

BATCH_SIZE = 32
EPOCHS = 10

In [146]:
# function to load in the data from the file

def load_data():
    texts, labels = [], []
    with open("data/SMSSpamCollection") as f:
        for line in f:
            split = line.split()
            labels.append(split[0].strip())
            texts.append(' '.join(split[1:]).strip())
    return texts, labels

In [147]:
# loading in the data, x has the text, y has whether that text is spam or not (ham); 
    # both in arrays, use corresponding indices

x, y = load_data()

In [148]:
# Creating the train, test, validation sets

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=TRAIN_TEST_SPLIT, random_state=412)
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=102)

train_size = len(X_train)
val_size = len(X_val)

In [181]:
# combining the text and labels into one variable for each set

# raw_train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# raw_test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
# raw_val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

raw_train_x = np.array(X_train)
raw_train_y = np.array(y_train)
raw_test_x = np.array(X_test)
raw_test_y = np.array(y_test)
raw_val_x = np.array(X_val)
raw_val_y = np.array(y_val)

In [207]:
# standardizing text inputs

tf.keras.saving.register_keras_serializable(package="custom_standardization", name="custom_standardization")
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

AttributeError: module 'tensorflow.keras' has no attribute 'saving'

In [208]:
# layer used to tokenize the text

vectorize_layer = layers.TextVectorization(
    standardize='lower_and_strip_punctuation',
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH
)

In [209]:
# testing the vectorize_layer

train_text = raw_train_dataset.map(lambda x,y : x)
#vectorize_layer.adapt(train_text)
vectorize_layer.adapt(raw_train_x)

In [210]:
# function to vectorize all the text of all sets

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

def vectorize_text_only(text):
    text = tf.expand_dims(text, -1)
    vlayer = vectorize_layer(text)
    vlayer = tf.reshape(vlayer, (100))
    return vlayer

In [211]:
# vectorizing all sets

# train_dataset = raw_train_dataset.map(vectorize_text)
# test_dataset = raw_test_dataset.map(vectorize_text)
# val_dataset = raw_val_dataset.map(vectorize_text)

# print(train_dataset)

train_x_array = []
for element in raw_train_x:
    train_x_array.append(vectorize_text_only(element))
    
train_x = np.array(train_x_array)

test_x_array = []
for element in raw_test_x:
    test_x_array.append(vectorize_text_only(element))
    
test_x = np.array(test_x_array)

val_x_array = []
for element in raw_val_x:
    val_x_array.append(vectorize_text_only(element))
    
val_x = np.array(val_x_array)

train_y_array = []
for element in raw_train_y:
    if element == "ham":
        train_y_array.append(0)
    else:
        train_y_array.append(1)

train_y = np.array(train_y_array)

test_y_array = []
for element in raw_test_y:
    if element == "ham":
        test_y_array.append(0)
    else:
        test_y_array.append(1)

test_y = np.array(test_y_array)

val_y_array = []
for element in raw_val_y:
    if element == "ham":
        val_y_array.append(0)
    else:
        val_y_array.append(1)

val_y = np.array(val_y_array)


In [212]:
print(train_x.shape)
print(train_y.shape)

(5016, 100)
(5016,)


In [213]:
# Building the model layers

model = tf.keras.Sequential([
    layers.Embedding(MAX_FEATURES, EMBEDDING_DIM),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)
])

In [214]:
# Building the loss function

model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [215]:
# fitting the model

spamDetect = model.fit(
    train_x,
    train_y,
    epochs=EPOCHS,
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [216]:
# evaluating the model

model.evaluate(test_x,test_y)



[0.19192317128181458, 0.9153515100479126]

In [217]:
# exporting the model to work with strings

export_model = tf.keras.Sequential([
  vectorize_layer,
  model,
  layers.Activation('sigmoid')
])

In [218]:
export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

In [219]:
export_model.evaluate(raw_train_x, train_y)



[0.19184905290603638, 0.9166666865348816]

In [220]:
export_model.save('SPAM_DETECTOR', save_format="keras")



INFO:tensorflow:Assets written to: SPAM_DETECTOR/assets


INFO:tensorflow:Assets written to: SPAM_DETECTOR/assets
