In [37]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras import losses
import re
import string

In [38]:
# Some initial setup steps and parameters for easy changeability

SEQUENCE_LENGTH = 100
TRAIN_TEST_SPLIT = 0.25
MAX_FEATURES = 10000
EMBEDDING_DIM = 16

EPOCHS = 10

labelToInteger = {'ham': 0, 'spam': 1}
integerToLabel = {0: 'ham', 1: 'spam'}

In [68]:
# function to load in the data from the file

def load_data():
    texts, labels = [], []
    with open("data/SMSSpamCollection") as f:
        for line in f:
            split = line.split()
            labels.append(split[0].strip())
            texts.append(' '.join(split[1:]).strip())
    return texts, labels

In [53]:
# loading in the data, x has the text, y has whether that text is spam or not (ham); 
    # both in arrays, use corresponding indices

x, y = load_data()

In [54]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=TRAIN_TEST_SPLIT, random_state=412)
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=102)

In [25]:
# raw_train_data = []
# raw_test_data = []
# raw_val_data = []

# for i in range(len(y_train)):
#     raw_train_data.append([X_train[i],y_train[i]])
    
# for i in range(len(y_test)):
#     raw_test_data.append([X_test[i],y_test[i]])

# for i in range(len(y_val)):
#     raw_val_data.append([X_val[i],y_val[i]])

In [55]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [56]:
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode='int',
    output_sequence_length=SEQUENCE_LENGTH)

In [57]:
raw_train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
raw_test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
raw_val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

train_text = raw_train_dataset.map(lambda x,y : x)

In [58]:
vectorize_layer.adapt(train_text)

In [67]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [66]:
train_dataset = raw_train_dataset.map(vectorize_text)
test_dataset = raw_test_dataset.map(vectorize_text)
val_dataset = raw_val_dataset.map(vectorize_text)

print(raw_train_dataset)
print(train_dataset)

<TensorSliceDataset element_spec=(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.string, name=None))>
<MapDataset element_spec=(TensorSpec(shape=(None, 100), dtype=tf.int64, name=None), TensorSpec(shape=(2,), dtype=tf.float32, name=None))>


In [32]:
# Building the model layers

model = tf.keras.Sequential([
    layers.Embedding(MAX_FEATURES, EMBEDDING_DIM),
    layers.Dropout(0.2),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(1)]
)

In [33]:
# Building the loss function

model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [34]:
spamDetect = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=EPOCHS)

Epoch 1/10


ValueError: in user code:

    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/engine/training.py", line 1024, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/engine/training.py", line 1082, in compute_loss
        return self.compiled_loss(
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/engine/compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/losses.py", line 284, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/losses.py", line 2176, in binary_crossentropy
        backend.binary_crossentropy(y_true, y_pred, from_logits=from_logits),
    File "/Users/BenAnderson/Library/Python/3.8/lib/python/site-packages/keras/backend.py", line 5680, in binary_crossentropy
        return tf.nn.sigmoid_cross_entropy_with_logits(

    ValueError: `logits` and `labels` must have the same shape, received ((None, 1) vs ()).


In [None]:
loss, accuracy = model.evaluate(test_dataset)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

In [None]:
model.save("spam_detector.keras")

In [None]:
print(test_dataset)