In [18]:
import pathlib
import pandas as pd
BASE_DIR = pathlib.Path().resolve().parent
EXPORT_DIR = BASE_DIR/"spam_data_dir"/"exports"
SPAM_DATASET_PATH = EXPORT_DIR/"spam_dataset.csv"

In [19]:
df = pd.read_csv(SPAM_DATASET_PATH)
label_legend = {"ham" : 0, "spam" : 1}
texts = df['text'].tolist()
labels = df['label'].tolist()

In [20]:
df_one_hot = pd.get_dummies(df, columns=['label'], dtype=float)
df_one_hot

Unnamed: 0,text,source,label_ham,label_spam
0,"Go until jurong point, crazy.. Available only ...",sms-spam,1.0,0.0
1,Ok lar... Joking wif u oni...,sms-spam,1.0,0.0
2,Free entry in 2 a wkly comp to win FA Cup fina...,sms-spam,0.0,1.0
3,U dun say so early hor... U c already then say...,sms-spam,1.0,0.0
4,"Nah I don't think he goes to usf, he lives aro...",sms-spam,1.0,0.0
...,...,...,...,...
7523,I love this song because we sing it at Camp al...,yt-spam,1.0,0.0
7524,I love this song for two reasons: 1.it is abou...,yt-spam,1.0,0.0
7525,wow,yt-spam,1.0,0.0
7526,Shakira u are so wiredo,yt-spam,1.0,0.0


In [21]:
from keras import layers
MAX_TOKENS = 68000

In [22]:
train_sentences = texts

In [23]:
vectorizer = layers.TextVectorization(MAX_TOKENS,
                                     output_sequence_length = 55,
                                     pad_to_max_tokens = False)
vectorizer.adapt(train_sentences)

In [24]:
vectorizer([train_sentences[5]])

<tf.Tensor: shape=(1, 55), dtype=int64, numpy=
array([[ 1076,    87,    76,  2164,    48,   119,   207,   736,    32,
            7,    46,   397,    95,   458,    40,   112,   435,     4,
           42,    14,    15,    88,  2311,    63,   453,  1310, 11188,
            3,    89,   551,     3,  4028,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0]], dtype=int64)>

In [25]:
vocabulary = vectorizer.get_vocabulary()

In [10]:
!pip install scikit-learn




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip





In [26]:
X = vectorizer(train_sentences)
y = pd.concat([df_one_hot['label_ham'], df_one_hot['label_spam']], axis=1)
print(y.head(5))
y = y.to_numpy()
X = X.numpy()
print(type(X))
print(type(y))
print(y)

   label_ham  label_spam
0        1.0         0.0
1        1.0         0.0
2        0.0         1.0
3        1.0         0.0
4        1.0         0.0
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
[[1. 0.]
 [1. 0.]
 [0. 1.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [34]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.batch(32).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
val_dataset = val_dataset.batch(32).prefetch(tf.data.AUTOTUNE)

In [35]:
embeddings = layers.Embedding(input_dim=len(vocabulary),
                               output_dim=128,
                               name="token_embedding")

In [36]:
import keras

In [52]:
inputs = layers.Input(shape=(55,))
token_embeddings = embeddings(inputs) # create embedding
x = layers.Conv1D(64, kernel_size=5, padding='same', activation='relu')(token_embeddings)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(32, activation='relu')(x)
outputs = layers.Dense(2, activation="softmax")(x)
model_1 = keras.Model(inputs, outputs)

In [53]:
model_1.compile(
    loss = keras.losses.BinaryCrossentropy(),
    optimizer = keras.optimizers.Adam(),
    metrics = ["accuracy"]
)
model_1.summary()

In [54]:
model_1.fit(train_dataset, epochs=5, validation_data = val_dataset)

Epoch 1/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 20ms/step - accuracy: 0.8182 - loss: 0.4279 - val_accuracy: 0.9509 - val_loss: 0.1502
Epoch 2/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9569 - loss: 0.1198 - val_accuracy: 0.9571 - val_loss: 0.1275
Epoch 3/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.9808 - loss: 0.0621 - val_accuracy: 0.9553 - val_loss: 0.1249
Epoch 4/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.9889 - loss: 0.0323 - val_accuracy: 0.9544 - val_loss: 0.1352
Epoch 5/5
[1m165/165[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.9958 - loss: 0.0166 - val_accuracy: 0.9531 - val_loss: 0.1480


<keras.src.callbacks.history.History at 0x1f3cbebf7a0>

In [55]:
MODEL_PATH = BASE_DIR / "model.keras"
model_1.save(MODEL_PATH)

In [56]:
model_1.predict(X_test)

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


array([[9.9721611e-01, 2.7838994e-03],
       [1.0000000e+00, 1.3236371e-09],
       [9.9991918e-01, 8.0808670e-05],
       ...,
       [8.5820353e-01, 1.4179647e-01],
       [1.0000000e+00, 3.2014809e-09],
       [9.9999988e-01, 1.0099214e-07]], dtype=float32)

In [57]:
vectorizer_metadata = {
    "vocabulary" : vocabulary,
    "MAX_TOKENS" : 68000,
    "output_length" : 55
}

In [58]:
import json
VECTORIZER_DATA_PATH = BASE_DIR/"vectorizer_metadata.json"

In [59]:
with open(VECTORIZER_DATA_PATH, 'w') as outfile:
    json.dump(vectorizer_metadata, outfile)