In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

# Export vectoriser vocabulary to JSON. We need this to use the model.
import json

# Load the url dataset
df = pd.read_csv('urls_dataset.csv')

# Preprocess the data: convert titles to strings and distractions to integers
texts = df['title'].astype(str).tolist()
labels = df['distraction'].astype(int).tolist()

# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Convert labels to NumPy arrays
y_train = np.array(y_train)
y_test = np.array(y_test)

# Set up text vectorisation
max_features = 10000
sequence_length = 100

vectorizer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length
)
vectorizer.adapt(x_train)

# Vectorize the training and testing data
x_train_vec = vectorizer(np.array([[s] for s in x_train])).numpy()
x_test_vec = vectorizer(np.array([[s] for s in x_test])).numpy()

# Build the model
model = keras.Sequential([
    layers.Embedding(input_dim=max_features, output_dim=64),
    layers.GlobalAveragePooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train the model using validation_split 
model.fit(x_train_vec, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(x_test_vec, y_test)
print(f'Test accuracy: {accuracy:.3f}')

# Save the model as a TensorFlow SavedModel
tf.saved_model.save(model, 'distraction_detector_model_tf')

# Save the vectoriser vocabulary
vocab = vectorizer.get_vocabulary()
with open('vocab.json', 'w') as f:
    json.dump(vocab, f)


2025-04-03 11:28:33.844623: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-03 11:28:33.845377: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-03 11:28:33.849875: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-04-03 11:28:33.859836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743640113.875940  528998 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743640113.88

Epoch 1/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.7496 - loss: 0.5581 - val_accuracy: 0.7541 - val_loss: 0.5284
Epoch 2/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7816 - loss: 0.4711 - val_accuracy: 0.8770 - val_loss: 0.3119
Epoch 3/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8832 - loss: 0.2990 - val_accuracy: 0.8640 - val_loss: 0.3475
Epoch 4/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8963 - loss: 0.2589 - val_accuracy: 0.8815 - val_loss: 0.2754
Epoch 5/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9020 - loss: 0.2450 - val_accuracy: 0.8994 - val_loss: 0.2668
Epoch 6/10
[1m551/551[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.9036 - loss: 0.2315 - val_accuracy: 0.8826 - val_loss: 0.2898
Epoch 7/10
[1m551/551[0m 

INFO:tensorflow:Assets written to: distraction_detector_model_tf/assets
