In [1]:
import os
import re
import string

import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from wandb.keras import WandbCallback

import wandb

old_dataset = "data/filesplit2/"
old_testset = "data/test/"
kaggle_set_folder = "data/kaggle_set_folder"
kaggle_test_set_folder = "data/kaggle_test_set_folder"
small_kaggle_set_folder = "data/small_kaggle_set_folder"
small_kaggle_test_set_folder = "data/small_kaggle_test_set_folder"
history_set_folder = "data/history_set_folder"
history_test_set_folder = "data/history_test_set_folder"
combined_set_folder = "data/combined_set_folder"
combined_test_set_folder = "data/combined_test_set_folder"
MODELS_FOLDER = "IA_models"

MODEL_NAME = "combined.h5"
MAX_FEATURES = 2000
BATCH_SIZE = 128
EPOCHS = 15
OUTPUT_DIM = 10
TRAIN_SET = combined_set_folder
TEST_SET = combined_test_set_folder

SEQUENCE_LENGTH = 150
SEED = 42

config_defaults = {
    'batch_size': 128,
    'learning_rate': 0.01
}
wandb.init(project="url_watcher", config=config_defaults)

[34m[1mwandb[0m: Currently logged in as: [33mwawann[0m (use `wandb login --relogin` to force relogin)


In [2]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )


raw_train_data = tf.keras.preprocessing.text_dataset_from_directory(TRAIN_SET, batch_size=wandb.config.batch_size,
                                                                    validation_split=0.2, subset="training",
                                                                    label_mode="binary", seed=SEED)

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)
text_ds = raw_train_data.map(lambda x, y: x, num_parallel_calls=tf.data.AUTOTUNE)
vectorize_layer.adapt(text_ds)


def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

Found 71578 files belonging to 2 classes.
Using 57263 files for training.


In [3]:
def train():
    config_defaults = {
        'batch_size': 128,
        'learning_rate': 0.01
    }
    wandb.init(project="url_watcher", config=config_defaults)
    wandb.config.epochs = EPOCHS
    raw_train_data = tf.keras.preprocessing.text_dataset_from_directory(TRAIN_SET, batch_size=wandb.config.batch_size,
                                                                        validation_split=0.2, subset="training",
                                                                        label_mode="binary", seed=SEED)
    raw_val_data = tf.keras.preprocessing.text_dataset_from_directory(TRAIN_SET, batch_size=wandb.config.batch_size,
                                                                      validation_split=0.2, subset="validation",
                                                                      label_mode="binary", seed=SEED)
    raw_test_data = tf.keras.preprocessing.text_dataset_from_directory(TEST_SET, batch_size=wandb.config.batch_size)

    vectorize_layer = TextVectorization(
        standardize=custom_standardization,
        max_tokens=MAX_FEATURES,
        output_mode="int",
        output_sequence_length=SEQUENCE_LENGTH,
    )
    text_ds = raw_train_data.map(lambda x, y: x, num_parallel_calls=tf.data.AUTOTUNE)
    vectorize_layer.adapt(text_ds)

    train_ds = raw_train_data.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
    val_ds = raw_val_data.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
    test_ds = raw_test_data.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)

    train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    test_ds = test_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
    #standard model
    from tensorflow.keras import layers

    config = wandb.config
    config.learning_rate = 0.01

    model = tf.keras.Sequential([
        layers.Embedding(MAX_FEATURES + 1, output_dim=OUTPUT_DIM),
        layers.Dense(1000),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dense(100),
        layers.Dropout(0.2),
        layers.Dense(1)])
    model.summary()

    model.compile(loss=tf.losses.BinaryCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.Adam(learning_rate=wandb.config.learning_rate),
                  metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=wandb.config.epochs, callbacks=[WandbCallback(log_weights=True, )])
    loss, accuracy = model.evaluate(test_ds)

    model.save(os.path.join(wandb.run.dir, f"batch_{wandb.config.batch_size}_lr_{wandb.config.learning_rate}.h5"))

    print("Base Model")
    print("Loss: ", loss)
    print("Accuracy: ", accuracy)
    wandb.log({'Test Error Rate': round((1 - accuracy) * 100, 2)})

In [None]:
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'val_loss',
        'goal': 'minimize'
    },
    'early_terminate': {
        'type': 'hyperband',
        'min_iter': 5
    },
    'parameters': {
        'batch_size': {
            'values': [8, 16, 32, 64, 128, 256]
        },
        'learning_rate': {
            'values': [0.01, 0.005, 0.001, 0.0005, 0.0001]
        }
    }
}
sweep_id = wandb.sweep(sweep_config, project="url_watcher")
wandb.agent(sweep_id, function=train)



Create sweep with ID: k8e07x9n
Sweep URL: https://wandb.ai/wawann/url_watcher/sweeps/k8e07x9n


[34m[1mwandb[0m: Agent Starting Run: 226k9a1c with config:
[34m[1mwandb[0m: 	batch_size: 8
[34m[1mwandb[0m: 	learning_rate: 0.01


Found 71578 files belonging to 2 classes.
Using 57263 files for training.
Found 71578 files belonging to 2 classes.
Using 14315 files for validation.
Found 2490 files belonging to 2 classes.




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          20010     
_________________________________________________________________
dense (Dense)                (None, None, 1000)        11000     
_________________________________________________________________
dropout (Dropout)            (None, None, 1000)        0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               100100    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1

VBox(children=(Label(value=' 1.54MB of 1.54MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
loss,0.24583
binary_accuracy,0.91419
val_loss,0.23631
val_binary_accuracy,0.90171
_runtime,270.0
_timestamp,1623206098.0
_step,15.0
best_val_loss,0.2122
best_epoch,3.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▃▂▂▂▂▂▂▂▃▁▁▁▂▁
binary_accuracy,▁▅▆▇▇█▆▅▅▆▆█▇▇▇
val_loss,█▄▇▁▅▄▃▃▆▆▆▄▄▄▄
val_binary_accuracy,▁▅▂█▃▄▅▅▅▆▄▅▅▆▃
_runtime,▁▂▂▃▃▄▄▄▅▅▆▇▇███
_timestamp,▁▂▂▃▃▄▄▄▅▅▆▇▇███
_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
Test Error Rate,▁


[34m[1mwandb[0m: Agent Starting Run: 8apou7r7 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0005


Found 71578 files belonging to 2 classes.
Using 57263 files for training.
Found 71578 files belonging to 2 classes.
Using 14315 files for validation.
Found 2490 files belonging to 2 classes.




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          20010     
_________________________________________________________________
dense (Dense)                (None, None, 1000)        11000     
_________________________________________________________________
dropout (Dropout)            (None, None, 1000)        0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               100100    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1

VBox(children=(Label(value=' 1.54MB of 1.54MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
loss,0.20918
binary_accuracy,0.9247
val_loss,0.21279
val_binary_accuracy,0.92679
_runtime,179.0
_timestamp,1623206282.0
_step,15.0
best_val_loss,0.21089
best_epoch,3.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁
binary_accuracy,▁▇▇█████▇██████
val_loss,█▅▂▁▁▂▂▂▂▄▄▃▃▄▃
val_binary_accuracy,▁▃▅▆█▇▇▇▇▇▇▇▇▆▇
_runtime,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
Test Error Rate,▁


[34m[1mwandb[0m: Agent Starting Run: co7j1gc0 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	learning_rate: 0.0005


Found 71578 files belonging to 2 classes.
Using 57263 files for training.
Found 71578 files belonging to 2 classes.
Using 14315 files for validation.
Found 2490 files belonging to 2 classes.




Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 10)          20010     
_________________________________________________________________
dense (Dense)                (None, None, 1000)        11000     
_________________________________________________________________
dropout (Dropout)            (None, None, 1000)        0         
_________________________________________________________________
global_average_pooling1d (Gl (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               100100    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 1