In [1]:
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
old_dataset = "data/filesplit2/"
old_testset = "data/test/"
kaggle_set_folder = "data/kaggle_set_folder"
kaggle_test_set_folder = "data/kaggle_test_set_folder"
small_kaggle_set_folder = "data/small_kaggle_set_folder"
small_kaggle_test_set_folder = "data/small_kaggle_test_set_folder"
history_set_folder = "data/history_set_folder"
history_test_set_folder = "data/history_test_set_folder"
combined_set_folder = "data/combined_set_folder"
combined_test_set_folder = "data/combined_test_set_folder"
MODELS_FOLDER = "IA_models"

MODEL_NAME = "model-best.h5"
MAX_FEATURES = 2000
BATCH_SIZE = 128
EPOCHS = 15
OUTPUT_DIM = 10
TRAIN_SET = combined_set_folder
TEST_SET = combined_test_set_folder

SEQUENCE_LENGTH = 150
SEED = 42

In [2]:
raw_train_data = tf.keras.preprocessing.text_dataset_from_directory(TRAIN_SET, batch_size=BATCH_SIZE,
                                                                    validation_split=0.2, subset="training",
                                                                    label_mode="binary", seed=SEED)
raw_val_data = tf.keras.preprocessing.text_dataset_from_directory(TRAIN_SET, batch_size=BATCH_SIZE,
                                                                  validation_split=0.2, subset="validation",
                                                                  label_mode="binary", seed=SEED)
raw_test_data = tf.keras.preprocessing.text_dataset_from_directory(TEST_SET, batch_size=BATCH_SIZE)

Found 71578 files belonging to 2 classes.
Using 57263 files for training.
Found 71578 files belonging to 2 classes.
Using 14315 files for validation.
Found 2490 files belonging to 2 classes.


In [3]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re


def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )


vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=MAX_FEATURES,
    output_mode="int",
    output_sequence_length=SEQUENCE_LENGTH,
)
text_ds = raw_train_data.map(lambda x, y: x, num_parallel_calls=tf.data.AUTOTUNE)
vectorize_layer.adapt(text_ds)


def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

test_ds = raw_test_data.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

In [4]:
model = tf.keras.models.load_model(f"{MODELS_FOLDER}/{MODEL_NAME}")
complete_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation("sigmoid")
])

complete_model.compile(
    loss="binary_crossentropy", optimizer="adam", metrics=['accuracy']
)
#Test on some value
def adapt(url: str):
    return url.ljust(75, " ").replace("", " ")[1: 151]


test = [
    #Should be malicious urls
    adapt("https://crackyourgames.com/"),
    adapt("https://fitgirl-repacks.to/"),
    adapt("http://eliteloungegroup.com.au/sample/"),
    adapt("http://neverclick.net/uw45692d61/367c70453965c3e5c2ab1f6d/index.php?id=5e63"),
    adapt("http://iitp.org.br/invel/wp-includes/quotationfile/doc/dxx/b727e1fe915abd93"),
    #Should be valid urls
    adapt("https://www.reddit.com/r/gamedev/wiki/getting_started#wiki_you_must_learn_how_to_seek_out_resources"),
    adapt("https://www.twitch.tv/"),
    adapt("https://www.lebigdata.fr/cryptbb-dark-web-tout-savoir"),
    adapt("https://bitwarden.com/browser-start/"),
    adapt("https://stackoverflow.com/"),
    adapt("https://stackoverflow.com/questions/59285984/custom-layer-in-keras-dimension-problem"),
    adapt("https://ent2.utt.fr/uPortal/render.userLayoutRootNode.uP")
]

complete_model.predict(test)

array([[0.53608024],
       [0.52395886],
       [0.31355068],
       [0.16352417],
       [0.42541954],
       [0.99653184],
       [0.9781075 ],
       [0.9722692 ],
       [0.92360723],
       [0.81990445],
       [0.9809037 ],
       [0.6220234 ]], dtype=float32)

In [5]:
#Test on all history
raw_history_data = tf.keras.preprocessing.text_dataset_from_directory(history_set_folder, batch_size=BATCH_SIZE,
                                                                      label_mode="binary", seed=SEED)
raw_history_test_data = tf.keras.preprocessing.text_dataset_from_directory(history_test_set_folder,
                                                                           batch_size=BATCH_SIZE,
                                                                           label_mode="binary", seed=SEED)
raw_kaggle_test_data = tf.keras.preprocessing.text_dataset_from_directory(kaggle_test_set_folder, batch_size=BATCH_SIZE,
                                                                          label_mode="binary", seed=SEED)
raw_combined_data = tf.keras.preprocessing.text_dataset_from_directory(combined_set_folder, batch_size=BATCH_SIZE,
                                                                       label_mode="binary", seed=SEED)
raw_combined_test_data = tf.keras.preprocessing.text_dataset_from_directory(combined_test_set_folder,
                                                                            batch_size=BATCH_SIZE,
                                                                            label_mode="binary", seed=SEED)

loss, history_accuracy = complete_model.evaluate(raw_history_data)
loss, history_test_accuracy = complete_model.evaluate(raw_history_test_data)
loss, kaggle_test_accuracy = complete_model.evaluate(raw_kaggle_test_data)
loss, combined_accuracy = complete_model.evaluate(raw_combined_data)
loss, combined_test_accuracy = complete_model.evaluate(raw_combined_test_data)
print(f"Accuracy for history: {history_accuracy}")
print(f"Accuracy for history test: {history_test_accuracy}")
print(f"Accuracy for kaggle test: {kaggle_test_accuracy}")
print(f"Accuracy for combined: {combined_accuracy}")
print(f"Accuracy for combined test: {combined_test_accuracy}")


Found 21576 files belonging to 2 classes.
Found 1488 files belonging to 2 classes.
Found 44953 files belonging to 2 classes.
Found 71578 files belonging to 2 classes.
Found 2490 files belonging to 2 classes.
Accuracy for history: 0.7213570475578308
Accuracy for history test: 0.8736559152603149
Accuracy for kaggle test: 0.8820545673370361
Accuracy for combined: 0.9153231382369995
Accuracy for combined test: 0.924096405506134
