In [1]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)
from keras import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout
from keras_tuner.tuners import GridSearch

2024-10-17 17:18:35.783043: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-17 17:18:35.802604: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-17 17:18:35.808827: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-17 17:18:35.825814: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
seed = 42
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)

In [3]:
# Load Data
df = pd.read_csv('dataset_phishing.csv')

In [4]:
#Top 27 features from feature selection
features = ['shortest_word_path',
 'ratio_intMedia',
 'links_in_tags',
 'nb_hyphens',
 'page_rank',
 'avg_word_path',
 'ratio_extHyperlinks',
 'longest_words_raw',
 'google_index',
 'length_hostname',
 'longest_word_host',
 'domain_registration_length',
 'nb_www',
 'nb_underscore',
 'nb_dots',
 'ratio_extMedia',
 'phish_hints',
 'domain_in_title',
 'web_traffic',
 'safe_anchor',
 'nb_space',
 'shortening_service',
 'ip',
 'domain_age',
 'nb_qm',
 'nb_hyperlinks',
 'nb_slash']

In [5]:
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_reshaped = X_scaled.reshape((X.shape[0],1,X.shape[1])) #Reshape features with 1 timestep to fit into RNN 

le = LabelEncoder()
y = le.fit_transform(df["status"])
X_train, X_temp, y_train, y_temp = train_test_split(X_reshaped, y, test_size=0.3, random_state=seed, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed, stratify=y_temp)

In [6]:
#Define model to fir into GridSearch later
def build_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Choice('units',[128,512,1024,2048]), activation=hp.Choice('activation',['relu','tanh']),input_shape=(1,X.shape[1]))) #4 Different layer sizes
    model.add(Dropout(hp.Float('dropout',0.1,0.5,step=0.1))) #5 different dropout values
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [7]:
#Grid search for best hyperparameters
tuner = GridSearch(
    build_model,
    objective='val_accuracy',
    max_trials=20
)
tuner.search(X_train,y_train,epochs=10,validation_data=(X_val,y_val))
best_model = tuner.get_best_models()[0]
tuner.results_summary()
history = best_model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val,y_val))

y_pred = (best_model.predict(X_test) > 0.5).astype('int32')

2024-10-17 17:18:38.607646: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22455 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:01:00.0, compute capability: 8.6


Reloading Tuner from ./untitled_project/tuner0.json


  super().__init__(**kwargs)


Results summary
Results in ./untitled_project
Showing 10 best trials
Objective(name="val_accuracy", direction="max")

Trial 2 summary
Hyperparameters:
units: 1024
activation: relu
dropout: 0.1
Score: 0.13861757516860962

Trial 1 summary
Hyperparameters:
units: 512
Score: 0.13782855868339539

Trial 0 summary
Hyperparameters:
units: 128
Score: 0.13699819147586823
Epoch 1/5


  saveable.load_own_variables(weights_store.get(inner_path))
I0000 00:00:1729156721.973444  885520 service.cc:146] XLA service 0xaa41120 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1729156721.973483  885520 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-10-17 17:18:42.012431: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-17 17:18:42.229277: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907






[1m225/251[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 737us/step - accuracy: 0.9594 - loss: 0.1237

I0000 00:00:1729156727.383563  885520 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 24ms/step - accuracy: 0.9591 - loss: 0.1232 - val_accuracy: 0.9481 - val_loss: 0.1357
Epoch 2/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 915us/step - accuracy: 0.9617 - loss: 0.1063 - val_accuracy: 0.9492 - val_loss: 0.1352
Epoch 3/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 918us/step - accuracy: 0.9622 - loss: 0.1006 - val_accuracy: 0.9504 - val_loss: 0.1359
Epoch 4/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 905us/step - accuracy: 0.9630 - loss: 0.0963 - val_accuracy: 0.9510 - val_loss: 0.1346
Epoch 5/5
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 902us/step - accuracy: 0.9661 - loss: 0.0918 - val_accuracy: 0.9527 - val_loss: 0.1353
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 52ms/step 


In [8]:
print(f"Accuracy: {accuracy_score(y_test,y_pred):.4f}")
print(f"Precision: {precision_score(y_test,y_pred):.4f}")
print(f"Recall:  {recall_score(y_test,y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test,y_pred):.4f}")
print(f"ROC_AUC Score: 
print(classification_report(y_test, y_pred))

Accuracy: 0.9644
Precision: 0.9660
Recall:  0.9627
F1 Score: 0.9643
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       858
           1       0.97      0.96      0.96       857

    accuracy                           0.96      1715
   macro avg       0.96      0.96      0.96      1715
weighted avg       0.96      0.96      0.96      1715

