In [1]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix)
from keras import Sequential
from keras.layers import Dense, LSTM, Activation, Dropout, Embedding, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasClassifier
from keras_tuner.tuners import GridSearch
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-11-06 16:00:02.366136: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-06 16:00:02.466068: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-06 16:00:02.487916: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-06 16:00:02.610792: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
seed = 42
np.random.seed(seed)
random.seed(seed)
tf.random.set_seed(seed)
tf.keras.utils.set_random_seed(seed)

In [3]:
# Load Data
df = pd.read_csv('dataset_phishing.csv')

In [4]:
#Top 27 features from feature selection
features = ['shortest_word_path',
 'ratio_intMedia',
 'links_in_tags',
 'nb_hyphens',
 'page_rank',
 'avg_word_path',
 'ratio_extHyperlinks',
 'longest_words_raw',
 'google_index',
 'length_hostname',
 'longest_word_host',
 'domain_registration_length',
 'nb_www',
 'nb_underscore',
 'nb_dots',
 'ratio_extMedia',
 'phish_hints',
 'domain_in_title',
 'web_traffic',
 'safe_anchor',
 'nb_space',
 'shortening_service',
 'ip',
 'domain_age',
 'nb_qm',
 'nb_hyperlinks',
 'nb_slash']

In [5]:
X = df[features]
scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)



le = LabelEncoder()
y = le.fit_transform(df["status"])

X_train, X_temp, y_train, y_temp= train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Reshape for CNN
X_train_cnn = X_train_scaled.reshape(-1, X_train_scaled.shape[1], 1)
X_val_cnn = X_val_scaled.reshape(-1, X_val_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)

In [6]:
def create_model(filters_1=32, kernel_size_1=3, dropout_rate_1=0.2,
                 filters_2=64, kernel_size_2=3, dropout_rate_2=0.2,
                 dense_units=128, learning_rate=0.001):
    model = Sequential()
    
    # First Conv1D layer
    model.add(Conv1D(filters=filters_1, kernel_size=kernel_size_1,
                     activation='relu', input_shape=(X_train_cnn.shape[1], 1),
                     padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(dropout_rate_1))
    
    # Second Conv1D layer
    model.add(Conv1D(filters=filters_2, kernel_size=kernel_size_2,
                     activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(dropout_rate_2))
    
    model.add(Flatten())
    model.add(Dense(dense_units, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                 loss='binary_crossentropy',
                 metrics=['accuracy', tf.keras.metrics.Recall()])
    
    return model

In [7]:
# ## For grid search, skip this and run cell below this to use the parameters obtained from previous grid search result.

# # Wrap the model
# model = KerasClassifier(
#     model=create_model,
#     verbose=0,
#     random_state=seed
# )

# # Define the grid search parameters
# param_grid = {
#     'model__filters_1': [32, 64],
#     'model__kernel_size_1': [2, 3],
#     'model__dropout_rate_1': [0.2, 0.3],
#     'model__filters_2': [32, 64],
#     'model__kernel_size_2': [2, 3],
#     'model__dropout_rate_2': [0.2, 0.3],
#     'model__dense_units': [64, 128],
#     'model__learning_rate': [0.001, 0.01]
# }

# # Create GridSearchCV object
# grid = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='recall',
#     cv=3,
#     n_jobs=-1,
#     verbose=2
# )

# # Fit the grid search
# print("Starting Grid Search...")
# grid_result = grid.fit(X_train_cnn, y_train)

# # Print the best parameters
# print("\nBest parameters found:")
# # Remove 'model__' prefix from parameter names for clarity
# best_params = {k.replace('model__', ''): v for k, v in grid_result.best_params_.items()}
# print(best_params)
# print("\nBest recall score:", grid_result.best_score_)

In [8]:
best_params = {'dense_units': 64, 'dropout_rate_1': 0.2, 'dropout_rate_2': 0.2, 'filters_1': 32, 'filters_2': 64, 'kernel_size_1': 3, 'kernel_size_2': 2, 'learning_rate': 0.01}

In [9]:
# Create and train the model with best parameters
print("\nTraining final model with best parameters...")
CNN_model = create_model(**best_params)

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_recall', patience=5, 
                             restore_best_weights=True, mode='max')

# Train the model
history = CNN_model.fit(
    X_train_cnn, y_train,
    epochs=50,
    validation_data=(X_val_cnn, y_val),
    callbacks=[early_stopping],
    verbose=1
)

y_val_pred_cnn = CNN_model.predict(X_val_cnn)
y_test_pred_cnn = CNN_model.predict(X_test_cnn)


Training final model with best parameters...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-11-06 16:00:05.676953: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22455 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3090, pci bus id: 0000:81:00.0, compute capability: 8.6


Epoch 1/50


I0000 00:00:1730880008.334276   50229 service.cc:146] XLA service 0x7fe854006ac0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1730880008.334318   50229 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce RTX 3090, Compute Capability 8.6
2024-11-06 16:00:08.392748: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-11-06 16:00:08.662232: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


[1m150/251[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 1ms/step - accuracy: 0.8759 - loss: 0.3087 - recall: 0.8666

I0000 00:00:1730880010.853727   50229 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.8909 - loss: 0.2752 - recall: 0.8828 - val_accuracy: 0.9201 - val_loss: 0.2099 - val_recall: 0.8623
Epoch 2/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9289 - loss: 0.1828 - recall: 0.9222 - val_accuracy: 0.9370 - val_loss: 0.1625 - val_recall: 0.9440
Epoch 3/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9375 - loss: 0.1723 - recall: 0.9354 - val_accuracy: 0.9440 - val_loss: 0.1530 - val_recall: 0.9568
Epoch 4/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9419 - loss: 0.1581 - recall: 0.9408 - val_accuracy: 0.9422 - val_loss: 0.1549 - val_recall: 0.9370
Epoch 5/50
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9444 - loss: 0.1521 - recall: 0.9410 - val_accuracy: 0.9405 - val_loss: 0.1510 - val_recall: 0.9370
Epoch 6/50

In [10]:
# Handle class imbalance by oversampling the minority class (phishing)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)

# Define the MLPClassifier with early stopping to avoid overfitting
mlp = MLPClassifier(random_state=42, early_stopping=True, validation_fraction=0.1, activation='tanh', alpha=0.0006, hidden_layer_sizes=(100,50),
                    learning_rate='constant',learning_rate_init=0.008,solver='sgd')

# # Set up hyperparameter grid for tuning
# param_grid = {
#     'hidden_layer_sizes': [(200, 100)],
#     'activation': ['relu', 'tanh', 'logistic'],
#     'solver': ['adam', 'sgd'],
#     'alpha': [0.0001],
#     'learning_rate': ['constant', 'adaptive'],
#     'learning_rate_init': [0.008],
#     # 'max_iter': [10000],
#     # 'tol': [1e-4],
#     # 'verbose': [True]
# }

# # Use GridSearchCV to find the best hyperparameters
# grid_search = GridSearchCV(mlp, param_grid, cv=5, scoring='recall', n_jobs=-1, verbose=2)

# grid_search.fit(X_resampled, y_resampled)
mlp.fit(X_resampled, y_resampled)
# Get the best model from the grid search
# best_mlp = grid_search.best_estimator_
y_val_pred_mlp = mlp.predict(X_val)
y_test_pred_mlp = mlp.predict(X_test)



In [11]:
X = df[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_reshaped = X_scaled.reshape((X.shape[0],1,X.shape[1]))  

le = LabelEncoder()
y = le.fit_transform(df["status"])
X_train_lstm, X_temp_lstm, y_train_lstm, y_temp_lstm = train_test_split(X_reshaped, y, test_size=0.3, random_state=42, stratify=y)
X_val_lstm, X_test_lstm, y_val_lstm, y_test_lstm = train_test_split(X_temp_lstm, y_temp_lstm, test_size=0.5, random_state=42, stratify=y_temp_lstm)

lstm_model = Sequential()
lstm_model.add(LSTM(1024, activation='relu',input_shape=(1,X.shape[1]))) 
lstm_model.add(Dropout(0.1))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# best_model = tuner.get_best_models()[0]
# best_hp = tuner.get_best_hyperparameters()[0]
# print(best_hp.values)
# tuner.results_summary()

history = lstm_model.fit(X_train_lstm, y_train_lstm, epochs=5, batch_size=32, validation_split=0.1)

y_val_pred_lstm = (lstm_model.predict(X_val_lstm) > 0.5)
y_test_pred_lstm = (lstm_model.predict(X_test_lstm) > 0.5)

  super().__init__(**kwargs)


Epoch 1/5








[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.9142 - loss: 0.3356 - val_accuracy: 0.9426 - val_loss: 0.1563
Epoch 2/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 887us/step - accuracy: 0.9451 - loss: 0.1428 - val_accuracy: 0.9513 - val_loss: 0.1444
Epoch 3/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - accuracy: 0.9526 - loss: 0.1266 - val_accuracy: 0.9563 - val_loss: 0.1369
Epoch 4/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 859us/step - accuracy: 0.9589 - loss: 0.1156 - val_accuracy: 0.9600 - val_loss: 0.1327
Epoch 5/5
[1m225/225[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 853us/step - accuracy: 0.9619 - loss: 0.1076 - val_accuracy: 0.9588 - val_loss: 0.1320
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step 
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step


In [12]:
# ======================================= METHOD 1: SVM ================================
# Hyperparameter tuning for SVM
# svm_param_grid = {
#     'C': [0.1, 1, 10],
#     'gamma': ['scale', 'auto'],
#     'kernel': ['linear', 'rbf']
# }

# svm_grid_search = GridSearchCV(SVC(probability=True, random_state=42), svm_param_grid, cv=5)
# svm_grid_search.fit(X_train, y_train)

# # Best parameters for SVM
# print("Best parameters for SVM:", svm_grid_search.best_params_)

# Validate and test the best SVM model
# svm_model = svm_grid_search.best_estimator_
svm_model = SVC(probability=True, random_state=42, C=10, gamma='scale', kernel='rbf')
svm_model.fit(X_train, y_train)
y_val_pred_svm = svm_model.predict(X_val)
y_test_pred_svm = svm_model.predict(X_test)

In [13]:
# ================================ METHOD 2: Traditional Tree ==================

# Hyperparameter tuning for Decision Tree
# dt_param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
# }

# dt_grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), dt_param_grid, cv=5)
# dt_grid_search.fit(X_train, y_train)

# Best parameters for Decision Tree
# print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Validate and test the best Decision Tree model
# dt_model = dt_grid_search.best_estimator_
dt_model = DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=10, min_samples_leaf=1, min_samples_split=2)
dt_model.fit(X_train, y_train)
y_val_pred_dt = dt_model.predict(X_val)
y_test_pred_dt = dt_model.predict(X_test)


In [14]:
# Define KNN model and hyperparameter grid
# knn = KNeighborsClassifier()
# param_grid = {
#     'n_neighbors': range(1, 21),           # Test k values from 1 to 20
#     'weights': ['uniform', 'distance'],    # Uniform or distance-weighted voting
#     'metric': ['euclidean', 'manhattan', 'minkowski'],  # Different distance metrics
#     'p': [1, 2]                            # Power parameter for Minkowski (p=1 is Manhattan, p=2 is Euclidean)
# }

# # Perform GridSearchCV with validation set
# grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='recall', verbose=1)
# grid_search.fit(X_train, y_train)

# Get the best model from the grid search
# best_knn = grid_search.best_estimator_
best_knn = KNeighborsClassifier(n_neighbors=16,metric='manhattan',p=1,weights='distance')
best_knn.fit(X_train,y_train)
print(best_knn)

# Evaluate on the validation set
y_val_pred_knn = best_knn.predict(X_val)
y_test_pred_knn = best_knn.predict(X_test)


KNeighborsClassifier(metric='manhattan', n_neighbors=16, p=1,
                     weights='distance')


In [15]:
# Define the Logistic Regression model and a hyperparameter grid for tuning
# logreg = LogisticRegression(max_iter=1000)  # Default Logistic Regression
# param_grid = {
#     'C': [0.1, 1, 10, 100],  # Regularization parameter
#     'solver': ['liblinear', 'lbfgs']  # Solvers
# }

# # Perform GridSearchCV with validation set
# grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='recall', verbose=1)
# grid_search.fit(X_train, y_train)

# Get the best model from the grid search
# best_logreg = grid_search.best_estimator_
best_logreg = LogisticRegression(max_iter=1000,C=100,solver='liblinear')
best_logreg.fit(X_train, y_train)
# print(best_logreg)

# Evaluate on the validation set
y_val_pred_lr = best_logreg.predict(X_val)
y_test_pred_lr = best_logreg.predict(X_test)


In [16]:
# Define the models and hyperparameter grids
models = {
    'GaussianNB': (GaussianNB(), {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}),
    'BernoulliNB': (BernoulliNB(), {'alpha': [0.5, 1.0, 1.5, 2.0], 'binarize': [0.0, 0.5, 1.0]})
}
gauss = GaussianNB()
bern = BernoulliNB(alpha=0.5)
gauss.fit(X_train, y_train)
bern.fit(X_train, y_train)
y_val_pred_gauss = gauss.predict(X_val)
y_test_pred_gauss = gauss.predict(X_test)
y_val_pred_bern = bern.predict(X_val)
y_test_pred_bern = bern.predict(X_test)

In [17]:
stacked_val = np.column_stack((y_val_pred_cnn,y_val_pred_mlp,y_val_pred_svm,y_val_pred_dt,y_val_pred_knn,y_val_pred_lr,
                              y_val_pred_gauss,y_val_pred_bern))

hybrid_model = LogisticRegression()
hybrid_model.fit(stacked_val, y_val)


stacked_test =np.column_stack((y_test_pred_cnn,y_test_pred_mlp,y_test_pred_svm,y_test_pred_dt,y_test_pred_knn,y_test_pred_lr,
                              y_test_pred_gauss,y_test_pred_bern))

final_pred = hybrid_model.predict(stacked_test)

print(f"Accuracy: {accuracy_score(y_test,final_pred):.4f}")
print(f"Precision: {precision_score(y_test,final_pred):.4f}")
print(f"Recall:  {recall_score(y_test,final_pred):.4f}")
print(f"F1 Score: {f1_score(y_test,final_pred):.4f}")
print(f"ROC_AUC Score: {roc_auc_score(y_test,final_pred):.4f}")
print(classification_report(y_test, final_pred))

Accuracy: 0.9609
Precision: 0.9551
Recall:  0.9673
F1 Score: 0.9612
ROC_AUC Score: 0.9609
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       858
           1       0.96      0.97      0.96       857

    accuracy                           0.96      1715
   macro avg       0.96      0.96      0.96      1715
weighted avg       0.96      0.96      0.96      1715

