In [1]:
pip install pandas numpy matplotlib seaborn scikit-learn tensorflow keras-tuner

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import random
import numpy as np
import tensorflow as tf

# Define a seed value for reproducibility
seed = 42

# 1. Set the PYTHONHASHSEED environment variable for Python hash-based operations
os.environ['PYTHONHASHSEED'] = str(seed)

# 2. Set the Python built-in pseudo-random generator at a fixed value
random.seed(seed)

# 3. Set the NumPy pseudo-random generator at a fixed value
np.random.seed(seed)

# 4. Set the TensorFlow pseudo-random generator at a fixed value
tf.random.set_seed(seed)


In [2]:
# Import Required Packages


import pandas as pd

# For data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For performance metrics
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, classification_report, 
                             confusion_matrix)

# For building the neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Conv1D, MaxPooling1D, Flatten, Dense, Dropout, 
                                     BatchNormalization)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# For hyperparameter tuning
import keras_tuner as kt
from keras_tuner.tuners import RandomSearch

# =============================================
# Version Information (Optional)
# =============================================

print(f'TensorFlow version: {tf.__version__}')
print(f'Keras Tuner version: {kt.__version__}')



TensorFlow version: 2.17.0
Keras Tuner version: 1.4.7


In [3]:
# Data Loading and Preprocessing


# Read the dataset
data = pd.read_csv('dataset_phishing.csv')

# Define features and target variable
features = ['shortest_word_path', 'ratio_intMedia', 'links_in_tags',
            'nb_hyphens', 'page_rank', 'avg_word_path', 
            'ratio_extHyperlinks', 'longest_words_raw', 'google_index',
            'length_hostname', 'longest_word_host', 
            'domain_registration_length', 'nb_www', 'nb_underscore', 
            'nb_dots', 'ratio_extMedia', 'phish_hints', 'domain_in_title', 
            'web_traffic', 'safe_anchor', 'nb_space', 'shortening_service', 
            'ip', 'domain_age', 'nb_qm', 'nb_hyperlinks', 'nb_slash']

X = data[features]
y = data['status']  # 'status' is 'legitimate' or 'phishing'

# Map the 'status' to numerical labels
label_mapping = {'legitimate': 0, 'phishing': 1}
y = y.map(label_mapping)

# Check for any unmapped values or missing values
if y.isnull().any():
    print("Some 'status' values were not mapped properly. Please check the 'status' column for unexpected values.")
    print(y[y.isnull()])


# Handle missing values in features if any
X.fillna(0, inplace=True)

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=seed, stratify=y)

# Standardize the features
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape data to be suitable for Conv1D (samples, timesteps, features)
# Since we have numerical features, we treat each feature as a "time step"
X_train_cnn = X_train_scaled.reshape(-1, X_train_scaled.shape[1], 1)
X_test_cnn = X_test_scaled.reshape(-1, X_test_scaled.shape[1], 1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


In [4]:
# Model Building and Hyperparameter Tuning


def build_model(hp):
    model = Sequential()
    
    # First Conv1D layer
    model.add(Conv1D(
        filters=hp.Int('filters_1', min_value=16, max_value=128, step=16),
        kernel_size=hp.Int('kernel_size_1', min_value=2, max_value=5, step=1),
        activation='relu',
        input_shape=(X_train_cnn.shape[1], 1),
        padding='same'
    ))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(hp.Float('dropout_rate_1', min_value=0.0, max_value=0.5, step=0.1)))
    
    # Optional second Conv1D layer
    if hp.Boolean('second_conv_layer'):
        model.add(Conv1D(
            filters=hp.Int('filters_2', min_value=16, max_value=128, step=16),
            kernel_size=hp.Int('kernel_size_2', min_value=2, max_value=5, step=1),
            activation='relu',
            padding='same'
        ))
        model.add(BatchNormalization())
        model.add(MaxPooling1D(pool_size=2))
        model.add(Dropout(hp.Float('dropout_rate_2', min_value=0.0, max_value=0.5, step=0.1)))
    
    model.add(Flatten())
    
    # Dense layers
    for i in range(hp.Int('num_dense_layers', 1, 3)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
            activation='relu'
        ))
        model.add(BatchNormalization())
        model.add(Dropout(hp.Float(f'dropout_rate_dense_{i}', min_value=0.0, max_value=0.5, step=0.1)))
    
    # Output layer
    model.add(Dense(1, activation='sigmoid'))
    
    # Compile the model
    model.compile(
        optimizer=Adam(
            hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='LOG', default=1e-3)
        ),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Initialize the tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=20,
    executions_per_trial=1,
    directory='tuning',
    project_name='cnn',
    seed=seed, 
    overwrite=True  
)

# Early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Run the hyperparameter search
tuner.search(
    x=X_train_cnn,
    y=y_train,
    epochs=20,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"""
The hyperparameter search is complete. 
The optimal number of filters in the first Conv1D layer is {best_hps.get('filters_1')}, 
kernel size is {best_hps.get('kernel_size_1')}, 
and the learning rate is {best_hps.get('lr')}.
""")

# Build the model with the best hyperparameters
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
history = best_model.fit(
    X_train_cnn, y_train,
    epochs=50,
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1 
)



Trial 20 Complete [00h 00m 13s]
val_accuracy: 0.9633679389953613

Best val_accuracy So Far: 0.96555495262146
Total elapsed time: 00h 07m 33s

The hyperparameter search is complete. 
The optimal number of filters in the first Conv1D layer is 48, 
kernel size is 4, 
and the learning rate is 0.0017529957488324514.

Epoch 1/50
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8747 - loss: 0.3162 - val_accuracy: 0.9060 - val_loss: 0.2940
Epoch 2/50
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9347 - loss: 0.1753 - val_accuracy: 0.9584 - val_loss: 0.1454
Epoch 3/50
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9403 - loss: 0.1634 - val_accuracy: 0.9584 - val_loss: 0.1268
Epoch 4/50
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9469 - loss: 0.1450 - val_accuracy: 0.9584 - val_loss: 0.1247
Epoch 5/50
[1m229/229[0m [32m━━━━━━

In [10]:
# Model Evaluation

# Evaluate the model on test data
test_loss, test_acc = best_model.evaluate(X_test_cnn, y_test)
print('Test Accuracy: {:.4f}'.format(test_acc))

# Predict probabilities and classes
y_pred_proba = best_model.predict(X_test_cnn).ravel()
y_pred = (y_pred_proba >= 0.5).astype(int)

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print('\nModel Performance Metrics:')
print('--------------------------')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1))
print('ROC AUC   : {:.4f}'.format(roc_auc))

print('\nConfusion Matrix:')
print(confusion_matrix(y_test, y_pred))

print('\nClassification Report:')
print(classification_report(y_test, y_pred))

[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9546 - loss: 0.1256
Test Accuracy: 0.9532
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

Model Performance Metrics:
--------------------------
Accuracy  : 0.9532
Precision : 0.9536
Recall    : 0.9528
F1 Score  : 0.9532
ROC AUC   : 0.9881

Confusion Matrix:
[[1090   53]
 [  54 1089]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1143
           1       0.95      0.95      0.95      1143

    accuracy                           0.95      2286
   macro avg       0.95      0.95      0.95      2286
weighted avg       0.95      0.95      0.95      2286

