# **Phishing Domain Detection (Training Models)**

### The objective of this notebook is to test a couple of models to classify which one is best suited for detecting malicious domains

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from concurrent.futures import ThreadPoolExecutor

# Load the dataset
urldata = pd.read_csv("./Url_Processed.csv")

# Inspect the dataset
print("Dataset Columns:", urldata.columns.tolist())

# Drop unnecessary columns if they exist
urldata.drop("Unnamed: 0", axis=1, inplace=True, errors='ignore')  # Drop unnecessary column
if "url" in urldata.columns and "label" in urldata.columns:
    urldata.drop(["url", "label"], axis=1, inplace=True)  # Drop 'url' and 'label' columns if they exist

# Define required columns for x
required_columns = [
    'hostname_length', 'path_length', 'fd_length', 'count-', 'count@', 'count?',
    'count%', 'count.', 'count=', 'count-http', 'count-https', 'count-www',
    'count-digits', 'count-letters', 'count_dir', 'use_of_ip'
]

# Check for missing columns
missing_columns = [col for col in required_columns if col not in urldata.columns]
if missing_columns:
    print(f"Warning: Missing columns in dataset: {missing_columns}")
    # Option 1: Drop missing columns from required_columns
    required_columns = [col for col in required_columns if col in urldata.columns]
    # Option 2: Create placeholder columns with default values (e.g., 0)
    for col in missing_columns:
        urldata[col] = 0

# Configure dependent variables (x) and independent variable (y)
x = urldata[required_columns]
if 'result' in urldata.columns:
    y = urldata['result']
else:
    raise ValueError("'result' column not found in dataset")

# Oversampling using SMOTE
x_sample, y_sample = SMOTE().fit_resample(x, y)
x_sample = pd.DataFrame(x_sample, columns=required_columns)
y_sample = pd.DataFrame(y_sample, columns=['result'])

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size=0.2, random_state=42)
print(f"Shape of x_train: {x_train.shape}, x_test: {x_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

# Decision Tree Model
dt_model = DecisionTreeClassifier(max_depth=7)
dt_model.fit(x_train, y_train)

# Evaluate Decision Tree Model
y_pred_train_dt = dt_model.predict(x_train)
y_pred_test_dt = dt_model.predict(x_test)
print("Decision Tree - Training Accuracy:", accuracy_score(y_train, y_pred_train_dt))
print("Decision Tree - Testing Accuracy:", accuracy_score(y_test, y_pred_test_dt))
print("Decision Tree - Classification Report:\n", classification_report(y_test, y_pred_test_dt, target_names=["legitimate", "malicious"]))

# Random Forest Model
rf_model = RandomForestClassifier(max_depth=7)
rf_model.fit(x_train, y_train)

# Evaluate Random Forest Model
y_pred_train_rf = rf_model.predict(x_train)
y_pred_test_rf = rf_model.predict(x_test)
print("Random Forest - Training Accuracy:", accuracy_score(y_train, y_pred_train_rf))
print("Random Forest - Testing Accuracy:", accuracy_score(y_test, y_pred_test_rf))
print("Random Forest - Classification Report:\n", classification_report(y_test, y_pred_test_rf, target_names=["legitimate", "malicious"]))

# Neural Network Model using Keras
nn_model = Sequential([
    Dense(64, input_dim=x_train.shape[1], activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile and train Neural Network model
nn_model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = nn_model.fit(x_train, y_train, epochs=10, batch_size=256, validation_data=(x_test, y_test),
                       callbacks=[early_stopping], verbose=1)

# Save the best model
nn_model.save("Malicious_URL_Prediction.keras")
print("Model saved as Malicious_URL_Prediction.keras")

Dataset Columns: ['url', 'label', 'result', 'url_length', 'hostname_length', 'path_length', 'fd_length', 'count_-', 'count_@', 'count_?', 'count_%', 'count_.', 'count_=', 'count_http', 'count_https', 'count_www', 'count_digits', 'count_letters', 'count_dir', 'use_of_ip', 'short_url']
Shape of x_train: (553180, 5), x_test: (138296, 5), y_train: (553180, 1), y_test: (138296, 1)
Decision Tree - Training Accuracy: 0.7061589356086626
Decision Tree - Testing Accuracy: 0.7060074044079366
Decision Tree - Classification Report:
               precision    recall  f1-score   support

  legitimate       0.68      0.76      0.72     69063
   malicious       0.73      0.65      0.69     69233

    accuracy                           0.71    138296
   macro avg       0.71      0.71      0.71    138296
weighted avg       0.71      0.71      0.71    138296



  return fit_method(estimator, *args, **kwargs)


Random Forest - Training Accuracy: 0.7267760945804259
Random Forest - Testing Accuracy: 0.7270926129461445
Random Forest - Classification Report:
               precision    recall  f1-score   support

  legitimate       0.70      0.80      0.75     69063
   malicious       0.77      0.65      0.71     69233

    accuracy                           0.73    138296
   macro avg       0.73      0.73      0.73    138296
weighted avg       0.73      0.73      0.73    138296

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.6293 - loss: 0.6529 - val_accuracy: 0.6765 - val_loss: 0.6196
Epoch 2/10
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6776 - loss: 0.6167 - val_accuracy: 0.6779 - val_loss: 0.6091
Epoch 3/10
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6818 - loss: 0.6084 - val_accuracy: 0.6837 - val_loss: 0.6027
Epoch 4/10
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6845 - loss: 0.6036 - val_accuracy: 0.6960 - val_loss: 0.5963
Epoch 5/10
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.6933 - loss: 0.5959 - val_accuracy: 0.7026 - val_loss: 0.5875
Epoch 6/10
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step - accuracy: 0.7004 - loss: 0.5855 - val_accuracy: 0.7020 - val_loss: 0.5762
Epoch 7/10
[1m2161/2161[0