# Phishing Domain Detection (Using Deep Learning for Classification)

[Dataset Link](https://data.mendeley.com/datasets/72ptz43s9v/1)<br>
[Dataset Description](https://www.sciencedirect.com/science/article/pii/S2352340920313202)

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
import warnings
import os

In [2]:
os.chdir('..')

In [3]:
df = pd.read_csv("data/dataset_full.csv")
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [4]:
X = df.drop(columns=['phishing'])
X.head()

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_and_url,qty_exclamation_url,qty_space_url,...,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened
0,3,0,0,1,0,0,0,0,0,0,...,-1,1,2,0,892,0,0,0,0,0
1,5,0,1,3,0,3,0,2,0,0,...,150,1,2,1,9540,1,0,0,0,0
2,2,0,0,1,0,0,0,0,0,0,...,-1,1,2,3,589,1,0,0,0,0
3,4,0,2,5,0,0,0,0,0,0,...,-1,1,2,0,292,1,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,306,1,2,1,3597,0,1,0,0,0


In [5]:
y = df['phishing']
y

0        1
1        1
2        0
3        1
4        0
        ..
88642    0
88643    0
88644    1
88645    1
88646    0
Name: phishing, Length: 87209, dtype: int64

In [6]:
y.values.ravel()

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [7]:
#Oversampling using SMOTE
from imblearn.over_sampling import SMOTE

X, y = SMOTE().fit_resample(X, y)

# checking the sizes of the sample data
print("Size of X:", X.shape)
print("Size of y:", y.shape)

Size of X: (113424, 111)
Size of y: (113424,)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X.shape

(113424, 111)

In [9]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((90739, 111), (22685, 111), (90739,), (22685,))

In [10]:
# import pickle
# pickle.dump(scaler, open('scaling.pkl','wb'))

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

time_step = 111

# Define the model
model = Sequential()
model.add(Dense(64, activation = 'relu', input_shape=(time_step, )))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(16, activation = 'relu'))
model.add(Dense(8, activation = 'relu'))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()


In [21]:
from tensorflow.keras.callbacks import Callback
class myCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs.get('val_loss') < 0.1:
            print("\nReached 0.1 val_loss, so stopping training!")
            self.model.stop_training = True

callback = myCallback()

In [22]:
# start training the model 
history = model.fit(X_train, y_train, epochs=50,batch_size=256, callbacks=[callback],validation_data=(X_test,y_test),verbose=1)

Epoch 1/50


[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8909 - loss: 0.2808 - val_accuracy: 0.9456 - val_loss: 0.1401
Epoch 2/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9476 - loss: 0.1341 - val_accuracy: 0.9514 - val_loss: 0.1231
Epoch 3/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9551 - loss: 0.1174 - val_accuracy: 0.9546 - val_loss: 0.1171
Epoch 4/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9591 - loss: 0.1094 - val_accuracy: 0.9548 - val_loss: 0.1168
Epoch 5/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9602 - loss: 0.1070 - val_accuracy: 0.9590 - val_loss: 0.1106
Epoch 6/50
[1m355/355[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9630 - loss: 0.1027 - val_accuracy: 0.9604 - val_loss: 0.1085
Epoch 7/50
[1m355/355[0m [32m━━━━━━━

In [23]:
# Display final training and validation accuracy
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]

# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

print(f"Final Training Accuracy: {final_train_acc:.4f}")
print(f"Final Validation Accuracy: {final_val_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Final Training Accuracy: 0.9671
Final Validation Accuracy: 0.9633
Test Accuracy: 0.9633


In [26]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test data
y_test_pred = model.predict(X_test)

# Convert predicted probabilities to binary predictions:
# Threshold predicted probabilities (> 0.5) to classify as positive (1) or negative (0) class
y_test_pred = (y_test_pred > 0.5).astype(int)

# Function to evaluate the model
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    class_report = classification_report(predicted, true, target_names=["legitimate", "malicious"])
    return accuracy, class_report

# Evaluate the model using the defined function
accuracy, class_report = evaluate_model(y_test, y_test_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(class_report)

[1m709/709[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Accuracy: 0.9633
Classification Report:
              precision    recall  f1-score   support

  legitimate       0.97      0.96      0.96     11503
   malicious       0.96      0.96      0.96     11182

    accuracy                           0.96     22685
   macro avg       0.96      0.96      0.96     22685
weighted avg       0.96      0.96      0.96     22685

