# Phishing Domain Detection
## Testing all model with 53 features

[Dataset Link](https://data.mendeley.com/datasets/72ptz43s9v/1)<br>
[Dataset Description](https://www.sciencedirect.com/science/article/pii/S2352340920313202)

In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

# Modelling
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings
import os

In [4]:
df = pd.read_csv("data/final_data.csv")

In [5]:
df['phishing'].value_counts()

phishing
0    56706
1    30497
Name: count, dtype: int64

In [6]:
df.shape

(87203, 54)

In [7]:
X = df.drop(columns=['phishing'])
X.head()

Unnamed: 0,directory_length,time_domain_activation,qty_slash_directory,qty_at_file,qty_slash_file,qty_equal_file,qty_dot_file,ttl_hostname,qty_equal_directory,qty_plus_file,...,qty_hyphen_domain,qty_and_directory,qty_questionmark_file,qty_hashtag_directory,params_length,qty_dot_params,qty_params,url_shortened,qty_equal_params,qty_space_params
0,8,-1,1,0,0,0,1,892,0,0,...,0,0,0,0,-1,-1,-1,0,-1,-1
1,42,579,3,0,0,0,1,9540,0,0,...,0,0,0,0,165,0,3,0,3,0
2,1,-1,1,0,0,0,0,589,0,0,...,0,0,0,0,-1,-1,-1,0,-1,-1
3,62,-1,5,0,0,0,1,292,0,0,...,0,0,0,0,-1,-1,-1,0,-1,-1
4,-1,6998,-1,-1,-1,-1,-1,3597,-1,-1,...,0,-1,-1,-1,-1,-1,-1,0,-1,-1


In [8]:
X_cols = X.columns
X_cols

Index(['directory_length', 'time_domain_activation', 'qty_slash_directory',
       'qty_at_file', 'qty_slash_file', 'qty_equal_file', 'qty_dot_file',
       'ttl_hostname', 'qty_equal_directory', 'qty_plus_file', 'asn_ip',
       'time_response', 'time_domain_expiration', 'qty_underline_file',
       'domain_length', 'qty_percent_directory', 'qty_dot_domain',
       'qty_hyphen_file', 'file_length', 'qty_asterisk_directory',
       'qty_exclamation_directory', 'qty_asterisk_file', 'qty_tilde_file',
       'qty_at_directory', 'qty_vowels_domain', 'qty_plus_directory',
       'qty_exclamation_file', 'qty_dot_directory', 'qty_mx_servers',
       'qty_nameservers', 'qty_underline_directory', 'qty_hyphen_directory',
       'qty_comma_directory', 'qty_space_file', 'qty_and_file',
       'qty_dollar_directory', 'qty_questionmark_directory',
       'qty_space_directory', 'qty_ip_resolved', 'qty_redirects',
       'tls_ssl_certificate', 'qty_percent_file', 'domain_spf',
       'qty_hyphen_domai

In [9]:
y = df['phishing']
y

0        1
1        1
2        0
3        1
4        0
        ..
87198    0
87199    0
87200    1
87201    1
87202    0
Name: phishing, Length: 87203, dtype: int64

In [10]:
y.values.ravel()

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X.shape

(87203, 53)

In [13]:
X

array([[-0.12067099, -1.12987003,  0.12201639, ..., -0.07486279,
        -0.24617076, -0.30325457],
       [ 1.27116209, -0.93909471,  1.02376606, ..., -0.07486279,
         3.33910332,  3.28490121],
       [-0.40722485, -1.12987003,  0.12201639, ..., -0.07486279,
        -0.24617076, -0.30325457],
       ...,
       [ 1.51677969, -0.52300715,  1.92551573, ..., -0.07486279,
        -0.24617076, -0.30325457],
       [-0.40722485, -1.12987003,  0.12201639, ..., -0.07486279,
        -0.24617076, -0.30325457],
       [-0.48909739, -1.03086421, -0.77973328, ..., -0.07486279,
        -0.24617076, -0.30325457]])

In [14]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((69762, 53), (17441, 53), (69762,), (17441,))

In [15]:
# import pickle
# pickle.dump(scaler, open('scaling.pkl','wb'))

In [16]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    class_report = classification_report(predicted , true, target_names=["legitimate","malicious"])
    return accuracy, class_report


In [17]:
# Define models
models = {
    "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    "Random Forest Classifier": RandomForestClassifier(random_state=42),
    "XGBClassifier": XGBClassifier(random_state=42),
    "CatBoost Classifier": CatBoostClassifier(verbose=False, random_state=42),
    "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42)
}

model_list = []
train_accuracies = []
test_accuracies = []

for name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_accuracy, model_classification_report_train = evaluate_model(y_train, y_train_pred)
    model_test_accuracy, model_classification_report_test = evaluate_model(y_test, y_test_pred)

    print(name)
    print('-' * 20)
    print('Train Accuracy:', model_train_accuracy)
    print('Train Classification Report:\n', model_classification_report_train)
    print('-' * 35)
    print('Test Accuracy:', model_test_accuracy)
    print('Test Classification Report:\n', model_classification_report_test)
    print('=' * 35)
    print('\n')
    
    model_list.append(name)
    train_accuracies.append(model_train_accuracy)
    test_accuracies.append(model_test_accuracy)

Decision Tree Classifier
--------------------
Train Accuracy: 0.9999856655485795
Train Classification Report:
               precision    recall  f1-score   support

  legitimate       1.00      1.00      1.00     45227
   malicious       1.00      1.00      1.00     24535

    accuracy                           1.00     69762
   macro avg       1.00      1.00      1.00     69762
weighted avg       1.00      1.00      1.00     69762

-----------------------------------
Test Accuracy: 0.9537297173327217
Test Classification Report:
               precision    recall  f1-score   support

  legitimate       0.96      0.97      0.96     11469
   malicious       0.93      0.93      0.93      5972

    accuracy                           0.95     17441
   macro avg       0.95      0.95      0.95     17441
weighted avg       0.95      0.95      0.95     17441



Random Forest Classifier
--------------------
Train Accuracy: 0.999971331097159
Train Classification Report:
               precision 

In [18]:
results_df = pd.DataFrame(list(zip(model_list, test_accuracies)), columns=['Model Name', 'Test Accuracy']).sort_values(by=["Test Accuracy"],ascending=False)
results_df

Unnamed: 0,Model Name,Test Accuracy
1,Random Forest Classifier,0.971905
2,XGBClassifier,0.969841
3,CatBoost Classifier,0.969841
0,Decision Tree Classifier,0.95373
4,Gradient Boosting Classifier,0.953156


In [19]:
import pickle

best_model_name = results_df.iloc[0]['Model Name']
best_model = models[best_model_name]
# pickle.dump(best_model, 'best_model.pkl')

print(f"The best model is {best_model_name} with a test accuracy of {results_df.iloc[0]['Test Accuracy']:.4f}")

The best model is Random Forest Classifier with a test accuracy of 0.9719
