# Phishing Domain Detection
## Experiment with Recursive Feature Elimination (RFE) as explained [here](https://www.sciencedirect.com/science/article/abs/pii/S1389128622004418?via%3Dihub)

[Dataset Link](https://data.mendeley.com/datasets/72ptz43s9v/1)<br>
[Dataset Description](https://www.sciencedirect.com/science/article/pii/S2352340920313202)

In [2]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline

# Modelling
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import warnings
import os

In [3]:
df = pd.read_csv("data/78_features.csv")
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [4]:
X = df.drop(columns=['phishing'])
X.head()

Unnamed: 0,qty_dot_domain,qty_hyphen_domain,qty_vowels_domain,domain_length,domain_in_ip,server_client_domain,qty_dot_directory,qty_hyphen_directory,qty_underline_directory,qty_slash_directory,...,time_domain_expiration,qty_ip_resolved,qty_nameservers,qty_mx_servers,ttl_hostname,tls_ssl_certificate,qty_redirects,url_google_index,domain_google_index,url_shortened
0,2,0,4,17,0,0,1,0,0,1,...,-1,1,2,0,892,0,0,0,0,0
1,2,0,5,16,0,0,3,0,0,3,...,150,1,2,1,9540,1,0,0,0,0
2,2,0,3,14,0,0,0,0,0,1,...,-1,1,2,3,589,1,0,0,0,0
3,2,0,7,19,0,0,2,0,2,5,...,-1,1,2,0,292,1,0,0,0,0
4,2,0,5,19,0,0,-1,-1,-1,-1,...,306,1,2,1,3597,0,1,0,0,0


In [5]:
y = df['phishing']
y

0        1
1        1
2        0
3        1
4        0
        ..
87204    0
87205    0
87206    1
87207    1
87208    0
Name: phishing, Length: 87203, dtype: int64

In [6]:
y.values.ravel()

array([1, 1, 0, ..., 1, 1, 0], dtype=int64)

In [7]:
# checking the sizes of the sample data
print("Size of X:", X.shape)
print("Size of y:", y.shape)

Size of X: (87203, 77)
Size of y: (87203,)


In [8]:
X_cols = X.columns.tolist()
X_cols

['qty_dot_domain',
 'qty_hyphen_domain',
 'qty_vowels_domain',
 'domain_length',
 'domain_in_ip',
 'server_client_domain',
 'qty_dot_directory',
 'qty_hyphen_directory',
 'qty_underline_directory',
 'qty_slash_directory',
 'qty_questionmark_directory',
 'qty_equal_directory',
 'qty_at_directory',
 'qty_and_directory',
 'qty_exclamation_directory',
 'qty_space_directory',
 'qty_tilde_directory',
 'qty_comma_directory',
 'qty_plus_directory',
 'qty_asterisk_directory',
 'qty_hashtag_directory',
 'qty_dollar_directory',
 'qty_percent_directory',
 'directory_length',
 'qty_dot_file',
 'qty_hyphen_file',
 'qty_underline_file',
 'qty_slash_file',
 'qty_questionmark_file',
 'qty_equal_file',
 'qty_at_file',
 'qty_and_file',
 'qty_exclamation_file',
 'qty_space_file',
 'qty_tilde_file',
 'qty_comma_file',
 'qty_plus_file',
 'qty_asterisk_file',
 'qty_hashtag_file',
 'qty_dollar_file',
 'qty_percent_file',
 'file_length',
 'qty_dot_params',
 'qty_hyphen_params',
 'qty_underline_params',
 'qty_s

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
X.shape

(87203, 77)

In [11]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((69762, 77), (17441, 77), (69762,), (17441,))

In [12]:
# import pickle
# pickle.dump(scaler, open('scaling.pkl','wb'))

In [13]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')
    class_report = classification_report(predicted , true, target_names=["legitimate","malicious"])
    return accuracy, precision, recall, f1, class_report

In [15]:
# Define models
models = {
    "Random Forest Classifier": RandomForestClassifier(random_state=42)
    # "Decision Tree Classifier": DecisionTreeClassifier(random_state=42),
    # "XGBClassifier": XGBClassifier(random_state=42),
    # "CatBoost Classifier": CatBoostClassifier(verbose=False, random_state=42),
    # "Gradient Boosting Classifier": GradientBoostingClassifier(random_state=42)
}

model_list = []
train_accuracies = []
train_precisions = []
train_recalls = []
train_f1_scores = []
test_accuracies = []
test_precisions = []
test_recalls = []
test_f1_scores = []

for name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train dataset
    model_train_accuracy, model_train_precision, model_train_recall, model_train_f1, model_classification_report_train = evaluate_model(y_train, y_train_pred)
    # Evaluate Test dataset
    model_test_accuracy, model_test_precision, model_test_recall, model_test_f1, model_classification_report_test = evaluate_model(y_test, y_test_pred)

    print(name)
    print('-' * 20)
    print('Train Accuracy:', model_train_accuracy)
    print('Train Classification Report:\n', model_classification_report_train)
    print('-' * 35)
    print('Test Accuracy:', model_test_accuracy)
    print('Test Classification Report:\n', model_classification_report_test)
    print('=' * 35)
    print('\n')
    
    model_list.append(name)

    train_accuracies.append(model_train_accuracy)
    train_precisions.append(model_train_precision)
    train_recalls.append(model_train_recall)
    train_f1_scores.append(model_train_f1)
    
    test_accuracies.append(model_test_accuracy)
    test_precisions.append(model_test_precision)
    test_recalls.append(model_test_recall)
    test_f1_scores.append(model_test_f1)

Random Forest Classifier
--------------------
Train Accuracy: 0.999971331097159
Train Classification Report:
               precision    recall  f1-score   support

  legitimate       1.00      1.00      1.00     45228
   malicious       1.00      1.00      1.00     24534

    accuracy                           1.00     69762
   macro avg       1.00      1.00      1.00     69762
weighted avg       1.00      1.00      1.00     69762

-----------------------------------
Test Accuracy: 0.9709879020698354
Test Classification Report:
               precision    recall  f1-score   support

  legitimate       0.98      0.98      0.98     11434
   malicious       0.96      0.95      0.96      6007

    accuracy                           0.97     17441
   macro avg       0.97      0.97      0.97     17441
weighted avg       0.97      0.97      0.97     17441





In [16]:
# results_df = pd.DataFrame(list(zip(model_list, test_accuracies)), columns=['Model Name', 'Test Accuracy']).sort_values(by=["Test Accuracy"],ascending=False)
# results_df

In [17]:

results_df = pd.DataFrame({
    'Model Name': model_list,
    'Test Accuracy': test_accuracies,
    'Test Precision': test_precisions,
    'Test Recall': test_recalls,
    'Test F1-score': test_f1_scores
})

results_df

Unnamed: 0,Model Name,Test Accuracy,Test Precision,Test Recall,Test F1-score
0,Random Forest Classifier,0.970988,0.971056,0.970988,0.971015


In [18]:
results_df.loc[results_df['Test Accuracy'].idxmax()]

Model Name        Random Forest Classifier
Test Accuracy                     0.970988
Test Precision                    0.971056
Test Recall                       0.970988
Test F1-score                     0.971015
Name: 0, dtype: object

In [19]:
results_df.loc[results_df['Test Precision'].idxmax()]

Model Name        Random Forest Classifier
Test Accuracy                     0.970988
Test Precision                    0.971056
Test Recall                       0.970988
Test F1-score                     0.971015
Name: 0, dtype: object

In [20]:
results_df.loc[results_df['Test Recall'].idxmax()]

Model Name        Random Forest Classifier
Test Accuracy                     0.970988
Test Precision                    0.971056
Test Recall                       0.970988
Test F1-score                     0.971015
Name: 0, dtype: object

In [21]:
results_df.loc[results_df['Test F1-score'].idxmax()]

Model Name        Random Forest Classifier
Test Accuracy                     0.970988
Test Precision                    0.971056
Test Recall                       0.970988
Test F1-score                     0.971015
Name: 0, dtype: object

In [22]:
import pickle

best_model_name = results_df.loc[results_df['Test F1-score'].idxmax()]['Model Name']
best_model = models[best_model_name]
# pickle.dump(best_model, 'best_model.pkl')

print(f"The best model is {best_model_name} with a Test F1-score of {results_df.iloc[0]['Test F1-score']}")

The best model is Random Forest Classifier with a Test F1-score of 0.9710145362863138


---------------------------------------------------------------------------------------------------------------------------

## RFE

In [23]:
from sklearn.feature_selection import RFE

def recursive_feature_elimination(X_train=X_train, y_train=y_train, n_features_to_select=14, estimator=RandomForestClassifier()):

    rfe = RFE(estimator=estimator, n_features_to_select=n_features_to_select)
    rfe.fit(X_train, y_train)
    
    selected_features = [X_cols[i] for i in range(len(X_cols)) if rfe.support_[i]]
    
    return selected_features

In [24]:
# selected_features = recursive_feature_elimination()
# selected_features

In [25]:
selected_features = ['domain_length',
 'qty_hyphen_directory',
 'qty_slash_directory',
 'directory_length',
 'qty_dot_file',
 'qty_exclamation_file',
 'qty_space_file',
 'qty_tilde_file',
 'qty_percent_file',
 'file_length',
 'time_response',
 'asn_ip',
 'time_domain_activation',
 'ttl_hostname']

In [35]:
# selected_features2 = [
#     "qty_dot_domain",
#     "qty_vowels_domain",
#     "domain_length",
#     "qty_dot_directory",
#     "qty_slash_directory",
#     "directory_length",
#     "qty_dot_file",
#     "file_length",
#     "params_length",
#     "time_response",
#     "asn_ip",
#     "time_domain_activation",
#     "time_domain_expiration",
#     "ttl_hostname"
# ]


## PREDICTION

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score

# Assuming 'importances', 'indices', and 'df' are already defined

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')
    class_report = classification_report(predicted, true, target_names=["legitimate", "malicious"])
    return accuracy, precision, recall, f1, class_report

# Initialize lists to store results
j_values = []
num_features = []
f1_scores = []

# Iterate over the range of j
for j in [x * 0.001 for x in range(0, 101)]:
    important_features = []
    for i in indices:
        if importances[i] >= j:
            important_features.append(df.columns[i])
    
    # Store the number of important features
    num_features.append(len(important_features))
    
    # Check if there are features selected
    if len(important_features) == 0:
        f1_scores.append(np.nan)  # Handle case where no features are selected
        continue

    # Prepare the data
    X = df[important_features]
    y = df['phishing']
    y = y.values.ravel()

    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define and train the model
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Make predictions
    y_test_pred = model.predict(X_test)
    
    # Evaluate the model
    _, _, _, f1, _ = evaluate_model(y_test, y_test_pred)
    
    # Store the F1-score
    f1_scores.append(f1)
    
    # Store the current j value
    j_values.append(j)