<a href="https://colab.research.google.com/github/nasoufisdim/EKPA/blob/main/phishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
data = pd.read_csv("https://raw.githubusercontent.com/nasoufisdim/EKPA/main/phishing_dataset.csv", sep=',' )

In [9]:
data = data.sample(frac=1).reset_index(drop=True)
data.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards,Label
0,mootah-medical.com,0,1,1,1,0,0,0,1,0,1,0,1,0,0,1,0,1
1,cellidplus.com,0,0,1,5,0,0,0,0,0,1,0,1,0,0,1,0,1
2,elitedaily.com,0,0,1,4,0,0,0,0,0,1,0,1,0,0,1,0,0
3,secure.runescape.rs-e.xyz,0,0,1,2,0,0,0,1,0,1,0,1,1,1,1,1,1
4,ap.org,0,0,1,4,0,0,0,0,0,1,0,1,0,0,1,0,0


In [10]:
X = data.drop(['Domain', 'Label'], axis=1)

In [11]:
X.head()

Unnamed: 0,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,Web_Traffic,Domain_Age,Domain_End,iFrame,Mouse_Over,Right_Click,Web_Forwards
0,0,1,1,1,0,0,0,1,0,1,0,1,0,0,1,0
1,0,0,1,5,0,0,0,0,0,1,0,1,0,0,1,0
2,0,0,1,4,0,0,0,0,0,1,0,1,0,0,1,0
3,0,0,1,2,0,0,0,1,0,1,0,1,1,1,1,1
4,0,0,1,4,0,0,0,0,0,1,0,1,0,0,1,0


In [12]:
y=np.ravel(data['Label'])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=43)

In [14]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [15]:
# Define classifiers
classifiers = [
    LogisticRegression(),
    SVC(),
    RandomForestClassifier(),
    MLPClassifier(),
    KNeighborsClassifier(),
    DecisionTreeClassifier()
]

In [16]:
# Evaluate and compare classifiers
results = {'Classifier': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': []}

for clf in classifiers:
    clf_name = clf.__class__.__name__

    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions
    y_pred = clf.predict(X_test)

    # Calculate evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Display results
    print(f"\n{clf_name} Metrics:")
    print(f"Accuracy: {acc}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("Confusion Matrix:")
    print(cm)

    # Store results for comparison
    results['Classifier'].append(clf_name)
    results['Accuracy'].append(acc)
    results['Precision'].append(precision)
    results['Recall'].append(recall)
    results['F1 Score'].append(f1)


LogisticRegression Metrics:
Accuracy: 0.8096666666666666
Precision: 0.8356664468864469
Recall: 0.8096666666666666
F1 Score: 0.8052889095900119
Confusion Matrix:
[[1453   74]
 [ 497  976]]

SVC Metrics:
Accuracy: 0.8333333333333334
Precision: 0.8619792927492346
Recall: 0.8333333333333334
F1 Score: 0.829447181171319
Confusion Matrix:
[[1490   37]
 [ 463 1010]]

RandomForestClassifier Metrics:
Accuracy: 0.8613333333333333
Precision: 0.8671815275498045
Recall: 0.8613333333333333
F1 Score: 0.860586951540841
Confusion Matrix:
[[1416  111]
 [ 305 1168]]





MLPClassifier Metrics:
Accuracy: 0.8573333333333333
Precision: 0.8671665908209331
Recall: 0.8573333333333333
F1 Score: 0.8561160934322932
Confusion Matrix:
[[1438   89]
 [ 339 1134]]

KNeighborsClassifier Metrics:
Accuracy: 0.8556666666666667
Precision: 0.8598560264043349
Recall: 0.8556666666666667
F1 Score: 0.8550715582194439
Confusion Matrix:
[[1394  133]
 [ 300 1173]]

DecisionTreeClassifier Metrics:
Accuracy: 0.8603333333333333
Precision: 0.8679941420957591
Recall: 0.8603333333333333
F1 Score: 0.8593822032355207
Confusion Matrix:
[[1428   99]
 [ 320 1153]]


In [17]:
# Display comparison results
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False)
print("\nComparison of Classifiers:")
print(results_df)



Comparison of Classifiers:
               Classifier  Accuracy  Precision    Recall  F1 Score
2  RandomForestClassifier  0.861333   0.867182  0.861333  0.860587
5  DecisionTreeClassifier  0.860333   0.867994  0.860333  0.859382
3           MLPClassifier  0.857333   0.867167  0.857333  0.856116
4    KNeighborsClassifier  0.855667   0.859856  0.855667  0.855072
1                     SVC  0.833333   0.861979  0.833333  0.829447
0      LogisticRegression  0.809667   0.835666  0.809667  0.805289
