In [1]:
import pandas as pd

PHISHING = '/kaggle/input/web-page-phishing-dataset/web-page-phishing.csv'
df = pd.read_csv(filepath_or_buffer=PHISHING)
df['Phishing'] = df['phishing'].astype(bool)
df.head()

Unnamed: 0,url_length,n_dots,n_hypens,n_underline,n_slash,n_questionmark,n_equal,n_at,n_and,n_exclamation,...,n_tilde,n_comma,n_plus,n_asterisk,n_hastag,n_dollar,n_percent,n_redirection,phishing,Phishing
0,37,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,False
1,77,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,True
2,126,4,1,2,0,1,3,0,2,0,...,0,0,0,0,0,0,0,1,1,True
3,18,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,False
4,55,2,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,False


In [2]:
import warnings
from plotly import express
warnings.filterwarnings(action='ignore', category=FutureWarning)
express.pie(data_frame=df, names='phishing', color='phishing')

Our classes are unbalanced, about 2:1 in favor of not-phishing.

In [3]:
correlation_df = df.corr()['phishing'].to_frame().reset_index()
express.histogram(data_frame=correlation_df[~correlation_df['index'].isin({'phishing', 'Phishing'})], x='index', y='phishing')

Our correlations are interesting in that almost all features are positively correlated with the target variable.

In [4]:
df.columns

Index(['url_length', 'n_dots', 'n_hypens', 'n_underline', 'n_slash',
       'n_questionmark', 'n_equal', 'n_at', 'n_and', 'n_exclamation',
       'n_space', 'n_tilde', 'n_comma', 'n_plus', 'n_asterisk', 'n_hastag',
       'n_dollar', 'n_percent', 'n_redirection', 'phishing', 'Phishing'],
      dtype='object')

In [5]:
from umap import UMAP
from arrow import now

columns =['url_length', 'n_dots', 'n_hypens', 'n_underline', 'n_slash', 'n_questionmark', 'n_equal', 'n_at', 'n_and', 'n_exclamation',
          'n_space', 'n_tilde', 'n_comma', 'n_plus', 'n_asterisk', 'n_hastag', 'n_dollar', 'n_percent', 'n_redirection']
target = 'Phishing'

time_start = now()
# running UMAP over all our data takes forever, and we should be able to get a sense
# of how strong a signal is in our data from a sample
sample_df = df.sample(n=10000, random_state=2024).copy()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=100)
sample_df[['x', 'y']] = pd.DataFrame(data=reducer.fit_transform(X=sample_df[columns]))
express.scatter(data_frame=sample_df, x='x', y='y', color=target, height=800, ).show()
print('UMAP done in {}'.format(now() - time_start))

2024-03-06 20:43:46.191787: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-06 20:43:46.191967: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-06 20:43:46.351102: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=100, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Wed Mar  6 20:43:59 2024 Construct fuzzy simplicial set
Wed Mar  6 20:43:59 2024 Finding Nearest Neighbors
Wed Mar  6 20:43:59 2024 Building RP forest with 10 trees
Wed Mar  6 20:44:05 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	Stopping threshold met -- exiting after 2 iterations
Wed Mar  6 20:44:25 2024 Finished Nearest Neighbor Search
Wed Mar  6 20:44:29 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Wed Mar  6 20:44:38 2024 Finished embedding


UMAP done in 0:00:39.186702


In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

target = 'Phishing'

X_train, X_test, y_train, y_test = train_test_split(df[columns], df[target], test_size=0.2, random_state=2024, stratify=df[target])

regression = LogisticRegression(max_iter=1000, tol=1e-6)
regression.fit(X=X_train, y=y_train)
print('fit complete after {} iterations.'.format(regression.n_iter_[0]))
print('accuracy: {:5.4f} '.format(regression.score(X=X_test, y=y_test)))
express.histogram(y=regression.coef_.tolist()[0], x=columns).show(validate=True)

fit complete after 391 iterations.
accuracy: 0.8585 


It is interesting to note that our correlations and our regression coefficients look nothing alike.

In [7]:
from arrow import now
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

MODEL = {
    'Naive Bayes': GaussianNB(),
#     'QDA': QuadraticDiscriminantAnalysis(),
#     '3 Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
#     '5 Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    '7 Nearest Neighbors': KNeighborsClassifier(n_neighbors=7),
    '8 Nearest Neighbors': KNeighborsClassifier(n_neighbors=8),
    '9 Nearest Neighbors': KNeighborsClassifier(n_neighbors=9),
    '10 Nearest Neighbors': KNeighborsClassifier(n_neighbors=10),
#     'Linear SVM': SVC(kernel='linear', C=0.025, random_state=2024),
#     'Gaussian Process':GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=2024),
    '10 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=2024),
    '20 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=20, max_features=1, random_state=2024),
    'Neural Net': MLPClassifier(alpha=1, max_iter=1000, random_state=2024),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=2024),
    'RBF SVM': SVC(gamma=2, C=1, random_state=2024),
}

result = []
for name, clf in MODEL.items():
    time_start = now()
    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    result.append((score, name))
    print('{:5.4f} {} {}'.format(score, now() - time_start, name))
result = sorted(result, key=lambda x: x[0], reverse=True)
print('best: {} {}'.format(result[0][0], result[0][1]))

0.7028 0:00:00.078700 Naive Bayes
0.8903 0:00:05.473607 7 Nearest Neighbors
0.8910 0:00:05.131281 8 Nearest Neighbors
0.8931 0:00:05.094336 9 Nearest Neighbors
0.8908 0:00:05.147526 10 Nearest Neighbors
0.8716 0:00:00.138513 Decision Tree
0.7620 0:00:00.235188 10 estimator Random Forest
0.8093 0:00:00.461518 20 estimator Random Forest
0.8755 0:00:19.246555 Neural Net
0.8640 0:00:02.019712 AdaBoost
0.8887 0:10:22.359691 RBF SVM
best: 0.8930855315747402 9 Nearest Neighbors
