In [1]:
# Filter the uneccesary warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Fix the random seed
np.random.seed(7)

In [3]:
# Load the dataset
data = pd.read_csv("Phishing.csv")

In [4]:
# Rebase the values
data.rename(columns={"Result": "Class"}, inplace=True)

data["Class"] = data["Class"].map({-1:0, 1:1})

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = data.iloc[:,0:30].values.astype(int)
y = data.iloc[:,30].values.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.seed(7))

In [7]:
# Serialize the numpy arrays
np.save("X_train.npy", X_train), np.save("y_train.npy", y_train)
np.save("X_test.npy", X_train), np.save("y_test.npy", y_train)

(None, None)

In [8]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import wandb
import time

In [9]:
# Execute simple Logistic Regression with default hyperparameter
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
start = time.time()
logreg.fit(X_train, y_train)
end = time.time()-start
y_pred = logreg.predict(X_test)
print('Accuracy score of the Logistic Regression Classifier with default hyperparameter values: {:.2f}'.format(logreg.score(X_test, y_test)))
print('---- Classification report of the Logistic Regression Classifier with default hyperparameter values ---- ')
wandb.init(project="phishing-websites-detection", name='Logistic Regression')
wandb.log({"accuracy":accuracy_score(y_test, y_pred)*100.0,\
               "precision": precision_recall_fscore_support(y_test, y_pred, average='macro')[0],
               "recall": precision_recall_fscore_support(y_test, y_pred, average='macro')[1],
               "training_time":end})
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['Phising Websites', 'Normal Websites']))
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

Accuracy score of the Logistic Regression Classifier with default hyperparameter values: 0.94
---- Classification report of the Logistic Regression Classifier with default hyperparameter values ---- 


Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
                  precision    recall  f1-score   support

Phising Websites       0.94      0.92      0.93       974
 Normal Websites       0.94      0.95      0.94      1237

        accuracy                           0.94      2211
       macro avg       0.94      0.94      0.94      2211
    weighted avg       0.94      0.94      0.94      2211

[[ 896   78]
 [  61 1176]]


In [10]:
# Execute random search process based on Logistic Regression Classifier 
penalty = ['l1', 'l2']
C = [0.8, 0.9 , 1.0]
tol = [0.01, 0.001, 0.001]
max_iter = [100, 150, 200 , 250]
hyperparameters = { 'penalty' : penalty, 
                    'C' : C,
                    'tol' : tol,
                    'max_iter' : max_iter}
from sklearn.model_selection import RandomizedSearchCV
randomCV = RandomizedSearchCV(LogisticRegression(solver='liblinear'), param_distributions=hyperparameters)
start = time.time( )
randomCV.fit(X_train,y_train)
end = time.time()-start
wandb.init(project="phishing-websites-detection", name='CV Random Search')
wandb.log({"accuracy":accuracy_score(y_test, y_pred)*100.0,\
               "precision": precision_recall_fscore_support(y_test, y_pred, average='macro')[0],
               "recall": precision_recall_fscore_support(y_test, y_pred, average='macro')[1],
               "training_time":end})
best_score = randomCV.best_score_
print('Best score: ','{:2.2f}'.format(best_score*100), ' using ', randomCV.best_params_ )
y_pred = randomCV.predict(X_test)

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
Best score:  92.44  using  {'tol': 0.001, 'penalty': 'l1', 'max_iter': 250, 'C': 1.0}
