# Training Models

## Setup

We use the `wandb` library to log the metrics and hyperparameters of our model.
Import the W&B Python SDK and log in (you'll need to paste your API key).

In [None]:
import wandb

wandb.Settings(quiet=True)
wandb.login()

We'll define a `set_seeds` function that will set the seeds for reproducibility across our libraries.

In [None]:
import random
import numpy as np


def set_seeds(seed: int):
    """ Set seeds for reproducibility. """
    random.seed(seed)
    np.random.seed(seed)

In [None]:
random_seed = 42
set_seeds(random_seed)

## Load Dataset

In [None]:
import pandas as pd

df = pd.read_csv("../data/extracted_features.csv")

In [None]:
# Drop rows with missing values
df = df.dropna()

In [None]:
X = df.drop(columns=["url", "is_phishing", "domain", "tld"])
y = df["is_phishing"]

## Prepare Dataset

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(X)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
])

X_prepared = full_pipeline.fit_transform(X)

In [None]:
X.shape

In [None]:
X_prepared.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_prepared, y, test_size=0.2, random_state=42)

## Model Selection

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

max_iter = 1000

models = {
    "Logistic Regression": LogisticRegression(max_iter=max_iter, random_state=random_seed),
    "Random Forest": RandomForestClassifier(random_state=random_seed),
    "SVM": SVC(max_iter=max_iter, random_state=random_seed),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [None]:
from sklearn.model_selection import cross_val_score

for model_name, model in models.items():
    wandb.init(
        project='phishing-url-detection',
        group='model_selection',
        notes="Standardizes features",
        config={
            "model": model_name,
        }
    )

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

    wandb.log({
        "accuracy": scores.mean()
    })

    wandb.finish()

After running the above code, it appears that the Random Forest model has the highest accuracy. We will proceed with this model.

## Hyperparameter Tuning

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples at leaf node
    'max_features': ['sqrt', 'log2', None]  # Number of features considered
}

In [None]:
from sklearn.model_selection import GridSearchCV

rf_clf = RandomForestClassifier(random_state=random_seed)
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validated Score:", grid_search.best_score_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   4.5s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=   5.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   6.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   3.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.8s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   1.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   3.9s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Best Cross-Validated Score: 0.986376655925528


In [None]:
# Best model from the search
best_rf = grid_search.best_estimator_

## Evaluate Model

In [None]:
y_pred = best_rf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix

"""
The confusion matrix is typically structured as follows:

                                Predicted Positive    Predicted Negative
Actual Positive       TP                              FN
Actual Negative     FP                              TN
"""

confusion_matrix(y_test, y_pred)

array([[8204,   25],
       [ 126, 5610]])

In [None]:
confusion_matrix(y_test, y_pred)[1, 0]

126

In [None]:
import json
from sklearn.metrics import precision_recall_fscore_support

metrics = {}

overall_metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
metrics["precision"] = overall_metrics[0]
metrics["recall"] = overall_metrics[1]
metrics["f1"] = overall_metrics[2]
metrics["false_positive_rate"] = confusion_matrix(y_test, y_pred)[0, 1] / y_test.value_counts()[
    0]  # Phishing classified as legitimate (Type I error)
metrics["false_negative_rate"] = confusion_matrix(y_test, y_pred)[1, 0] / y_test.value_counts()[
    1]  # Legitimate classified as phishing (Type II error)

print(json.dumps(metrics, indent=4))

{
    "precision": 0.9892645641827459,
    "recall": 0.9891872538489079,
    "f1": 0.989172208765938,
    "false_positive_rate": 0.0030380362133916637,
    "false_negative_rate": 0.021966527196652718
}


## Save Model

In [None]:
from joblib import dump

pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('model', best_rf)
])

dump(pipeline, "../models/random_forest_with_pipeline.joblib")

['../models/random_forest_with_pipeline.joblib']