In [12]:
# Used dependencies
import pandas as pd
import numpy as np
import warnings
import time
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import lightgbm as lgbm
from sklearn.preprocessing import MinMaxScaler

In [2]:
warnings.filterwarnings("ignore", category=UserWarning)
# Loading cleaned training data
data = pd.read_csv('./files/training_features_cleaned.csv')
X = data.drop(columns='CLASS_LABEL')
y = data['CLASS_LABEL']

Given the relatively small size of the dataset, we adopted a 5‑fold cross‑validation strategy for all model training and evaluation. This approach ensures more reliable performance estimates and reduces the risk of overfitting. For this dataset, we focused on evaluating four ML algorithms: 
K‑Nearest Neighbors (KNN), 
Random Forest, 
Decision Tree,
LightGBM. 
As an initial step, each model was tested using default parameters to validate the implementation and establish baseline performance scores before proceeding with further tuning and optimization.

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

models = {
    'KNN': Pipeline([
        ('scaler', MinMaxScaler()), 
        ('clf', KNeighborsClassifier(n_neighbors=5))
        ]),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'LightGBM': Pipeline([
        ('scaler', MinMaxScaler()), 
        ('clf', lgbm.LGBMClassifier(random_state=42, verbose=-1))
        ])
}

results = []
for model_name, model in models.items():
    cv_results = cross_val_score(model, X.values, y.values, cv=kf, scoring='accuracy')
    results.append({
        'Model': model_name,
        'Mean Accuracy': cv_results.mean(),
        'Std Dev': cv_results.std()
    })
    print(f"{model_name} - Mean Accuracy: {cv_results.mean():.4f}, Std Dev: {cv_results.std():.4f}")

summary_df = pd.DataFrame(results).sort_values(by='Mean Accuracy', ascending=False) 
print("\nSummary of Model Performances:")
print(summary_df)   

KNN - Mean Accuracy: 0.9410, Std Dev: 0.0056
RandomForest - Mean Accuracy: 0.9777, Std Dev: 0.0037
DecisionTree - Mean Accuracy: 0.9634, Std Dev: 0.0038
LightGBM - Mean Accuracy: 0.9827, Std Dev: 0.0025

Summary of Model Performances:
          Model  Mean Accuracy   Std Dev
3      LightGBM       0.982714  0.002531
1  RandomForest       0.977714  0.003709
2  DecisionTree       0.963429  0.003844
0           KNN       0.941000  0.005581


In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define models as pipelines
model_defs = {
    "KNN": Pipeline([("scaler", MinMaxScaler()), ("clf", KNeighborsClassifier())]),
    "Random Forest": Pipeline([("clf", RandomForestClassifier(random_state=42))]),
    "Decision Tree": Pipeline([("clf", DecisionTreeClassifier(random_state=42))]),
    "LightGBM": Pipeline([("scaler", MinMaxScaler()), ("clf", lgbm.LGBMClassifier(random_state=42, verbose=-1))])
}

# Parameter grids (same as your definition)
param_grids = {
    "KNN": {
        'clf__n_neighbors': [1, 3, 5, 7, 9],
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean', 'manhattan']
    },
    "Random Forest": {
        'clf__max_depth': [10, 20, 30, None],
        'clf__n_estimators': [100, 200, 300],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 4],
        'clf__max_features': ['sqrt', 'log2', None]
    },
    "Decision Tree": {
        'clf__max_depth': [5, 10, 15, None],
        'clf__min_samples_split': [2, 5, 10],
        'clf__min_samples_leaf': [1, 2, 5],
        'clf__criterion': ['gini', 'entropy']
    },
    "LightGBM": {
        'clf__learning_rate': [0.01, 0.05, 0.1],
        'clf__n_estimators': [100, 200],
        'clf__num_leaves': [31, 63, 127],
        'clf__max_depth': [-1, 10, 20],
        'clf__min_child_samples': [10, 20, 30],
        'clf__subsample': [0.8, 1.0],
        'clf__colsample_bytree': [0.8, 1.0],
    }
}

summary = {"Model": [], "Best Accuracy": [], "Best Parameters": [], "Time (s)": []}

for name, pipe in model_defs.items():
    print(f"\n=== Running GridSearch for {name} ===")
    start = time.time()
    grid = GridSearchCV(pipe, param_grids[name], cv=kf, scoring="accuracy", n_jobs=-1, verbose=0)
    grid.fit(X, y)
    elapsed = time.time() - start
    
    summary["Model"].append(name)
    summary["Best Accuracy"].append(grid.best_score_)
    summary["Best Parameters"].append(grid.best_params_)
    summary["Time (s)"].append(round(elapsed, 2))

summary_df = pd.DataFrame(summary)
summary_df["Best Accuracy"] = summary_df["Best Accuracy"].round(4)
summary_df = summary_df.sort_values(by="Best Accuracy", ascending=False).reset_index(drop=True)
summary_df["Best Parameters"] = summary_df["Best Parameters"].apply(lambda d: str(d))

print("\n=== Final Summary Table ===")
pd.set_option("display.max_colwidth", None)
print(summary_df)


=== Running GridSearch for KNN ===

=== Running GridSearch for Random Forest ===

=== Running GridSearch for Decision Tree ===

=== Running GridSearch for LightGBM ===

=== Final Summary Table ===
           Model  Best Accuracy  \
0       LightGBM         0.9844   
1  Random Forest         0.9791   
2  Decision Tree         0.9677   
3            KNN         0.9539   

                                                                                                                                                                          Best Parameters  \
0  {'clf__colsample_bytree': 0.8, 'clf__learning_rate': 0.1, 'clf__max_depth': -1, 'clf__min_child_samples': 10, 'clf__n_estimators': 200, 'clf__num_leaves': 127, 'clf__subsample': 0.8}   
1                                                  {'clf__max_depth': 20, 'clf__max_features': 'log2', 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}   
2                                                          

A large range of parameters was tested and, for what was already the model wih the best accuracy results on the first run (LightGBM), there as been only an increase from 0.982714 to 0.9844. Given the small gain in accuracy (0.17%) over what took a significant computational time (21678s) it's not worth to test more parameters on these models. 

One thing worth testing on LightGBM with these optimized parameters is to enable early stopping, as running the training for the full number of runs can lead to overfitting.

In [12]:
params = {
    'objective': 'binary' if len(y.unique()) == 2 else 'multiclass',
    'num_class': len(y.unique()) if len(y.unique()) > 2 else 1,
    'metric': 'binary_logloss' if len(y.unique()) == 2 else 'multi_logloss',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42,
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 10,
    'n_estimators': 200,
    'num_leaves': 127,
    'subsample': 0.8
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []
fold_num = 1

for train_index, val_index in kf.split(X):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    train_data = lgbm.Dataset(X_train_scaled, label=y_train)
    val_data = lgbm.Dataset(X_val_scaled, label=y_val, reference=train_data)

    gbm = lgbm.train(
        params,
        train_data,
        valid_sets=[val_data],
        num_boost_round=200,
        callbacks=[lgbm.early_stopping(stopping_rounds=10, verbose=False)]
    )

    y_pred_prob = gbm.predict(X_val_scaled, num_iteration=gbm.best_iteration)
    if len(y.unique()) > 2:
        y_pred = y_pred_prob.argmax(axis=1)
    else:
        y_pred = (y_pred_prob > 0.5).astype(int)

    acc = accuracy_score(y_val, y_pred)
    results.append(acc)

    fold_num += 1

print("\n=== LightGBM Early-Stop KFold Validation Summary ===")
print(f"Mean Accuracy: {np.mean(results):.4f}")
print(f"Std Dev: {np.std(results):.4f}")



=== LightGBM Early-Stop KFold Validation Summary ===
Mean Accuracy: 0.9809
Std Dev: 0.0038


Although the accuracy is lower than before, this might still be a better model for general use since before we could be overfitting the model the this training data, causing it to be worst for data outside this.
Since the work of hyperparameter tuning has been done for all models, I will save all optimized models in disk. For the final train of the model all data from the dataset is used, instead of using k-fold like we have done during tuning.

In [3]:
lgbm_pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', lgbm.LGBMClassifier(
        colsample_bytree=0.8,
        learning_rate=0.1,
        max_depth=-1,
        min_child_samples=10,
        n_estimators=200,
        num_leaves=127,
        subsample=0.8,
        objective='binary' if len(y.unique()) == 2 else 'multiclass',
        random_state=42,
        verbosity=-1
    ))
])
lgbm_pipeline.fit(X, y)
joblib.dump(lgbm_pipeline, './models/lightgbm.pkl')


best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': -1,
    'min_child_samples': 10,
    'n_estimators': 200,
    'num_leaves': 127,
    'subsample': 0.8,
    'random_state': 42,
    'verbose': -1
}

# Pipeline with scaler + LightGBM
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('clf', lgbm.LGBMClassifier(**best_params))
])

pipeline.fit(
    X, y,
    clf__eval_set=[(X, y)],
    clf__eval_metric="accuracy",
    clf__callbacks=[lgbm.early_stopping(stopping_rounds=10, verbose=False)]
)

joblib.dump(pipeline, './models/lightgbm_best_es.pkl')



['./models/lightgbm_best_es.pkl']

In [6]:
models = {
    'KNN': Pipeline([
        ('scaler', MinMaxScaler()),
        ('clf', KNeighborsClassifier(n_neighbors=3, weights='distance', metric='manhattan'))
    ]),
    'RandomForest': Pipeline([
        ('clf', RandomForestClassifier(max_depth=20, max_features='log2',
                                       min_samples_leaf=1, min_samples_split=2,
                                       n_estimators=100, random_state=42))
    ]),
    'DecisionTree': Pipeline([
        ('clf', DecisionTreeClassifier(max_depth=10, min_samples_split=2,
                                       min_samples_leaf=1, criterion='gini',
                                       random_state=42))
    ]),
    'LGBM': Pipeline([
        ('scaler', MinMaxScaler()),
        ('clf', lgbm.LGBMClassifier(colsample_bytree=0.8, learning_rate=0.1,
                               max_depth=-1, min_child_samples=10,
                               n_estimators=200, num_leaves=127, subsample=0.8,
                               objective='binary' if len(y.unique()) == 2 else 'multiclass',
                               random_state=42, verbosity=-1))
    ])
}
lgbm_es = models['LGBM']

for name, model in models.items():
    joblib.dump(model.fit(X, y), f'./models/{name.lower()}.pkl')

lgbm_es.fit(X, y, clf__eval_set=[(X, y)], clf__eval_metric="accuracy", clf__callbacks=[lgbm.early_stopping(stopping_rounds=10, verbose=False)])
joblib.dump(lgbm_es, './models/lgbm_es.pkl')

['./models/lgbm_es.pkl']

It's now time to properly test the models agaist the "test" data we have separated at the beginning, which as never been 'seen' by the models.

In [13]:
warnings.filterwarnings("ignore", category=UserWarning)
test_data = pd.read_csv('./files/testing_features_cleaned.csv')
X_test = test_data.drop(columns='CLASS_LABEL')
y_test = test_data['CLASS_LABEL']

model_paths = {
    'KNN': './models/knn.pkl',
    'RandomForest': './models/randomforest.pkl',
    'DecisionTree': './models/decisiontree.pkl',
    'LGBM': './models/lgbm.pkl',
    'LGBM_es': './models/lgbm_es.pkl'
}

models = {name: joblib.load(path) for name, path in model_paths.items()}

def evaluate_model(model, X, y, name):
    y_pred = model.predict(X)
    acc = accuracy_score(y, y_pred)
    pr = precision_score(y, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y, y_pred, average='weighted', zero_division=0)

    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    return {
        'Model': name,
        'Accuracy': round(acc, 4),
        'Precision': round(pr, 4),
        'Recall': round(rec, 4),
        'F1-Score': round(f1, 4),
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn
    }

results = []
for name, model in models.items():
    result = evaluate_model(model, X_test, y_test, name)
    results.append(result)

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)
print("\n=== Model Evaluation on Test Set ===")
print(results_df)



=== Model Evaluation on Test Set ===
          Model  Accuracy  Precision  Recall  F1-Score    TP    TN  FP  FN
0          LGBM    0.9833     0.9833  0.9833    0.9833  1473  1477  23  27
1       LGBM_es    0.9797     0.9797  0.9797    0.9797  1473  1466  34  27
2  RandomForest    0.9780     0.9780  0.9780    0.9780  1465  1469  31  35
3  DecisionTree    0.9637     0.9637  0.9637    0.9637  1450  1441  59  50
4           KNN    0.9610     0.9615  0.9610    0.9610  1466  1417  83  34


As expected, we have the best results with the LightGBM model. Although RandomForest is close in terms of accuracy, the FP/FN ratio is lower. If we think the model goal is to detect malicious URL's, from a security perspective it is preferred to have more FP (legitimate URL flagged as malicioous) than FN (malicious URL flagged as legitimate).