# Load packages

In [1]:
# skip reinstalling packages we already have
using Pkg

pkgs = [
    "MLJ", "MLJBase", "MLJModels", "MLJEnsembles", "MLJLinearModels",
    "DecisionTree", "MLJDecisionTreeInterface", "NaiveBayes", 
    "MLJNaiveBayesInterface", "EvoTrees", "CategoricalArrays", "Random",
    "LIBSVM", "MLJLIBSVMInterface", "Plots", "MLJModelInterface",
    "CSV", "DataFrames", "UrlDownload", "XGBoost", "NNlib"
]

# Filter out packages already installed
missing_pkgs = filter(pkg -> !(pkg in keys(Pkg.project().dependencies)), pkgs)

if !isempty(missing_pkgs)
    println("Installing missing packages: ", missing_pkgs)
    Pkg.add(missing_pkgs)
else
    println(" All required packages are already installed.")
end


 All required packages are already installed.


In [32]:
using MLJ
using LIBSVM
using NNlib
using Flux
using Flux.Losses
using Statistics

In [54]:
#Load your library of functions
include("utils.jl")
# Set a global random seed for reproducibility
using Random
Random.seed!(42)

TaskLocalRNG()

# Load Data

In [55]:
using CSV, DataFrames, Random
using CategoricalArrays

df = CSV.read("./data/updated_pollution_dataset.csv", DataFrame)

# Some log
println("First 5 rows of df:")
show(df[1:5, :], allcols=true)

# Convert last column to categorical (in-place!)
df[!, end] = categorical(df[!, end])

# Extract the integer codes of the categories
targets = Float32.(levelcode.(df[!, end]))

# Use all columns except the last one as inputs
inputs = Matrix{Float32}(df[:, 1:end-1])

println("First 5 inputs::")
for i in 1:5
    println(inputs[i, :])
end

println("\n\nFirst 5 targets:")
println(targets[1:5])

# Extract labels (categories) as strings
label_names = levels(df[!, 10])
println("Labels: ", label_names)

First 5 rows of df:
[1m5×10 DataFrame[0m
[1m Row [0m│[1m Temperature [0m[1m Humidity [0m[1m PM2.5   [0m[1m PM10    [0m[1m NO2     [0m[1m SO2     [0m[1m CO      [0m[1m Proximity_to_Industrial_Areas [0m[1m Population_Density [0m[1m Air Quality [0m
     │[90m Float64     [0m[90m Float64  [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64                       [0m[90m Int64              [0m[90m String15    [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │        29.8      59.1      5.2     17.9     18.9      9.2     1.72                            6.3                 319  Moderate
   2 │        28.3      75.6      2.3     12.2     30.8      9.7     1.64                            6.0                 611  Moderate
   3 │        23.1      74.7     26.7     33.8     24.4     12.6     1.63                   

# Split train-test datasets

In [56]:
trainIdx, testIdx = holdOut(size(df,1), 0.2)

trainingInputs  = inputs[trainIdx, :]
testInputs      = inputs[testIdx, :]

trainingTargets = targets[trainIdx]
testTargets     = targets[testIdx]

1000-element Vector{Float32}:
 2.0
 3.0
 1.0
 2.0
 1.0
 1.0
 4.0
 4.0
 2.0
 1.0
 3.0
 1.0
 4.0
 ⋮
 3.0
 1.0
 1.0
 1.0
 1.0
 4.0
 4.0
 3.0
 3.0
 2.0
 1.0
 3.0

In [57]:
results = Dict()
crossValidationIndices = crossvalidation(trainingTargets, 5)

4000-element Vector{Int64}:
 5
 5
 3
 2
 2
 1
 2
 1
 3
 2
 1
 2
 3
 ⋮
 1
 3
 2
 3
 4
 4
 4
 4
 4
 3
 5
 5

In [37]:
function printExperimentResult(model, hyperparams, results, class_labels)
    (
        (accuracy_mean, accuracy_std),
        (error_rate_mean, error_rate_std),
        (sensitivity_mean, sensitivity_std),
        (specificity_mean, specificity_std),
        (ppv_mean, ppv_std),
        (npv_mean, npv_std),
        (f1_mean, f1_std),
        cm
    ) = results

    println("\n=====================================================")
    println(" Model: $model")
    println(" Hyperparameters: $hyperparams")
    println("=====================================================")

    println(" Accuracy (mean)               : ", round(accuracy_mean, digits=4))
    println(" Accuracy (std)                : ", round(accuracy_std, digits=4))

    println(" Error Rate (mean)             : ", round(error_rate_mean, digits=4))
    println(" Error Rate (std)              : ", round(error_rate_std, digits=4))

    println(" Sensitivity/Recall (mean)     : ", round(sensitivity_mean, digits=4))
    println(" Sensitivity/Recall (std)      : ", round(sensitivity_std,  digits=4))

    println(" Specificity (mean)            : ", round(specificity_mean, digits=4))
    println(" Specificity (std)             : ", round(specificity_std,  digits=4))

    println(" PPV (mean)                    : ", round(ppv_mean,         digits=4))
    println(" PPV (std)                     : ", round(ppv_std,          digits=4))

    println(" NPV (mean)                    : ", round(npv_mean,         digits=4))
    println(" NPV (std)                     : ", round(npv_std,          digits=4))

    println(" F1 Score (mean)               : ", round(f1_mean,          digits=4))
    println(" F1 Score (std)                : ", round(f1_std,           digits=4))

    println("\nConfusion Matrix:")
    println(cm)

    PrettyTables.pretty_table(DataFrame(cm, :auto); header=class_labels, row_labels=class_labels)

    println("=====================================================\n")
end


printExperimentResult (generic function with 1 method)

# Artificial Neural Networks

In [26]:
############# 1. ARTIFICIAL NEURAL NETWORKS (8+ topologies) #############
default_ann = Dict(      
    "numExecutions" => 5,
    #"transferFunctions" => [σ, σ, σ, σ],
    "maxEpochs" => 200,
    "minLoss" => 0.0,
    "learningRate" => 0.01,
    "validationRatio" => 0.1,
    "maxEpochsVal" => 20
)

ann_search_space = [
    Dict("topology"=>[4, 4]),
    Dict("topology"=>[8, 8]),
    Dict("topology"=>[16, 16]),
    Dict("topology"=>[10, 4]),
    Dict("topology"=>[10, 6, 4]),
    Dict("topology"=>[10, 8, 4]),
    Dict("topology"=>[10, 8, 6, 4]),
    Dict("topology"=>[10, 12, 6, 4])
]

8-element Vector{Dict{String, Vector{Int64}}}:
 Dict("topology" => [4, 4])
 Dict("topology" => [8, 8])
 Dict("topology" => [16, 16])
 Dict("topology" => [10, 4])
 Dict("topology" => [10, 6, 4])
 Dict("topology" => [10, 8, 4])
 Dict("topology" => [10, 8, 6, 4])
 Dict("topology" => [10, 12, 6, 4])

In [27]:
########################
# 1. ANN GRID SEARCH
########################
ann_results = []

for hp in ann_search_space
    println("\n=== ANN experiment: topology = $(hp["topology"]) ===")
    full_hp = merge(default_ann, hp)
    res = modelCrossValidation(:ANN, full_hp, (trainingInputs, trainingTargets), crossValidationIndices)
    push!(ann_results, (model=:ANN, hyperparams=hp, results=res))
end

results[:ANN] = ann_results


=== ANN experiment: topology = [4, 4] ===



=== ANN experiment: topology = [8, 8] ===

=== ANN experiment: topology = [16, 16] ===

=== ANN experiment: topology = [10, 4] ===

=== ANN experiment: topology = [10, 6, 4] ===

=== ANN experiment: topology = [10, 8, 4] ===

=== ANN experiment: topology = [10, 8, 6, 4] ===

=== ANN experiment: topology = [10, 12, 6, 4] ===


8-element Vector{Any}:
 (model = :ANN, hyperparams = Dict("topology" => [4, 4]), results = ((0.94074315f0, 0.0042731683f0), (0.059256874f0, 0.0042731655f0), (0.94074315f0, 0.0042731683f0), (0.98274595f0, 0.0025842295f0), (0.9379606f0, 0.009482839f0), (0.98563397f0, 0.0011026161f0), (0.9388195f0, 0.007021723f0), Float32[267.36002 0.52 2.76 2.76; 3.64 173.36002 9.72 4.4800005; 2.3200002 4.0 211.23999 6.04; 2.76 4.5199995 3.8799999 100.64]))
 (model = :ANN, hyperparams = Dict("topology" => [8, 8]), results = ((0.94404685f0, 0.0044486574f0), (0.055953186f0, 0.004448656f0), (0.94404685f0, 0.0044486574f0), (0.9838977f0, 0.0011569984f0), (0.944022f0, 0.0051273378f0), (0.9860857f0, 0.0008367489f0), (0.94367075f0, 0.004500241f0), Float32[267.6 0.6 2.52 2.6799998; 3.36 176.12 7.2400002 4.48; 2.28 4.0800004 211.8 5.44; 2.8799999 4.7200003 4.48 99.72]))
 (model = :ANN, hyperparams = Dict("topology" => [16, 16]), results = ((0.9439484f0, 0.0041749845f0), (0.056051623f0, 0.0041750036f0), (0.9439484f

In [29]:
for entry in results[:ANN]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: ANN
 Hyperparameters: Dict("topology" => [4, 4])
 Accuracy (mean)               : 0.9407
 Accuracy (std)                : 0.0043
 Error Rate (mean)             : 0.0593
 Error Rate (std)              : 0.0043
 Sensitivity/Recall (mean)     : 0.9407
 Sensitivity/Recall (std)      : 0.0043
 Specificity (mean)            : 0.9827
 Specificity (std)             : 0.0026
 PPV (mean)                    : 0.938
 PPV (std)                     : 0.0095
 NPV (mean)                    : 0.9856
 NPV (std)                     : 0.0011
 F1 Score (mean)               : 0.9388
 F1 Score (std)                : 0.007

Confusion Matrix:
Float32[267.36002 0.52 2.76 2.76; 3.64 173.36002 9.72 4.4800005; 2.3200002 4.0 211.23999 6.04; 2.76 4.5199995 3.8799999 100.64]
┌───────────┬────────┬───────────┬──────────┬────────┐
│[1m           [0m│[1m   Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m   Poor [0m│
├───────────┼────────┼───────────┼──────────┼────────┤
│[1m      Good [0m│ 267.36 │  

In [30]:
########################
# 1. ANN FINAL TESTING
########################

# BEST HYPERPARAMETERS
topology = [10, 8, 6, 4]

hp = Dict(      
    "numExecutions" => 5,
    "maxEpochs" => 200,
    "minLoss" => 0.0,
    "learningRate" => 0.01,
    "validationRatio" => 0.1,
    "maxEpochsVal" => 20
)

learningRate = hp["learningRate"]
epochs = hp["maxEpochs"]
validationRatio = hp["validationRatio"]

# ============================
# NORMALIZATION
# ============================
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
norm_train_val_inputs = normalizeMinMax(trainingInputs, normParams)
norm_test_inputs  = normalizeMinMax(testInputs,  normParams)

# ============================
# CREATE VALIDATION SPLIT
# ============================
N = size(inputs, 1)
nTrainVal = size(norm_train_val_inputs, 1)

nVal = Int(floor(validationRatio * N))
realValidationRatio = nVal / nTrainVal

(train_idx, val_idx) = holdOut(nTrainVal, realValidationRatio)

norm_train_inputs = norm_train_val_inputs[train_idx, :]
norm_val_inputs   = norm_train_val_inputs[val_idx, :]

# ============================
# ONE–HOT ENCODING FOR TARGETS
# ============================
trainTargetsVec = trainingTargets[train_idx]
valTargetsVec   = trainingTargets[val_idx]
testTargetsVec  = testTargets

trainTargetsOH = Matrix(oneHotEncoding(vec(trainTargetsVec), sort(levels(categorical(trainingTargets)))))
valTargetsOH   = Matrix(oneHotEncoding(vec(valTargetsVec),   sort(levels(categorical(trainingTargets)))))
testTargetsOH  = Matrix(oneHotEncoding(vec(testTargetsVec),  sort(levels(categorical(trainingTargets)))))

# ============================
# TRAIN ANN
# ============================
finalAnn, trainLoss, valLoss, testLoss = trainClassANN(
    topology,
    (norm_train_inputs, trainTargetsOH),
    validationDataset = (norm_val_inputs, valTargetsOH),
    testDataset = (norm_test_inputs, testTargetsOH),
    maxEpochs = hp["maxEpochs"],
    minLoss = hp["minLoss"],
    learningRate = hp["learningRate"],
    maxEpochsVal = hp["maxEpochsVal"],
    showText = false
)

# ============================
# PREDICTION
# ============================
testOutputs = finalAnn(norm_test_inputs')
testPredictions = classifyOutputs(testOutputs')   # boolean matrix (N × classes)

# ============================
# METRICS
# ============================
metrics = confusionMatrix(testPredictions, testTargetsOH)

printANNResult(
    :ANN,
    Dict("topology" => topology),
    metrics,
    label_names
)



 Model: ANN   
 Hyperparameters: Dict("topology" => [10, 8, 6, 4])
 Accuracy                : 0.852
 Error Rate              : 0.148
 Sensitivity/Recall      : 0.852
 Specificity             : 0.9532
 PPV                     : 0.781
 NPV                     : 0.9741
 F1 Score                : 0.8124

Confusion Matrix:
[383 0 0 0; 0 0 0 103; 2 0 285 9; 0 0 34 184]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│  383 │         0 │        0 │    0 │
│[1m Hazardous [0m│    0 │         0 │        0 │  103 │
│[1m  Moderate [0m│    2 │         0 │      285 │    9 │
│[1m      Poor [0m│    0 │         0 │       34 │  184 │
└───────────┴──────┴───────────┴──────────┴──────┘


# Support Vector Machines

In [8]:
SVMClassifier = MLJ.@load SVC pkg=LIBSVM verbosity=0

MLJLIBSVMInterface.SVC

In [9]:
############# 2. SVM (8+ configs: kernels × C) #############
default_svm = Dict(
    "gamma" => 1.0,
    "degree" => Int32(3),
    "coef0" => 0.0
)
svm_search_space = [
    Dict("kernel"=>"linear", "C"=>0.1),
    Dict("kernel"=>"linear", "C"=>1.0),
    Dict("kernel"=>"linear", "C"=>10.0),

    Dict("kernel"=>"rbf", "C"=>1.0, "gamma" => 2.0),
    Dict("kernel"=>"rbf", "C"=>10.0, "gamma" => 0.5),

    Dict("kernel"=>"sigmoid", "C"=>1.0, "gamma" => 1.0),

    Dict("kernel"=>"poly", "C"=>1.0, "degree" => 3, "gamma" => 1),
    Dict("kernel"=>"poly", "C"=>5.0, "degree" => 4, "gamma" => 0.5),
]

8-element Vector{Dict{String, Any}}:
 Dict("C" => 0.1, "kernel" => "linear")
 Dict("C" => 1.0, "kernel" => "linear")
 Dict("C" => 10.0, "kernel" => "linear")
 Dict("C" => 1.0, "kernel" => "rbf", "gamma" => 2.0)
 Dict("C" => 10.0, "kernel" => "rbf", "gamma" => 0.5)
 Dict("C" => 1.0, "kernel" => "sigmoid", "gamma" => 1.0)
 Dict("C" => 1.0, "kernel" => "poly", "gamma" => 1, "degree" => 3)
 Dict("C" => 5.0, "kernel" => "poly", "gamma" => 0.5, "degree" => 4)

In [10]:
########################
# 2. SVM GRID SEARCH
########################
svm_results = []

for hp in svm_search_space
    println("\n=== SVM experiment: kernel=$(hp["kernel"]) C=$(get(hp,"C","-")) ===")
    full_hp = merge(default_svm, hp)
    res = modelCrossValidation(:SVC, full_hp, (trainingInputs, trainingTargets), crossValidationIndices)
    push!(svm_results, (model=:SVC, hyperparams=hp, results=res))
end

results[:SVC] = svm_results


=== SVM experiment: kernel=linear C=0.1 ===

=== SVM experiment: kernel=linear C=1.0 ===

=== SVM experiment: kernel=linear C=10.0 ===

=== SVM experiment: kernel=rbf C=1.0 ===

=== SVM experiment: kernel=rbf C=10.0 ===

=== SVM experiment: kernel=sigmoid C=1.0 ===

=== SVM experiment: kernel=poly C=1.0 ===

=== SVM experiment: kernel=poly C=5.0 ===


8-element Vector{Any}:
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 0.1, "kernel" => "linear"), results = ((0.9304966f0, 0.005035769f0), (0.06950344f0, 0.005035765f0), (0.9304966f0, 0.005035769f0), (0.9750918f0, 0.00083392154f0), (0.9309021f0, 0.005551302f0), (0.9843459f0, 0.0011032501f0), (0.9289301f0, 0.0056161913f0), Float32[268.0 0.2 4.8 0.4; 5.4 175.8 8.6 1.4; 2.4 6.4 209.0 5.8; 5.0 8.8 6.4 91.6]))
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 1.0, "kernel" => "linear"), results = ((0.9417473f0, 0.0034394085f0), (0.05825276f0, 0.0034394148f0), (0.9417473f0, 0.0034394085f0), (0.98124313f0, 0.0008948613f0), (0.9411415f0, 0.0038164093f0), (0.98613167f0, 0.0007897962f0), (0.94100016f0, 0.003653778f0), Float32[268.0 0.4 3.4 1.6; 4.2 177.0 6.6 3.4; 2.4 5.4 210.6 5.2; 3.4 5.4 5.2 97.8]))
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 10.0, "kernel" => "linear"), results = ((0.9417513f0, 0.005474671f0), (0.0582487f0, 0.005474683f0), (0.9417513f0, 0.005474671f0),

In [11]:
for entry in results[:SVC]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: SVC
 Hyperparameters: Dict{String, Any}("C" => 0.1, "kernel" => "linear")
 Accuracy (mean)               : 0.9305
 Accuracy (std)                : 0.005
 Error Rate (mean)             : 0.0695
 Error Rate (std)              : 0.005
 Sensitivity/Recall (mean)     : 0.9305
 Sensitivity/Recall (std)      : 0.005
 Specificity (mean)            : 0.9751
 Specificity (std)             : 0.0008
 PPV (mean)                    : 0.9309
 PPV (std)                     : 0.0056
 NPV (mean)                    : 0.9843
 NPV (std)                     : 0.0011
 F1 Score (mean)               : 0.9289
 F1 Score (std)                : 0.0056

Confusion Matrix:
Float32[268.0 0.2 4.8 0.4; 5.4 175.8 8.6 1.4; 2.4 6.4 209.0 5.8; 5.0 8.8 6.4 91.6]
┌───────────┬───────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼───────┼───────────┼──────────┼──────┤
│[1m      Good [0m│ 268.0 │       0.2 │      4.8 │  0.4 │
│

In [13]:
########################
# 2. SVM FINAL TESTING 
########################

# BEST HYPERPARAMETERS
hp = Dict("kernel"=>"rbf", "C"=>1.0, "gamma"=>2.0)

# Compute normalization parameters from TRAINING set only
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
train_df = normalizeMinMax(trainingInputs, normParams)
test_df = normalizeMinMax(testInputs, normParams)

# --- Convert targets ---
train_y_cat = categorical(vec(trainingTargets))
test_y_cat  = categorical(vec(testTargets))

# Load model (insert hyperparameters manually)
model = SVMClassifier(kernel=LIBSVM.Kernel.RadialBasis, cost=Float64(1), gamma=Float64(2))
    
# --- Train machine ---
mach = machine(model, MLJ.table(train_df), train_y_cat)
MLJ.fit!(mach)
    
# --- Predict ---
ŷ = MLJ.predict(mach, test_df)              # probabilistic predictions
y_pred = CategoricalArray(ŷ)         # convert to class labels
    
# --- Compute metrics ---
metrics = confusionMatrix(y_pred, test_y_cat)

printResult(:SVC, hp, metrics, label_names)




[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(SVC(kernel = RadialBasis, …), …).


 Model: SVC
 Hyperparameters: Dict{String, Any}("C" => 1.0, "kernel" => "rbf", "gamma" => 2.0)
 Accuracy                : 0.928
 Error Rate              : 0.072
 Sensitivity/Recall      : 0.928
 Specificity             : 0.9784
 PPV                     : 0.9273
 NPV                     : 0.9824
 F1 Score                : 0.9271

Confusion Matrix:
[77 0 0 26; 0 284 1 11; 0 0 383 0; 12 22 0 184]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│   77 │         0 │        0 │   26 │
│[1m Hazardous [0m│    0 │       284 │        1 │   11 │
│[1m  Moderate [0m│    0 │         0 │      383 │    0 │
│[1m      Poor [0m│   12 │        22 │        0 │  184 │
└───────────┴──────┴───────────┴──────────┴──────┘


# Decission Trees

In [14]:
DTClassifier = MLJ.@load DecisionTreeClassifier pkg=DecisionTree verbosity=0

MLJDecisionTreeInterface.DecisionTreeClassifier

In [15]:
############# 3. DECISION TREES (6 depths) #############
default_dt = Dict(
    "rng" => Random.MersenneTwister(1)
)

dt_search_space = [
    Dict("max_depth"=>2),
    Dict("max_depth"=>3),
    Dict("max_depth"=>4),
    Dict("max_depth"=>5),
    Dict("max_depth"=>6),
    Dict("max_depth"=>8)
]

6-element Vector{Dict{String, Int64}}:
 Dict("max_depth" => 2)
 Dict("max_depth" => 3)
 Dict("max_depth" => 4)
 Dict("max_depth" => 5)
 Dict("max_depth" => 6)
 Dict("max_depth" => 8)

In [17]:
########################
# 3. DECISION TREE GRID SEARCH
########################
dt_results = []

for hp in dt_search_space
    println("\n=== Decision Tree experiment: max_depth=$(hp["max_depth"]) ===")
    full_hp = merge(default_dt, hp) 
    res = modelCrossValidation(:DecisionTreeClassifier, full_hp, (trainingInputs, trainingTargets), crossValidationIndices)
    push!(dt_results, (model=:DT, hyperparams=hp, results=res))
end

results[:DT] = dt_results


=== Decision Tree experiment: max_depth=2 ===

=== Decision Tree experiment: max_depth=3 ===

=== Decision Tree experiment: max_depth=4 ===

=== Decision Tree experiment: max_depth=5 ===

=== Decision Tree experiment: max_depth=6 ===

=== Decision Tree experiment: max_depth=8 ===


6-element Vector{Any}:
 (model = :DT, hyperparams = Dict("max_depth" => 2), results = ((0.8247405f0, 0.006871341f0), (0.1752595f0, 0.0068713464f0), (0.8247405f0, 0.006871341f0), (0.9448327f0, 0.0034481913f0), (0.7582999f0, 0.007281201f0), (0.96567047f0, 0.0036217559f0), (0.78805333f0, 0.0068843667f0), Float32[260.8 2.6 8.4 1.6; 8.6 154.4 28.0 0.2; 3.6 11.6 199.8 8.6; 16.6 31.8 18.6 44.8]))
 (model = :DT, hyperparams = Dict("max_depth" => 3), results = ((0.8897629f0, 0.009560379f0), (0.11023704f0, 0.009560377f0), (0.8897629f0, 0.009560379f0), (0.9693753f0, 0.0016073347f0), (0.88850796f0, 0.009277768f0), (0.97364223f0, 0.004388472f0), (0.8877665f0, 0.008821769f0), Float32[260.4 2.6 5.2 5.2; 4.8 164.4 13.6 8.4; 4.0 9.8 204.2 5.6; 8.0 11.0 10.0 82.8]))
 (model = :DT, hyperparams = Dict("max_depth" => 4), results = ((0.9032677f0, 0.011706319f0), (0.096732296f0, 0.011706326f0), (0.9032677f0, 0.011706319f0), (0.9724762f0, 0.0030279022f0), (0.9031806f0, 0.010536734f0), (0.9754895f0, 0.00429728

In [18]:
for entry in results[:DT]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: DT
 Hyperparameters: Dict("max_depth" => 2)
 Accuracy (mean)               : 0.8247
 Accuracy (std)                : 0.0069
 Error Rate (mean)             : 0.1753
 Error Rate (std)              : 0.0069
 Sensitivity/Recall (mean)     : 0.8247
 Sensitivity/Recall (std)      : 0.0069
 Specificity (mean)            : 0.9448
 Specificity (std)             : 0.0034
 PPV (mean)                    : 0.7583
 PPV (std)                     : 0.0073
 NPV (mean)                    : 0.9657
 NPV (std)                     : 0.0036
 F1 Score (mean)               : 0.7881
 F1 Score (std)                : 0.0069

Confusion Matrix:
Float32[260.8 2.6 8.4 1.6; 8.6 154.4 28.0 0.2; 3.6 11.6 199.8 8.6; 16.6 31.8 18.6 44.8]
┌───────────┬───────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼───────┼───────────┼──────────┼──────┤
│[1m      Good [0m│ 260.8 │       2.6 │      8.4 │  1.6 │
│[1m Hazardous [0m│   

In [20]:
########################
# 3. DT FINAL TESTING 
########################

# BEST HYPERPARAMETERS
hp = Dict("max_depth"=>8)

# Compute normalization parameters from TRAINING set only
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
train_df = normalizeMinMax(trainingInputs, normParams)
test_df = normalizeMinMax(testInputs, normParams)

# --- Convert targets ---
train_y_cat = categorical(vec(trainingTargets))
test_y_cat  = categorical(vec(testTargets))

# Load model (insert hyperparameters manually)
model = DTClassifier(max_depth=6, rng=Random.MersenneTwister(1))    

# --- Train machine ---
mach = machine(model, MLJ.table(train_df), train_y_cat)
MLJ.fit!(mach)
    
# --- Predict ---
ŷ = MLJ.predict(mach, MLJ.table(test_df))              # probabilistic predictions
y_pred = CategoricalArray(mode.(ŷ))         # convert to class labels
    
# --- Compute metrics ---
metrics = confusionMatrix(y_pred, test_y_cat)

printResult(:DT, hp, metrics, label_names)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DecisionTreeClassifier(max_depth = 6, …), …).



 Model: DT
 Hyperparameters: Dict("max_depth" => 8)
 Accuracy                : 0.921
 Error Rate              : 0.079
 Sensitivity/Recall      : 0.921
 Specificity             : 0.9775
 PPV                     : 0.9247
 NPV                     : 0.9779
 F1 Score                : 0.9214

Confusion Matrix:
[76 0 0 27; 0 269 1 26; 0 2 381 0; 11 12 0 195]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│   76 │         0 │        0 │   27 │
│[1m Hazardous [0m│    0 │       269 │        1 │   26 │
│[1m  Moderate [0m│    0 │         2 │      381 │    0 │
│[1m      Poor [0m│   11 │        12 │        0 │  195 │
└───────────┴──────┴───────────┴──────────┴──────┘


# K-Nearest Neighbors

In [38]:
kNNClassifier = MLJ.@load KNNClassifier pkg=NearestNeighborModels verbosity=0

NearestNeighborModels.KNNClassifier

In [39]:
############# 4. kNN (6 values) #############
knn_search_space = [
    Dict("K"=>1),
    Dict("K"=>3),
    Dict("K"=>5),
    Dict("K"=>7),
    Dict("K"=>9),
    Dict("K"=>11)
]

6-element Vector{Dict{String, Int64}}:
 Dict("K" => 1)
 Dict("K" => 3)
 Dict("K" => 5)
 Dict("K" => 7)
 Dict("K" => 9)
 Dict("K" => 11)

In [40]:
########################
# 4. KNN GRID SEARCH
########################
knn_results = []

for hp in knn_search_space
    println("\n=== kNN experiment: K=$(hp["K"]) ===")
    res = modelCrossValidation(:KNeighborsClassifier, hp, (trainingInputs, trainingTargets), crossValidationIndices)
    push!(knn_results, (model=:KNN, hyperparams=hp, results=res))
end

results[:KNN] = knn_results


=== kNN experiment: K=1 ===



=== kNN experiment: K=3 ===

=== kNN experiment: K=5 ===

=== kNN experiment: K=7 ===

=== kNN experiment: K=9 ===

=== kNN experiment: K=11 ===


6-element Vector{Any}:
 (model = :KNN, hyperparams = Dict("K" => 1), results = ((0.91574585f0, 0.011013997f0), (0.08425412f0, 0.011013993f0), (0.91574585f0, 0.011013997f0), (0.97418725f0, 0.003764723f0), (0.9146147f0, 0.010846263f0), (0.97899264f0, 0.003636073f0), (0.91474324f0, 0.010814225f0), Float32[264.4 0.8 5.6 2.6; 3.8 171.2 11.8 4.4; 4.4 7.6 203.4 8.2; 4.8 6.6 6.8 93.6]))
 (model = :KNN, hyperparams = Dict("K" => 3), results = ((0.93125135f0, 0.0065393206f0), (0.06874873f0, 0.0065393127f0), (0.93125135f0, 0.0065393206f0), (0.9782092f0, 0.0030846733f0), (0.93018085f0, 0.0067967884f0), (0.98437274f0, 0.0026192376f0), (0.9300531f0, 0.006574418f0), Float32[267.6 0.0 4.4 1.4; 3.4 174.4 9.2 4.2; 2.4 6.2 208.8 6.2; 4.6 8.0 5.0 94.2]))
 (model = :KNN, hyperparams = Dict("K" => 5), results = ((0.93400156f0, 0.002801467f0), (0.06599842f0, 0.0028014837f0), (0.93400156f0, 0.002801467f0), (0.97837484f0, 0.001903854f0), (0.9328934f0, 0.0028109066f0), (0.9855808f0, 0.0013901182f0), (0.9326143f

In [41]:
for entry in results[:KNN]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: KNN
 Hyperparameters: Dict("K" => 1)
 Accuracy (mean)               : 0.9157
 Accuracy (std)                : 0.011
 Error Rate (mean)             : 0.0843
 Error Rate (std)              : 0.011
 Sensitivity/Recall (mean)     : 0.9157
 Sensitivity/Recall (std)      : 0.011
 Specificity (mean)            : 0.9742
 Specificity (std)             : 0.0038
 PPV (mean)                    : 0.9146
 PPV (std)                     : 0.0108
 NPV (mean)                    : 0.979
 NPV (std)                     : 0.0036
 F1 Score (mean)               : 0.9147
 F1 Score (std)                : 0.0108

Confusion Matrix:
Float32[264.4 0.8 5.6 2.6; 3.8 171.2 11.8 4.4; 4.4 7.6 203.4 8.2; 4.8 6.6 6.8 93.6]
┌───────────┬───────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼───────┼───────────┼──────────┼──────┤
│[1m      Good [0m│ 264.4 │       0.8 │      5.6 │  2.6 │
│[1m Hazardous [0m│   3.8 │     171.2

In [42]:
########################
# 4. KNN FINAL TESTING 
########################

# BEST HYPERPARAMETERS
hp = Dict("K"=>5)

# Compute normalization parameters from TRAINING set only
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
train_df = normalizeMinMax(trainingInputs, normParams)
test_df = normalizeMinMax(testInputs, normParams)

# --- Convert targets ---
train_y_cat = categorical(vec(trainingTargets))
test_y_cat  = categorical(vec(testTargets))

# Load model (insert hyperparameters manually)
model = kNNClassifier(K=5)

# --- Train machine ---
mach = machine(model, MLJ.table(train_df), train_y_cat)
MLJ.fit!(mach)
    
# --- Predict ---
ŷ = MLJ.predict(mach, MLJ.table(test_df))              # probabilistic predictions
y_pred = CategoricalArray(mode.(ŷ))         # convert to class labels
    
# --- Compute metrics ---
metrics = confusionMatrix(y_pred, test_y_cat)

printResult(:KNN, hp, metrics, label_names)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(KNNClassifier(K = 5, …), …).



 Model: KNN
 Hyperparameters: Dict("K" => 5)
 Accuracy                : 0.909
 Error Rate              : 0.091
 Sensitivity/Recall      : 0.909
 Specificity             : 0.9692
 PPV                     : 0.9087
 NPV                     : 0.9785
 F1 Score                : 0.9066

Confusion Matrix:
[67 0 0 36; 0 282 6 8; 0 0 383 0; 8 33 0 177]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│   67 │         0 │        0 │   36 │
│[1m Hazardous [0m│    0 │       282 │        6 │    8 │
│[1m  Moderate [0m│    0 │         0 │      383 │    0 │
│[1m      Poor [0m│    8 │        33 │        0 │  177 │
└───────────┴──────┴───────────┴──────────┴──────┘


# Stacking Ensemble

In [58]:
SVMClassifier = @load ProbabilisticSVC pkg=LIBSVM verbosity=0
DTClassifier  = @load DecisionTreeClassifier pkg=DecisionTree verbosity=0
kNNClassifier = @load KNNClassifier pkg=NearestNeighborModels verbosity=0

trainIdx, testIdx = holdOut(size(df,1), 0.2)

trainingInputs  = inputs[trainIdx, :]
testInputs      = inputs[testIdx, :]

trainingTargets = targets[trainIdx]
testTargets     = targets[testIdx]

normParams = calculateMinMaxNormalizationParameters(trainingInputs)
trainingInputs = normalizeMinMax(trainingInputs, normParams)
testInputs  = normalizeMinMax(testInputs,  normParams)

# X as DataFrame
train_df = DataFrame(trainingInputs, :auto)
test_df  = DataFrame(testInputs, :auto)

# y as categorical
train_y_cat = categorical(trainingTargets)
test_y_cat  = categorical(testTargets)

# ---- Base Models ----
base_models_dict = Dict(
    "SVM" => SVMClassifier(kernel=LIBSVM.Kernel.RadialBasis, cost=1.0, gamma=2.0),
    "DT"  => DTClassifier(max_depth=8, rng=Random.MersenneTwister(1)),
    "kNN" => kNNClassifier(K=5)
)

# Convert to NamedTuple for Stack
base_models_NT = (; (Symbol(k)=>v for (k,v) in base_models_dict)...)

# ---- Stacking Model ----
stack_model = Stack(;
    metalearner = DTClassifier(max_depth=3, rng=Random.MersenneTwister(1)),
    resampling = CV(nfolds=5, shuffle=true, rng=123),  # deterministic
    measure = accuracy, 
    base_models_NT...
)

# Train the stacking model on your train dataset
stack_mach = machine(stack_model, train_df, train_y_cat) |> MLJ.fit!

y_pred = mode.(MLJ.predict(stack_mach, test_df))
acc = MLJ.accuracy(y_pred, test_y_cat)
println("Stack ensemble accuracy = $(round(acc*100, digits=2)) %")

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(ProbabilisticStack(metalearner = DecisionTreeClassifier(max_depth = 3, …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:kNN, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:kNN, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:kNN, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining 

Stack ensemble accuracy = 93.1 %
